graph an plot updates

pull/123/head
Simon Let 6 years ago
parent 1607c2e9aa
commit c5319a6813
  1. 4
      pkg/histanal/histload.go
  2. 133
      scripts/resh-evaluate-plot.py

@ -172,7 +172,9 @@ func (e *HistLoad) loadHistoryRecords(fname string) []records.EnrichedRecord {
} else if record.CmdLength == 0 { } else if record.CmdLength == 0 {
log.Fatal("Assert failed - 'cmdLength' is unset in the data. This should not happen.") log.Fatal("Assert failed - 'cmdLength' is unset in the data. This should not happen.")
} }
recs = append(recs, records.Enriched(record)) if !e.skipFailedCmds || record.ExitCode == 0 {
recs = append(recs, records.Enriched(record))
}
} }
return recs return recs
} }

@ -102,9 +102,9 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent)) print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent)) print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent)) print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
ranks = range(1, len(cmdFrq)+1) ranks = range(1, len(cmdFrq)+1)
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.plot(ranks, zipf(len(ranks)), 'o-') plt.plot(ranks, zipf(len(ranks)), 'o-')
@ -132,6 +132,9 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items())) # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items()))
# x_values = range(0, x_count) # x_values = range(0, x_count)
for user in DATA_records_by_user.items(): for user in DATA_records_by_user.items():
new_cmds_after_1k = 0
new_cmds_after_2k = 0
new_cmds_after_3k = 0
cmd_vocabulary = set() cmd_vocabulary = set()
y_cmd_count = [0] y_cmd_count = [0]
name, records = user name, records = user
@ -144,7 +147,24 @@ def plot_cmdVocabularySize_cmdLinesEntered():
cmd_vocabulary.add(cmd) cmd_vocabulary.add(cmd)
# append last value +1 # append last value +1
y_cmd_count.append(y_cmd_count[-1] + 1) y_cmd_count.append(y_cmd_count[-1] + 1)
if len(y_cmd_count) > 1000:
new_cmds_after_1k+=1
if len(y_cmd_count) > 2000:
new_cmds_after_2k+=1
if len(y_cmd_count) > 3000:
new_cmds_after_3k+=1
if len(y_cmd_count) == 1000:
print("% {}: Cmd adoption rate at 1k (between 0 and 1k) cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count))))
if len(y_cmd_count) == 2000:
print("% {}: Cmd adoption rate at 2k cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count))))
print("% {}: Cmd adoption rate between 1k and 2k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000)))
if len(y_cmd_count) == 3000:
print("% {}: Cmd adoption rate between 2k and 3k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000)))
print("% {}: New cmd adoption rate after 1k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000)))
print("% {}: New cmd adoption rate after 2k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000)))
print("% {}: New cmd adoption rate after 3k cmdlines = {}".format(name ,new_cmds_after_3k / (len(y_cmd_count) - 3000)))
x_cmds_entered = range(0, len(y_cmd_count)) x_cmds_entered = range(0, len(y_cmd_count))
plt.plot(x_cmds_entered, y_cmd_count, '-') plt.plot(x_cmds_entered, y_cmd_count, '-')
legend.append(name + " (TODO: sanitize!)") legend.append(name + " (TODO: sanitize!)")
@ -160,26 +180,33 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. # Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals.
def plot_cmdLineVocabularySize_cmdLinesEntered(): def plot_cmdLineVocabularySize_cmdLinesEntered():
cmdLine_vocabulary = set()
y_cmdLine_count = [0]
for record in DATA_records:
cmdLine = record["cmdLine"]
if cmdLine in cmdLine_vocabulary:
# repeat last value
y_cmdLine_count.append(y_cmdLine_count[-1])
else:
cmdLine_vocabulary.add(cmdLine)
# append last value +1
y_cmdLine_count.append(y_cmdLine_count[-1] + 1)
# print(cmdLine_vocabulary)
x_cmdLines_entered = range(0, len(y_cmdLine_count))
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.plot(x_cmdLines_entered, y_cmdLine_count, '-')
plt.title("Command line vocabulary size vs. the number of command lines entered") plt.title("Command line vocabulary size vs. the number of command lines entered")
plt.ylabel("Command line vocabulary size") plt.ylabel("Command line vocabulary size")
plt.xlabel("# of command lines entered") plt.xlabel("# of command lines entered")
legend = []
for user in DATA_records_by_user.items():
cmdLine_vocabulary = set()
y_cmdLine_count = [0]
name, records = user
for record in records:
cmdLine = record["cmdLine"]
if cmdLine in cmdLine_vocabulary:
# repeat last value
y_cmdLine_count.append(y_cmdLine_count[-1])
else:
cmdLine_vocabulary.add(cmdLine)
# append last value +1
y_cmdLine_count.append(y_cmdLine_count[-1] + 1)
# print(cmdLine_vocabulary)
x_cmdLines_entered = range(0, len(y_cmdLine_count))
plt.plot(x_cmdLines_entered, y_cmdLine_count, '-')
legend.append(name + " (TODO: sanitize!)")
plt.legend(legend, loc="best")
if async_draw: if async_draw:
plt.draw() plt.draw()
else: else:
@ -190,11 +217,14 @@ def plot_cmdLineVocabularySize_cmdLinesEntered():
# solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001). # solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True): def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
START_CMD = "_start_" START_CMD = "_start_"
END_CMD = "_end_"
cmd_count = defaultdict(int) cmd_count = defaultdict(int)
cmdSeq_count = defaultdict(lambda: defaultdict(int)) cmdSeq_count = defaultdict(lambda: defaultdict(int))
cmd_id = dict() cmd_id = dict()
x = 0 x = 0
cmd_id[START_CMD] = str(x) cmd_id[START_CMD] = str(x)
x += 1
cmd_id[END_CMD] = str(x)
for pid, session in DATA_records_by_session.items(): for pid, session in DATA_records_by_session.items():
cmd_count[START_CMD] += 1 cmd_count[START_CMD] += 1
prev_cmd = START_CMD prev_cmd = START_CMD
@ -206,6 +236,10 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
x += 1 x += 1
cmd_id[cmd] = str(x) cmd_id[cmd] = str(x)
prev_cmd = cmd prev_cmd = cmd
# end the session
cmdSeq_count[prev_cmd][END_CMD] += 1
cmd_count[END_CMD] += 1
# get `node_count` of largest nodes # get `node_count` of largest nodes
sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True) sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
@ -275,15 +309,18 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
scale_ = seq_count / cmd_count[cmd] scale_ = seq_count / cmd_count[cmd]
penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor) penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor)
#penwidth_bold_ = str(8 * scale_) #penwidth_bold_ = str(8 * scale_)
if scale_ > 0.5: # if scale_ > 0.5:
# graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
# penwidth=penwidth_, style='bold', arrowhead='diamond')
# elif scale_ > 0.2:
if scale_ > 0.3:
scale_ = str(int(scale_ * 100)/100)
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='bold', arrowhead='diamond') penwidth=penwidth_, forcelables='true', label=scale_)
elif scale_ > 0.2: elif scale_ > 0.2:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_)
elif scale_ > 0.1:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='dashed') penwidth=penwidth_, style='dashed')
# elif scale_ > 0.1:
else: else:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved', graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='dotted', arrowhead='empty') penwidth=penwidth_, style='dotted', arrowhead='empty')
@ -360,7 +397,7 @@ def plot_strategies_matches(plot_size=50, selected_strategies=[]):
assert(saved_matches_total is not None) assert(saved_matches_total is not None)
assert(saved_dataPoint_count is not None) assert(saved_dataPoint_count is not None)
max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values) max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values)
print(">>> Avg recurrence rate = {}".format(max_values[0])) print("% >>> Avg recurrence rate = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-') plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible") legend.append("maximum possible")
@ -432,7 +469,7 @@ def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]):
assert(saved_charsRecalled_total is not None) assert(saved_charsRecalled_total is not None)
assert(saved_dataPoint_count is not None) assert(saved_dataPoint_count is not None)
max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
print(">>> Max avg recalled characters = {}".format(max_values[0])) print("% >>> Max avg recalled characters = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-') plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible") legend.append("maximum possible")
@ -508,7 +545,7 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
assert(saved_charsRecalled_total is not None) assert(saved_charsRecalled_total is not None)
assert(saved_dataPoint_count is not None) assert(saved_dataPoint_count is not None)
max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
print(">>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0])) print("% >>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-') plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible") legend.append("maximum possible")
@ -522,24 +559,30 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
plt.show() plt.show()
plot_cmdLineFrq_rank()
plot_cmdFrq_rank()
plot_cmdLineVocabularySize_cmdLinesEntered()
plot_cmdVocabularySize_cmdLinesEntered()
plot_strategies_matches(20)
plot_strategies_charsRecalled(20)
plot_strategies_charsRecalled_prefix(20)
graph_cmdSequences(node_count=33, edge_minValue=0.048)
graph_cmdSequences(node_count=28, edge_minValue=0.06)
# for n in range(29, 35): # plot_cmdLineFrq_rank()
# for e in range(44, 56, 2): # plot_cmdFrq_rank()
# e *= 0.001 #
# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) # plot_cmdLineVocabularySize_cmdLinesEntered()
# plot_cmdVocabularySize_cmdLinesEntered()
#
# plot_strategies_matches(20)
# plot_strategies_charsRecalled(20)
# plot_strategies_charsRecalled_prefix(20)
#
# graph_cmdSequences(node_count=33, edge_minValue=0.048)
#
# graph_cmdSequences(node_count=28, edge_minValue=0.06)
for n in range(40, 43):
for e in range(94, 106, 2):
e *= 0.001
graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
#for n in range(29, 35):
# for e in range(44, 56, 2):
# e *= 0.001
# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
# be careful and check if labels fit the display # be careful and check if labels fit the display

Loading…
Cancel
Save