graph an plot updates

pull/123/head
Simon Let 6 years ago
parent 1607c2e9aa
commit c5319a6813
  1. 2
      pkg/histanal/histload.go
  2. 103
      scripts/resh-evaluate-plot.py

@ -172,7 +172,9 @@ func (e *HistLoad) loadHistoryRecords(fname string) []records.EnrichedRecord {
} else if record.CmdLength == 0 {
log.Fatal("Assert failed - 'cmdLength' is unset in the data. This should not happen.")
}
if !e.skipFailedCmds || record.ExitCode == 0 {
recs = append(recs, records.Enriched(record))
}
}
return recs
}

@ -102,9 +102,9 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
ranks = range(1, len(cmdFrq)+1)
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.plot(ranks, zipf(len(ranks)), 'o-')
@ -132,6 +132,9 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items()))
# x_values = range(0, x_count)
for user in DATA_records_by_user.items():
new_cmds_after_1k = 0
new_cmds_after_2k = 0
new_cmds_after_3k = 0
cmd_vocabulary = set()
y_cmd_count = [0]
name, records = user
@ -144,7 +147,24 @@ def plot_cmdVocabularySize_cmdLinesEntered():
cmd_vocabulary.add(cmd)
# append last value +1
y_cmd_count.append(y_cmd_count[-1] + 1)
if len(y_cmd_count) > 1000:
new_cmds_after_1k+=1
if len(y_cmd_count) > 2000:
new_cmds_after_2k+=1
if len(y_cmd_count) > 3000:
new_cmds_after_3k+=1
if len(y_cmd_count) == 1000:
print("% {}: Cmd adoption rate at 1k (between 0 and 1k) cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count))))
if len(y_cmd_count) == 2000:
print("% {}: Cmd adoption rate at 2k cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count))))
print("% {}: Cmd adoption rate between 1k and 2k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000)))
if len(y_cmd_count) == 3000:
print("% {}: Cmd adoption rate between 2k and 3k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000)))
print("% {}: New cmd adoption rate after 1k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000)))
print("% {}: New cmd adoption rate after 2k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000)))
print("% {}: New cmd adoption rate after 3k cmdlines = {}".format(name ,new_cmds_after_3k / (len(y_cmd_count) - 3000)))
x_cmds_entered = range(0, len(y_cmd_count))
plt.plot(x_cmds_entered, y_cmd_count, '-')
legend.append(name + " (TODO: sanitize!)")
@ -160,9 +180,17 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals.
def plot_cmdLineVocabularySize_cmdLinesEntered():
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Command line vocabulary size vs. the number of command lines entered")
plt.ylabel("Command line vocabulary size")
plt.xlabel("# of command lines entered")
legend = []
for user in DATA_records_by_user.items():
cmdLine_vocabulary = set()
y_cmdLine_count = [0]
for record in DATA_records:
name, records = user
for record in records:
cmdLine = record["cmdLine"]
if cmdLine in cmdLine_vocabulary:
# repeat last value
@ -174,12 +202,11 @@ def plot_cmdLineVocabularySize_cmdLinesEntered():
# print(cmdLine_vocabulary)
x_cmdLines_entered = range(0, len(y_cmdLine_count))
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.plot(x_cmdLines_entered, y_cmdLine_count, '-')
plt.title("Command line vocabulary size vs. the number of command lines entered")
plt.ylabel("Command line vocabulary size")
plt.xlabel("# of command lines entered")
legend.append(name + " (TODO: sanitize!)")
plt.legend(legend, loc="best")
if async_draw:
plt.draw()
else:
@ -190,11 +217,14 @@ def plot_cmdLineVocabularySize_cmdLinesEntered():
# solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
START_CMD = "_start_"
END_CMD = "_end_"
cmd_count = defaultdict(int)
cmdSeq_count = defaultdict(lambda: defaultdict(int))
cmd_id = dict()
x = 0
cmd_id[START_CMD] = str(x)
x += 1
cmd_id[END_CMD] = str(x)
for pid, session in DATA_records_by_session.items():
cmd_count[START_CMD] += 1
prev_cmd = START_CMD
@ -206,6 +236,10 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
x += 1
cmd_id[cmd] = str(x)
prev_cmd = cmd
# end the session
cmdSeq_count[prev_cmd][END_CMD] += 1
cmd_count[END_CMD] += 1
# get `node_count` of largest nodes
sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
@ -275,15 +309,18 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
scale_ = seq_count / cmd_count[cmd]
penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor)
#penwidth_bold_ = str(8 * scale_)
if scale_ > 0.5:
# if scale_ > 0.5:
# graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
# penwidth=penwidth_, style='bold', arrowhead='diamond')
# elif scale_ > 0.2:
if scale_ > 0.3:
scale_ = str(int(scale_ * 100)/100)
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='bold', arrowhead='diamond')
penwidth=penwidth_, forcelables='true', label=scale_)
elif scale_ > 0.2:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_)
elif scale_ > 0.1:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='dashed')
# elif scale_ > 0.1:
else:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='dotted', arrowhead='empty')
@ -360,7 +397,7 @@ def plot_strategies_matches(plot_size=50, selected_strategies=[]):
assert(saved_matches_total is not None)
assert(saved_dataPoint_count is not None)
max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values)
print(">>> Avg recurrence rate = {}".format(max_values[0]))
print("% >>> Avg recurrence rate = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible")
@ -432,7 +469,7 @@ def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]):
assert(saved_charsRecalled_total is not None)
assert(saved_dataPoint_count is not None)
max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
print(">>> Max avg recalled characters = {}".format(max_values[0]))
print("% >>> Max avg recalled characters = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible")
@ -508,7 +545,7 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
assert(saved_charsRecalled_total is not None)
assert(saved_dataPoint_count is not None)
max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
print(">>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0]))
print("% >>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0]))
plt.plot(x_values, max_values, 'r-')
legend.append("maximum possible")
@ -522,21 +559,27 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
plt.show()
plot_cmdLineFrq_rank()
plot_cmdFrq_rank()
plot_cmdLineVocabularySize_cmdLinesEntered()
plot_cmdVocabularySize_cmdLinesEntered()
plot_strategies_matches(20)
plot_strategies_charsRecalled(20)
plot_strategies_charsRecalled_prefix(20)
graph_cmdSequences(node_count=33, edge_minValue=0.048)
# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()
#
# plot_cmdLineVocabularySize_cmdLinesEntered()
# plot_cmdVocabularySize_cmdLinesEntered()
#
# plot_strategies_matches(20)
# plot_strategies_charsRecalled(20)
# plot_strategies_charsRecalled_prefix(20)
#
# graph_cmdSequences(node_count=33, edge_minValue=0.048)
#
# graph_cmdSequences(node_count=28, edge_minValue=0.06)
graph_cmdSequences(node_count=28, edge_minValue=0.06)
for n in range(40, 43):
for e in range(94, 106, 2):
e *= 0.001
graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
# for n in range(29, 35):
#for n in range(29, 35):
# for e in range(44, 56, 2):
# e *= 0.001
# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)

Loading…
Cancel
Save