From c5319a6813a3b32acb340fbf4adc62ac9047dd8d Mon Sep 17 00:00:00 2001 From: Simon Let Date: Sat, 14 Mar 2020 17:10:39 +0100 Subject: [PATCH] graph an plot updates --- pkg/histanal/histload.go | 4 +- scripts/resh-evaluate-plot.py | 133 ++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 46 deletions(-) diff --git a/pkg/histanal/histload.go b/pkg/histanal/histload.go index 2bc50b1..ec81cc2 100644 --- a/pkg/histanal/histload.go +++ b/pkg/histanal/histload.go @@ -172,7 +172,9 @@ func (e *HistLoad) loadHistoryRecords(fname string) []records.EnrichedRecord { } else if record.CmdLength == 0 { log.Fatal("Assert failed - 'cmdLength' is unset in the data. This should not happen.") } - recs = append(recs, records.Enriched(record)) + if !e.skipFailedCmds || record.ExitCode == 0 { + recs = append(recs, records.Enriched(record)) + } } return recs } diff --git a/scripts/resh-evaluate-plot.py b/scripts/resh-evaluate-plot.py index 946d2a1..326dd51 100755 --- a/scripts/resh-evaluate-plot.py +++ b/scripts/resh-evaluate-plot.py @@ -102,9 +102,9 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records - print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent)) - print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent)) - print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent)) + print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent)) + print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent)) + print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent)) ranks = range(1, len(cmdFrq)+1) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(ranks, zipf(len(ranks)), 'o-') @@ -132,6 +132,9 @@ def plot_cmdVocabularySize_cmdLinesEntered(): # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items())) # x_values = range(0, x_count) for user in DATA_records_by_user.items(): + new_cmds_after_1k = 0 + new_cmds_after_2k = 0 + new_cmds_after_3k = 0 cmd_vocabulary = set() y_cmd_count = [0] name, records = user @@ -144,7 +147,24 @@ def plot_cmdVocabularySize_cmdLinesEntered(): cmd_vocabulary.add(cmd) # append last value +1 y_cmd_count.append(y_cmd_count[-1] + 1) - + if len(y_cmd_count) > 1000: + new_cmds_after_1k+=1 + if len(y_cmd_count) > 2000: + new_cmds_after_2k+=1 + if len(y_cmd_count) > 3000: + new_cmds_after_3k+=1 + + if len(y_cmd_count) == 1000: + print("% {}: Cmd adoption rate at 1k (between 0 and 1k) cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count)))) + if len(y_cmd_count) == 2000: + print("% {}: Cmd adoption rate at 2k cmdlines = {}".format(name ,len(cmd_vocabulary) / (len(y_cmd_count)))) + print("% {}: Cmd adoption rate between 1k and 2k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000))) + if len(y_cmd_count) == 3000: + print("% {}: Cmd adoption rate between 2k and 3k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000))) + + print("% {}: New cmd adoption rate after 1k cmdlines = {}".format(name ,new_cmds_after_1k / (len(y_cmd_count) - 1000))) + print("% {}: New cmd adoption rate after 2k cmdlines = {}".format(name ,new_cmds_after_2k / (len(y_cmd_count) - 2000))) + print("% {}: New cmd adoption rate after 3k cmdlines = {}".format(name ,new_cmds_after_3k / (len(y_cmd_count) - 3000))) x_cmds_entered = range(0, len(y_cmd_count)) plt.plot(x_cmds_entered, y_cmd_count, '-') legend.append(name + " (TODO: sanitize!)") @@ -160,26 +180,33 @@ def plot_cmdVocabularySize_cmdLinesEntered(): # Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. def plot_cmdLineVocabularySize_cmdLinesEntered(): - cmdLine_vocabulary = set() - y_cmdLine_count = [0] - for record in DATA_records: - cmdLine = record["cmdLine"] - if cmdLine in cmdLine_vocabulary: - # repeat last value - y_cmdLine_count.append(y_cmdLine_count[-1]) - else: - cmdLine_vocabulary.add(cmdLine) - # append last value +1 - y_cmdLine_count.append(y_cmdLine_count[-1] + 1) - - # print(cmdLine_vocabulary) - x_cmdLines_entered = range(0, len(y_cmdLine_count)) - plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) - plt.plot(x_cmdLines_entered, y_cmdLine_count, '-') plt.title("Command line vocabulary size vs. the number of command lines entered") plt.ylabel("Command line vocabulary size") plt.xlabel("# of command lines entered") + legend = [] + + for user in DATA_records_by_user.items(): + cmdLine_vocabulary = set() + y_cmdLine_count = [0] + name, records = user + for record in records: + cmdLine = record["cmdLine"] + if cmdLine in cmdLine_vocabulary: + # repeat last value + y_cmdLine_count.append(y_cmdLine_count[-1]) + else: + cmdLine_vocabulary.add(cmdLine) + # append last value +1 + y_cmdLine_count.append(y_cmdLine_count[-1] + 1) + + # print(cmdLine_vocabulary) + x_cmdLines_entered = range(0, len(y_cmdLine_count)) + plt.plot(x_cmdLines_entered, y_cmdLine_count, '-') + legend.append(name + " (TODO: sanitize!)") + + plt.legend(legend, loc="best") + if async_draw: plt.draw() else: @@ -190,11 +217,14 @@ def plot_cmdLineVocabularySize_cmdLinesEntered(): # solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001). def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True): START_CMD = "_start_" + END_CMD = "_end_" cmd_count = defaultdict(int) cmdSeq_count = defaultdict(lambda: defaultdict(int)) cmd_id = dict() x = 0 cmd_id[START_CMD] = str(x) + x += 1 + cmd_id[END_CMD] = str(x) for pid, session in DATA_records_by_session.items(): cmd_count[START_CMD] += 1 prev_cmd = START_CMD @@ -206,6 +236,10 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True): x += 1 cmd_id[cmd] = str(x) prev_cmd = cmd + # end the session + cmdSeq_count[prev_cmd][END_CMD] += 1 + cmd_count[END_CMD] += 1 + # get `node_count` of largest nodes sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True) @@ -275,15 +309,18 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True): scale_ = seq_count / cmd_count[cmd] penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor) #penwidth_bold_ = str(8 * scale_) - if scale_ > 0.5: + # if scale_ > 0.5: + # graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', + # penwidth=penwidth_, style='bold', arrowhead='diamond') + # elif scale_ > 0.2: + if scale_ > 0.3: + scale_ = str(int(scale_ * 100)/100) graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', - penwidth=penwidth_, style='bold', arrowhead='diamond') + penwidth=penwidth_, forcelables='true', label=scale_) elif scale_ > 0.2: - graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', - penwidth=penwidth_) - elif scale_ > 0.1: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', penwidth=penwidth_, style='dashed') + # elif scale_ > 0.1: else: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved', penwidth=penwidth_, style='dotted', arrowhead='empty') @@ -360,7 +397,7 @@ def plot_strategies_matches(plot_size=50, selected_strategies=[]): assert(saved_matches_total is not None) assert(saved_dataPoint_count is not None) max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values) - print(">>> Avg recurrence rate = {}".format(max_values[0])) + print("% >>> Avg recurrence rate = {}".format(max_values[0])) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") @@ -432,7 +469,7 @@ def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]): assert(saved_charsRecalled_total is not None) assert(saved_dataPoint_count is not None) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) - print(">>> Max avg recalled characters = {}".format(max_values[0])) + print("% >>> Max avg recalled characters = {}".format(max_values[0])) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") @@ -508,7 +545,7 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]): assert(saved_charsRecalled_total is not None) assert(saved_dataPoint_count is not None) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) - print(">>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0])) + print("% >>> Max avg recalled characters (including prefix matches) = {}".format(max_values[0])) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") @@ -522,24 +559,30 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]): plt.show() -plot_cmdLineFrq_rank() -plot_cmdFrq_rank() - -plot_cmdLineVocabularySize_cmdLinesEntered() -plot_cmdVocabularySize_cmdLinesEntered() - -plot_strategies_matches(20) -plot_strategies_charsRecalled(20) -plot_strategies_charsRecalled_prefix(20) - -graph_cmdSequences(node_count=33, edge_minValue=0.048) - -graph_cmdSequences(node_count=28, edge_minValue=0.06) -# for n in range(29, 35): -# for e in range(44, 56, 2): -# e *= 0.001 -# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) +# plot_cmdLineFrq_rank() +# plot_cmdFrq_rank() +# +# plot_cmdLineVocabularySize_cmdLinesEntered() +# plot_cmdVocabularySize_cmdLinesEntered() +# +# plot_strategies_matches(20) +# plot_strategies_charsRecalled(20) +# plot_strategies_charsRecalled_prefix(20) +# +# graph_cmdSequences(node_count=33, edge_minValue=0.048) +# +# graph_cmdSequences(node_count=28, edge_minValue=0.06) + +for n in range(40, 43): + for e in range(94, 106, 2): + e *= 0.001 + graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) + +#for n in range(29, 35): +# for e in range(44, 56, 2): +# e *= 0.001 +# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) # be careful and check if labels fit the display