From 34daa9ba8a5a874ad4d97c1acd4ede069840b6da Mon Sep 17 00:00:00 2001 From: Simon Let Date: Sun, 22 Mar 2020 21:27:05 +0100 Subject: [PATCH] evaluation changes --- cmd/evaluate/main.go | 14 +- scripts/resh-evaluate-plot.py | 316 ++++++++++++++++++++++++++++++++-- 2 files changed, 310 insertions(+), 20 deletions(-) diff --git a/cmd/evaluate/main.go b/cmd/evaluate/main.go index ffd4aae..5b4bda8 100644 --- a/cmd/evaluate/main.go +++ b/cmd/evaluate/main.go @@ -9,7 +9,6 @@ import ( "path/filepath" "github.com/curusarn/resh/pkg/histanal" - "github.com/curusarn/resh/pkg/records" "github.com/curusarn/resh/pkg/strat" ) @@ -109,12 +108,13 @@ func main() { // dynamicDistG.Init() // strategies = append(strategies, &dynamicDistG) - distanceStaticBest := strat.RecordDistance{ - MaxDepth: 3000, - DistParams: records.DistParams{Pwd: 10, RealPwd: 10, SessionID: 1, Time: 1}, - Label: "10*pwd,10*realpwd,session,time", - } - strategies = append(strategies, &distanceStaticBest) + // NOTE: this is the decent one !!! + // distanceStaticBest := strat.RecordDistance{ + // MaxDepth: 3000, + // DistParams: records.DistParams{Pwd: 10, RealPwd: 10, SessionID: 1, Time: 1}, + // Label: "10*pwd,10*realpwd,session,time", + // } + // strategies = append(strategies, &distanceStaticBest) recentBash := strat.RecentBash{} recentBash.Init() diff --git a/scripts/resh-evaluate-plot.py b/scripts/resh-evaluate-plot.py index 98808c0..89792cb 100755 --- a/scripts/resh-evaluate-plot.py +++ b/scripts/resh-evaluate-plot.py @@ -15,6 +15,7 @@ rcParams['font.family'] = 'serif' import matplotlib.pyplot as plt import matplotlib.path as mpath +import matplotlib.patches as mpatches PLOT_WIDTH = 10 # inches PLOT_HEIGHT = 7 # inches @@ -27,14 +28,18 @@ DATA_records = [] DATA_records_by_session = defaultdict(list) DATA_records_by_user = defaultdict(list) for user in data["UsersRecords"]: + if user["Devices"] is None: + continue for device in user["Devices"]: + if device["Records"] is None: + continue for record in device["Records"]: if "invalid" in record and record["invalid"]: continue DATA_records.append(record) DATA_records_by_session[record["seqSessionId"]].append(record) - DATA_records_by_user[user["Name"]].append(record) + DATA_records_by_user[user["Name"] + ":" + device["Name"]].append(record) DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"])) @@ -213,6 +218,283 @@ def plot_cmdVocabularySize_cmdLinesEntered(): else: plt.show() + +def plot_cmdVocabularySize_daily(): + SECONDS_IN_A_DAY = 86400 + plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) + plt.title("Command vocabulary size in days") + plt.ylabel("Command vocabulary size") + plt.xlabel("Days") + legend = [] + + # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items())) + # x_values = range(0, x_count) + for user in DATA_records_by_user.items(): + new_cmds_after_100 = 0 + new_cmds_after_200 = 0 + new_cmds_after_300 = 0 + cmd_vocabulary = set() + y_cmd_count = [0] + name, records = user + + cmd_fail_count = 0 + + if not len(records): + print("ERROR: no records for user {}".format(name)) + continue + + first_day = records[0]["realtimeAfter"] + this_day = first_day + + for record in records: + cmd = record["command"] + timestamp = record["realtimeAfter"] + + if cmd == "": + cmd_fail_count += 1 + continue + + if timestamp >= this_day + SECONDS_IN_A_DAY: + this_day += SECONDS_IN_A_DAY + while timestamp >= this_day + SECONDS_IN_A_DAY: + y_cmd_count.append(-10) + this_day += SECONDS_IN_A_DAY + + y_cmd_count.append(len(cmd_vocabulary)) + cmd_vocabulary = set() # wipes the vocabulary each day + + if len(y_cmd_count) > 100: + new_cmds_after_100+=1 + if len(y_cmd_count) > 200: + new_cmds_after_200+=1 + if len(y_cmd_count) > 300: + new_cmds_after_300+=1 + + if len(y_cmd_count) == 100: + print("% {}: Cmd adoption rate at 100 days (between 0 and 100 days) = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count)))) + if len(y_cmd_count) == 200: + print("% {}: Cmd adoption rate at 200 days days = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count)))) + print("% {}: Cmd adoption rate between 100 and 200 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100))) + if len(y_cmd_count) == 300: + print("% {}: Cmd adoption rate between 200 and 300 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200))) + + if cmd not in cmd_vocabulary: + cmd_vocabulary.add(cmd) + + + print("% {}: New cmd adoption rate after 100 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100))) + print("% {}: New cmd adoption rate after 200 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200))) + print("% {}: New cmd adoption rate after 300 days = {}".format(name, new_cmds_after_300 / (len(y_cmd_count) - 300))) + print("% {}: cmd_fail_count = {}".format(name, cmd_fail_count)) + x_cmds_entered = range(0, len(y_cmd_count)) + plt.plot(x_cmds_entered, y_cmd_count, 'o', markersize=2) + legend.append(name + " (TODO: sanitize!)") + + # print(cmd_vocabulary) + + plt.legend(legend, loc="best") + plt.ylim(bottom=-5) + + if async_draw: + plt.draw() + else: + plt.show() + + +def matplotlib_escape(ss): + ss = ss.replace('$', '\\$') + return ss + + +def plot_cmdUsage_in_time(sort_cmds=False, num_cmds=None): + SECONDS_IN_A_DAY = 86400 + tab_colors = ("tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray") + plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) + plt.title("Command use in time") + plt.ylabel("Commands") + plt.xlabel("Days") + legend_patches = [] + + cmd_ids = {} + y_labels = [] + + all_x_values = [] + all_y_values = [] + all_s_values = [] # size + all_c_values = [] # color + + x_values = [] + y_values = [] + s_values = [] # size + c_values = [] # color + + if sort_cmds: + cmd_count = defaultdict(int) + for user in DATA_records_by_user.items(): + name, records = user + for record in records: + cmd = record["command"] + cmd_count[cmd] += 1 + + sorted_cmds = map(lambda x: x[0], sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)) + + for cmd in sorted_cmds: + cmd_ids[cmd] = len(cmd_ids) + y_labels.append(matplotlib_escape(cmd)) + + + for user_idx, user in enumerate(DATA_records_by_user.items()): + name, records = user + + if not len(records): + print("ERROR: no records for user {}".format(name)) + continue + + + first_day = records[0]["realtimeAfter"] + this_day = first_day + day_no = 0 + today_cmds = defaultdict(int) + + for record in records: + cmd = record["command"] + timestamp = record["realtimeAfter"] + + if cmd == "": + print("NOTICE: Empty cmd for {}".format(record["cmdLine"])) + continue + + if timestamp >= this_day + SECONDS_IN_A_DAY: + for item in today_cmds.items(): + cmd, count = item + cmd_id = cmd_ids[cmd] + # skip commands with high ids + if num_cmds is not None and cmd_id >= num_cmds: + continue + + x_values.append(day_no) + y_values.append(cmd_id) + s_values.append(count) + c_values.append(tab_colors[user_idx]) + + today_cmds = defaultdict(int) + + this_day += SECONDS_IN_A_DAY + day_no += 1 + while timestamp >= this_day + SECONDS_IN_A_DAY: + this_day += SECONDS_IN_A_DAY + day_no += 1 + + if cmd not in cmd_ids: + cmd_ids[cmd] = len(cmd_ids) + y_labels.append(matplotlib_escape(cmd)) + + today_cmds[cmd] += 1 + + all_x_values.extend(x_values) + all_y_values.extend(y_values) + all_s_values.extend(s_values) + all_c_values.extend(c_values) + x_values = [] + y_values = [] + s_values = [] + c_values = [] + legend_patches.append(mpatches.Patch(color=tab_colors[user_idx], label="{} ({}) (TODO: sanitize!)".format(name, user_idx))) + + if num_cmds is not None and len(y_labels) > num_cmds: + y_labels = y_labels[:num_cmds] + plt.yticks(ticks=range(0, len(y_labels)), labels=y_labels, fontsize=6) + plt.scatter(all_x_values, all_y_values, s=all_s_values, c=all_c_values, marker='o') + plt.legend(handles=legend_patches, loc="best") + + if async_draw: + plt.draw() + else: + plt.show() + + +# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. +def plot_cmdVocabularySize_time(): + SECONDS_IN_A_DAY = 86400 + plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) + plt.title("Command vocabulary size growth in time") + plt.ylabel("Command vocabulary size") + plt.xlabel("Days") + legend = [] + + # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items())) + # x_values = range(0, x_count) + for user in DATA_records_by_user.items(): + new_cmds_after_100 = 0 + new_cmds_after_200 = 0 + new_cmds_after_300 = 0 + cmd_vocabulary = set() + y_cmd_count = [0] + name, records = user + + cmd_fail_count = 0 + + if not len(records): + print("ERROR: no records for user {}".format(name)) + continue + + first_day = records[0]["realtimeAfter"] + this_day = first_day + + for record in records: + cmd = record["command"] + timestamp = record["realtimeAfter"] + + if cmd == "": + cmd_fail_count += 1 + continue + + if timestamp >= this_day + SECONDS_IN_A_DAY: + this_day += SECONDS_IN_A_DAY + while timestamp >= this_day + SECONDS_IN_A_DAY: + y_cmd_count.append(-10) + this_day += SECONDS_IN_A_DAY + + y_cmd_count.append(len(cmd_vocabulary)) + + if len(y_cmd_count) > 100: + new_cmds_after_100+=1 + if len(y_cmd_count) > 200: + new_cmds_after_200+=1 + if len(y_cmd_count) > 300: + new_cmds_after_300+=1 + + if len(y_cmd_count) == 100: + print("% {}: Cmd adoption rate at 100 days (between 0 and 100 days) = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count)))) + if len(y_cmd_count) == 200: + print("% {}: Cmd adoption rate at 200 days days = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count)))) + print("% {}: Cmd adoption rate between 100 and 200 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100))) + if len(y_cmd_count) == 300: + print("% {}: Cmd adoption rate between 200 and 300 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200))) + + if cmd not in cmd_vocabulary: + cmd_vocabulary.add(cmd) + + + print("% {}: New cmd adoption rate after 100 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100))) + print("% {}: New cmd adoption rate after 200 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200))) + print("% {}: New cmd adoption rate after 300 days = {}".format(name, new_cmds_after_300 / (len(y_cmd_count) - 300))) + print("% {}: cmd_fail_count = {}".format(name, cmd_fail_count)) + x_cmds_entered = range(0, len(y_cmd_count)) + plt.plot(x_cmds_entered, y_cmd_count, 'o', markersize=2) + legend.append(name + " (TODO: sanitize!)") + + # print(cmd_vocabulary) + + plt.legend(legend, loc="best") + plt.ylim(bottom=0) + + if async_draw: + plt.draw() + else: + plt.show() + + # Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. def plot_cmdLineVocabularySize_cmdLinesEntered(): plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) @@ -601,7 +883,7 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]): plt.show() -def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["recent (bash-like)"], show_strat_title=False): +def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["recent (bash-like)"], show_strat_title=False, force_strat_title=None): plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.title("Matches at distance (noncumulative) <{}>".format(datetime.now().strftime('%H:%M:%S'))) plt.ylabel('%' + " of matches") @@ -655,7 +937,10 @@ def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["r matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches)) plt.plot(x_values, matches_percent, 'o-') - legend.append(strategy_title) + if force_strat_title is not None: + legend.append(force_strat_title) + else: + legend.append(strategy_title) assert(saved_matches_total is not None) assert(saved_dataPoint_count is not None) @@ -891,24 +1176,29 @@ def print_avg_cmdline_length(): # plot_cmdFrq_rank() print_top_cmds(30) print_top_cmds_by_user(30) -print_avg_cmdline_length() +# print_avg_cmdline_length() # # plot_cmdLineVocabularySize_cmdLinesEntered() -# plot_cmdVocabularySize_cmdLinesEntered() +plot_cmdVocabularySize_cmdLinesEntered() +plot_cmdVocabularySize_time() +# plot_cmdVocabularySize_daily() +plot_cmdUsage_in_time(num_cmds=100) +plot_cmdUsage_in_time(sort_cmds=True, num_cmds=100) # recent_strats=("recent", "recent (bash-like)") recurrence_strat=("recent (bash-like)",) -plot_strategies_matches(20) -plot_strategies_charsRecalled(20) -plot_strategies_charsRecalled_prefix(20) +# plot_strategies_matches(20) +# plot_strategies_charsRecalled(20) +# plot_strategies_charsRecalled_prefix(20) # plot_strategies_charsRecalled_noncummulative(20, selected_strategies=recent_strats) -plot_strategies_matches_noncummulative(20) -plot_strategies_charsRecalled_noncummulative(20) -plot_strategies_charsRecalled_prefix_noncummulative(20) -plot_strategies_matches(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate") +# plot_strategies_matches_noncummulative(20) +# plot_strategies_charsRecalled_noncummulative(20) +# plot_strategies_charsRecalled_prefix_noncummulative(20) +# plot_strategies_matches(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate") +# plot_strategies_matches_noncummulative(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate") # graph_cmdSequences(node_count=33, edge_minValue=0.048) -# + # graph_cmdSequences(node_count=28, edge_minValue=0.06) # new improved