diff --git a/scripts/resh-evaluate-plot.py b/scripts/resh-evaluate-plot.py index f1aee96..946d2a1 100755 --- a/scripts/resh-evaluate-plot.py +++ b/scripts/resh-evaluate-plot.py @@ -25,6 +25,7 @@ data = json.load(sys.stdin) DATA_records = [] DATA_records_by_session = defaultdict(list) +DATA_records_by_user = defaultdict(list) for user in data["UsersRecords"]: for device in user["Devices"]: for record in device["Records"]: @@ -33,6 +34,7 @@ for user in data["UsersRecords"]: DATA_records.append(record) DATA_records_by_session[record["seqSessionId"]].append(record) + DATA_records_by_user[user["Name"]].append(record) DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"])) @@ -85,16 +87,24 @@ def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf. def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): cmd_count = defaultdict(int) + len_records = 0 for record in DATA_records: cmd = record["command"] if cmd == "": continue cmd_count[cmd] += 1 + len_records += 1 tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize] cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp)) labels = list(map(lambda x: trim(x[0], 7), tmp)) + top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records + top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records + top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records + print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent)) + print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent)) + print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent)) ranks = range(1, len(cmdFrq)+1) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(ranks, zipf(len(ranks)), 'o-') @@ -113,26 +123,36 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): # Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals. def plot_cmdVocabularySize_cmdLinesEntered(): - cmd_vocabulary = set() - y_cmd_count = [0] - for record in DATA_records: - cmd = record["command"] - if cmd in cmd_vocabulary: - # repeat last value - y_cmd_count.append(y_cmd_count[-1]) - else: - cmd_vocabulary.add(cmd) - # append last value +1 - y_cmd_count.append(y_cmd_count[-1] + 1) - - # print(cmd_vocabulary) - x_cmds_entered = range(0, len(y_cmd_count)) - plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) - plt.plot(x_cmds_entered, y_cmd_count, '-') plt.title("Command vocabulary size vs. the number of command lines entered") plt.ylabel("Command vocabulary size") plt.xlabel("# of command lines entered") + legend = [] + + # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items())) + # x_values = range(0, x_count) + for user in DATA_records_by_user.items(): + cmd_vocabulary = set() + y_cmd_count = [0] + name, records = user + for record in records: + cmd = record["command"] + if cmd in cmd_vocabulary: + # repeat last value + y_cmd_count.append(y_cmd_count[-1]) + else: + cmd_vocabulary.add(cmd) + # append last value +1 + y_cmd_count.append(y_cmd_count[-1] + 1) + + x_cmds_entered = range(0, len(y_cmd_count)) + plt.plot(x_cmds_entered, y_cmd_count, '-') + legend.append(name + " (TODO: sanitize!)") + + # print(cmd_vocabulary) + + plt.legend(legend, loc="best") + if async_draw: plt.draw() else: