improvements to plots

6 years ago · 1607c2e9aa
parent 2857927608
commit 1607c2e9aa
1 changed files with 36 additions and 16 deletions
--- a/scripts/resh-evaluate-plot.py
+++ b/scripts/resh-evaluate-plot.py
@ -25,6 +25,7 @@ data = json.load(sys.stdin)

 DATA_records = []
 DATA_records_by_session = defaultdict(list) 
+DATA_records_by_user = defaultdict(list) 
 for user in data["UsersRecords"]:
    for device in user["Devices"]:
        for record in device["Records"]:
@ -33,6 +34,7 @@ for user in data["UsersRecords"]:
            
            DATA_records.append(record)
            DATA_records_by_session[record["seqSessionId"]].append(record)
+            DATA_records_by_user[user["Name"]].append(record)

 DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"]))

@ -85,16 +87,24 @@ def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
 # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
 def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmd_count = defaultdict(int)
+    len_records = 0
    for record in DATA_records:
        cmd = record["command"]
        if cmd == "":
            continue
        cmd_count[cmd] += 1
+        len_records += 1

    tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))

+    top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
+    top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
+    top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
+    print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
+    print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
+    print(">>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
    ranks = range(1, len(cmdFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), 'o-')
@ -113,26 +123,36 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):

 # Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals.
 def plot_cmdVocabularySize_cmdLinesEntered():
-    cmd_vocabulary = set()
-    y_cmd_count = [0]
-    for record in DATA_records:
-        cmd = record["command"]
-        if cmd in cmd_vocabulary:
-            # repeat last value
-            y_cmd_count.append(y_cmd_count[-1])
-        else:
-            cmd_vocabulary.add(cmd)  
-            # append last value +1
-            y_cmd_count.append(y_cmd_count[-1] + 1)
-
-    # print(cmd_vocabulary)
-    x_cmds_entered = range(0, len(y_cmd_count))
-
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
-    plt.plot(x_cmds_entered, y_cmd_count, '-')
    plt.title("Command vocabulary size vs. the number of command lines entered")
    plt.ylabel("Command vocabulary size")
    plt.xlabel("# of command lines entered")
+    legend = []
+
+    # x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items()))
+    # x_values = range(0, x_count)  
+    for user in DATA_records_by_user.items():
+        cmd_vocabulary = set()
+        y_cmd_count = [0]
+        name, records = user
+        for record in records:
+            cmd = record["command"]
+            if cmd in cmd_vocabulary:
+                # repeat last value
+                y_cmd_count.append(y_cmd_count[-1])
+            else:
+                cmd_vocabulary.add(cmd)  
+                # append last value +1
+                y_cmd_count.append(y_cmd_count[-1] + 1)
+
+        x_cmds_entered = range(0, len(y_cmd_count))
+        plt.plot(x_cmds_entered, y_cmd_count, '-')
+        legend.append(name + " (TODO: sanitize!)")
+
+    # print(cmd_vocabulary)
+
+    plt.legend(legend, loc="best")
+
    if async_draw:
        plt.draw()
    else: