From 475b74589c7c2e8fb5b22aa323ae0f8a272bd49b Mon Sep 17 00:00:00 2001
From: Simon Let <simon.a.let@gmail.com>
Date: Thu, 19 Mar 2020 18:30:51 +0100
Subject: [PATCH] plot by user

---
 scripts/resh-evaluate-plot.py | 106 +++++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 14 deletions(-)

diff --git a/scripts/resh-evaluate-plot.py b/scripts/resh-evaluate-plot.py
index 326dd51..6fb4ae7 100755
--- a/scripts/resh-evaluate-plot.py
+++ b/scripts/resh-evaluate-plot.py
@@ -86,6 +86,13 @@ def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
 
 # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
 def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
+    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
+    plt.title("Command frequency / rank")
+    plt.ylabel("Normalized command frequency")
+    plt.xlabel("Command rank")
+    legend = []
+
+
     cmd_count = defaultdict(int)
     len_records = 0
     for record in DATA_records:
@@ -102,17 +109,43 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
     top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
     top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
     top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
-    print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
-    print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
-    print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
+    print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
+    print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
+    print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
     ranks = range(1, len(cmdFrq)+1)
-    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
-    plt.plot(ranks, zipf(len(ranks)), 'o-')
+    plt.plot(ranks, zipf(len(ranks)), '-')
+    legend.append("Zipf distribution")
     plt.plot(ranks, cmdFrq, 'o-')
-    plt.title("Command frequency / rank")
-    plt.ylabel("Normalized command frequency")
-    plt.xlabel("Command rank")
-    plt.legend(("Zipf", "Command"), loc="best")
+    legend.append("All subjects")
+
+
+    for user in DATA_records_by_user.items():
+        cmd_count = defaultdict(int)
+        len_records = 0
+        name, records = user
+        for record in records:
+            cmd = record["command"]
+            if cmd == "":
+                continue
+            cmd_count[cmd] += 1
+            len_records += 1
+
+        tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
+        cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
+        labels = list(map(lambda x: trim(x[0], 7), tmp))
+
+        top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
+        top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
+        top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
+        print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 100, top100percent))
+        print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 10, top10percent))
+        print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 20, top20percent))
+        ranks = range(1, len(cmdFrq)+1)
+        plt.plot(ranks, cmdFrq, 'o-')
+        legend.append("{} (sanitize!)".format(name))
+
+    plt.legend(legend, loc="best")
+
     if show_labels:
         plt.xticks(ranks, labels, rotation=-60)
     # TODO: make xticks integral
@@ -140,6 +173,8 @@ def plot_cmdVocabularySize_cmdLinesEntered():
         name, records = user
         for record in records:
             cmd = record["command"]
+            if cmd == "":
+                continue
             if cmd in cmd_vocabulary:
                 # repeat last value
                 y_cmd_count.append(y_cmd_count[-1])
@@ -230,6 +265,8 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
         prev_cmd = START_CMD
         for record in session:
             cmd = record["command"]
+            if cmd == "":
+                continue
             cmdSeq_count[prev_cmd][cmd] += 1
             cmd_count[cmd] += 1
             if cmd not in cmd_id:
@@ -559,9 +596,49 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
         plt.show()
 
 
+def print_top_cmds(num_cmds=20):
+    cmd_count = defaultdict(int)
+    cmd_total = 0
+    for pid, session in DATA_records_by_session.items():
+        for record in session:
+            cmd = record["command"]
+            if cmd == "":
+                continue
+            cmd_count[cmd] += 1
+            cmd_total += 1
+
+    # get `node_count` of largest nodes
+    sorted_cmd_count = list(sorted(cmd_count.items(), key=lambda x: x[1], reverse=True))
+    print("\n\n% All subjects: Top commands")
+    for cmd, count in sorted_cmd_count[:num_cmds]:
+        print("{} {}".format(cmd, count))
+    # print(sorted_cmd_count)
+    # cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:cmd_count]
+
+def print_top_cmds_by_user(num_cmds=20):
+    for user in DATA_records_by_user.items():
+        name, records = user
+        cmd_count = defaultdict(int)
+        cmd_total = 0
+        for record in records:
+            cmd = record["command"]
+            if cmd == "":
+                continue
+            cmd_count[cmd] += 1
+            cmd_total += 1
+
+        # get `node_count` of largest nodes
+        sorted_cmd_count = list(sorted(cmd_count.items(), key=lambda x: x[1], reverse=True))
+        print("\n\n% {}: Top commands".format(name))
+        for cmd, count in sorted_cmd_count[:num_cmds]:
+            print("{} {}".format(cmd, count))
+        # print(sorted_cmd_count)
+        # cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:cmd_count]
 
 # plot_cmdLineFrq_rank()
-# plot_cmdFrq_rank()
+plot_cmdFrq_rank()
+print_top_cmds(30)
+print_top_cmds_by_user(30)
 #         
 # plot_cmdLineVocabularySize_cmdLinesEntered()
 # plot_cmdVocabularySize_cmdLinesEntered()
@@ -574,10 +651,11 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
 # 
 # graph_cmdSequences(node_count=28, edge_minValue=0.06)
 
-for n in range(40, 43):
-    for e in range(94, 106, 2):
-        e *= 0.001
-        graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
+# new improved
+# for n in range(40, 43):
+#     for e in range(94, 106, 2):
+#         e *= 0.001
+#         graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
 
 #for n in range(29, 35):
 #    for e in range(44, 56, 2):