polish graphviz command sequences

7 years ago · 050af919dc
parent 7cfc4f579a
commit 050af919dc
1 changed files with 93 additions and 59 deletions
--- a/evaluate/resh-evaluate-plot.py
+++ b/evaluate/resh-evaluate-plot.py
@ -1,5 +1,7 @@
 #!/usr/bin/env python3

+
+import traceback
 import sys
 import json
 from collections import defaultdict
@ -111,14 +113,15 @@ def plot_cmdVocabularySize_cmdLinesEntered():
    plt.xlabel("# of command lines entered")
    plt.show()

+
 # Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984).
 #       Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
 #       solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
-def graphviz_cmdSequences(cmd_displayTreshold=28, edge_displayTreshold=0.05):
+def graph_cmdSequences(node_count=33, edge_minValue=0.05):
    cmd_count = defaultdict(int)
    cmdSeq_count = defaultdict(lambda: defaultdict(int))
    cmd_id = dict()
-    prev_cmd = "_SESSION_INIT_" # XXX: not actually session init yet
+    prev_cmd = "<start>" # XXX: not actually session init yet
    cmd_id[prev_cmd] = str(-1) 
    for x, record in enumerate(data["Records"]):
        if record["invalid"]:
@ -130,64 +133,95 @@ def graphviz_cmdSequences(cmd_displayTreshold=28, edge_displayTreshold=0.05):
        cmd_id[cmd] = str(x)
        prev_cmd = cmd

-    graph = Digraph(engine='neato', graph_attr={'overlap':'scale', 'overlap_shrink':'true', 'splines':'true', 'sep':'0.25'})
-
-    # for cmd_entry in cmdSeq_count.items():
-    #     cmd, seq = cmd_entry
-
-    #     if cmd_count[cmd] < cmd_displayTreshold:
-    #         continue
-    #     
-    #     graph.node(cmd_id[cmd], cmd)
-
-    for cmd_entry in cmdSeq_count.items():
-        cmd, seq = cmd_entry
+    # get `node_count` of largest nodes
+    sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
+    cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count]
+
+    # use 3 biggest nodes as a reference point for scaling
+    biggest_node = cmd_count[cmds_to_graph[0]]
+    nd_biggest_node = cmd_count[cmds_to_graph[1]]
+    rd_biggest_node = cmd_count[cmds_to_graph[1]]
+    count2scale_coef = 3 / (biggest_node + nd_biggest_node + rd_biggest_node)
+
+    # scaling constant
+    #       affects node size and node label
+    base_scaling_factor = 21
+    # extra scaling for experiments - not really useful imho
+    #       affects everything nodes, edges, node labels, treshold for turning label into xlabel, xlabel size, ...
+    extra_scaling_factor = 1.0 
+    for x in range(0, 10):
+        # graphviz is not the most reliable piece of software
+        #       -> retry on fail but scale nodes down by 1%
+        scaling_factor = base_scaling_factor * (1 - x * 0.01)
+
+        # overlap: scale -> solve overlap by scaling the graph
+        # overlap_shrink -> try to shrink the graph a bit after you are done
+        # splines -> don't draw edges over nodes
+        # sep: 2.5 -> assume that nodes are 2.5 inches larger
+        graph_attr={'overlap':'scale', 'overlap_shrink':'true',
+                    'splines':'true', 'sep':'0.25'}
+        graph = Digraph(name='command_sequentiality', engine='neato', graph_attr=graph_attr)
+
+        # iterate over all nodes
+        for cmd in cmds_to_graph:
+            seq = cmdSeq_count[cmd]
+            count = cmd_count[cmd]
+
+            # iterate over all "following" commands (for each node)
+            for seq_entry in seq.items():
+                cmd2, seq_count = seq_entry
+                relative_seq_count = seq_count / count
+
+                # check if "follow" command is supposed to be in the graph
+                if cmd2 not in cmds_to_graph:
+                    continue
+                # check if the edge value is high enough
+                if relative_seq_count < edge_minValue:
+                    continue
+                
+                # create starting node and end node for the edge
+                #       duplicates don't matter 
+                for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
+                    count_ = cmd_count[cmd_]
+                    scale_ = count_ * count2scale_coef * scaling_factor * extra_scaling_factor
+                    width_ = 0.08 * scale_
+                    fontsize_ = 8.5 * scale_ / (len(cmd_) + 3)
+
+                    width_ = str(width_) 
+                    if fontsize_ < 12 * extra_scaling_factor:
+                        graph.node(id_, ' ', shape='circle', fixedsize='true', fontname='monospace bold',
+                                width=width_, fontsize=str(12 * extra_scaling_factor), forcelabels='true', xlabel=cmd_)
+                    else:
+                        fontsize_ = str(fontsize_)
+                        graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='monospace bold',
+                                width=width_, fontsize=fontsize_, forcelabels='true', labelloc='c')
+                
+                # value of the edge (percentage) 1.0 is max
+                scale_ = seq_count / cmd_count[cmd]
+                penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor)
+                #penwidth_bold_ = str(8 * scale_)
+                if scale_ > 0.5:
+                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
+                            penwidth=penwidth_, style='bold')
+                elif scale_ > 0.2:
+                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
+                            penwidth=penwidth_, arrowhead='open')
+                elif scale_ > 0.1:
+                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
+                            penwidth=penwidth_, style='dashed', arrowhead='open')
+                else:
+                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
+                            penwidth=penwidth_, style='dotted', arrowhead='empty')

-        count = cmd_count[cmd]
-        if count < cmd_displayTreshold:
-            continue
+        # graphviz sometimes fails - see above
+        try:
+            graph.view()
+            # graph.render('/tmp/resh-graphviz-cmdSeq.gv', view=True)
+            break
+        except Exception as e:
+            trace = traceback.format_exc()
+            print("GRAPHVIZ EXCEPTION: <{}>\nGRAPHVIZ TRACE: <{}>".format(str(e), trace))

-        for seq_entry in seq.items():
-            cmd2, seq_count = seq_entry
-            relative_seq_count = seq_count / count
-
-            if cmd_count[cmd2] < cmd_displayTreshold:
-                continue
-            if relative_seq_count < edge_displayTreshold:
-                continue
-            
-            for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
-                count_ = cmd_count[cmd_]
-                scale_ = count_ / (cmd_displayTreshold)
-                width_ = str(0.08*scale_) 
-                fontsize_ = str(1*scale_)
-                if scale_ < 12:
-                    graph.node(id_, '', shape='circle', fixedsize='true', fontname='bold',
-                            width=width_, fontsize='12', forcelabels='true', xlabel=cmd_)
-                else:
-                    graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='bold',
-                            width=width_, fontsize=fontsize_, forcelabels='true')
-
-            
-            # 1.0 is max
-            scale_ = seq_count / cmd_count[cmd]
-            penwidth_ = str(0.5 + 4.5 * scale_)
-            #penwidth_bold_ = str(8 * scale_)
-            if scale_ > 0.5:
-                graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
-                         penwidth=penwidth_, style='bold')
-            elif scale_ > 0.2:
-                graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
-                         penwidth=penwidth_, arrowhead='open')
-            elif scale_ > 0.1:
-                graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
-                         penwidth=penwidth_, style='dashed', arrowhead='open')
-            else:
-                graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
-                         penwidth=penwidth_, style='dotted', arrowhead='empty')
-
-    graph.view()
-    # graph.render('/tmp/resh-graphviz-cmdSeq.gv', view=True)

 def plot_strategy_recency():
    recent = None
@ -267,7 +301,7 @@ def plot_strategy_recency():
        
 # plot_strategy_recency()

-graphviz_cmdSequences()
+graph_cmdSequences(node_count=28, edge_minValue=0.06)
 # plot_cmdVocabularySize_cmdLinesEntered()
 # plot_cmdLineFrq_rank()
 # plot_cmdFrq_rank()