polish graphviz command sequences

pull/13/head
Simon Let 6 years ago
parent 7cfc4f579a
commit 050af919dc
  1. 152
      evaluate/resh-evaluate-plot.py

@ -1,5 +1,7 @@
#!/usr/bin/env python3
import traceback
import sys
import json
from collections import defaultdict
@ -111,14 +113,15 @@ def plot_cmdVocabularySize_cmdLinesEntered():
plt.xlabel("# of command lines entered")
plt.show()
# Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984).
# Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
# solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graphviz_cmdSequences(cmd_displayTreshold=28, edge_displayTreshold=0.05):
def graph_cmdSequences(node_count=33, edge_minValue=0.05):
cmd_count = defaultdict(int)
cmdSeq_count = defaultdict(lambda: defaultdict(int))
cmd_id = dict()
prev_cmd = "_SESSION_INIT_" # XXX: not actually session init yet
prev_cmd = "<start>" # XXX: not actually session init yet
cmd_id[prev_cmd] = str(-1)
for x, record in enumerate(data["Records"]):
if record["invalid"]:
@ -130,64 +133,95 @@ def graphviz_cmdSequences(cmd_displayTreshold=28, edge_displayTreshold=0.05):
cmd_id[cmd] = str(x)
prev_cmd = cmd
graph = Digraph(engine='neato', graph_attr={'overlap':'scale', 'overlap_shrink':'true', 'splines':'true', 'sep':'0.25'})
# for cmd_entry in cmdSeq_count.items():
# cmd, seq = cmd_entry
# if cmd_count[cmd] < cmd_displayTreshold:
# continue
#
# graph.node(cmd_id[cmd], cmd)
for cmd_entry in cmdSeq_count.items():
cmd, seq = cmd_entry
# get `node_count` of largest nodes
sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count]
# use 3 biggest nodes as a reference point for scaling
biggest_node = cmd_count[cmds_to_graph[0]]
nd_biggest_node = cmd_count[cmds_to_graph[1]]
rd_biggest_node = cmd_count[cmds_to_graph[1]]
count2scale_coef = 3 / (biggest_node + nd_biggest_node + rd_biggest_node)
# scaling constant
# affects node size and node label
base_scaling_factor = 21
# extra scaling for experiments - not really useful imho
# affects everything nodes, edges, node labels, treshold for turning label into xlabel, xlabel size, ...
extra_scaling_factor = 1.0
for x in range(0, 10):
# graphviz is not the most reliable piece of software
# -> retry on fail but scale nodes down by 1%
scaling_factor = base_scaling_factor * (1 - x * 0.01)
# overlap: scale -> solve overlap by scaling the graph
# overlap_shrink -> try to shrink the graph a bit after you are done
# splines -> don't draw edges over nodes
# sep: 2.5 -> assume that nodes are 2.5 inches larger
graph_attr={'overlap':'scale', 'overlap_shrink':'true',
'splines':'true', 'sep':'0.25'}
graph = Digraph(name='command_sequentiality', engine='neato', graph_attr=graph_attr)
# iterate over all nodes
for cmd in cmds_to_graph:
seq = cmdSeq_count[cmd]
count = cmd_count[cmd]
# iterate over all "following" commands (for each node)
for seq_entry in seq.items():
cmd2, seq_count = seq_entry
relative_seq_count = seq_count / count
# check if "follow" command is supposed to be in the graph
if cmd2 not in cmds_to_graph:
continue
# check if the edge value is high enough
if relative_seq_count < edge_minValue:
continue
# create starting node and end node for the edge
# duplicates don't matter
for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
count_ = cmd_count[cmd_]
scale_ = count_ * count2scale_coef * scaling_factor * extra_scaling_factor
width_ = 0.08 * scale_
fontsize_ = 8.5 * scale_ / (len(cmd_) + 3)
width_ = str(width_)
if fontsize_ < 12 * extra_scaling_factor:
graph.node(id_, ' ', shape='circle', fixedsize='true', fontname='monospace bold',
width=width_, fontsize=str(12 * extra_scaling_factor), forcelabels='true', xlabel=cmd_)
else:
fontsize_ = str(fontsize_)
graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='monospace bold',
width=width_, fontsize=fontsize_, forcelabels='true', labelloc='c')
# value of the edge (percentage) 1.0 is max
scale_ = seq_count / cmd_count[cmd]
penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor)
#penwidth_bold_ = str(8 * scale_)
if scale_ > 0.5:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='bold')
elif scale_ > 0.2:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, arrowhead='open')
elif scale_ > 0.1:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
penwidth=penwidth_, style='dashed', arrowhead='open')
else:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='dotted', arrowhead='empty')
count = cmd_count[cmd]
if count < cmd_displayTreshold:
continue
# graphviz sometimes fails - see above
try:
graph.view()
# graph.render('/tmp/resh-graphviz-cmdSeq.gv', view=True)
break
except Exception as e:
trace = traceback.format_exc()
print("GRAPHVIZ EXCEPTION: <{}>\nGRAPHVIZ TRACE: <{}>".format(str(e), trace))
for seq_entry in seq.items():
cmd2, seq_count = seq_entry
relative_seq_count = seq_count / count
if cmd_count[cmd2] < cmd_displayTreshold:
continue
if relative_seq_count < edge_displayTreshold:
continue
for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
count_ = cmd_count[cmd_]
scale_ = count_ / (cmd_displayTreshold)
width_ = str(0.08*scale_)
fontsize_ = str(1*scale_)
if scale_ < 12:
graph.node(id_, '', shape='circle', fixedsize='true', fontname='bold',
width=width_, fontsize='12', forcelabels='true', xlabel=cmd_)
else:
graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='bold',
width=width_, fontsize=fontsize_, forcelabels='true')
# 1.0 is max
scale_ = seq_count / cmd_count[cmd]
penwidth_ = str(0.5 + 4.5 * scale_)
#penwidth_bold_ = str(8 * scale_)
if scale_ > 0.5:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='bold')
elif scale_ > 0.2:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, arrowhead='open')
elif scale_ > 0.1:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='dashed', arrowhead='open')
else:
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
penwidth=penwidth_, style='dotted', arrowhead='empty')
graph.view()
# graph.render('/tmp/resh-graphviz-cmdSeq.gv', view=True)
def plot_strategy_recency():
recent = None
@ -267,7 +301,7 @@ def plot_strategy_recency():
# plot_strategy_recency()
graphviz_cmdSequences()
graph_cmdSequences(node_count=28, edge_minValue=0.06)
# plot_cmdVocabularySize_cmdLinesEntered()
# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()

Loading…
Cancel
Save