#!/usr/bin/env python3 import traceback import sys import json from collections import defaultdict import matplotlib.pyplot as plt import matplotlib.path as mpath import numpy as np from graphviz import Digraph from datetime import datetime PLOT_WIDTH = 10 # inches PLOT_HEIGHT = 7 # inches PLOT_SIZE_zipf = 20 data = json.load(sys.stdin) DATA_records = [] DATA_records_by_session = defaultdict(list) for user in data["UsersRecords"]: for device in user["Devices"]: for record in device["Records"]: if "invalid" in record and record["invalid"]: continue DATA_records.append(record) DATA_records_by_session[record["seqSessionId"]].append(record) DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"])) for pid, session in DATA_records_by_session.items(): session = list(sorted(session, key=lambda x: x["realtimeAfterLocal"])) # TODO: this should be a cmdline option async_draw = True # for strategy in data["Strategies"]: # print(json.dumps(strategy)) def zipf(length): return list(map(lambda x: 1/2**x, range(0, length))) def trim(text, length, add_elipse=True): if add_elipse and len(text) > length: return text[:length-1] + "…" return text[:length] # Figure 3.1. The normalized command frequency, compared with Zipf. def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): cmdLine_count = defaultdict(int) for record in DATA_records: cmdLine_count[record["cmdLine"]] += 1 tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize] cmdLineFrq = list(map(lambda x: x[1] / tmp[0][1], tmp)) labels = list(map(lambda x: trim(x[0], 7), tmp)) ranks = range(1, len(cmdLineFrq)+1) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(ranks, zipf(len(ranks)), '-') plt.plot(ranks, cmdLineFrq, 'o-') plt.title("Commandline frequency / rank") plt.ylabel("Normalized commandline frequency") plt.xlabel("Commandline rank") plt.legend(("Zipf", "Commandline"), loc="best") if show_labels: plt.xticks(ranks, labels, rotation=-60) # TODO: make xticks integral if async_draw: plt.draw() else: plt.show() # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf. def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): cmd_count = defaultdict(int) for record in DATA_records: cmd = record["command"] if cmd == "": continue cmd_count[cmd] += 1 tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize] cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp)) labels = list(map(lambda x: trim(x[0], 7), tmp)) ranks = range(1, len(cmdFrq)+1) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(ranks, zipf(len(ranks)), 'o-') plt.plot(ranks, cmdFrq, 'o-') plt.title("Command frequency / rank") plt.ylabel("Normalized command frequency") plt.xlabel("Command rank") plt.legend(("Zipf", "Command"), loc="best") if show_labels: plt.xticks(ranks, labels, rotation=-60) # TODO: make xticks integral if async_draw: plt.draw() else: plt.show() # Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals. def plot_cmdVocabularySize_cmdLinesEntered(): cmd_vocabulary = set() y_cmd_count = [0] for record in DATA_records: cmd = record["command"] if cmd in cmd_vocabulary: # repeat last value y_cmd_count.append(y_cmd_count[-1]) else: cmd_vocabulary.add(cmd) # append last value +1 y_cmd_count.append(y_cmd_count[-1] + 1) # print(cmd_vocabulary) x_cmds_entered = range(0, len(y_cmd_count)) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(x_cmds_entered, y_cmd_count, '-') plt.title("Command vocabulary size vs. the number of command lines entered") plt.ylabel("Command vocabulary size") plt.xlabel("# of command lines entered") if async_draw: plt.draw() else: plt.show() # Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. def plot_cmdLineVocabularySize_cmdLinesEntered(): cmdLine_vocabulary = set() y_cmdLine_count = [0] for record in DATA_records: cmdLine = record["cmdLine"] if cmdLine in cmdLine_vocabulary: # repeat last value y_cmdLine_count.append(y_cmdLine_count[-1]) else: cmdLine_vocabulary.add(cmdLine) # append last value +1 y_cmdLine_count.append(y_cmdLine_count[-1] + 1) # print(cmdLine_vocabulary) x_cmdLines_entered = range(0, len(y_cmdLine_count)) plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(x_cmdLines_entered, y_cmdLine_count, '-') plt.title("Command line vocabulary size vs. the number of command lines entered") plt.ylabel("Command line vocabulary size") plt.xlabel("# of command lines entered") if async_draw: plt.draw() else: plt.show() # Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984). # Ball diameters are proportional to stationary probability. Lines indicate significant dependencies, # solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001). def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True): START_CMD = "_start_" cmd_count = defaultdict(int) cmdSeq_count = defaultdict(lambda: defaultdict(int)) cmd_id = dict() x = 0 cmd_id[START_CMD] = str(x) for pid, session in DATA_records_by_session.items(): cmd_count[START_CMD] += 1 prev_cmd = START_CMD for record in session: cmd = record["command"] cmdSeq_count[prev_cmd][cmd] += 1 cmd_count[cmd] += 1 if cmd not in cmd_id: x += 1 cmd_id[cmd] = str(x) prev_cmd = cmd # get `node_count` of largest nodes sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True) print(sorted_cmd_count) cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count] # use 3 biggest nodes as a reference point for scaling biggest_node = cmd_count[cmds_to_graph[0]] nd_biggest_node = cmd_count[cmds_to_graph[1]] rd_biggest_node = cmd_count[cmds_to_graph[1]] count2scale_coef = 3 / (biggest_node + nd_biggest_node + rd_biggest_node) # scaling constant # affects node size and node label base_scaling_factor = 21 # extra scaling for experiments - not really useful imho # affects everything nodes, edges, node labels, treshold for turning label into xlabel, xlabel size, ... extra_scaling_factor = 1.0 for x in range(0, 10): # graphviz is not the most reliable piece of software # -> retry on fail but scale nodes down by 1% scaling_factor = base_scaling_factor * (1 - x * 0.01) # overlap: scale -> solve overlap by scaling the graph # overlap_shrink -> try to shrink the graph a bit after you are done # splines -> don't draw edges over nodes # sep: 2.5 -> assume that nodes are 2.5 inches larger graph_attr={'overlap':'scale', 'overlap_shrink':'true', 'splines':'true', 'sep':'0.25'} graph = Digraph(name='command_sequentiality', engine='neato', graph_attr=graph_attr) # iterate over all nodes for cmd in cmds_to_graph: seq = cmdSeq_count[cmd] count = cmd_count[cmd] # iterate over all "following" commands (for each node) for seq_entry in seq.items(): cmd2, seq_count = seq_entry relative_seq_count = seq_count / count # check if "follow" command is supposed to be in the graph if cmd2 not in cmds_to_graph: continue # check if the edge value is high enough if relative_seq_count < edge_minValue: continue # create starting node and end node for the edge # duplicates don't matter for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)): count_ = cmd_count[cmd_] scale_ = count_ * count2scale_coef * scaling_factor * extra_scaling_factor width_ = 0.08 * scale_ fontsize_ = 8.5 * scale_ / (len(cmd_) + 3) width_ = str(width_) if fontsize_ < 12 * extra_scaling_factor: graph.node(id_, ' ', shape='circle', fixedsize='true', fontname='monospace bold', width=width_, fontsize=str(12 * extra_scaling_factor), forcelabels='true', xlabel=cmd_) else: fontsize_ = str(fontsize_) graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='monospace bold', width=width_, fontsize=fontsize_, forcelabels='true', labelloc='c') # value of the edge (percentage) 1.0 is max scale_ = seq_count / cmd_count[cmd] penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor) #penwidth_bold_ = str(8 * scale_) if scale_ > 0.5: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', penwidth=penwidth_, style='bold') elif scale_ > 0.2: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', penwidth=penwidth_, arrowhead='open') elif scale_ > 0.1: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', penwidth=penwidth_, style='dashed', arrowhead='open') else: graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved', penwidth=penwidth_, style='dotted', arrowhead='empty') # graphviz sometimes fails - see above try: # graph.view() graph.render('/tmp/resh-graph-command_sequence-nodeCount_{}-edgeMinVal_{}.gv'.format(node_count, edge_minValue), view=view_graph) break except Exception as e: trace = traceback.format_exc() print("GRAPHVIZ EXCEPTION: <{}>\nGRAPHVIZ TRACE: <{}>".format(str(e), trace)) def plot_strategies_matches(plot_size=50, selected_strategies=[]): plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.title("Matches at distance <{}>".format(datetime.now().strftime('%H:%M:%S'))) plt.ylabel('%' + " of matches") plt.xlabel("Distance") legend = [] x_values = range(1, plot_size+1) saved_matches_total = None saved_dataPoint_count = None for strategy in data["Strategies"]: strategy_title = strategy["Title"] # strategy_description = strategy["Description"] if len(selected_strategies) and strategy_title not in selected_strategies: continue dataPoint_count = 0 matches = [0] * plot_size matches_total = 0 charsRecalled = [0] * plot_size charsRecalled_total = 0 for match in strategy["Matches"]: dataPoint_count += 1 if not match["Match"]: continue chars = match["CharsRecalled"] charsRecalled_total += chars matches_total += 1 dist = match["Distance"] if dist > plot_size: continue matches[dist-1] += 1 charsRecalled[dist-1] += chars # recent is very simple strategy so we will believe # that there is no bug in it and we can use it to determine total if strategy_title == "recent": saved_matches_total = matches_total saved_dataPoint_count = dataPoint_count if len(selected_strategies) and strategy_title not in selected_strategies: continue acc = 0 matches_cumulative = [] for x in matches: acc += x matches_cumulative.append(acc) # matches_cumulative.append(matches_total) matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches_cumulative)) plt.plot(x_values, matches_percent, 'o-') legend.append(strategy_title) assert(saved_matches_total is not None) assert(saved_dataPoint_count is not None) max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") x_ticks = list(range(1, plot_size+1, 2)) x_labels = x_ticks[:] plt.xticks(x_ticks, x_labels) plt.legend(legend, loc="best") if async_draw: plt.draw() else: plt.show() def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]): plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.title("Average characters recalled at distance <{}>".format(datetime.now().strftime('%H:%M:%S'))) plt.ylabel("Average characters recalled") plt.xlabel("Distance") x_values = range(1, plot_size+1) legend = [] saved_charsRecalled_total = None saved_dataPoint_count = None for strategy in data["Strategies"]: strategy_title = strategy["Title"] # strategy_description = strategy["Description"] dataPoint_count = 0 matches = [0] * plot_size matches_total = 0 charsRecalled = [0] * plot_size charsRecalled_total = 0 for match in strategy["Matches"]: dataPoint_count += 1 if not match["Match"]: continue chars = match["CharsRecalled"] charsRecalled_total += chars matches_total += 1 dist = match["Distance"] if dist > plot_size: continue matches[dist-1] += 1 charsRecalled[dist-1] += chars # recent is very simple strategy so we will believe # that there is no bug in it and we can use it to determine total if strategy_title == "recent": saved_charsRecalled_total = charsRecalled_total saved_dataPoint_count = dataPoint_count if len(selected_strategies) and strategy_title not in selected_strategies: continue acc = 0 charsRecalled_cumulative = [] for x in charsRecalled: acc += x charsRecalled_cumulative.append(acc) charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative)) plt.plot(x_values, charsRecalled_average, 'o-') legend.append(strategy_title) assert(saved_charsRecalled_total is not None) assert(saved_dataPoint_count is not None) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") x_ticks = list(range(1, plot_size+1, 2)) x_labels = x_ticks[:] plt.xticks(x_ticks, x_labels) plt.legend(legend, loc="best") if async_draw: plt.draw() else: plt.show() def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]): plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.title("Average characters recalled at distance (including prefix matches) <{}>".format(datetime.now().strftime('%H:%M:%S'))) plt.ylabel("Average characters recalled (including prefix matches)") plt.xlabel("Distance") x_values = range(1, plot_size+1) legend = [] saved_charsRecalled_total = None saved_dataPoint_count = None for strategy in data["Strategies"]: strategy_title = strategy["Title"] # strategy_description = strategy["Description"] dataPoint_count = 0 matches_total = 0 charsRecalled = [0] * plot_size charsRecalled_total = 0 for multiMatch in strategy["PrefixMatches"]: dataPoint_count += 1 if not multiMatch["Match"]: continue matches_total += 1 last_charsRecalled = 0 for match in multiMatch["Entries"]: chars = match["CharsRecalled"] charsIncrease = chars - last_charsRecalled assert(charsIncrease > 0) charsRecalled_total += charsIncrease dist = match["Distance"] if dist > plot_size: continue charsRecalled[dist-1] += charsIncrease last_charsRecalled = chars # recent is very simple strategy so we will believe # that there is no bug in it and we can use it to determine total if strategy_title == "recent": saved_charsRecalled_total = charsRecalled_total saved_dataPoint_count = dataPoint_count if len(selected_strategies) and strategy_title not in selected_strategies: continue acc = 0 charsRecalled_cumulative = [] for x in charsRecalled: acc += x charsRecalled_cumulative.append(acc) charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative)) plt.plot(x_values, charsRecalled_average, 'o-') legend.append(strategy_title) assert(saved_charsRecalled_total is not None) assert(saved_dataPoint_count is not None) max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) plt.plot(x_values, max_values, 'r-') legend.append("maximum possible") x_ticks = list(range(1, plot_size+1, 2)) x_labels = x_ticks[:] plt.xticks(x_ticks, x_labels) plt.legend(legend, loc="best") if async_draw: plt.draw() else: plt.show() # plot_cmdLineFrq_rank() # plot_cmdFrq_rank() # plot_cmdLineVocabularySize_cmdLinesEntered() # plot_cmdVocabularySize_cmdLinesEntered() plot_strategies_matches(20) plot_strategies_charsRecalled(20) plot_strategies_charsRecalled_prefix(20) # graph_cmdSequences(node_count=33, edge_minValue=0.048) # graph_cmdSequences(node_count=28, edge_minValue=0.06) # for n in range(29, 35): # for e in range(44, 56, 2): # e *= 0.001 # graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) # be careful and check if labels fit the display if async_draw: plt.show()