resh/evaluate/resh-evaluate-plot.py

#!/usr/bin/env python3


import traceback
import sys
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.path as mpath
import numpy as np
from graphviz import Digraph
from datetime import datetime

PLOT_WIDTH = 10 # inches
PLOT_HEIGHT = 7 # inches

PLOT_SIZE_zipf = 20

data = json.load(sys.stdin)

DATA_records = []
DATA_records_by_session = defaultdict(list)
for user in data["UsersRecords"]:
    for device in user["Devices"]:
        for record in device["Records"]:
            if "invalid" in record and record["invalid"]:
                continue

            DATA_records.append(record)
            DATA_records_by_session[record["seqSessionId"]].append(record)

DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"]))

for pid, session in DATA_records_by_session.items():
    session = list(sorted(session, key=lambda x: x["realtimeAfterLocal"]))

# TODO: this should be a cmdline option
async_draw = True

# for strategy in data["Strategies"]:
#     print(json.dumps(strategy))

def zipf(length):
    return list(map(lambda x: 1/2**x, range(0, length)))


def trim(text, length, add_elipse=True):
    if add_elipse and len(text) > length:
        return text[:length-1] + "…"
    return text[:length]


# Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmdLine_count = defaultdict(int)
    for record in DATA_records:
        cmdLine_count[record["cmdLine"]] += 1

    tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdLineFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))

    ranks = range(1, len(cmdLineFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), '-')
    plt.plot(ranks, cmdLineFrq, 'o-')
    plt.title("Commandline frequency / rank")
    plt.ylabel("Normalized commandline frequency")
    plt.xlabel("Commandline rank")
    plt.legend(("Zipf", "Commandline"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    if async_draw:
        plt.draw()
    else:
        plt.show()


# similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmd_count = defaultdict(int)
    for record in DATA_records:
        cmd = record["command"]
        if cmd == "":
            continue
        cmd_count[cmd] += 1

    tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))

    ranks = range(1, len(cmdFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), 'o-')
    plt.plot(ranks, cmdFrq, 'o-')
    plt.title("Command frequency / rank")
    plt.ylabel("Normalized command frequency")
    plt.xlabel("Command rank")
    plt.legend(("Zipf", "Command"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    if async_draw:
        plt.draw()
    else:
        plt.show()

# Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals.
def plot_cmdVocabularySize_cmdLinesEntered():
    cmd_vocabulary = set()
    y_cmd_count = [0]
    for record in DATA_records:
        cmd = record["command"]
        if cmd in cmd_vocabulary:
            # repeat last value
            y_cmd_count.append(y_cmd_count[-1])
        else:
            cmd_vocabulary.add(cmd)
            # append last value +1
            y_cmd_count.append(y_cmd_count[-1] + 1)

    # print(cmd_vocabulary)
    x_cmds_entered = range(0, len(y_cmd_count))

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_cmds_entered, y_cmd_count, '-')
    plt.title("Command vocabulary size vs. the number of command lines entered")
    plt.ylabel("Command vocabulary size")
    plt.xlabel("# of command lines entered")
    if async_draw:
        plt.draw()
    else:
        plt.show()

# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals.
def plot_cmdLineVocabularySize_cmdLinesEntered():
    cmdLine_vocabulary = set()
    y_cmdLine_count = [0]
    for record in DATA_records:
        cmdLine = record["cmdLine"]
        if cmdLine in cmdLine_vocabulary:
            # repeat last value
            y_cmdLine_count.append(y_cmdLine_count[-1])
        else:
            cmdLine_vocabulary.add(cmdLine)
            # append last value +1
            y_cmdLine_count.append(y_cmdLine_count[-1] + 1)

    # print(cmdLine_vocabulary)
    x_cmdLines_entered = range(0, len(y_cmdLine_count))

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_cmdLines_entered, y_cmdLine_count, '-')
    plt.title("Command line vocabulary size vs. the number of command lines entered")
    plt.ylabel("Command line vocabulary size")
    plt.xlabel("# of command lines entered")
    if async_draw:
        plt.draw()
    else:
        plt.show()

# Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984).
#       Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
#       solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
    START_CMD = "_start_"
    cmd_count = defaultdict(int)
    cmdSeq_count = defaultdict(lambda: defaultdict(int))
    cmd_id = dict()
    x = 0
    cmd_id[START_CMD] = str(x)
    for pid, session in DATA_records_by_session.items():
        cmd_count[START_CMD] += 1
        prev_cmd = START_CMD
        for record in session:
            cmd = record["command"]
            cmdSeq_count[prev_cmd][cmd] += 1
            cmd_count[cmd] += 1
            if cmd not in cmd_id:
                x += 1
                cmd_id[cmd] = str(x)
            prev_cmd = cmd

    # get `node_count` of largest nodes
    sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
    print(sorted_cmd_count)
    cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count]

    # use 3 biggest nodes as a reference point for scaling
    biggest_node = cmd_count[cmds_to_graph[0]]
    nd_biggest_node = cmd_count[cmds_to_graph[1]]
    rd_biggest_node = cmd_count[cmds_to_graph[1]]
    count2scale_coef = 3 / (biggest_node + nd_biggest_node + rd_biggest_node)

    # scaling constant
    #       affects node size and node label
    base_scaling_factor = 21
    # extra scaling for experiments - not really useful imho
    #       affects everything nodes, edges, node labels, treshold for turning label into xlabel, xlabel size, ...
    extra_scaling_factor = 1.0
    for x in range(0, 10):
        # graphviz is not the most reliable piece of software
        #       -> retry on fail but scale nodes down by 1%
        scaling_factor = base_scaling_factor * (1 - x * 0.01)

        # overlap: scale -> solve overlap by scaling the graph
        # overlap_shrink -> try to shrink the graph a bit after you are done
        # splines -> don't draw edges over nodes
        # sep: 2.5 -> assume that nodes are 2.5 inches larger
        graph_attr={'overlap':'scale', 'overlap_shrink':'true',
                    'splines':'true', 'sep':'0.25'}
        graph = Digraph(name='command_sequentiality', engine='neato', graph_attr=graph_attr)

        # iterate over all nodes
        for cmd in cmds_to_graph:
            seq = cmdSeq_count[cmd]
            count = cmd_count[cmd]

            # iterate over all "following" commands (for each node)
            for seq_entry in seq.items():
                cmd2, seq_count = seq_entry
                relative_seq_count = seq_count / count

                # check if "follow" command is supposed to be in the graph
                if cmd2 not in cmds_to_graph:
                    continue
                # check if the edge value is high enough
                if relative_seq_count < edge_minValue:
                    continue

                # create starting node and end node for the edge
                #       duplicates don't matter
                for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
                    count_ = cmd_count[cmd_]
                    scale_ = count_ * count2scale_coef * scaling_factor * extra_scaling_factor
                    width_ = 0.08 * scale_
                    fontsize_ = 8.5 * scale_ / (len(cmd_) + 3)

                    width_ = str(width_)
                    if fontsize_ < 12 * extra_scaling_factor:
                        graph.node(id_, ' ', shape='circle', fixedsize='true', fontname='monospace bold',
                                width=width_, fontsize=str(12 * extra_scaling_factor), forcelabels='true', xlabel=cmd_)
                    else:
                        fontsize_ = str(fontsize_)
                        graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='monospace bold',
                                width=width_, fontsize=fontsize_, forcelabels='true', labelloc='c')

                # value of the edge (percentage) 1.0 is max
                scale_ = seq_count / cmd_count[cmd]
                penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor)
                #penwidth_bold_ = str(8 * scale_)
                if scale_ > 0.5:
                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
                            penwidth=penwidth_, style='bold')
                elif scale_ > 0.2:
                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
                            penwidth=penwidth_, arrowhead='open')
                elif scale_ > 0.1:
                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved',
                            penwidth=penwidth_, style='dashed', arrowhead='open')
                else:
                    graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                            penwidth=penwidth_, style='dotted', arrowhead='empty')

        # graphviz sometimes fails - see above
        try:
            # graph.view()
            graph.render('/tmp/resh-graph-command_sequence-nodeCount_{}-edgeMinVal_{}.gv'.format(node_count, edge_minValue), view=view_graph)
            break
        except Exception as e:
            trace = traceback.format_exc()
            print("GRAPHVIZ EXCEPTION: <{}>\nGRAPHVIZ TRACE: <{}>".format(str(e), trace))


def plot_strategies_matches(plot_size=50, selected_strategies=[]):
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.title("Matches at distance <{}>".format(datetime.now().strftime('%H:%M:%S')))
    plt.ylabel('%' + " of matches")
    plt.xlabel("Distance")
    legend = []
    x_values = range(1, plot_size+1)
    saved_matches_total = None
    saved_dataPoint_count = None
    for strategy in data["Strategies"]:
        strategy_title = strategy["Title"]
        # strategy_description = strategy["Description"]

        if len(selected_strategies) and strategy_title not in selected_strategies:
            continue

        dataPoint_count = 0
        matches = [0] * plot_size
        matches_total = 0
        charsRecalled = [0] * plot_size
        charsRecalled_total = 0

        for match in strategy["Matches"]:
            dataPoint_count += 1

            if not match["Match"]:
                continue

            chars = match["CharsRecalled"]
            charsRecalled_total += chars
            matches_total += 1

            dist = match["Distance"]
            if dist > plot_size:
                continue

            matches[dist-1] += 1
            charsRecalled[dist-1] += chars

        # recent is very simple strategy so we will believe
        #       that there is no bug in it and we can use it to determine total
        if strategy_title == "recent":
            saved_matches_total = matches_total
            saved_dataPoint_count = dataPoint_count

        if len(selected_strategies) and strategy_title not in selected_strategies:
            continue

        acc = 0
        matches_cumulative = []
        for x in matches:
            acc += x
            matches_cumulative.append(acc)
        # matches_cumulative.append(matches_total)
        matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches_cumulative))

        plt.plot(x_values, matches_percent, 'o-')
        legend.append(strategy_title)

    assert(saved_matches_total is not None)
    assert(saved_dataPoint_count is not None)
    max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values)
    plt.plot(x_values, max_values, 'r-')
    legend.append("maximum possible")

    x_ticks = list(range(1, plot_size+1, 2))
    x_labels = x_ticks[:]
    plt.xticks(x_ticks, x_labels)
    plt.legend(legend, loc="best")
    if async_draw:
        plt.draw()
    else:
        plt.show()


def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]):
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.title("Average characters recalled at distance <{}>".format(datetime.now().strftime('%H:%M:%S')))
    plt.ylabel("Average characters recalled")
    plt.xlabel("Distance")
    x_values = range(1, plot_size+1)
    legend = []
    saved_charsRecalled_total = None
    saved_dataPoint_count = None
    for strategy in data["Strategies"]:
        strategy_title = strategy["Title"]
        # strategy_description = strategy["Description"]

        dataPoint_count = 0
        matches = [0] * plot_size
        matches_total = 0
        charsRecalled = [0] * plot_size
        charsRecalled_total = 0

        for match in strategy["Matches"]:
            dataPoint_count += 1

            if not match["Match"]:
                continue

            chars = match["CharsRecalled"]
            charsRecalled_total += chars
            matches_total += 1

            dist = match["Distance"]
            if dist > plot_size:
                continue

            matches[dist-1] += 1
            charsRecalled[dist-1] += chars

        # recent is very simple strategy so we will believe
        #       that there is no bug in it and we can use it to determine total
        if strategy_title == "recent":
            saved_charsRecalled_total = charsRecalled_total
            saved_dataPoint_count = dataPoint_count

        if len(selected_strategies) and strategy_title not in selected_strategies:
            continue

        acc = 0
        charsRecalled_cumulative = []
        for x in charsRecalled:
            acc += x
            charsRecalled_cumulative.append(acc)
        charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative))

        plt.plot(x_values, charsRecalled_average, 'o-')
        legend.append(strategy_title)

    assert(saved_charsRecalled_total is not None)
    assert(saved_dataPoint_count is not None)
    max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
    plt.plot(x_values, max_values, 'r-')
    legend.append("maximum possible")

    x_ticks = list(range(1, plot_size+1, 2))
    x_labels = x_ticks[:]
    plt.xticks(x_ticks, x_labels)
    plt.legend(legend, loc="best")
    if async_draw:
        plt.draw()
    else:
        plt.show()


def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.title("Average characters recalled at distance (including prefix matches) <{}>".format(datetime.now().strftime('%H:%M:%S')))
    plt.ylabel("Average characters recalled (including prefix matches)")
    plt.xlabel("Distance")
    x_values = range(1, plot_size+1)
    legend = []
    saved_charsRecalled_total = None
    saved_dataPoint_count = None
    for strategy in data["Strategies"]:
        strategy_title = strategy["Title"]
        # strategy_description = strategy["Description"]

        dataPoint_count = 0
        matches_total = 0
        charsRecalled = [0] * plot_size
        charsRecalled_total = 0

        for multiMatch in strategy["PrefixMatches"]:
            dataPoint_count += 1

            if not multiMatch["Match"]:
                continue
            matches_total += 1

            last_charsRecalled = 0
            for match in multiMatch["Entries"]:

                chars = match["CharsRecalled"]
                charsIncrease = chars - last_charsRecalled
                assert(charsIncrease > 0)
                charsRecalled_total += charsIncrease

                dist = match["Distance"]
                if dist > plot_size:
                    continue

                charsRecalled[dist-1] += charsIncrease
                last_charsRecalled = chars

        # recent is very simple strategy so we will believe
        #       that there is no bug in it and we can use it to determine total
        if strategy_title == "recent":
            saved_charsRecalled_total = charsRecalled_total
            saved_dataPoint_count = dataPoint_count

        if len(selected_strategies) and strategy_title not in selected_strategies:
            continue

        acc = 0
        charsRecalled_cumulative = []
        for x in charsRecalled:
            acc += x
            charsRecalled_cumulative.append(acc)
        charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative))

        plt.plot(x_values, charsRecalled_average, 'o-')
        legend.append(strategy_title)

    assert(saved_charsRecalled_total is not None)
    assert(saved_dataPoint_count is not None)
    max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values)
    plt.plot(x_values, max_values, 'r-')
    legend.append("maximum possible")

    x_ticks = list(range(1, plot_size+1, 2))
    x_labels = x_ticks[:]
    plt.xticks(x_ticks, x_labels)
    plt.legend(legend, loc="best")
    if async_draw:
        plt.draw()
    else:
        plt.show()


# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()

# plot_cmdLineVocabularySize_cmdLinesEntered()
# plot_cmdVocabularySize_cmdLinesEntered()

plot_strategies_matches(20)
plot_strategies_charsRecalled(20)
# plot_strategies_charsRecalled_prefix(20)

# graph_cmdSequences(node_count=33, edge_minValue=0.048)

# graph_cmdSequences(node_count=28, edge_minValue=0.06)

# for n in range(29, 35):
#     for e in range(44, 56, 2):
#         e *= 0.001
#         graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)

# be careful and check if labels fit the display

if async_draw:
    plt.show()