resh/evaluate/resh-evaluate-plot.py

#!/usr/bin/env python3

import sys
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.path as mpath
import numpy as np
from graphviz import Digraph

PLOT_WIDTH = 10 # inches
PLOT_HEIGHT = 7 # inches

PLOT_SIZE_zipf = 20

data = json.load(sys.stdin)
# for strategy in data["Strategies"]:
#     print(json.dumps(strategy))


def zipf(length):
    return list(map(lambda x: 1/2**x, range(0, length)))


def trim(text, length, add_elipse=True):
    if add_elipse and len(text) > length:
        return text[:length-1] + "…"
    return text[:length]


# Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmdLine_count = defaultdict(int)
    for record in data["Records"]:
        if record["invalid"]:
            continue

        cmdLine_count[record["cmdLine"]] += 1

    tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdLineFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))

    ranks = range(1, len(cmdLineFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), '-')
    plt.plot(ranks, cmdLineFrq, 'o-')
    plt.title("Commandline frequency / rank")
    plt.ylabel("Normalized commandline frequency")
    plt.xlabel("Commandline rank")
    plt.legend(("Zipf", "Commandline"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    plt.show()


# similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmd_count = defaultdict(int)
    for record in data["Records"]:
        if record["invalid"]:
            continue

        cmd = record["firstWord"]
        if cmd == "":
            continue
        cmd_count[cmd] += 1

    tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))

    ranks = range(1, len(cmdFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), 'o-')
    plt.plot(ranks, cmdFrq, 'o-')
    plt.title("Command frequency / rank")
    plt.ylabel("Normalized command frequency")
    plt.xlabel("Command rank")
    plt.legend(("Zipf", "Command"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    plt.show()

# Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals.
def plot_cmdVocabularySize_cmdLinesEntered():
    cmd_vocabulary = set()
    y_cmd_count = [0]
    for record in data["Records"]:
        if record["invalid"]:
            continue

        cmd = record["firstWord"]
        if cmd in cmd_vocabulary:
            # repeat last value
            y_cmd_count.append(y_cmd_count[-1])
        else:
            cmd_vocabulary.add(cmd)
            # append last value +1
            y_cmd_count.append(y_cmd_count[-1] + 1)

    print(cmd_vocabulary)
    x_cmds_entered = range(0, len(y_cmd_count))

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_cmds_entered, y_cmd_count, '-')
    plt.title("Command vocabulary size vs. the number of command lines entered")
    plt.ylabel("Command vocabulary size")
    plt.xlabel("# of command lines entered")
    plt.show()

# Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984).
#       Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
#       solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graphviz_cmdSequences(cmd_displayTreshold=28, edge_displayTreshold=0.05):
    cmd_count = defaultdict(int)
    cmdSeq_count = defaultdict(lambda: defaultdict(int))
    cmd_id = dict()
    prev_cmd = "_SESSION_INIT_" # XXX: not actually session init yet
    cmd_id[prev_cmd] = str(-1)
    for x, record in enumerate(data["Records"]):
        if record["invalid"]:
            continue

        cmd = record["firstWord"]
        cmdSeq_count[prev_cmd][cmd] += 1
        cmd_count[cmd] += 1
        cmd_id[cmd] = str(x)
        prev_cmd = cmd

    dot = Digraph(comment="Command sequences", graph_attr={'overlap':'scale', 'splines':'true', 'sep':'0.25'})

    # for cmd_entry in cmdSeq_count.items():
    #     cmd, seq = cmd_entry

    #     if cmd_count[cmd] < cmd_displayTreshold:
    #         continue
    #
    #     dot.node(cmd_id[cmd], cmd)

    for cmd_entry in cmdSeq_count.items():
        cmd, seq = cmd_entry

        count = cmd_count[cmd]
        if count < cmd_displayTreshold:
            continue

        for seq_entry in seq.items():
            cmd2, seq_count = seq_entry
            relative_seq_count = seq_count / count

            if cmd_count[cmd2] < cmd_displayTreshold:
                continue
            if relative_seq_count < edge_displayTreshold:
                continue

            for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
                count_ = cmd_count[cmd_]
                scale_ = count_ / (cmd_displayTreshold)
                width_ = str(0.08*scale_)
                fontsize_ = str(1*scale_)
                if scale_ < 12:
                    dot.node(id_, '', shape='circle', fixedsize='true', fontname='bold',
                            width=width_, fontsize='12', forcelabels='true', xlabel=cmd_)
                else:
                    dot.node(id_, cmd_, shape='circle', fixedsize='true', fontname='bold',
                            width=width_, fontsize=fontsize_, forcelabels='true')


            # 1.0 is max
            scale_ = seq_count / cmd_count[cmd]
            penwidth_ = str(0.5 + 4.5 * scale_)
            #penwidth_bold_ = str(8 * scale_)
            if scale_ > 0.5:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='bold')
            elif scale_ > 0.2:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, arrowhead='open')
            elif scale_ > 0.1:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='dashed', arrowhead='open')
            else:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='dotted', arrowhead='empty')

    dot.render('/tmp/resh-graphviz-cmdSeq.gv', view=False)

def plot_strategy_recency():
    recent = None
    for strategy in data["Strategies"]:
        if strategy["Title"] != "recent":
            continue
        recent = strategy
        break

    assert(recent is not None)

    size = 50

    dataPoint_count = 0
    matches = [0] * size
    matches_total = 0
    charsRecalled = [0] * size
    charsRecalled_total = 0

    for match in recent["Matches"]:
        dataPoint_count += 1

        if not match["Match"]:
            continue

        chars = match["CharsRecalled"]
        charsRecalled_total += chars
        matches_total += 1

        dist = match["Distance"]
        if dist > size:
            continue

        matches[dist-1] += 1
        charsRecalled[dist-1] += chars

    x_values = range(1, size+2)
    x_ticks = list(range(1, size+1, 2))
    x_labels = x_ticks[:]
    x_ticks.append(size+1)
    x_labels.append("total")

    acc = 0
    matches_cumulative = []
    for x in matches:
        acc += x
        matches_cumulative.append(acc)
    matches_cumulative.append(matches_total)
    matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches_cumulative))
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_values, matches_percent, 'o-')
    plt.title("Matches at distance")
    plt.ylabel('%' + " of matches")
    plt.xlabel("Distance")
    plt.xticks(x_ticks, x_labels)
    #plt.legend(("Zipf", "Command"), loc="best")
    plt.show()

    acc = 0
    charsRecalled_cumulative = []
    for x in charsRecalled:
        acc += x
        charsRecalled_cumulative.append(acc)
    charsRecalled_cumulative.append(charsRecalled_total)
    charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative))
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_values, charsRecalled_average, 'o-')
    plt.title("Average characters recalled at distance")
    plt.ylabel("Average characters recalled")
    plt.xlabel("Distance")
    plt.xticks(x_ticks, x_labels)
    #plt.legend(("Zipf", "Command"), loc="best")
    plt.show()


plot_strategy_recency()

# graphviz_cmdSequences()
# plot_cmdVocabularySize_cmdLinesEntered()
# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()


# be careful and check if labels fit the display