command sequence graph draft

7 years ago · 7ca44f9177
parent 29b449a9e3
commit 7ca44f9177
3 changed files with 191 additions and 34 deletions
--- a/common/resh-common.go
+++ b/common/resh-common.go
@ -75,6 +75,7 @@ type Record struct {
 	// enriching fields - added "later"
 	FirstWord string `json:"firstWord"`
 	Invalid   bool   `json:"invalid"`
 }
 // FallbackRecord when record is too old and can't be parsed into regular Record
@ -212,6 +213,19 @@ func ConvertRecord(r *FallbackRecord) Record {
 func (r *Record) Enrich() {
 	// Get command/first word from commandline
 	r.FirstWord = GetCommandFromCommandLine(r.CmdLine)
 	err := r.Validate()
 	if err != nil {
 		log.Println("Invalid command:", r.CmdLine)
 		r.Invalid = true
 	}
 	r.Invalid = false
 	// TODO: Detect and mark simple commands r.Simple
 }
 // Validate - returns error if the record is invalid
 func (r *Record) Validate() error {
 	return nil
 }
 // GetCommandFromCommandLine func
--- a/evaluate/resh-evaluate-plot.py
+++ b/evaluate/resh-evaluate-plot.py
@ -6,39 +6,192 @@ from collections import defaultdict
 import matplotlib.pyplot as plt
 import matplotlib.path as mpath
 import numpy as np
 from graphviz import Digraph
 PLOT_WIDTH = 10 # inches
 PLOT_HEIGHT = 7 # inches
-def addRank(data):
+PLOT_SIZE_zipf = 20
    return list(enumerate(data, start=1))
 data = json.load(sys.stdin)
 # for strategy in data["Strategies"]:
 #     print(json.dumps(strategy))
 cmd_count = defaultdict(int)
 cmdLine_count = defaultdict(int)
-for record in data["Records"]:
+def zipf(length):
-    cmd_count[record["firstWord"]] += 1
+    return list(map(lambda x: 1/2**x, range(0, length)))
 def trim(text, length, add_elipse=True):
    if add_elipse and len(text) > length:
        return text[:length-1] + "…"
    return text[:length]
 # Figure 3.1. The normalized command frequency, compared with Zipf.
 def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmdLine_count = defaultdict(int)
    for record in data["Records"]:
        if record["invalid"]:
            continue
        cmdLine_count[record["cmdLine"]] += 1
-cmdTmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:50]
+    tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
-cmdFrq = list(map(lambda x: x[1] / cmdTmp[0][1], cmdTmp))
+    cmdLineFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))
    ranks = range(1, len(cmdLineFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), '-')
    plt.plot(ranks, cmdLineFrq, 'o-')
    plt.title("Commandline frequency / rank")
    plt.ylabel("Normalized commandline frequency")
    plt.xlabel("Commandline rank")
    plt.legend(("Zipf", "Commandline"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    plt.show()
 # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
 def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
    cmd_count = defaultdict(int)
    for record in data["Records"]:
        if record["invalid"]:
            continue
        cmd = record["firstWord"]
        if cmd == "":
            continue
        cmd_count[cmd] += 1
    tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
    cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
    labels = list(map(lambda x: trim(x[0], 7), tmp))
    ranks = range(1, len(cmdFrq)+1)
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(ranks, zipf(len(ranks)), 'o-')
    plt.plot(ranks, cmdFrq, 'o-')
    plt.title("Command frequency / rank")
    plt.ylabel("Normalized command frequency")
    plt.xlabel("Command rank")
    plt.legend(("Zipf", "Command"), loc="best")
    if show_labels:
        plt.xticks(ranks, labels, rotation=-60)
    # TODO: make xticks integral
    plt.show()
 # Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals.
 def plot_cmdVocabularySize_cmdLinesEntered():
    cmd_vocabulary = set()
    y_cmd_count = [0]
    for record in data["Records"]:
        if record["invalid"]:
            continue
        cmd = record["firstWord"]
        if cmd in cmd_vocabulary:
            # repeat last value
            y_cmd_count.append(y_cmd_count[-1])
        else:
            cmd_vocabulary.add(cmd)  
            # append last value +1
            y_cmd_count.append(y_cmd_count[-1] + 1)
    print(cmd_vocabulary)
    x_cmds_entered = range(0, len(y_cmd_count))
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.plot(x_cmds_entered, y_cmd_count, '-')
    plt.title("Command vocabulary size vs. the number of command lines entered")
    plt.ylabel("Command vocabulary size")
    plt.xlabel("# of command lines entered")
    plt.show()
 # Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984).
 #       Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
 #       solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
 def graphviz_cmdSequences(cmd_displayTreshold=20, edge_displayTreshold=0.03):
    cmd_count = defaultdict(int)
    cmdSeq_count = defaultdict(lambda: defaultdict(int))
    cmd_id = dict()
    prev_cmd = "_SESSION_INIT_" # XXX: not actually session init yet
    cmd_id[prev_cmd] = str(-1) 
    for x, record in enumerate(data["Records"]):
        if record["invalid"]:
            continue
        cmd = record["firstWord"]
        cmdSeq_count[prev_cmd][cmd] += 1
        cmd_count[cmd] += 1
        cmd_id[cmd] = str(x)
        prev_cmd = cmd
    dot = Digraph(comment="Command sequences", graph_attr={'overlap':'scale', 'splines':'true'})
    # for cmd_entry in cmdSeq_count.items():
    #     cmd, seq = cmd_entry
    #     if cmd_count[cmd] < cmd_displayTreshold:
    #         continue
    #     
    #     dot.node(cmd_id[cmd], cmd)
    for cmd_entry in cmdSeq_count.items():
        cmd, seq = cmd_entry
        count = cmd_count[cmd]
        if count < cmd_displayTreshold:
            continue
        for seq_entry in seq.items():
            cmd2, seq_count = seq_entry
            relative_seq_count = seq_count / count
            if cmd_count[cmd2] < cmd_displayTreshold:
                continue
            if relative_seq_count < edge_displayTreshold:
                continue
            for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)):
                count_ = cmd_count[cmd_]
                scale_ = count_ / (cmd_displayTreshold)
                width_ = str(0.08*scale_) 
                fontsize_ = str(1*scale_)
                if scale_ < 12:
                    dot.node(id_, '', shape='circle', fixedsize='true', fontname='bold',
                            width=width_, fontsize='12', forcelabels='true', xlabel=cmd_)
                else:
                    dot.node(id_, cmd_, shape='circle', fixedsize='true', fontname='bold',
                            width=width_, fontsize=fontsize_, forcelabels='true')
            # 1.0 is max
            scale_ = seq_count / cmd_count[cmd]
            penwidth_ = str(0.5 + 4.5 * scale_)
            #penwidth_bold_ = str(8 * scale_)
            if scale_ > 0.5:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='bold')
            elif scale_ > 0.2:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, arrowhead='open')
            elif scale_ > 0.1:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='dashed', arrowhead='open')
            else:
                dot.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved',
                         penwidth=penwidth_, style='dotted', arrowhead='empty')
-cmdLineTmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:50]
+    dot.render('/tmp/resh-graphviz-cmdSeq.gv', view=False)
 cmdLineFrq = list(map(lambda x: x[1] / cmdLineTmp[0][1], cmdLineTmp))
-print(cmdFrq)
+graphviz_cmdSequences()
-print("#################")
+# plot_cmdVocabularySize_cmdLinesEntered()
-#print(cmdLineFrq_rank)
+# plot_cmdLineFrq_rank()
 # plot_cmdFrq_rank()
 plt.plot(range(1, len(cmdFrq)+1), cmdFrq, 'o-')
 plt.title("Command frequency")
 plt.yticks()
 #plt.xticks(range(1, len(cmdFrq)+1))
 plt.show()
-plt.plot(range(1, len(cmdLineFrq)+1), cmdLineFrq, 'o-')
+# be careful and check if labels fit the display
 plt.title("Commandline frequency")
 plt.show()
--- a/evaluate/resh-evaluate.go
+++ b/evaluate/resh-evaluate.go
@ -33,7 +33,7 @@ func main() {
 	inputPath := flag.String("input", "",
 		"Input file (default: "+historyPath+"OR"+sanitizedHistoryPath+
 			" depending on --sanitized-input option)")
-	outputDir := flag.String("output", "/tmp/resh-evaluate", "Output directory")
+	// outputDir := flag.String("output", "/tmp/resh-evaluate", "Output directory")
 	sanitizedInput := flag.Bool("sanitized-input", false,
 		"Handle input as sanitized (also changes default value for input argument)")
 	plottingScript := flag.String("plotting-script", "resh-evaluate-plot.py", "Script to use for plotting")
@ -80,15 +80,6 @@ func main() {
 	}
 	// evaluator.dumpJSON(tmpPath)
 	// run python script to stat and plot/
 	cmd := exec.Command("echo", *outputDir)
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	log.Printf("")
 	err = cmd.Run()
 	if err != nil {
 		log.Printf("Command finished with error: %v", err)
 	}
 	evaluator.calculateStatsAndPlot(*plottingScript)
 }
@ -141,7 +132,6 @@ func (e *evaluator) calculateStatsAndPlot(scriptName string) {
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	cmd.Stdin = &buffer
 	log.Printf("...")
 	err = cmd.Run()
 	if err != nil {
 		log.Printf("Command finished with error: %v", err)