add grouping by session (pid), add batch mode

pull/13/head
Simon Let 6 years ago
parent 050af919dc
commit abb786c478
  1. 1
      common/resh-common.go
  2. 71
      evaluate/resh-evaluate-plot.py
  3. 162
      evaluate/resh-evaluate.go

@ -224,7 +224,6 @@ func (r *Record) Enrich() {
// Validate - returns error if the record is invalid
func (r *Record) Validate() error {
return nil
}

@ -16,6 +16,24 @@ PLOT_HEIGHT = 7 # inches
PLOT_SIZE_zipf = 20
data = json.load(sys.stdin)
DATA_records = []
DATA_records_by_session = defaultdict(list)
for user in data["UsersRecords"]:
for device in user["Devices"]:
for record in device["Records"]:
if record["invalid"]:
continue
DATA_records.append(record)
DATA_records_by_session[record["sessionPid"]].append(record)
DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeBeforeLocal"]))
for pid, session in DATA_records_by_session.items():
session = list(sorted(session, key=lambda x: x["realtimeBeforeLocal"]))
# for strategy in data["Strategies"]:
# print(json.dumps(strategy))
@ -33,10 +51,7 @@ def trim(text, length, add_elipse=True):
# Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
cmdLine_count = defaultdict(int)
for record in data["Records"]:
if record["invalid"]:
continue
for record in DATA_records:
cmdLine_count[record["cmdLine"]] += 1
tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
@ -60,10 +75,7 @@ def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
# similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
cmd_count = defaultdict(int)
for record in data["Records"]:
if record["invalid"]:
continue
for record in DATA_records:
cmd = record["firstWord"]
if cmd == "":
continue
@ -90,10 +102,7 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
def plot_cmdVocabularySize_cmdLinesEntered():
cmd_vocabulary = set()
y_cmd_count = [0]
for record in data["Records"]:
if record["invalid"]:
continue
for record in DATA_records:
cmd = record["firstWord"]
if cmd in cmd_vocabulary:
# repeat last value
@ -103,7 +112,7 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# append last value +1
y_cmd_count.append(y_cmd_count[-1] + 1)
print(cmd_vocabulary)
# print(cmd_vocabulary)
x_cmds_entered = range(0, len(y_cmd_count))
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
@ -118,23 +127,27 @@ def plot_cmdVocabularySize_cmdLinesEntered():
# Ball diameters are proportional to stationary probability. Lines indicate significant dependencies,
# solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001).
def graph_cmdSequences(node_count=33, edge_minValue=0.05):
START_CMD = "_start_"
cmd_count = defaultdict(int)
cmdSeq_count = defaultdict(lambda: defaultdict(int))
cmd_id = dict()
prev_cmd = "<start>" # XXX: not actually session init yet
cmd_id[prev_cmd] = str(-1)
for x, record in enumerate(data["Records"]):
if record["invalid"]:
continue
cmd = record["firstWord"]
cmdSeq_count[prev_cmd][cmd] += 1
cmd_count[cmd] += 1
cmd_id[cmd] = str(x)
prev_cmd = cmd
x = 0
cmd_id[START_CMD] = str(x)
for pid, session in DATA_records_by_session.items():
cmd_count[START_CMD] += 1
prev_cmd = START_CMD
for record in session:
cmd = record["firstWord"]
cmdSeq_count[prev_cmd][cmd] += 1
cmd_count[cmd] += 1
if cmd not in cmd_id:
x += 1
cmd_id[cmd] = str(x)
prev_cmd = cmd
# get `node_count` of largest nodes
sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)
print(sorted_cmd_count)
cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count]
# use 3 biggest nodes as a reference point for scaling
@ -298,13 +311,15 @@ def plot_strategy_recency():
# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()
# plot_cmdVocabularySize_cmdLinesEntered()
# plot_strategy_recency()
graph_cmdSequences(node_count=28, edge_minValue=0.06)
# plot_cmdVocabularySize_cmdLinesEntered()
# plot_cmdLineFrq_rank()
# plot_cmdFrq_rank()
graph_cmdSequences()
# graph_cmdSequences(node_count=28, edge_minValue=0.06)
# be careful and check if labels fit the display

@ -6,6 +6,7 @@ import (
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"os/exec"
@ -25,30 +26,25 @@ func main() {
usr, _ := user.Current()
dir := usr.HomeDir
historyPath := filepath.Join(dir, ".resh_history.json")
historyPathBatchMode := filepath.Join(dir, "resh_history.json")
sanitizedHistoryPath := filepath.Join(dir, "resh_history_sanitized.json")
// tmpPath := "/tmp/resh-evaluate-tmp.json"
showVersion := flag.Bool("version", false, "Show version and exit")
showRevision := flag.Bool("revision", false, "Show git revision and exit")
inputPath := flag.String("input", "",
input := flag.String("input", "",
"Input file (default: "+historyPath+"OR"+sanitizedHistoryPath+
" depending on --sanitized-input option)")
// outputDir := flag.String("output", "/tmp/resh-evaluate", "Output directory")
sanitizedInput := flag.Bool("sanitized-input", false,
"Handle input as sanitized (also changes default value for input argument)")
plottingScript := flag.String("plotting-script", "resh-evaluate-plot.py", "Script to use for plotting")
inputDataRoot := flag.String("input-data-root", "",
"Input data root, enables batch mode, looks for files matching --input option")
flag.Parse()
// set default input
if *inputPath == "" {
if *sanitizedInput {
*inputPath = sanitizedHistoryPath
} else {
*inputPath = historyPath
}
}
// handle show{Version,Revision} options
if *showVersion == true {
fmt.Println(Version)
os.Exit(0)
@ -58,10 +54,33 @@ func main() {
os.Exit(0)
}
evaluator := evaluator{sanitizedInput: *sanitizedInput, maxCandidates: 50}
err := evaluator.init(*inputPath)
if err != nil {
log.Fatal("Evaluator init() error:", err)
// handle batch mode
batchMode := false
if *inputDataRoot != "" {
batchMode = true
}
// set default input
if *input == "" {
if *sanitizedInput {
*input = sanitizedHistoryPath
} else if batchMode {
*input = historyPathBatchMode
} else {
*input = historyPath
}
}
evaluator := evaluator{sanitizedInput: *sanitizedInput, maxCandidates: 50, BatchMode: batchMode}
if batchMode {
err := evaluator.initBatchMode(*input, *inputDataRoot)
if err != nil {
log.Fatal("Evaluator initBatchMode() error:", err)
}
} else {
err := evaluator.init(*input)
if err != nil {
log.Fatal("Evaluator init() error:", err)
}
}
var strategies []strategy
@ -73,12 +92,11 @@ func main() {
strategies = append(strategies, &recent)
for _, strat := range strategies {
err = evaluator.evaluate(strat)
err := evaluator.evaluate(strat)
if err != nil {
log.Println("Evaluator evaluate() error:", err)
}
}
// evaluator.dumpJSON(tmpPath)
evaluator.calculateStatsAndPlot(*plottingScript)
}
@ -102,26 +120,42 @@ type strategyJSON struct {
Matches []matchJSON
}
type evaluateJSON struct {
Strategies []strategyJSON
Records []common.Record
type deviceRecords struct {
Name string
Records []common.Record
}
type userRecords struct {
Name string
Devices []deviceRecords
}
type evaluator struct {
sanitizedInput bool
BatchMode bool
maxCandidates int
historyRecords []common.Record
data evaluateJSON
UsersRecords []userRecords
Strategies []strategyJSON
}
func (e *evaluator) initBatchMode(input string, inputDataRoot string) error {
e.UsersRecords = e.loadHistoryRecordsBatchMode(input, inputDataRoot)
e.processRecords()
return nil
}
func (e *evaluator) init(inputPath string) error {
e.historyRecords = e.loadHistoryRecords(inputPath)
records := e.loadHistoryRecords(inputPath)
device := deviceRecords{Records: records}
user := userRecords{}
user.Devices = append(user.Devices, device)
e.UsersRecords = append(e.UsersRecords, user)
e.processRecords()
return nil
}
func (e *evaluator) calculateStatsAndPlot(scriptName string) {
evalJSON, err := json.Marshal(e.data)
evalJSON, err := json.Marshal(e)
if err != nil {
log.Fatal("json marshal error", err)
}
@ -140,25 +174,28 @@ func (e *evaluator) calculateStatsAndPlot(scriptName string) {
// enrich records and add them to serializable structure
func (e *evaluator) processRecords() {
for _, record := range e.historyRecords {
for i := range e.UsersRecords {
for j := range e.UsersRecords[i].Devices {
for k, record := range e.UsersRecords[i].Devices[j].Records {
// assert
if record.Sanitized != e.sanitizedInput {
if e.sanitizedInput {
log.Fatal("ASSERT failed: '--sanitized-input' is present but data is not sanitized")
}
log.Fatal("ASSERT failed: data is sanitized but '--sanitized-input' is not present")
}
// assert
if record.Sanitized != e.sanitizedInput {
if e.sanitizedInput {
log.Fatal("ASSERT failed: '--sanitized-input' is present but data is not sanitized")
e.UsersRecords[i].Devices[j].Records[k].Enrich()
// device.Records = append(device.Records, record)
}
log.Fatal("ASSERT failed: data is sanitized but '--sanitized-input' is not present")
}
record.Enrich()
e.data.Records = append(e.data.Records, record)
}
}
func (e *evaluator) evaluate(strategy strategy) error {
title, description := strategy.GetTitleAndDescription()
strategyData := strategyJSON{Title: title, Description: description}
for _, record := range e.historyRecords {
for _, record := range e.UsersRecords[0].Devices[0].Records {
candidates := strategy.GetCandidates()
matchFound := false
@ -183,10 +220,67 @@ func (e *evaluator) evaluate(strategy strategy) error {
return err
}
}
e.data.Strategies = append(e.data.Strategies, strategyData)
e.Strategies = append(e.Strategies, strategyData)
return nil
}
func (e *evaluator) loadHistoryRecordsBatchMode(fname string, dataRootPath string) []userRecords {
var records []userRecords
info, err := os.Stat(dataRootPath)
if err != nil {
log.Fatal("Error: Directory", dataRootPath, "does not exist - exiting! (", err, ")")
}
if info.IsDir() == false {
log.Fatal("Error:", dataRootPath, "is not a directory - exiting!")
}
users, err := ioutil.ReadDir(dataRootPath)
if err != nil {
log.Fatal("Could not read directory:", dataRootPath)
}
fmt.Println("Listing users in <", dataRootPath, ">...")
for _, user := range users {
userRecords := userRecords{Name: user.Name()}
userFullPath := filepath.Join(dataRootPath, user.Name())
if user.IsDir() == false {
log.Println("Warn: Unexpected file (not a directory) <", userFullPath, "> - skipping.")
continue
}
fmt.Println()
fmt.Printf("*- %s\n", user.Name())
devices, err := ioutil.ReadDir(userFullPath)
if err != nil {
log.Fatal("Could not read directory:", userFullPath)
}
for _, device := range devices {
deviceRecords := deviceRecords{Name: device.Name()}
deviceFullPath := filepath.Join(userFullPath, device.Name())
if device.IsDir() == false {
log.Println("Warn: Unexpected file (not a directory) <", deviceFullPath, "> - skipping.")
continue
}
fmt.Printf(" \\- %s\n", device.Name())
files, err := ioutil.ReadDir(deviceFullPath)
if err != nil {
log.Fatal("Could not read directory:", deviceFullPath)
}
for _, file := range files {
fileFullPath := filepath.Join(deviceFullPath, file.Name())
if file.Name() == fname {
fmt.Printf(" \\- %s - loading ...", file.Name())
// load the data
deviceRecords.Records = e.loadHistoryRecords(fileFullPath)
fmt.Println(" OK ✓")
} else {
fmt.Printf(" \\- %s - skipped\n", file.Name())
}
}
userRecords.Devices = append(userRecords.Devices, deviceRecords)
}
records = append(records, userRecords)
}
return records
}
func (e *evaluator) loadHistoryRecords(fname string) []common.Record {
file, err := os.Open(fname)
if err != nil {

Loading…
Cancel
Save