mirror of https://github.com/curusarn/resh
commit
22a0bf3f5e
@ -1,2 +1,4 @@ |
||||
resh-collect |
||||
resh-daemon |
||||
resh-sanitize-history |
||||
resh-evaluate |
||||
|
||||
@ -0,0 +1,438 @@ |
||||
#!/usr/bin/env python3 |
||||
|
||||
|
||||
import traceback |
||||
import sys |
||||
import json |
||||
from collections import defaultdict |
||||
import matplotlib.pyplot as plt |
||||
import matplotlib.path as mpath |
||||
import numpy as np |
||||
from graphviz import Digraph |
||||
|
||||
PLOT_WIDTH = 10 # inches |
||||
PLOT_HEIGHT = 7 # inches |
||||
|
||||
PLOT_SIZE_zipf = 20 |
||||
|
||||
data = json.load(sys.stdin) |
||||
|
||||
DATA_records = [] |
||||
DATA_records_by_session = defaultdict(list) |
||||
for user in data["UsersRecords"]: |
||||
for device in user["Devices"]: |
||||
for record in device["Records"]: |
||||
if record["invalid"]: |
||||
continue |
||||
|
||||
DATA_records.append(record) |
||||
DATA_records_by_session[record["sessionId"]].append(record) |
||||
|
||||
DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"])) |
||||
|
||||
for pid, session in DATA_records_by_session.items(): |
||||
session = list(sorted(session, key=lambda x: x["realtimeAfterLocal"])) |
||||
|
||||
# TODO: this should be a cmdline option |
||||
async_draw = True |
||||
|
||||
# for strategy in data["Strategies"]: |
||||
# print(json.dumps(strategy)) |
||||
|
||||
|
||||
def zipf(length): |
||||
return list(map(lambda x: 1/2**x, range(0, length))) |
||||
|
||||
|
||||
def trim(text, length, add_elipse=True): |
||||
if add_elipse and len(text) > length: |
||||
return text[:length-1] + "…" |
||||
return text[:length] |
||||
|
||||
|
||||
# Figure 3.1. The normalized command frequency, compared with Zipf. |
||||
def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): |
||||
cmdLine_count = defaultdict(int) |
||||
for record in DATA_records: |
||||
cmdLine_count[record["cmdLine"]] += 1 |
||||
|
||||
tmp = sorted(cmdLine_count.items(), key=lambda x: x[1], reverse=True)[:plotSize] |
||||
cmdLineFrq = list(map(lambda x: x[1] / tmp[0][1], tmp)) |
||||
labels = list(map(lambda x: trim(x[0], 7), tmp)) |
||||
|
||||
ranks = range(1, len(cmdLineFrq)+1) |
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.plot(ranks, zipf(len(ranks)), '-') |
||||
plt.plot(ranks, cmdLineFrq, 'o-') |
||||
plt.title("Commandline frequency / rank") |
||||
plt.ylabel("Normalized commandline frequency") |
||||
plt.xlabel("Commandline rank") |
||||
plt.legend(("Zipf", "Commandline"), loc="best") |
||||
if show_labels: |
||||
plt.xticks(ranks, labels, rotation=-60) |
||||
# TODO: make xticks integral |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
|
||||
# similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf. |
||||
def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): |
||||
cmd_count = defaultdict(int) |
||||
for record in DATA_records: |
||||
cmd = record["firstWord"] |
||||
if cmd == "": |
||||
continue |
||||
cmd_count[cmd] += 1 |
||||
|
||||
tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize] |
||||
cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp)) |
||||
labels = list(map(lambda x: trim(x[0], 7), tmp)) |
||||
|
||||
ranks = range(1, len(cmdFrq)+1) |
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.plot(ranks, zipf(len(ranks)), 'o-') |
||||
plt.plot(ranks, cmdFrq, 'o-') |
||||
plt.title("Command frequency / rank") |
||||
plt.ylabel("Normalized command frequency") |
||||
plt.xlabel("Command rank") |
||||
plt.legend(("Zipf", "Command"), loc="best") |
||||
if show_labels: |
||||
plt.xticks(ranks, labels, rotation=-60) |
||||
# TODO: make xticks integral |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
# Figure 3.2. Command vocabulary size vs. the number of command lines entered for four individuals. |
||||
def plot_cmdVocabularySize_cmdLinesEntered(): |
||||
cmd_vocabulary = set() |
||||
y_cmd_count = [0] |
||||
for record in DATA_records: |
||||
cmd = record["firstWord"] |
||||
if cmd in cmd_vocabulary: |
||||
# repeat last value |
||||
y_cmd_count.append(y_cmd_count[-1]) |
||||
else: |
||||
cmd_vocabulary.add(cmd) |
||||
# append last value +1 |
||||
y_cmd_count.append(y_cmd_count[-1] + 1) |
||||
|
||||
# print(cmd_vocabulary) |
||||
x_cmds_entered = range(0, len(y_cmd_count)) |
||||
|
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.plot(x_cmds_entered, y_cmd_count, '-') |
||||
plt.title("Command vocabulary size vs. the number of command lines entered") |
||||
plt.ylabel("Command vocabulary size") |
||||
plt.xlabel("# of command lines entered") |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals. |
||||
def plot_cmdLineVocabularySize_cmdLinesEntered(): |
||||
cmdLine_vocabulary = set() |
||||
y_cmdLine_count = [0] |
||||
for record in DATA_records: |
||||
cmdLine = record["cmdLine"] |
||||
if cmdLine in cmdLine_vocabulary: |
||||
# repeat last value |
||||
y_cmdLine_count.append(y_cmdLine_count[-1]) |
||||
else: |
||||
cmdLine_vocabulary.add(cmdLine) |
||||
# append last value +1 |
||||
y_cmdLine_count.append(y_cmdLine_count[-1] + 1) |
||||
|
||||
# print(cmdLine_vocabulary) |
||||
x_cmdLines_entered = range(0, len(y_cmdLine_count)) |
||||
|
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.plot(x_cmdLines_entered, y_cmdLine_count, '-') |
||||
plt.title("Command line vocabulary size vs. the number of command lines entered") |
||||
plt.ylabel("Command line vocabulary size") |
||||
plt.xlabel("# of command lines entered") |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
# Figure 3.3. Sequential structure of UNIX command usage, from Figure 4 in Hanson et al. (1984). |
||||
# Ball diameters are proportional to stationary probability. Lines indicate significant dependencies, |
||||
# solid ones being more probable (p < .0001) and dashed ones less probable (.005 < p < .0001). |
||||
def graph_cmdSequences(node_count=33, edge_minValue=0.05): |
||||
START_CMD = "_start_" |
||||
cmd_count = defaultdict(int) |
||||
cmdSeq_count = defaultdict(lambda: defaultdict(int)) |
||||
cmd_id = dict() |
||||
x = 0 |
||||
cmd_id[START_CMD] = str(x) |
||||
for pid, session in DATA_records_by_session.items(): |
||||
cmd_count[START_CMD] += 1 |
||||
prev_cmd = START_CMD |
||||
for record in session: |
||||
cmd = record["firstWord"] |
||||
cmdSeq_count[prev_cmd][cmd] += 1 |
||||
cmd_count[cmd] += 1 |
||||
if cmd not in cmd_id: |
||||
x += 1 |
||||
cmd_id[cmd] = str(x) |
||||
prev_cmd = cmd |
||||
|
||||
# get `node_count` of largest nodes |
||||
sorted_cmd_count = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True) |
||||
print(sorted_cmd_count) |
||||
cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:node_count] |
||||
|
||||
# use 3 biggest nodes as a reference point for scaling |
||||
biggest_node = cmd_count[cmds_to_graph[0]] |
||||
nd_biggest_node = cmd_count[cmds_to_graph[1]] |
||||
rd_biggest_node = cmd_count[cmds_to_graph[1]] |
||||
count2scale_coef = 3 / (biggest_node + nd_biggest_node + rd_biggest_node) |
||||
|
||||
# scaling constant |
||||
# affects node size and node label |
||||
base_scaling_factor = 21 |
||||
# extra scaling for experiments - not really useful imho |
||||
# affects everything nodes, edges, node labels, treshold for turning label into xlabel, xlabel size, ... |
||||
extra_scaling_factor = 1.0 |
||||
for x in range(0, 10): |
||||
# graphviz is not the most reliable piece of software |
||||
# -> retry on fail but scale nodes down by 1% |
||||
scaling_factor = base_scaling_factor * (1 - x * 0.01) |
||||
|
||||
# overlap: scale -> solve overlap by scaling the graph |
||||
# overlap_shrink -> try to shrink the graph a bit after you are done |
||||
# splines -> don't draw edges over nodes |
||||
# sep: 2.5 -> assume that nodes are 2.5 inches larger |
||||
graph_attr={'overlap':'scale', 'overlap_shrink':'true', |
||||
'splines':'true', 'sep':'0.25'} |
||||
graph = Digraph(name='command_sequentiality', engine='neato', graph_attr=graph_attr) |
||||
|
||||
# iterate over all nodes |
||||
for cmd in cmds_to_graph: |
||||
seq = cmdSeq_count[cmd] |
||||
count = cmd_count[cmd] |
||||
|
||||
# iterate over all "following" commands (for each node) |
||||
for seq_entry in seq.items(): |
||||
cmd2, seq_count = seq_entry |
||||
relative_seq_count = seq_count / count |
||||
|
||||
# check if "follow" command is supposed to be in the graph |
||||
if cmd2 not in cmds_to_graph: |
||||
continue |
||||
# check if the edge value is high enough |
||||
if relative_seq_count < edge_minValue: |
||||
continue |
||||
|
||||
# create starting node and end node for the edge |
||||
# duplicates don't matter |
||||
for id_, cmd_ in ((cmd_id[cmd], cmd), (cmd_id[cmd2], cmd2)): |
||||
count_ = cmd_count[cmd_] |
||||
scale_ = count_ * count2scale_coef * scaling_factor * extra_scaling_factor |
||||
width_ = 0.08 * scale_ |
||||
fontsize_ = 8.5 * scale_ / (len(cmd_) + 3) |
||||
|
||||
width_ = str(width_) |
||||
if fontsize_ < 12 * extra_scaling_factor: |
||||
graph.node(id_, ' ', shape='circle', fixedsize='true', fontname='monospace bold', |
||||
width=width_, fontsize=str(12 * extra_scaling_factor), forcelabels='true', xlabel=cmd_) |
||||
else: |
||||
fontsize_ = str(fontsize_) |
||||
graph.node(id_, cmd_, shape='circle', fixedsize='true', fontname='monospace bold', |
||||
width=width_, fontsize=fontsize_, forcelabels='true', labelloc='c') |
||||
|
||||
# value of the edge (percentage) 1.0 is max |
||||
scale_ = seq_count / cmd_count[cmd] |
||||
penwidth_ = str((0.5 + 4.5 * scale_) * extra_scaling_factor) |
||||
#penwidth_bold_ = str(8 * scale_) |
||||
if scale_ > 0.5: |
||||
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', |
||||
penwidth=penwidth_, style='bold') |
||||
elif scale_ > 0.2: |
||||
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', |
||||
penwidth=penwidth_, arrowhead='open') |
||||
elif scale_ > 0.1: |
||||
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='true', splines='curved', |
||||
penwidth=penwidth_, style='dashed', arrowhead='open') |
||||
else: |
||||
graph.edge(cmd_id[cmd], cmd_id[cmd2], constraint='false', splines='curved', |
||||
penwidth=penwidth_, style='dotted', arrowhead='empty') |
||||
|
||||
# graphviz sometimes fails - see above |
||||
try: |
||||
graph.view() |
||||
# graph.render('/tmp/resh-graphviz-cmdSeq.gv', view=True) |
||||
break |
||||
except Exception as e: |
||||
trace = traceback.format_exc() |
||||
print("GRAPHVIZ EXCEPTION: <{}>\nGRAPHVIZ TRACE: <{}>".format(str(e), trace)) |
||||
|
||||
|
||||
def plot_strategies_matches(plot_size=50, selected_strategies=[]): |
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.title("Matches at distance") |
||||
plt.ylabel('%' + " of matches") |
||||
plt.xlabel("Distance") |
||||
legend = [] |
||||
x_values = range(1, plot_size+1) |
||||
saved_matches_total = None |
||||
saved_dataPoint_count = None |
||||
for strategy in data["Strategies"]: |
||||
strategy_title = strategy["Title"] |
||||
# strategy_description = strategy["Description"] |
||||
|
||||
if len(selected_strategies) and strategy_title not in selected_strategies: |
||||
continue |
||||
|
||||
dataPoint_count = 0 |
||||
matches = [0] * plot_size |
||||
matches_total = 0 |
||||
charsRecalled = [0] * plot_size |
||||
charsRecalled_total = 0 |
||||
|
||||
for match in strategy["Matches"]: |
||||
dataPoint_count += 1 |
||||
|
||||
if not match["Match"]: |
||||
continue |
||||
|
||||
chars = match["CharsRecalled"] |
||||
charsRecalled_total += chars |
||||
matches_total += 1 |
||||
|
||||
dist = match["Distance"] |
||||
if dist > plot_size: |
||||
continue |
||||
|
||||
matches[dist-1] += 1 |
||||
charsRecalled[dist-1] += chars |
||||
|
||||
# recent is very simple strategy so we will believe |
||||
# that there is no bug in it and we can use it to determine total |
||||
if strategy_title == "recent": |
||||
saved_matches_total = matches_total |
||||
saved_dataPoint_count = dataPoint_count |
||||
|
||||
if len(selected_strategies) and strategy_title not in selected_strategies: |
||||
continue |
||||
|
||||
acc = 0 |
||||
matches_cumulative = [] |
||||
for x in matches: |
||||
acc += x |
||||
matches_cumulative.append(acc) |
||||
# matches_cumulative.append(matches_total) |
||||
matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches_cumulative)) |
||||
|
||||
plt.plot(x_values, matches_percent, 'o-') |
||||
legend.append(strategy_title) |
||||
|
||||
assert(saved_matches_total is not None) |
||||
assert(saved_dataPoint_count is not None) |
||||
max_values = [100 * saved_matches_total / saved_dataPoint_count] * len(x_values) |
||||
plt.plot(x_values, max_values, 'r-') |
||||
legend.append("maximum possible") |
||||
|
||||
x_ticks = list(range(1, plot_size+1, 2)) |
||||
x_labels = x_ticks[:] |
||||
plt.xticks(x_ticks, x_labels) |
||||
plt.legend(legend, loc="best") |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
|
||||
|
||||
def plot_strategies_charsRecalled(plot_size=50, selected_strategies=[]): |
||||
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) |
||||
plt.title("Average characters recalled at distance") |
||||
plt.ylabel("Average characters recalled") |
||||
plt.xlabel("Distance") |
||||
x_values = range(1, plot_size+1) |
||||
legend = [] |
||||
saved_charsRecalled_total = None |
||||
saved_dataPoint_count = None |
||||
for strategy in data["Strategies"]: |
||||
strategy_title = strategy["Title"] |
||||
# strategy_description = strategy["Description"] |
||||
|
||||
dataPoint_count = 0 |
||||
matches = [0] * plot_size |
||||
matches_total = 0 |
||||
charsRecalled = [0] * plot_size |
||||
charsRecalled_total = 0 |
||||
|
||||
for match in strategy["Matches"]: |
||||
dataPoint_count += 1 |
||||
|
||||
if not match["Match"]: |
||||
continue |
||||
|
||||
chars = match["CharsRecalled"] |
||||
charsRecalled_total += chars |
||||
matches_total += 1 |
||||
|
||||
dist = match["Distance"] |
||||
if dist > plot_size: |
||||
continue |
||||
|
||||
matches[dist-1] += 1 |
||||
charsRecalled[dist-1] += chars |
||||
|
||||
# recent is very simple strategy so we will believe |
||||
# that there is no bug in it and we can use it to determine total |
||||
if strategy_title == "recent": |
||||
saved_charsRecalled_total = charsRecalled_total |
||||
saved_dataPoint_count = dataPoint_count |
||||
|
||||
if len(selected_strategies) and strategy_title not in selected_strategies: |
||||
continue |
||||
|
||||
acc = 0 |
||||
charsRecalled_cumulative = [] |
||||
for x in charsRecalled: |
||||
acc += x |
||||
charsRecalled_cumulative.append(acc) |
||||
charsRecalled_average = list(map(lambda x: x / dataPoint_count, charsRecalled_cumulative)) |
||||
|
||||
plt.plot(x_values, charsRecalled_average, 'o-') |
||||
legend.append(strategy_title) |
||||
|
||||
assert(saved_charsRecalled_total is not None) |
||||
assert(saved_dataPoint_count is not None) |
||||
max_values = [saved_charsRecalled_total / saved_dataPoint_count] * len(x_values) |
||||
plt.plot(x_values, max_values, 'r-') |
||||
legend.append("maximum possible") |
||||
|
||||
x_ticks = list(range(1, plot_size+1, 2)) |
||||
x_labels = x_ticks[:] |
||||
plt.xticks(x_ticks, x_labels) |
||||
plt.legend(legend, loc="best") |
||||
if async_draw: |
||||
plt.draw() |
||||
else: |
||||
plt.show() |
||||
|
||||
|
||||
|
||||
# graph_cmdSequences(node_count=33, edge_minValue=0.05) |
||||
graph_cmdSequences(node_count=28, edge_minValue=0.06) |
||||
|
||||
plot_cmdLineFrq_rank() |
||||
plot_cmdFrq_rank() |
||||
|
||||
plot_cmdLineVocabularySize_cmdLinesEntered() |
||||
plot_cmdVocabularySize_cmdLinesEntered() |
||||
|
||||
plot_strategies_matches(20) |
||||
plot_strategies_charsRecalled(20) |
||||
|
||||
if async_draw: |
||||
plt.show() |
||||
# be careful and check if labels fit the display |
||||
@ -0,0 +1,340 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"bufio" |
||||
"bytes" |
||||
"encoding/json" |
||||
"flag" |
||||
"fmt" |
||||
"io/ioutil" |
||||
"log" |
||||
"os" |
||||
"os/exec" |
||||
"os/user" |
||||
"path/filepath" |
||||
"sort" |
||||
|
||||
"github.com/curusarn/resh/common" |
||||
) |
||||
|
||||
// Version from git set during build
|
||||
var Version string |
||||
|
||||
// Revision from git set during build
|
||||
var Revision string |
||||
|
||||
func main() { |
||||
usr, _ := user.Current() |
||||
dir := usr.HomeDir |
||||
historyPath := filepath.Join(dir, ".resh_history.json") |
||||
historyPathBatchMode := filepath.Join(dir, "resh_history.json") |
||||
sanitizedHistoryPath := filepath.Join(dir, "resh_history_sanitized.json") |
||||
// tmpPath := "/tmp/resh-evaluate-tmp.json"
|
||||
|
||||
showVersion := flag.Bool("version", false, "Show version and exit") |
||||
showRevision := flag.Bool("revision", false, "Show git revision and exit") |
||||
input := flag.String("input", "", |
||||
"Input file (default: "+historyPath+"OR"+sanitizedHistoryPath+ |
||||
" depending on --sanitized-input option)") |
||||
// outputDir := flag.String("output", "/tmp/resh-evaluate", "Output directory")
|
||||
sanitizedInput := flag.Bool("sanitized-input", false, |
||||
"Handle input as sanitized (also changes default value for input argument)") |
||||
plottingScript := flag.String("plotting-script", "resh-evaluate-plot.py", "Script to use for plotting") |
||||
inputDataRoot := flag.String("input-data-root", "", |
||||
"Input data root, enables batch mode, looks for files matching --input option") |
||||
|
||||
flag.Parse() |
||||
|
||||
// handle show{Version,Revision} options
|
||||
if *showVersion == true { |
||||
fmt.Println(Version) |
||||
os.Exit(0) |
||||
} |
||||
if *showRevision == true { |
||||
fmt.Println(Revision) |
||||
os.Exit(0) |
||||
} |
||||
|
||||
// handle batch mode
|
||||
batchMode := false |
||||
if *inputDataRoot != "" { |
||||
batchMode = true |
||||
} |
||||
// set default input
|
||||
if *input == "" { |
||||
if *sanitizedInput { |
||||
*input = sanitizedHistoryPath |
||||
} else if batchMode { |
||||
*input = historyPathBatchMode |
||||
} else { |
||||
*input = historyPath |
||||
} |
||||
} |
||||
|
||||
evaluator := evaluator{sanitizedInput: *sanitizedInput, maxCandidates: 50, BatchMode: batchMode} |
||||
if batchMode { |
||||
err := evaluator.initBatchMode(*input, *inputDataRoot) |
||||
if err != nil { |
||||
log.Fatal("Evaluator initBatchMode() error:", err) |
||||
} |
||||
} else { |
||||
err := evaluator.init(*input) |
||||
if err != nil { |
||||
log.Fatal("Evaluator init() error:", err) |
||||
} |
||||
} |
||||
|
||||
var strategies []strategy |
||||
|
||||
// dummy := strategyDummy{}
|
||||
// strategies = append(strategies, &dummy)
|
||||
|
||||
recent := strategyRecent{} |
||||
frequent := strategyFrequent{} |
||||
frequent.init() |
||||
directory := strategyDirectorySensitive{} |
||||
directory.init() |
||||
|
||||
strategies = append(strategies, &recent, &frequent, &directory) |
||||
|
||||
for _, strat := range strategies { |
||||
err := evaluator.evaluate(strat) |
||||
if err != nil { |
||||
log.Println("Evaluator evaluate() error:", err) |
||||
} |
||||
} |
||||
|
||||
evaluator.calculateStatsAndPlot(*plottingScript) |
||||
} |
||||
|
||||
type strategy interface { |
||||
GetTitleAndDescription() (string, string) |
||||
GetCandidates() []string |
||||
AddHistoryRecord(record *common.Record) error |
||||
ResetHistory() error |
||||
} |
||||
|
||||
type matchJSON struct { |
||||
Match bool |
||||
Distance int |
||||
CharsRecalled int |
||||
} |
||||
|
||||
type strategyJSON struct { |
||||
Title string |
||||
Description string |
||||
Matches []matchJSON |
||||
} |
||||
|
||||
type deviceRecords struct { |
||||
Name string |
||||
Records []common.Record |
||||
} |
||||
|
||||
type userRecords struct { |
||||
Name string |
||||
Devices []deviceRecords |
||||
} |
||||
|
||||
type evaluator struct { |
||||
sanitizedInput bool |
||||
BatchMode bool |
||||
maxCandidates int |
||||
UsersRecords []userRecords |
||||
Strategies []strategyJSON |
||||
} |
||||
|
||||
func (e *evaluator) initBatchMode(input string, inputDataRoot string) error { |
||||
e.UsersRecords = e.loadHistoryRecordsBatchMode(input, inputDataRoot) |
||||
e.processRecords() |
||||
return nil |
||||
} |
||||
|
||||
func (e *evaluator) init(inputPath string) error { |
||||
records := e.loadHistoryRecords(inputPath) |
||||
device := deviceRecords{Records: records} |
||||
user := userRecords{} |
||||
user.Devices = append(user.Devices, device) |
||||
e.UsersRecords = append(e.UsersRecords, user) |
||||
e.processRecords() |
||||
return nil |
||||
} |
||||
|
||||
func (e *evaluator) calculateStatsAndPlot(scriptName string) { |
||||
evalJSON, err := json.Marshal(e) |
||||
if err != nil { |
||||
log.Fatal("json marshal error", err) |
||||
} |
||||
buffer := bytes.Buffer{} |
||||
buffer.Write(evalJSON) |
||||
// run python script to stat and plot/
|
||||
cmd := exec.Command(scriptName) |
||||
cmd.Stdout = os.Stdout |
||||
cmd.Stderr = os.Stderr |
||||
cmd.Stdin = &buffer |
||||
err = cmd.Run() |
||||
if err != nil { |
||||
log.Printf("Command finished with error: %v", err) |
||||
} |
||||
} |
||||
|
||||
// enrich records and add them to serializable structure
|
||||
func (e *evaluator) processRecords() { |
||||
for i := range e.UsersRecords { |
||||
for j, device := range e.UsersRecords[i].Devices { |
||||
sessionIDs := map[string]uint64{} |
||||
var nextID uint64 |
||||
nextID = 0 |
||||
for k, record := range e.UsersRecords[i].Devices[j].Records { |
||||
id, found := sessionIDs[record.SessionId] |
||||
if found == false { |
||||
id = nextID |
||||
sessionIDs[record.SessionId] = id |
||||
nextID++ |
||||
} |
||||
record.SeqSessionID = id |
||||
// assert
|
||||
if record.Sanitized != e.sanitizedInput { |
||||
if e.sanitizedInput { |
||||
log.Fatal("ASSERT failed: '--sanitized-input' is present but data is not sanitized") |
||||
} |
||||
log.Fatal("ASSERT failed: data is sanitized but '--sanitized-input' is not present") |
||||
} |
||||
|
||||
e.UsersRecords[i].Devices[j].Records[k].Enrich() |
||||
// device.Records = append(device.Records, record)
|
||||
} |
||||
sort.SliceStable(e.UsersRecords[i].Devices[j].Records, func(x, y int) bool { |
||||
if device.Records[x].SeqSessionID == device.Records[y].SeqSessionID { |
||||
return device.Records[x].RealtimeAfterLocal < device.Records[y].RealtimeAfterLocal |
||||
} |
||||
return device.Records[x].SeqSessionID < device.Records[y].SeqSessionID |
||||
}) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (e *evaluator) evaluate(strategy strategy) error { |
||||
title, description := strategy.GetTitleAndDescription() |
||||
strategyData := strategyJSON{Title: title, Description: description} |
||||
for _, record := range e.UsersRecords[0].Devices[0].Records { |
||||
candidates := strategy.GetCandidates() |
||||
|
||||
matchFound := false |
||||
for i, candidate := range candidates { |
||||
// make an option (--calculate-total) to turn this on/off ?
|
||||
// if i >= e.maxCandidates {
|
||||
// break
|
||||
// }
|
||||
if candidate == record.CmdLine { |
||||
match := matchJSON{Match: true, Distance: i + 1, CharsRecalled: record.CmdLength} |
||||
strategyData.Matches = append(strategyData.Matches, match) |
||||
matchFound = true |
||||
break |
||||
} |
||||
} |
||||
if matchFound == false { |
||||
strategyData.Matches = append(strategyData.Matches, matchJSON{}) |
||||
} |
||||
err := strategy.AddHistoryRecord(&record) |
||||
if err != nil { |
||||
log.Println("Error while evauating", err) |
||||
return err |
||||
} |
||||
} |
||||
e.Strategies = append(e.Strategies, strategyData) |
||||
return nil |
||||
} |
||||
|
||||
func (e *evaluator) loadHistoryRecordsBatchMode(fname string, dataRootPath string) []userRecords { |
||||
var records []userRecords |
||||
info, err := os.Stat(dataRootPath) |
||||
if err != nil { |
||||
log.Fatal("Error: Directory", dataRootPath, "does not exist - exiting! (", err, ")") |
||||
} |
||||
if info.IsDir() == false { |
||||
log.Fatal("Error:", dataRootPath, "is not a directory - exiting!") |
||||
} |
||||
users, err := ioutil.ReadDir(dataRootPath) |
||||
if err != nil { |
||||
log.Fatal("Could not read directory:", dataRootPath) |
||||
} |
||||
fmt.Println("Listing users in <", dataRootPath, ">...") |
||||
for _, user := range users { |
||||
userRecords := userRecords{Name: user.Name()} |
||||
userFullPath := filepath.Join(dataRootPath, user.Name()) |
||||
if user.IsDir() == false { |
||||
log.Println("Warn: Unexpected file (not a directory) <", userFullPath, "> - skipping.") |
||||
continue |
||||
} |
||||
fmt.Println() |
||||
fmt.Printf("*- %s\n", user.Name()) |
||||
devices, err := ioutil.ReadDir(userFullPath) |
||||
if err != nil { |
||||
log.Fatal("Could not read directory:", userFullPath) |
||||
} |
||||
for _, device := range devices { |
||||
deviceRecords := deviceRecords{Name: device.Name()} |
||||
deviceFullPath := filepath.Join(userFullPath, device.Name()) |
||||
if device.IsDir() == false { |
||||
log.Println("Warn: Unexpected file (not a directory) <", deviceFullPath, "> - skipping.") |
||||
continue |
||||
} |
||||
fmt.Printf(" \\- %s\n", device.Name()) |
||||
files, err := ioutil.ReadDir(deviceFullPath) |
||||
if err != nil { |
||||
log.Fatal("Could not read directory:", deviceFullPath) |
||||
} |
||||
for _, file := range files { |
||||
fileFullPath := filepath.Join(deviceFullPath, file.Name()) |
||||
if file.Name() == fname { |
||||
fmt.Printf(" \\- %s - loading ...", file.Name()) |
||||
// load the data
|
||||
deviceRecords.Records = e.loadHistoryRecords(fileFullPath) |
||||
fmt.Println(" OK ✓") |
||||
} else { |
||||
fmt.Printf(" \\- %s - skipped\n", file.Name()) |
||||
} |
||||
} |
||||
userRecords.Devices = append(userRecords.Devices, deviceRecords) |
||||
} |
||||
records = append(records, userRecords) |
||||
} |
||||
return records |
||||
} |
||||
|
||||
func (e *evaluator) loadHistoryRecords(fname string) []common.Record { |
||||
file, err := os.Open(fname) |
||||
if err != nil { |
||||
log.Fatal("Open() resh history file error:", err) |
||||
} |
||||
defer file.Close() |
||||
|
||||
var records []common.Record |
||||
scanner := bufio.NewScanner(file) |
||||
for scanner.Scan() { |
||||
record := common.Record{} |
||||
fallbackRecord := common.FallbackRecord{} |
||||
line := scanner.Text() |
||||
err = json.Unmarshal([]byte(line), &record) |
||||
if err != nil { |
||||
err = json.Unmarshal([]byte(line), &fallbackRecord) |
||||
if err != nil { |
||||
log.Println("Line:", line) |
||||
log.Fatal("Decoding error:", err) |
||||
} |
||||
record = common.ConvertRecord(&fallbackRecord) |
||||
} |
||||
if e.sanitizedInput == false { |
||||
if record.CmdLength != 0 { |
||||
log.Fatal("Assert failed - 'cmdLength' is set in raw data. Maybe you want to use '--sanitized-input' option?") |
||||
} |
||||
record.CmdLength = len(record.CmdLine) |
||||
} |
||||
if record.CmdLength == 0 { |
||||
log.Fatal("Assert failed - 'cmdLength' is unset in the data. This should not happen.") |
||||
} |
||||
records = append(records, record) |
||||
} |
||||
return records |
||||
} |
||||
@ -0,0 +1,42 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"github.com/curusarn/resh/common" |
||||
) |
||||
|
||||
type strategyDirectorySensitive struct { |
||||
history map[string][]string |
||||
lastPwd string |
||||
} |
||||
|
||||
func (s *strategyDirectorySensitive) init() { |
||||
s.history = map[string][]string{} |
||||
} |
||||
|
||||
func (s *strategyDirectorySensitive) GetTitleAndDescription() (string, string) { |
||||
return "directory sensitive (recent)", "Use recent commands executed is the same directory" |
||||
} |
||||
|
||||
func (s *strategyDirectorySensitive) GetCandidates() []string { |
||||
return s.history[s.lastPwd] |
||||
} |
||||
|
||||
func (s *strategyDirectorySensitive) AddHistoryRecord(record *common.Record) error { |
||||
// work on history for PWD
|
||||
pwd := record.Pwd |
||||
// remove previous occurance of record
|
||||
for i, cmd := range s.history[pwd] { |
||||
if cmd == record.CmdLine { |
||||
s.history[pwd] = append(s.history[pwd][:i], s.history[pwd][i+1:]...) |
||||
} |
||||
} |
||||
// append new record
|
||||
s.history[pwd] = append([]string{record.CmdLine}, s.history[pwd]...) |
||||
s.lastPwd = record.PwdAfter |
||||
return nil |
||||
} |
||||
|
||||
func (s *strategyDirectorySensitive) ResetHistory() error { |
||||
s.history = map[string][]string{} |
||||
return nil |
||||
} |
||||
@ -0,0 +1,24 @@ |
||||
package main |
||||
|
||||
import "github.com/curusarn/resh/common" |
||||
|
||||
type strategyDummy struct { |
||||
history []string |
||||
} |
||||
|
||||
func (s *strategyDummy) GetTitleAndDescription() (string, string) { |
||||
return "dummy", "Return empty candidate list" |
||||
} |
||||
|
||||
func (s *strategyDummy) GetCandidates() []string { |
||||
return nil |
||||
} |
||||
|
||||
func (s *strategyDummy) AddHistoryRecord(record *common.Record) error { |
||||
s.history = append(s.history, record.CmdLine) |
||||
return nil |
||||
} |
||||
|
||||
func (s *strategyDummy) ResetHistory() error { |
||||
return nil |
||||
} |
||||
@ -0,0 +1,47 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"sort" |
||||
|
||||
"github.com/curusarn/resh/common" |
||||
) |
||||
|
||||
type strategyFrequent struct { |
||||
history map[string]int |
||||
} |
||||
|
||||
type strFrqEntry struct { |
||||
cmdLine string |
||||
count int |
||||
} |
||||
|
||||
func (s *strategyFrequent) init() { |
||||
s.history = map[string]int{} |
||||
} |
||||
|
||||
func (s *strategyFrequent) GetTitleAndDescription() (string, string) { |
||||
return "frequent", "Use frequent commands" |
||||
} |
||||
|
||||
func (s *strategyFrequent) GetCandidates() []string { |
||||
var mapItems []strFrqEntry |
||||
for cmdLine, count := range s.history { |
||||
mapItems = append(mapItems, strFrqEntry{cmdLine, count}) |
||||
} |
||||
sort.Slice(mapItems, func(i int, j int) bool { return mapItems[i].count > mapItems[j].count }) |
||||
var hist []string |
||||
for _, item := range mapItems { |
||||
hist = append(hist, item.cmdLine) |
||||
} |
||||
return hist |
||||
} |
||||
|
||||
func (s *strategyFrequent) AddHistoryRecord(record *common.Record) error { |
||||
s.history[record.CmdLine]++ |
||||
return nil |
||||
} |
||||
|
||||
func (s *strategyFrequent) ResetHistory() error { |
||||
s.history = map[string]int{} |
||||
return nil |
||||
} |
||||
@ -0,0 +1,32 @@ |
||||
package main |
||||
|
||||
import "github.com/curusarn/resh/common" |
||||
|
||||
type strategyRecent struct { |
||||
history []string |
||||
} |
||||
|
||||
func (s *strategyRecent) GetTitleAndDescription() (string, string) { |
||||
return "recent", "Use recent commands" |
||||
} |
||||
|
||||
func (s *strategyRecent) GetCandidates() []string { |
||||
return s.history |
||||
} |
||||
|
||||
func (s *strategyRecent) AddHistoryRecord(record *common.Record) error { |
||||
// remove previous occurance of record
|
||||
for i, cmd := range s.history { |
||||
if cmd == record.CmdLine { |
||||
s.history = append(s.history[:i], s.history[i+1:]...) |
||||
} |
||||
} |
||||
// append new record
|
||||
s.history = append([]string{record.CmdLine}, s.history...) |
||||
return nil |
||||
} |
||||
|
||||
func (s *strategyRecent) ResetHistory() error { |
||||
s.history = nil |
||||
return nil |
||||
} |
||||
@ -1,2 +1,13 @@ |
||||
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= |
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= |
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= |
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= |
||||
github.com/mattn/go-shellwords v1.0.6 h1:9Jok5pILi5S1MnDirGVTufYGtksUs/V2BWUP3ZkeUUI= |
||||
github.com/mattn/go-shellwords v1.0.6/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= |
||||
github.com/wcharczuk/go-chart v2.0.1+incompatible h1:0pz39ZAycJFF7ju/1mepnk26RLVLBCWz1STcD3doU0A= |
||||
github.com/wcharczuk/go-chart v2.0.1+incompatible/go.mod h1:PF5tmL4EIx/7Wf+hEkpCqYi5He4u90sw+0+6FhrryuE= |
||||
github.com/whilp/git-urls v0.0.0-20160530060445-31bac0d230fa h1:rW+Lu6281ed/4XGuVIa4/YebTRNvoUJlfJ44ktEVwZk= |
||||
github.com/whilp/git-urls v0.0.0-20160530060445-31bac0d230fa/go.mod h1:2rx5KE5FLD0HRfkkpyn8JwbVLBdhgeiOb2D2D9LLKM4= |
||||
golang.org/x/image v0.0.0-20190902063713-cb417be4ba39 h1:4dQcAORh9oYBwVSBVIkP489LUPC+f1HBkTYXgmqfR+o= |
||||
golang.org/x/image v0.0.0-20190902063713-cb417be4ba39/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= |
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= |
||||
|
||||
@ -0,0 +1,424 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"bufio" |
||||
"crypto/sha1" |
||||
"encoding/binary" |
||||
"encoding/hex" |
||||
"encoding/json" |
||||
"errors" |
||||
"flag" |
||||
"fmt" |
||||
"log" |
||||
"net/url" |
||||
"os" |
||||
"os/user" |
||||
"path" |
||||
"path/filepath" |
||||
"strconv" |
||||
"strings" |
||||
"unicode" |
||||
|
||||
"github.com/curusarn/resh/common" |
||||
giturls "github.com/whilp/git-urls" |
||||
) |
||||
|
||||
// Version from git set during build
|
||||
var Version string |
||||
|
||||
// Revision from git set during build
|
||||
var Revision string |
||||
|
||||
func main() { |
||||
usr, _ := user.Current() |
||||
dir := usr.HomeDir |
||||
historyPath := filepath.Join(dir, ".resh_history.json") |
||||
// outputPath := filepath.Join(dir, "resh_history_sanitized.json")
|
||||
sanitizerDataPath := filepath.Join(dir, ".resh", "sanitizer_data") |
||||
|
||||
showVersion := flag.Bool("version", false, "Show version and exit") |
||||
showRevision := flag.Bool("revision", false, "Show git revision and exit") |
||||
trimHashes := flag.Int("trim-hashes", 12, "Trim hashes to N characters, '0' turns off trimming") |
||||
inputPath := flag.String("input", historyPath, "Input file") |
||||
outputPath := flag.String("output", "", "Output file (default: use stdout)") |
||||
|
||||
flag.Parse() |
||||
|
||||
if *showVersion == true { |
||||
fmt.Println(Version) |
||||
os.Exit(0) |
||||
} |
||||
if *showRevision == true { |
||||
fmt.Println(Revision) |
||||
os.Exit(0) |
||||
} |
||||
sanitizer := sanitizer{hashLength: *trimHashes} |
||||
err := sanitizer.init(sanitizerDataPath) |
||||
if err != nil { |
||||
log.Fatal("Sanitizer init() error:", err) |
||||
} |
||||
|
||||
inputFile, err := os.Open(*inputPath) |
||||
if err != nil { |
||||
log.Fatal("Open() resh history file error:", err) |
||||
} |
||||
defer inputFile.Close() |
||||
|
||||
var writer *bufio.Writer |
||||
if *outputPath == "" { |
||||
writer = bufio.NewWriter(os.Stdout) |
||||
} else { |
||||
outputFile, err := os.Create(*outputPath) |
||||
if err != nil { |
||||
log.Fatal("Create() output file error:", err) |
||||
} |
||||
defer outputFile.Close() |
||||
writer = bufio.NewWriter(outputFile) |
||||
} |
||||
defer writer.Flush() |
||||
|
||||
scanner := bufio.NewScanner(inputFile) |
||||
for scanner.Scan() { |
||||
record := common.Record{} |
||||
fallbackRecord := common.FallbackRecord{} |
||||
line := scanner.Text() |
||||
err = json.Unmarshal([]byte(line), &record) |
||||
if err != nil { |
||||
err = json.Unmarshal([]byte(line), &fallbackRecord) |
||||
if err != nil { |
||||
log.Println("Line:", line) |
||||
log.Fatal("Decoding error:", err) |
||||
} |
||||
record = common.ConvertRecord(&fallbackRecord) |
||||
} |
||||
err = sanitizer.sanitizeRecord(&record) |
||||
if err != nil { |
||||
log.Println("Line:", line) |
||||
log.Fatal("Sanitization error:", err) |
||||
} |
||||
outLine, err := json.Marshal(&record) |
||||
if err != nil { |
||||
log.Println("Line:", line) |
||||
log.Fatal("Encoding error:", err) |
||||
} |
||||
// fmt.Println(string(outLine))
|
||||
n, err := writer.WriteString(string(outLine) + "\n") |
||||
if err != nil { |
||||
log.Fatal(err) |
||||
} |
||||
if n == 0 { |
||||
log.Fatal("Nothing was written", n) |
||||
} |
||||
} |
||||
} |
||||
|
||||
type sanitizer struct { |
||||
hashLength int |
||||
whitelist map[string]bool |
||||
} |
||||
|
||||
func (s *sanitizer) init(dataPath string) error { |
||||
globalData := path.Join(dataPath, "whitelist.txt") |
||||
s.whitelist = loadData(globalData) |
||||
return nil |
||||
} |
||||
|
||||
func loadData(fname string) map[string]bool { |
||||
file, err := os.Open(fname) |
||||
if err != nil { |
||||
log.Fatal("Open() file error:", err) |
||||
} |
||||
defer file.Close() |
||||
|
||||
scanner := bufio.NewScanner(file) |
||||
data := make(map[string]bool) |
||||
for scanner.Scan() { |
||||
line := scanner.Text() |
||||
data[line] = true |
||||
} |
||||
return data |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeRecord(record *common.Record) error { |
||||
// hash directories of the paths
|
||||
record.Pwd = s.sanitizePath(record.Pwd) |
||||
record.RealPwd = s.sanitizePath(record.RealPwd) |
||||
record.PwdAfter = s.sanitizePath(record.PwdAfter) |
||||
record.RealPwdAfter = s.sanitizePath(record.RealPwdAfter) |
||||
record.GitDir = s.sanitizePath(record.GitDir) |
||||
record.GitRealDir = s.sanitizePath(record.GitRealDir) |
||||
record.Home = s.sanitizePath(record.Home) |
||||
record.ShellEnv = s.sanitizePath(record.ShellEnv) |
||||
|
||||
// hash the most sensitive info, do not tokenize
|
||||
record.Host = s.hashToken(record.Host) |
||||
record.Login = s.hashToken(record.Login) |
||||
record.MachineId = s.hashToken(record.MachineId) |
||||
|
||||
var err error |
||||
// this changes git url a bit but I'm still happy with the result
|
||||
// e.g. "git@github.com:curusarn/resh" becomes "ssh://git@github.com/3385162f14d7/5a7b2909005c"
|
||||
// notice the "ssh://" prefix
|
||||
record.GitOriginRemote, err = s.sanitizeGitURL(record.GitOriginRemote) |
||||
if err != nil { |
||||
log.Println("Error while snitizing GitOriginRemote url", record.GitOriginRemote, ":", err) |
||||
return err |
||||
} |
||||
|
||||
// sanitization destroys original CmdLine length -> save it
|
||||
record.CmdLength = len(record.CmdLine) |
||||
|
||||
record.CmdLine, err = s.sanitizeCmdLine(record.CmdLine) |
||||
if err != nil { |
||||
log.Fatal("Cmd:", record.CmdLine, "; sanitization error:", err) |
||||
} |
||||
|
||||
// add a flag to signify that the record has been sanitized
|
||||
record.Sanitized = true |
||||
return nil |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeCmdLine(cmdLine string) (string, error) { |
||||
const optionEndingChars = "\"$'\\#[]!><|;{}()*,?~&=`:@^/+%." // all bash control characters, '=', ...
|
||||
const optionAllowedChars = "-_" // characters commonly found inside of options
|
||||
sanCmdLine := "" |
||||
buff := "" |
||||
|
||||
// simple options shouldn't be sanitized
|
||||
// 1) whitespace 2) "-" or "--" 3) letters, digits, "-", "_" 4) ending whitespace or any of "=;)"
|
||||
var optionDetected bool |
||||
|
||||
prevR3 := ' ' |
||||
prevR2 := ' ' |
||||
prevR := ' ' |
||||
for _, r := range cmdLine { |
||||
switch optionDetected { |
||||
case true: |
||||
if unicode.IsSpace(r) || strings.ContainsRune(optionEndingChars, r) { |
||||
// whitespace or option ends the option
|
||||
// => add option unsanitized
|
||||
optionDetected = false |
||||
if len(buff) > 0 { |
||||
sanCmdLine += buff |
||||
buff = "" |
||||
} |
||||
sanCmdLine += string(r) |
||||
} else if unicode.IsLetter(r) == false && unicode.IsDigit(r) == false && |
||||
strings.ContainsRune(optionAllowedChars, r) == false { |
||||
// r is not any of allowed chars for an option: letter, digit, "-" or "_"
|
||||
// => sanitize
|
||||
if len(buff) > 0 { |
||||
sanToken, err := s.sanitizeCmdToken(buff) |
||||
if err != nil { |
||||
log.Println("WARN: got error while sanitizing cmdLine:", cmdLine) |
||||
// return cmdLine, err
|
||||
} |
||||
sanCmdLine += sanToken |
||||
buff = "" |
||||
} |
||||
sanCmdLine += string(r) |
||||
} else { |
||||
buff += string(r) |
||||
} |
||||
case false: |
||||
// split command on all non-letter and non-digit characters
|
||||
if unicode.IsLetter(r) == false && unicode.IsDigit(r) == false { |
||||
// split token
|
||||
if len(buff) > 0 { |
||||
sanToken, err := s.sanitizeCmdToken(buff) |
||||
if err != nil { |
||||
log.Println("WARN: got error while sanitizing cmdLine:", cmdLine) |
||||
// return cmdLine, err
|
||||
} |
||||
sanCmdLine += sanToken |
||||
buff = "" |
||||
} |
||||
sanCmdLine += string(r) |
||||
} else { |
||||
if (unicode.IsSpace(prevR2) && prevR == '-') || |
||||
(unicode.IsSpace(prevR3) && prevR2 == '-' && prevR == '-') { |
||||
optionDetected = true |
||||
} |
||||
buff += string(r) |
||||
} |
||||
} |
||||
prevR3 = prevR2 |
||||
prevR2 = prevR |
||||
prevR = r |
||||
} |
||||
if len(buff) <= 0 { |
||||
// nothing in the buffer => work is done
|
||||
return sanCmdLine, nil |
||||
} |
||||
if optionDetected { |
||||
// option detected => dont sanitize
|
||||
sanCmdLine += buff |
||||
return sanCmdLine, nil |
||||
} |
||||
// sanitize
|
||||
sanToken, err := s.sanitizeCmdToken(buff) |
||||
if err != nil { |
||||
log.Println("WARN: got error while sanitizing cmdLine:", cmdLine) |
||||
// return cmdLine, err
|
||||
} |
||||
sanCmdLine += sanToken |
||||
return sanCmdLine, nil |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeGitURL(rawURL string) (string, error) { |
||||
if len(rawURL) <= 0 { |
||||
return rawURL, nil |
||||
} |
||||
parsedURL, err := giturls.Parse(rawURL) |
||||
if err != nil { |
||||
return rawURL, err |
||||
} |
||||
return s.sanitizeParsedURL(parsedURL) |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeURL(rawURL string) (string, error) { |
||||
if len(rawURL) <= 0 { |
||||
return rawURL, nil |
||||
} |
||||
parsedURL, err := url.Parse(rawURL) |
||||
if err != nil { |
||||
return rawURL, err |
||||
} |
||||
return s.sanitizeParsedURL(parsedURL) |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeParsedURL(parsedURL *url.URL) (string, error) { |
||||
parsedURL.Opaque = s.sanitizeToken(parsedURL.Opaque) |
||||
|
||||
userinfo := parsedURL.User.Username() // only get username => password won't even make it to the sanitized data
|
||||
if len(userinfo) > 0 { |
||||
parsedURL.User = url.User(s.sanitizeToken(userinfo)) |
||||
} else { |
||||
// we need to do this because `gitUrls.Parse()` sets `User` to `url.User("")` instead of `nil`
|
||||
parsedURL.User = nil |
||||
} |
||||
var err error |
||||
parsedURL.Host, err = s.sanitizeTwoPartToken(parsedURL.Host, ":") |
||||
if err != nil { |
||||
return parsedURL.String(), err |
||||
} |
||||
parsedURL.Path = s.sanitizePath(parsedURL.Path) |
||||
// ForceQuery bool
|
||||
parsedURL.RawQuery = s.sanitizeToken(parsedURL.RawQuery) |
||||
parsedURL.Fragment = s.sanitizeToken(parsedURL.Fragment) |
||||
|
||||
return parsedURL.String(), nil |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizePath(path string) string { |
||||
var sanPath string |
||||
for _, token := range strings.Split(path, "/") { |
||||
if s.whitelist[token] != true { |
||||
token = s.hashToken(token) |
||||
} |
||||
sanPath += token + "/" |
||||
} |
||||
if len(sanPath) > 0 { |
||||
sanPath = sanPath[:len(sanPath)-1] |
||||
} |
||||
return sanPath |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeTwoPartToken(token string, delimeter string) (string, error) { |
||||
tokenParts := strings.Split(token, delimeter) |
||||
if len(tokenParts) <= 1 { |
||||
return s.sanitizeToken(token), nil |
||||
} |
||||
if len(tokenParts) == 2 { |
||||
return s.sanitizeToken(tokenParts[0]) + delimeter + s.sanitizeToken(tokenParts[1]), nil |
||||
} |
||||
return token, errors.New("Token has more than two parts") |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeCmdToken(token string) (string, error) { |
||||
// there shouldn't be tokens with letters or digits mixed together with symbols
|
||||
if len(token) <= 1 { |
||||
// NOTE: do not sanitize single letter tokens
|
||||
return token, nil |
||||
} |
||||
if s.isInWhitelist(token) == true { |
||||
return token, nil |
||||
} |
||||
|
||||
isLettersOrDigits := true |
||||
// isDigits := true
|
||||
isOtherCharacters := true |
||||
for _, r := range token { |
||||
if unicode.IsDigit(r) == false && unicode.IsLetter(r) == false { |
||||
isLettersOrDigits = false |
||||
// isDigits = false
|
||||
} |
||||
// if unicode.IsDigit(r) == false {
|
||||
// isDigits = false
|
||||
// }
|
||||
if unicode.IsDigit(r) || unicode.IsLetter(r) { |
||||
isOtherCharacters = false |
||||
} |
||||
} |
||||
// NOTE: I decided that I don't want a special sanitization for numbers
|
||||
// if isDigits {
|
||||
// return s.hashNumericToken(token), nil
|
||||
// }
|
||||
if isLettersOrDigits { |
||||
return s.hashToken(token), nil |
||||
} |
||||
if isOtherCharacters { |
||||
return token, nil |
||||
} |
||||
log.Println("WARN: cmd token is made of mix of letters or digits and other characters; token:", token) |
||||
// return token, errors.New("cmd token is made of mix of letters or digits and other characters")
|
||||
return s.hashToken(token), errors.New("cmd token is made of mix of letters or digits and other characters") |
||||
} |
||||
|
||||
func (s *sanitizer) sanitizeToken(token string) string { |
||||
if len(token) <= 1 { |
||||
// NOTE: do not sanitize single letter tokens
|
||||
return token |
||||
} |
||||
if s.isInWhitelist(token) { |
||||
return token |
||||
} |
||||
return s.hashToken(token) |
||||
} |
||||
|
||||
func (s *sanitizer) hashToken(token string) string { |
||||
if len(token) <= 0 { |
||||
return token |
||||
} |
||||
// hash with sha1
|
||||
h := sha1.New() |
||||
h.Write([]byte(token)) |
||||
sum := h.Sum(nil) |
||||
return s.trimHash(hex.EncodeToString(sum)) |
||||
} |
||||
|
||||
func (s *sanitizer) hashNumericToken(token string) string { |
||||
if len(token) <= 0 { |
||||
return token |
||||
} |
||||
h := sha1.New() |
||||
h.Write([]byte(token)) |
||||
sum := h.Sum(nil) |
||||
sumInt := int(binary.LittleEndian.Uint64(sum)) |
||||
if sumInt < 0 { |
||||
return strconv.Itoa(sumInt * -1) |
||||
} |
||||
return s.trimHash(strconv.Itoa(sumInt)) |
||||
} |
||||
|
||||
func (s *sanitizer) trimHash(hash string) string { |
||||
length := s.hashLength |
||||
if length <= 0 || len(hash) < length { |
||||
length = len(hash) |
||||
} |
||||
return hash[:length] |
||||
} |
||||
|
||||
func (s *sanitizer) isInWhitelist(token string) bool { |
||||
return s.whitelist[strings.ToLower(token)] == true |
||||
} |
||||
@ -0,0 +1,7 @@ |
||||
# copyright information |
||||
|
||||
Whitelist contains content from variety of sources. |
||||
|
||||
Part of the whitelist (`./whitelist.txt`) is made of copyrighted content from [FileInfo.com](https://fileinfo.com/filetypes/common). |
||||
|
||||
This content was used with permission from FileInfo.com. |
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue