evaluation changes

pull/123/head v2.5.26
Simon Let 6 years ago
parent 82336a5cf1
commit 34daa9ba8a
  1. 14
      cmd/evaluate/main.go
  2. 314
      scripts/resh-evaluate-plot.py

@ -9,7 +9,6 @@ import (
"path/filepath"
"github.com/curusarn/resh/pkg/histanal"
"github.com/curusarn/resh/pkg/records"
"github.com/curusarn/resh/pkg/strat"
)
@ -109,12 +108,13 @@ func main() {
// dynamicDistG.Init()
// strategies = append(strategies, &dynamicDistG)
distanceStaticBest := strat.RecordDistance{
MaxDepth: 3000,
DistParams: records.DistParams{Pwd: 10, RealPwd: 10, SessionID: 1, Time: 1},
Label: "10*pwd,10*realpwd,session,time",
}
strategies = append(strategies, &distanceStaticBest)
// NOTE: this is the decent one !!!
// distanceStaticBest := strat.RecordDistance{
// MaxDepth: 3000,
// DistParams: records.DistParams{Pwd: 10, RealPwd: 10, SessionID: 1, Time: 1},
// Label: "10*pwd,10*realpwd,session,time",
// }
// strategies = append(strategies, &distanceStaticBest)
recentBash := strat.RecentBash{}
recentBash.Init()

@ -15,6 +15,7 @@ rcParams['font.family'] = 'serif'
import matplotlib.pyplot as plt
import matplotlib.path as mpath
import matplotlib.patches as mpatches
PLOT_WIDTH = 10 # inches
PLOT_HEIGHT = 7 # inches
@ -27,14 +28,18 @@ DATA_records = []
DATA_records_by_session = defaultdict(list)
DATA_records_by_user = defaultdict(list)
for user in data["UsersRecords"]:
if user["Devices"] is None:
continue
for device in user["Devices"]:
if device["Records"] is None:
continue
for record in device["Records"]:
if "invalid" in record and record["invalid"]:
continue
DATA_records.append(record)
DATA_records_by_session[record["seqSessionId"]].append(record)
DATA_records_by_user[user["Name"]].append(record)
DATA_records_by_user[user["Name"] + ":" + device["Name"]].append(record)
DATA_records = list(sorted(DATA_records, key=lambda x: x["realtimeAfterLocal"]))
@ -213,6 +218,283 @@ def plot_cmdVocabularySize_cmdLinesEntered():
else:
plt.show()
def plot_cmdVocabularySize_daily():
SECONDS_IN_A_DAY = 86400
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Command vocabulary size in days")
plt.ylabel("Command vocabulary size")
plt.xlabel("Days")
legend = []
# x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items()))
# x_values = range(0, x_count)
for user in DATA_records_by_user.items():
new_cmds_after_100 = 0
new_cmds_after_200 = 0
new_cmds_after_300 = 0
cmd_vocabulary = set()
y_cmd_count = [0]
name, records = user
cmd_fail_count = 0
if not len(records):
print("ERROR: no records for user {}".format(name))
continue
first_day = records[0]["realtimeAfter"]
this_day = first_day
for record in records:
cmd = record["command"]
timestamp = record["realtimeAfter"]
if cmd == "":
cmd_fail_count += 1
continue
if timestamp >= this_day + SECONDS_IN_A_DAY:
this_day += SECONDS_IN_A_DAY
while timestamp >= this_day + SECONDS_IN_A_DAY:
y_cmd_count.append(-10)
this_day += SECONDS_IN_A_DAY
y_cmd_count.append(len(cmd_vocabulary))
cmd_vocabulary = set() # wipes the vocabulary each day
if len(y_cmd_count) > 100:
new_cmds_after_100+=1
if len(y_cmd_count) > 200:
new_cmds_after_200+=1
if len(y_cmd_count) > 300:
new_cmds_after_300+=1
if len(y_cmd_count) == 100:
print("% {}: Cmd adoption rate at 100 days (between 0 and 100 days) = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count))))
if len(y_cmd_count) == 200:
print("% {}: Cmd adoption rate at 200 days days = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count))))
print("% {}: Cmd adoption rate between 100 and 200 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100)))
if len(y_cmd_count) == 300:
print("% {}: Cmd adoption rate between 200 and 300 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200)))
if cmd not in cmd_vocabulary:
cmd_vocabulary.add(cmd)
print("% {}: New cmd adoption rate after 100 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100)))
print("% {}: New cmd adoption rate after 200 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200)))
print("% {}: New cmd adoption rate after 300 days = {}".format(name, new_cmds_after_300 / (len(y_cmd_count) - 300)))
print("% {}: cmd_fail_count = {}".format(name, cmd_fail_count))
x_cmds_entered = range(0, len(y_cmd_count))
plt.plot(x_cmds_entered, y_cmd_count, 'o', markersize=2)
legend.append(name + " (TODO: sanitize!)")
# print(cmd_vocabulary)
plt.legend(legend, loc="best")
plt.ylim(bottom=-5)
if async_draw:
plt.draw()
else:
plt.show()
def matplotlib_escape(ss):
ss = ss.replace('$', '\\$')
return ss
def plot_cmdUsage_in_time(sort_cmds=False, num_cmds=None):
SECONDS_IN_A_DAY = 86400
tab_colors = ("tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink", "tab:gray")
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Command use in time")
plt.ylabel("Commands")
plt.xlabel("Days")
legend_patches = []
cmd_ids = {}
y_labels = []
all_x_values = []
all_y_values = []
all_s_values = [] # size
all_c_values = [] # color
x_values = []
y_values = []
s_values = [] # size
c_values = [] # color
if sort_cmds:
cmd_count = defaultdict(int)
for user in DATA_records_by_user.items():
name, records = user
for record in records:
cmd = record["command"]
cmd_count[cmd] += 1
sorted_cmds = map(lambda x: x[0], sorted(cmd_count.items(), key=lambda x: x[1], reverse=True))
for cmd in sorted_cmds:
cmd_ids[cmd] = len(cmd_ids)
y_labels.append(matplotlib_escape(cmd))
for user_idx, user in enumerate(DATA_records_by_user.items()):
name, records = user
if not len(records):
print("ERROR: no records for user {}".format(name))
continue
first_day = records[0]["realtimeAfter"]
this_day = first_day
day_no = 0
today_cmds = defaultdict(int)
for record in records:
cmd = record["command"]
timestamp = record["realtimeAfter"]
if cmd == "":
print("NOTICE: Empty cmd for {}".format(record["cmdLine"]))
continue
if timestamp >= this_day + SECONDS_IN_A_DAY:
for item in today_cmds.items():
cmd, count = item
cmd_id = cmd_ids[cmd]
# skip commands with high ids
if num_cmds is not None and cmd_id >= num_cmds:
continue
x_values.append(day_no)
y_values.append(cmd_id)
s_values.append(count)
c_values.append(tab_colors[user_idx])
today_cmds = defaultdict(int)
this_day += SECONDS_IN_A_DAY
day_no += 1
while timestamp >= this_day + SECONDS_IN_A_DAY:
this_day += SECONDS_IN_A_DAY
day_no += 1
if cmd not in cmd_ids:
cmd_ids[cmd] = len(cmd_ids)
y_labels.append(matplotlib_escape(cmd))
today_cmds[cmd] += 1
all_x_values.extend(x_values)
all_y_values.extend(y_values)
all_s_values.extend(s_values)
all_c_values.extend(c_values)
x_values = []
y_values = []
s_values = []
c_values = []
legend_patches.append(mpatches.Patch(color=tab_colors[user_idx], label="{} ({}) (TODO: sanitize!)".format(name, user_idx)))
if num_cmds is not None and len(y_labels) > num_cmds:
y_labels = y_labels[:num_cmds]
plt.yticks(ticks=range(0, len(y_labels)), labels=y_labels, fontsize=6)
plt.scatter(all_x_values, all_y_values, s=all_s_values, c=all_c_values, marker='o')
plt.legend(handles=legend_patches, loc="best")
if async_draw:
plt.draw()
else:
plt.show()
# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals.
def plot_cmdVocabularySize_time():
SECONDS_IN_A_DAY = 86400
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Command vocabulary size growth in time")
plt.ylabel("Command vocabulary size")
plt.xlabel("Days")
legend = []
# x_count = max(map(lambda x: len(x[1]), DATA_records_by_user.items()))
# x_values = range(0, x_count)
for user in DATA_records_by_user.items():
new_cmds_after_100 = 0
new_cmds_after_200 = 0
new_cmds_after_300 = 0
cmd_vocabulary = set()
y_cmd_count = [0]
name, records = user
cmd_fail_count = 0
if not len(records):
print("ERROR: no records for user {}".format(name))
continue
first_day = records[0]["realtimeAfter"]
this_day = first_day
for record in records:
cmd = record["command"]
timestamp = record["realtimeAfter"]
if cmd == "":
cmd_fail_count += 1
continue
if timestamp >= this_day + SECONDS_IN_A_DAY:
this_day += SECONDS_IN_A_DAY
while timestamp >= this_day + SECONDS_IN_A_DAY:
y_cmd_count.append(-10)
this_day += SECONDS_IN_A_DAY
y_cmd_count.append(len(cmd_vocabulary))
if len(y_cmd_count) > 100:
new_cmds_after_100+=1
if len(y_cmd_count) > 200:
new_cmds_after_200+=1
if len(y_cmd_count) > 300:
new_cmds_after_300+=1
if len(y_cmd_count) == 100:
print("% {}: Cmd adoption rate at 100 days (between 0 and 100 days) = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count))))
if len(y_cmd_count) == 200:
print("% {}: Cmd adoption rate at 200 days days = {}".format(name, len(cmd_vocabulary) / (len(y_cmd_count))))
print("% {}: Cmd adoption rate between 100 and 200 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100)))
if len(y_cmd_count) == 300:
print("% {}: Cmd adoption rate between 200 and 300 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200)))
if cmd not in cmd_vocabulary:
cmd_vocabulary.add(cmd)
print("% {}: New cmd adoption rate after 100 days = {}".format(name, new_cmds_after_100 / (len(y_cmd_count) - 100)))
print("% {}: New cmd adoption rate after 200 days = {}".format(name, new_cmds_after_200 / (len(y_cmd_count) - 200)))
print("% {}: New cmd adoption rate after 300 days = {}".format(name, new_cmds_after_300 / (len(y_cmd_count) - 300)))
print("% {}: cmd_fail_count = {}".format(name, cmd_fail_count))
x_cmds_entered = range(0, len(y_cmd_count))
plt.plot(x_cmds_entered, y_cmd_count, 'o', markersize=2)
legend.append(name + " (TODO: sanitize!)")
# print(cmd_vocabulary)
plt.legend(legend, loc="best")
plt.ylim(bottom=0)
if async_draw:
plt.draw()
else:
plt.show()
# Figure 5.6. Command line vocabulary size vs. the number of commands entered for four typical individuals.
def plot_cmdLineVocabularySize_cmdLinesEntered():
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
@ -601,7 +883,7 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
plt.show()
def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["recent (bash-like)"], show_strat_title=False):
def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["recent (bash-like)"], show_strat_title=False, force_strat_title=None):
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Matches at distance (noncumulative) <{}>".format(datetime.now().strftime('%H:%M:%S')))
plt.ylabel('%' + " of matches")
@ -655,6 +937,9 @@ def plot_strategies_matches_noncummulative(plot_size=50, selected_strategies=["r
matches_percent = list(map(lambda x: 100 * x / dataPoint_count, matches))
plt.plot(x_values, matches_percent, 'o-')
if force_strat_title is not None:
legend.append(force_strat_title)
else:
legend.append(strategy_title)
assert(saved_matches_total is not None)
@ -891,24 +1176,29 @@ def print_avg_cmdline_length():
# plot_cmdFrq_rank()
print_top_cmds(30)
print_top_cmds_by_user(30)
print_avg_cmdline_length()
# print_avg_cmdline_length()
#
# plot_cmdLineVocabularySize_cmdLinesEntered()
# plot_cmdVocabularySize_cmdLinesEntered()
plot_cmdVocabularySize_cmdLinesEntered()
plot_cmdVocabularySize_time()
# plot_cmdVocabularySize_daily()
plot_cmdUsage_in_time(num_cmds=100)
plot_cmdUsage_in_time(sort_cmds=True, num_cmds=100)
#
recent_strats=("recent", "recent (bash-like)")
recurrence_strat=("recent (bash-like)",)
plot_strategies_matches(20)
plot_strategies_charsRecalled(20)
plot_strategies_charsRecalled_prefix(20)
# plot_strategies_matches(20)
# plot_strategies_charsRecalled(20)
# plot_strategies_charsRecalled_prefix(20)
# plot_strategies_charsRecalled_noncummulative(20, selected_strategies=recent_strats)
plot_strategies_matches_noncummulative(20)
plot_strategies_charsRecalled_noncummulative(20)
plot_strategies_charsRecalled_prefix_noncummulative(20)
plot_strategies_matches(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate")
# plot_strategies_matches_noncummulative(20)
# plot_strategies_charsRecalled_noncummulative(20)
# plot_strategies_charsRecalled_prefix_noncummulative(20)
# plot_strategies_matches(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate")
# plot_strategies_matches_noncummulative(20, selected_strategies=recurrence_strat, show_strat_title=True, force_strat_title="recurrence rate")
# graph_cmdSequences(node_count=33, edge_minValue=0.048)
#
# graph_cmdSequences(node_count=28, edge_minValue=0.06)
# new improved

Loading…
Cancel
Save