plot by user

pull/123/head
Simon Let 6 years ago
parent c5319a6813
commit 475b74589c
  1. 106
      scripts/resh-evaluate-plot.py

@ -86,6 +86,13 @@ def plot_cmdLineFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
# similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf. # similar to ~ Figure 3.1. The normalized command frequency, compared with Zipf.
def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False): def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
plt.title("Command frequency / rank")
plt.ylabel("Normalized command frequency")
plt.xlabel("Command rank")
legend = []
cmd_count = defaultdict(int) cmd_count = defaultdict(int)
len_records = 0 len_records = 0
for record in DATA_records: for record in DATA_records:
@ -102,17 +109,43 @@ def plot_cmdFrq_rank(plotSize=PLOT_SIZE_zipf, show_labels=False):
top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent)) print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(100, top100percent))
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent)) print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(10, top10percent))
print("% >>> Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent)) print("% ALL: Top {} %% of cmds amounts for {} %% of all command lines".format(20, top20percent))
ranks = range(1, len(cmdFrq)+1) ranks = range(1, len(cmdFrq)+1)
plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT)) plt.plot(ranks, zipf(len(ranks)), '-')
plt.plot(ranks, zipf(len(ranks)), 'o-') legend.append("Zipf distribution")
plt.plot(ranks, cmdFrq, 'o-') plt.plot(ranks, cmdFrq, 'o-')
plt.title("Command frequency / rank") legend.append("All subjects")
plt.ylabel("Normalized command frequency")
plt.xlabel("Command rank")
plt.legend(("Zipf", "Command"), loc="best") for user in DATA_records_by_user.items():
cmd_count = defaultdict(int)
len_records = 0
name, records = user
for record in records:
cmd = record["command"]
if cmd == "":
continue
cmd_count[cmd] += 1
len_records += 1
tmp = sorted(cmd_count.items(), key=lambda x: x[1], reverse=True)[:plotSize]
cmdFrq = list(map(lambda x: x[1] / tmp[0][1], tmp))
labels = list(map(lambda x: trim(x[0], 7), tmp))
top100percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(1 * len(cmd_count))])) / len_records
top10percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.1 * len(cmd_count))])) / len_records
top20percent = 100 * sum(map(lambda x: x[1], list(cmd_count.items())[:int(0.2 * len(cmd_count))])) / len_records
print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 100, top100percent))
print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 10, top10percent))
print("% {}: Top {} %% of cmds amounts for {} %% of all command lines".format(name, 20, top20percent))
ranks = range(1, len(cmdFrq)+1)
plt.plot(ranks, cmdFrq, 'o-')
legend.append("{} (sanitize!)".format(name))
plt.legend(legend, loc="best")
if show_labels: if show_labels:
plt.xticks(ranks, labels, rotation=-60) plt.xticks(ranks, labels, rotation=-60)
# TODO: make xticks integral # TODO: make xticks integral
@ -140,6 +173,8 @@ def plot_cmdVocabularySize_cmdLinesEntered():
name, records = user name, records = user
for record in records: for record in records:
cmd = record["command"] cmd = record["command"]
if cmd == "":
continue
if cmd in cmd_vocabulary: if cmd in cmd_vocabulary:
# repeat last value # repeat last value
y_cmd_count.append(y_cmd_count[-1]) y_cmd_count.append(y_cmd_count[-1])
@ -230,6 +265,8 @@ def graph_cmdSequences(node_count=33, edge_minValue=0.05, view_graph=True):
prev_cmd = START_CMD prev_cmd = START_CMD
for record in session: for record in session:
cmd = record["command"] cmd = record["command"]
if cmd == "":
continue
cmdSeq_count[prev_cmd][cmd] += 1 cmdSeq_count[prev_cmd][cmd] += 1
cmd_count[cmd] += 1 cmd_count[cmd] += 1
if cmd not in cmd_id: if cmd not in cmd_id:
@ -559,9 +596,49 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
plt.show() plt.show()
def print_top_cmds(num_cmds=20):
cmd_count = defaultdict(int)
cmd_total = 0
for pid, session in DATA_records_by_session.items():
for record in session:
cmd = record["command"]
if cmd == "":
continue
cmd_count[cmd] += 1
cmd_total += 1
# get `node_count` of largest nodes
sorted_cmd_count = list(sorted(cmd_count.items(), key=lambda x: x[1], reverse=True))
print("\n\n% All subjects: Top commands")
for cmd, count in sorted_cmd_count[:num_cmds]:
print("{} {}".format(cmd, count))
# print(sorted_cmd_count)
# cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:cmd_count]
def print_top_cmds_by_user(num_cmds=20):
for user in DATA_records_by_user.items():
name, records = user
cmd_count = defaultdict(int)
cmd_total = 0
for record in records:
cmd = record["command"]
if cmd == "":
continue
cmd_count[cmd] += 1
cmd_total += 1
# get `node_count` of largest nodes
sorted_cmd_count = list(sorted(cmd_count.items(), key=lambda x: x[1], reverse=True))
print("\n\n% {}: Top commands".format(name))
for cmd, count in sorted_cmd_count[:num_cmds]:
print("{} {}".format(cmd, count))
# print(sorted_cmd_count)
# cmds_to_graph = list(map(lambda x: x[0], sorted_cmd_count))[:cmd_count]
# plot_cmdLineFrq_rank() # plot_cmdLineFrq_rank()
# plot_cmdFrq_rank() plot_cmdFrq_rank()
print_top_cmds(30)
print_top_cmds_by_user(30)
# #
# plot_cmdLineVocabularySize_cmdLinesEntered() # plot_cmdLineVocabularySize_cmdLinesEntered()
# plot_cmdVocabularySize_cmdLinesEntered() # plot_cmdVocabularySize_cmdLinesEntered()
@ -574,10 +651,11 @@ def plot_strategies_charsRecalled_prefix(plot_size=50, selected_strategies=[]):
# #
# graph_cmdSequences(node_count=28, edge_minValue=0.06) # graph_cmdSequences(node_count=28, edge_minValue=0.06)
for n in range(40, 43): # new improved
for e in range(94, 106, 2): # for n in range(40, 43):
e *= 0.001 # for e in range(94, 106, 2):
graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False) # e *= 0.001
# graph_cmdSequences(node_count=n, edge_minValue=e, view_graph=False)
#for n in range(29, 35): #for n in range(29, 35):
# for e in range(44, 56, 2): # for e in range(44, 56, 2):

Loading…
Cancel
Save