From 8723bd7cd2d227634d67f24e9514d88e9f1c73a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mo=C5=BEina?= Date: Sun, 20 May 2018 17:36:11 +0200 Subject: Added create_data and evaluate_orange. --- create_data.py | 63 ++++++++++++++++++ dynamic/__init__.py | 9 +++ evaluate_orange.py | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 6 +- 4 files changed, 255 insertions(+), 3 deletions(-) create mode 100644 create_data.py create mode 100644 evaluate_orange.py diff --git a/create_data.py b/create_data.py new file mode 100644 index 0000000..2296727 --- /dev/null +++ b/create_data.py @@ -0,0 +1,63 @@ +import argparse +import ast +import collections +import os + +import pandas +import sklearn.dummy +import sklearn.ensemble +import sklearn.model_selection +import sklearn.tree + +import canonicalize +import dynamic +import regex + +import main +import Orange + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get patterns from student programs.') + parser.add_argument('filename', help='Orange file name') + parser.add_argument('path', help='path to data directory') + parser.add_argument('--dynamic', action="store_true", help='include dynamic atts') + parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized') + parser.add_argument('--exec', required=False, help='code to append for dynamic patterns') + parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns') + + args = parser.parse_args() + path = args.path.rstrip('/') + problem_name = os.path.basename(path) + + programs = main.get_programs(path, args.names, do_canonicalize=True) + + attrs = collections.OrderedDict() + attrs.update(regex.get_attributes(programs)) + if args.dynamic: + attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs)) + + orange_attrs = [] + for at in attrs: + orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T'))) + + cl = Orange.data.DiscreteVariable('correct', values=('F', 'T')) + mcode = Orange.data.StringVariable('code') + orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode]) + orange_data = Orange.data.Table.from_domain(orange_domain) + for program in programs: + if not program: + continue + instance = Orange.data.Instance(orange_domain) + for at in attrs: + instance[at] = program in attrs[at]['programs'] + instance[cl] = programs[program]['correct'] + instance[mcode] = program + for _ in range(len(programs[program]['users'])): + orange_data.append(instance) + + orange_data.save(args.filename) + + fatt = open("attributes.txt", "wt") + for at in attrs: + fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' '))) diff --git a/dynamic/__init__.py b/dynamic/__init__.py index fdcde57..b77dd22 100644 --- a/dynamic/__init__.py +++ b/dynamic/__init__.py @@ -57,6 +57,13 @@ def following_pairs_patterns(trace_data): for v1, v2 in zip(val, val[1:]): yield "[{}]:[{}]".format(v1[2], v2[2]) +def single_value_patterns(trace_data): + series = trace_data['series'] + for se, val in series.items(): + for v in val: + yield "value: {}".format(v[2]) + + def get_trace_data(code, call=None, inputs=None): if call: code += '\n\n' + call @@ -70,6 +77,8 @@ def get_attributes(programs, call, inputs): trace = get_trace_data(program, call, inputs) for pat in following_pairs_patterns(trace): patterns[pat] += [program] + for pat in single_value_patterns(trace): + patterns[pat] += [program] attrs = collections.OrderedDict() for pat, progs in sorted(patterns.items(), key=lambda x: len(x[1]), reverse=True): diff --git a/evaluate_orange.py b/evaluate_orange.py new file mode 100644 index 0000000..681c063 --- /dev/null +++ b/evaluate_orange.py @@ -0,0 +1,180 @@ +import argparse +import ast +import collections +import os + +import pandas +import numpy as np +np.set_printoptions(linewidth=1000) +import sklearn.dummy +import sklearn.ensemble +import sklearn.model_selection +import sklearn.tree + +import canonicalize +import dynamic +import regex + +import main +import Orange +from Orange.evaluation import CrossValidation, CA, AUC, LogLoss, Precision, Recall +from learning.rules import RL4T, RL4TFull + +datasets = [ +("introduction/fahrenheit_to_celsius", [100]), +("introduction/ballistics", [45, 100]), +("hw-fkkt/hw1", [298, 1.4, 0.028964]), +("introduction/pythagorean_theorem", [3, 4]), +("while_and_if/buy_five", [5,4,3,2,1]), +("while_and_if/top_shop", [2,4,1,0]), +("introduction/average", [2,4,5]), +("introduction-fkkt/pythagorean_theorem_fkkt", [3,4]), +("introduction-fkkt/what_is_your_name", ["Ana"]), +("while_and_if/competition", [3, 2, 4, 1]), +#("introduction-fkkt/hello_world", []), +("introduction-fkkt/area_of_a_triangle", [3,4]), +("lists_and_if-fkkt/is_palindrome", ["kisik"]), +("for-fkkt/sum_and_average", [[23,42,87,34,-1]]), +("for-fkkt/star_tree", [3]), +("lists_and_if-fkkt/temp_converter", ['32\nK\n']), +("for-fkkt/sum_to_n", [7]), +("while_and_if/checking_account", [10,-100,1000,-10000]), +("introduction-fkkt/molar_mass", [2]), +("lists_and_if-fkkt/itm", [165,70]), +("functions/greatest_negative", [4,-6,2,-1], ["max_neg"]), +("for-fkkt/star_triangle", [3]), +("lists_and_if-fkkt/square_equation", [1,2,1]), +("while_and_if/consumers_anonymous", [10,5,90,1,1,0]), +("while_and_if/minimax", [2,4,1,0]), +("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]), +("functions/greatest", [-8,6,2,0], ["max_val"]) +] + +datasets = [ +("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]),] +#("while_and_if/consumers_anonymous", [10,5,90,1,1,0])] +#("while_and_if/buy_five", [5,4,3,2,1]),] + +def create_data(path, names, include_dynamic, inputs): + problem_name = os.path.basename(path) + programs = main.get_programs(path, names, do_canonicalize=True) + attrs = collections.OrderedDict() + attrs.update(regex.get_attributes(programs)) + if include_dynamic: + attrs.update(dynamic.get_attributes(programs, "", inputs)) + #print('Attributes:', attrs.keys()) + orange_attrs = [] + for at in attrs: + orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T'))) + + cl = Orange.data.DiscreteVariable('correct', values=('F', 'T')) + mcode = Orange.data.StringVariable('code') + orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode]) + orange_data = Orange.data.Table.from_domain(orange_domain) + for program in programs: + if not program: + continue + instance = Orange.data.Instance(orange_domain) + for at in attrs: + instance[at] = program in attrs[at]['programs'] + instance[cl] = programs[program]['correct'] + instance[mcode] = program + for _ in range(len(programs[program]['users'])): + orange_data.append(instance) + return orange_data, attrs + +def get_coverages(rules, data): + if not rules: + return 0, 0 + corr, inc = np.zeros(len(data), dtype=bool), np.zeros(len(data), dtype=bool) + for r in rules: + if r.target_class == 0: # 0 ... 'F', should be like that or it wont work + inc |= r.covered_examples + else: + corr |= r.covered_examples + corr_perc = (corr & (data.Y == 1)).sum() + corr_perc /= data.Y.sum() + inc_perc = (inc & (data.Y == 0)).sum() + inc_perc /= len(data) - data.Y.sum() + return corr_perc, inc_perc + + +def learn_and_write(filename, learner_pos, learner_all, data): + rules_pos = learner_pos(data).rule_list + rules_all = learner_all(data).rule_list + pos_cov_corr, pos_cov_inc = get_coverages(rules_pos, data) + all_cov_corr, all_cov_inc = get_coverages(rules_all, data) + with open(os.path.join(output_path, filename), "wt") as f: + print("Only positive values of attributes:", file=f) + for r in rules_pos: + print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0], file=f) + print("Coverage incorrect: ", pos_cov_inc, file=f) + print("Coverage correct: ", pos_cov_corr, file=f) + print(file=f) + print("All values (positive and negative) of attributes:", file=f) + for r in rules_all: + print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0], file=f) + print("Coverage incorrect: ", all_cov_inc, file=f) + print("Coverage correct: ", all_cov_corr, file=f) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get patterns from student programs.') + parser.add_argument('path', help='path to data directory') + parser.add_argument('output_path', help='path to output data directory') + + args = parser.parse_args() + path = args.path.rstrip('/') + + rule_learner_positive = RL4TFull(parent_alpha=0.05, threshold=0.9) + rule_learner_all = RL4TFull(parent_alpha=0.05, threshold=0.9, positive_only=False) + learners = [ + rule_learner_positive, + rule_learner_all, + Orange.classification.TreeLearner(), + Orange.classification.RandomForestLearner(n_estimators=100), + Orange.classification.MajorityLearner()] + + for d in datasets: + print(d) + + # create orange data + names = d[2] if len(d)==3 else None + problem_path = os.path.join(path, d[0]) + data, attrs = create_data(problem_path, names, False, []) + data_dyn, attrs_dyn = create_data(problem_path, names, True, [str(v) for v in d[1]]) + + # save data + output_path = os.path.join(args.output_path, d[0]) + os.makedirs(output_path, exist_ok=True) + data.save(os.path.join(output_path, "regex.tab")) + data_dyn.save(os.path.join(output_path, "regex_dynamic.tab")) + with open(os.path.join(output_path, "regex_attributes.txt"), "wt") as fatt: + for at in attrs: + fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' '))) + with open(os.path.join(output_path, "both_attributes.txt"), "wt") as fatt: + for at in attrs_dyn: + fatt.write("{}: {}\n".format(at, str(attrs_dyn[at]["desc"]).replace('\n',' '))) + + # learn rules regex + learn_and_write("rules_regex.txt", rule_learner_positive, rule_learner_all, data) + learn_and_write("rules_both.txt", rule_learner_positive, rule_learner_all, data_dyn) + + res = CrossValidation(data, learners, k=5, random_state=0) + res_dyn = CrossValidation(data_dyn, learners, k=5, random_state=0) + with open(os.path.join(output_path, "results.txt"), "wt") as f: + print("Methods: rules(positive only), rules(all values), decision tree, random forest, majority", file=f) + print(file=f) + print("Without dynamic attributes: ", file=f) + print("ca", CA(res), file=f) + print("auc", AUC(res), file=f) + print("ll", LogLoss(res), file=f) + print("precision", Precision(res, target=0), file=f) + print("recall", Recall(res, target=0), file=f) + print(file=f) + print("With dynamic attributes: ", file=f) + print("ca", CA(res_dyn), file=f) + print("auc", AUC(res_dyn), file=f) + print("ll", LogLoss(res_dyn), file=f) + print("precision", Precision(res_dyn, target=0), file=f) + print("recall", Recall(res_dyn, target=0), file=f) + diff --git a/main.py b/main.py index 777f770..ecd2b75 100644 --- a/main.py +++ b/main.py @@ -66,9 +66,9 @@ if __name__ == '__main__': attrs.update(regex.get_attributes(programs)) attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs)) - print('Attributes:') - for attr in attrs: - print(attr, attrs[attr]['desc'].to_string(inline=True)) + #print('Attributes:') + #for attr in attrs: + # print(attr, attrs[attr]['desc'].to_string(inline=True)) for program in programs: for attr in attrs: -- cgit v1.2.1