Added create_data and evaluate_orange.

author: Martin Možina <martin.mozina@fri.uni-lj.si> 2018-05-20 17:36:11 +0200
committer: Martin Možina <martin.mozina@fri.uni-lj.si> 2018-05-20 17:36:11 +0200
commit: 8723bd7cd2d227634d67f24e9514d88e9f1c73a7 (patch)
tree: 5b92bf0360157c7148be13d4e0261bdb2c8ceebd
parent: 1dbb57208d2b8163a1c007ad0931f859651fc1c2 (diff)
4 files changed, 255 insertions, 3 deletions
diff --git a/create_data.py b/create_data.py
new file mode 100644
index 0000000..2296727
--- /dev/null
+++ b/create_data.py
@@ -0,0 +1,63 @@
+import argparse
+import ast
+import collections
+import os
+
+import pandas
+import sklearn.dummy
+import sklearn.ensemble
+import sklearn.model_selection
+import sklearn.tree
+
+import canonicalize
+import dynamic
+import regex
+
+import main
+import Orange
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Get patterns from student programs.')
+    parser.add_argument('filename', help='Orange file name')
+    parser.add_argument('path', help='path to data directory')
+    parser.add_argument('--dynamic', action="store_true", help='include dynamic atts')
+    parser.add_argument('--names', nargs='*', required=False, help='names that should not be anonymized')
+    parser.add_argument('--exec', required=False, help='code to append for dynamic patterns')
+    parser.add_argument('--inputs', nargs='*', required=False, help='inputs for dynamic patterns')
+
+    args = parser.parse_args()
+    path = args.path.rstrip('/')
+    problem_name = os.path.basename(path)
+
+    programs = main.get_programs(path, args.names, do_canonicalize=True)
+
+    attrs = collections.OrderedDict()
+    attrs.update(regex.get_attributes(programs))
+    if args.dynamic:
+        attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))
+    
+    orange_attrs = []
+    for at in attrs:
+        orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T')))
+
+    cl = Orange.data.DiscreteVariable('correct', values=('F', 'T'))
+    mcode = Orange.data.StringVariable('code')
+    orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode])
+    orange_data = Orange.data.Table.from_domain(orange_domain)
+    for program in programs:
+        if not program:
+            continue
+        instance = Orange.data.Instance(orange_domain)
+        for at in attrs:
+            instance[at] = program in attrs[at]['programs']
+        instance[cl] = programs[program]['correct']
+        instance[mcode] = program
+        for _ in range(len(programs[program]['users'])):
+            orange_data.append(instance)
+   
+    orange_data.save(args.filename)
+
+    fatt = open("attributes.txt", "wt")
+    for at in attrs:
+        fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' ')))
diff --git a/dynamic/__init__.py b/dynamic/__init__.py
index fdcde57..b77dd22 100644
--- a/dynamic/__init__.py
+++ b/dynamic/__init__.py
@@ -57,6 +57,13 @@ def following_pairs_patterns(trace_data):
         for v1, v2 in zip(val, val[1:]):
             yield "[{}]:[{}]".format(v1[2], v2[2])
 
+def single_value_patterns(trace_data):
+    series = trace_data['series']
+    for se, val in series.items():
+        for v in val:
+            yield "value: {}".format(v[2])
+
+
 def get_trace_data(code, call=None, inputs=None):
     if call:
         code += '\n\n' + call
@@ -70,6 +77,8 @@ def get_attributes(programs, call, inputs):
         trace = get_trace_data(program, call, inputs)
         for pat in following_pairs_patterns(trace):
             patterns[pat] += [program]
+        for pat in single_value_patterns(trace):
+            patterns[pat] += [program]
 
     attrs = collections.OrderedDict()
     for pat, progs in sorted(patterns.items(), key=lambda x: len(x[1]), reverse=True):
diff --git a/evaluate_orange.py b/evaluate_orange.py
new file mode 100644
index 0000000..681c063
--- /dev/null
+++ b/evaluate_orange.py
@@ -0,0 +1,180 @@
+import argparse
+import ast
+import collections
+import os
+
+import pandas
+import numpy as np
+np.set_printoptions(linewidth=1000)
+import sklearn.dummy
+import sklearn.ensemble
+import sklearn.model_selection
+import sklearn.tree
+
+import canonicalize
+import dynamic
+import regex
+
+import main
+import Orange
+from Orange.evaluation import CrossValidation, CA, AUC, LogLoss, Precision, Recall
+from learning.rules import RL4T, RL4TFull
+
+datasets = [
+("introduction/fahrenheit_to_celsius", [100]),
+("introduction/ballistics", [45, 100]),
+("hw-fkkt/hw1", [298, 1.4, 0.028964]),
+("introduction/pythagorean_theorem", [3, 4]),
+("while_and_if/buy_five", [5,4,3,2,1]),
+("while_and_if/top_shop", [2,4,1,0]),
+("introduction/average", [2,4,5]),
+("introduction-fkkt/pythagorean_theorem_fkkt", [3,4]),
+("introduction-fkkt/what_is_your_name", ["Ana"]),
+("while_and_if/competition", [3, 2, 4, 1]),
+#("introduction-fkkt/hello_world", []),
+("introduction-fkkt/area_of_a_triangle", [3,4]),
+("lists_and_if-fkkt/is_palindrome", ["kisik"]),
+("for-fkkt/sum_and_average", [[23,42,87,34,-1]]),
+("for-fkkt/star_tree", [3]),
+("lists_and_if-fkkt/temp_converter", ['32\nK\n']),
+("for-fkkt/sum_to_n", [7]),
+("while_and_if/checking_account", [10,-100,1000,-10000]),
+("introduction-fkkt/molar_mass", [2]),
+("lists_and_if-fkkt/itm", [165,70]),
+("functions/greatest_negative", [4,-6,2,-1], ["max_neg"]),
+("for-fkkt/star_triangle", [3]),
+("lists_and_if-fkkt/square_equation", [1,2,1]),
+("while_and_if/consumers_anonymous", [10,5,90,1,1,0]),
+("while_and_if/minimax", [2,4,1,0]),
+("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]),
+("functions/greatest", [-8,6,2,0], ["max_val"])
+]
+
+datasets = [
+("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]),]
+#("while_and_if/consumers_anonymous", [10,5,90,1,1,0])]
+#("while_and_if/buy_five", [5,4,3,2,1]),]
+
+def create_data(path, names, include_dynamic, inputs):
+    problem_name = os.path.basename(path)
+    programs = main.get_programs(path, names, do_canonicalize=True)
+    attrs = collections.OrderedDict()
+    attrs.update(regex.get_attributes(programs))
+    if include_dynamic:
+        attrs.update(dynamic.get_attributes(programs, "", inputs))
+    #print('Attributes:', attrs.keys())
+    orange_attrs = []
+    for at in attrs:
+        orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T')))
+
+    cl = Orange.data.DiscreteVariable('correct', values=('F', 'T'))
+    mcode = Orange.data.StringVariable('code')
+    orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode])
+    orange_data = Orange.data.Table.from_domain(orange_domain)
+    for program in programs:
+        if not program:
+            continue
+        instance = Orange.data.Instance(orange_domain)
+        for at in attrs:
+            instance[at] = program in attrs[at]['programs']
+        instance[cl] = programs[program]['correct']
+        instance[mcode] = program
+        for _ in range(len(programs[program]['users'])):
+            orange_data.append(instance)
+    return orange_data, attrs
+
+def get_coverages(rules, data):
+    if not rules:
+        return 0, 0
+    corr, inc = np.zeros(len(data), dtype=bool), np.zeros(len(data), dtype=bool)
+    for r in rules:
+        if r.target_class == 0: # 0 ... 'F', should be like that or it wont work  
+            inc |= r.covered_examples
+        else:
+            corr |= r.covered_examples
+    corr_perc = (corr & (data.Y == 1)).sum()
+    corr_perc /= data.Y.sum()
+    inc_perc = (inc & (data.Y == 0)).sum()
+    inc_perc /= len(data) - data.Y.sum()
+    return corr_perc, inc_perc
+
+
+def learn_and_write(filename, learner_pos, learner_all, data):
+    rules_pos = learner_pos(data).rule_list
+    rules_all = learner_all(data).rule_list
+    pos_cov_corr, pos_cov_inc = get_coverages(rules_pos, data)
+    all_cov_corr, all_cov_inc = get_coverages(rules_all, data)
+    with open(os.path.join(output_path, filename), "wt") as f:
+        print("Only positive values of attributes:", file=f)
+        for r in rules_pos:
+            print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0], file=f)
+        print("Coverage incorrect: ", pos_cov_inc, file=f)
+        print("Coverage correct: ", pos_cov_corr, file=f)
+        print(file=f)
+        print("All values (positive and negative) of attributes:", file=f)
+        for r in rules_all:
+            print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0],  file=f)
+        print("Coverage incorrect: ", all_cov_inc, file=f)
+        print("Coverage correct: ", all_cov_corr, file=f)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Get patterns from student programs.')
+    parser.add_argument('path', help='path to data directory')
+    parser.add_argument('output_path', help='path to output data directory')
+
+    args = parser.parse_args()
+    path = args.path.rstrip('/')
+    
+    rule_learner_positive = RL4TFull(parent_alpha=0.05, threshold=0.9)
+    rule_learner_all = RL4TFull(parent_alpha=0.05, threshold=0.9, positive_only=False)
+    learners = [
+        rule_learner_positive,
+        rule_learner_all,
+        Orange.classification.TreeLearner(),
+        Orange.classification.RandomForestLearner(n_estimators=100),
+        Orange.classification.MajorityLearner()]
+    
+    for d in datasets:
+        print(d)
+
+        # create orange data
+        names = d[2] if len(d)==3 else None
+        problem_path = os.path.join(path, d[0])
+        data, attrs = create_data(problem_path, names, False, [])
+        data_dyn, attrs_dyn = create_data(problem_path, names, True, [str(v) for v in d[1]])
+
+        # save data
+        output_path = os.path.join(args.output_path, d[0])
+        os.makedirs(output_path, exist_ok=True)
+        data.save(os.path.join(output_path, "regex.tab"))
+        data_dyn.save(os.path.join(output_path, "regex_dynamic.tab"))
+        with open(os.path.join(output_path, "regex_attributes.txt"), "wt") as fatt:
+            for at in attrs:
+                fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' ')))
+        with open(os.path.join(output_path, "both_attributes.txt"), "wt") as fatt:
+            for at in attrs_dyn:
+                fatt.write("{}: {}\n".format(at, str(attrs_dyn[at]["desc"]).replace('\n',' ')))
+
+        # learn rules regex
+        learn_and_write("rules_regex.txt", rule_learner_positive, rule_learner_all, data)
+        learn_and_write("rules_both.txt", rule_learner_positive, rule_learner_all, data_dyn)
+
+        res = CrossValidation(data, learners, k=5, random_state=0)
+        res_dyn = CrossValidation(data_dyn, learners, k=5, random_state=0)
+        with open(os.path.join(output_path, "results.txt"), "wt") as f:
+            print("Methods: rules(positive only), rules(all values), decision tree, random forest, majority", file=f)
+            print(file=f)
+            print("Without dynamic attributes: ", file=f)
+            print("ca", CA(res), file=f)
+            print("auc", AUC(res), file=f)
+            print("ll", LogLoss(res), file=f)
+            print("precision", Precision(res, target=0), file=f)
+            print("recall", Recall(res, target=0), file=f)
+            print(file=f)
+            print("With dynamic attributes: ", file=f)
+            print("ca", CA(res_dyn), file=f)
+            print("auc", AUC(res_dyn), file=f)
+            print("ll", LogLoss(res_dyn), file=f)
+            print("precision", Precision(res_dyn, target=0), file=f)
+            print("recall", Recall(res_dyn, target=0), file=f)
+
diff --git a/main.py b/main.py
index 777f770..ecd2b75 100644
--- a/main.py
+++ b/main.py
@@ -66,9 +66,9 @@ if __name__ == '__main__':
     attrs.update(regex.get_attributes(programs))
     attrs.update(dynamic.get_attributes(programs, args.exec, args.inputs))
 
-    print('Attributes:')
-    for attr in attrs:
-        print(attr, attrs[attr]['desc'].to_string(inline=True))
+    #print('Attributes:')
+    #for attr in attrs:
+    #    print(attr, attrs[attr]['desc'].to_string(inline=True))
 
     for program in programs:
         for attr in attrs:
author	Martin Možina <martin.mozina@fri.uni-lj.si>	2018-05-20 17:36:11 +0200
committer	Martin Možina <martin.mozina@fri.uni-lj.si>	2018-05-20 17:36:11 +0200
commit	8723bd7cd2d227634d67f24e9514d88e9f1c73a7 (patch)
tree	5b92bf0360157c7148be13d4e0261bdb2c8ceebd
parent	1dbb57208d2b8163a1c007ad0931f859651fc1c2 (diff)