import argparse import ast import collections import os import pandas import numpy as np np.set_printoptions(linewidth=1000) import sklearn.dummy import sklearn.ensemble import sklearn.model_selection import sklearn.tree import canonicalize import dynamic import regex import main import Orange from Orange.evaluation import CrossValidation, CA, AUC, LogLoss, Precision, Recall from learning.rules import RL4T, RL4TFull datasets = [ ("introduction/fahrenheit_to_celsius", [100]), ("introduction/ballistics", [45, 100]), ("hw-fkkt/hw1", [298, 1.4, 0.028964]), ("introduction/pythagorean_theorem", [3, 4]), ("while_and_if/buy_five", [5,4,3,2,1]), ("while_and_if/top_shop", [2,4,1,0]), ("introduction/average", [2,4,5]), ("introduction-fkkt/pythagorean_theorem_fkkt", [3,4]), ("introduction-fkkt/what_is_your_name", ["Ana"]), ("while_and_if/competition", [3, 2, 4, 1]), #("introduction-fkkt/hello_world", []), ("introduction-fkkt/area_of_a_triangle", [3,4]), ("lists_and_if-fkkt/is_palindrome", ["kisik"]), ("for-fkkt/sum_and_average", [[23,42,87,34,-1]]), ("for-fkkt/star_tree", [3]), ("lists_and_if-fkkt/temp_converter", ['32\nK\n']), ("for-fkkt/sum_to_n", [7]), ("while_and_if/checking_account", [10,-100,1000,-10000]), ("introduction-fkkt/molar_mass", [2]), ("lists_and_if-fkkt/itm", [165,70]), ("functions/greatest_negative", [4,-6,2,-1], ["max_neg"]), ("for-fkkt/star_triangle", [3]), ("lists_and_if-fkkt/square_equation", [1,2,1]), ("while_and_if/consumers_anonymous", [10,5,90,1,1,0]), ("while_and_if/minimax", [2,4,1,0]), ("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]), ("functions/greatest", [-8,6,2,0], ["max_val"]) ] datasets = [ ("functions/greatest_absolutist", [-8,6,2,0], ["max_abs"]),] #("while_and_if/consumers_anonymous", [10,5,90,1,1,0])] #("while_and_if/buy_five", [5,4,3,2,1]),] def create_data(path, names, include_dynamic, inputs): problem_name = os.path.basename(path) programs = main.get_programs(path, names, do_canonicalize=True) attrs = collections.OrderedDict() attrs.update(regex.get_attributes(programs)) if include_dynamic: attrs.update(dynamic.get_attributes(programs, "", inputs)) #print('Attributes:', attrs.keys()) orange_attrs = [] for at in attrs: orange_attrs.append(Orange.data.DiscreteVariable(at, values=('F', 'T'))) cl = Orange.data.DiscreteVariable('correct', values=('F', 'T')) mcode = Orange.data.StringVariable('code') orange_domain = Orange.data.Domain(orange_attrs, cl, metas=[mcode]) orange_data = Orange.data.Table.from_domain(orange_domain) for program in programs: if not program: continue instance = Orange.data.Instance(orange_domain) for at in attrs: instance[at] = program in attrs[at]['programs'] instance[cl] = programs[program]['correct'] instance[mcode] = program for _ in range(len(programs[program]['users'])): orange_data.append(instance) return orange_data, attrs def get_coverages(rules, data): if not rules: return 0, 0 corr, inc = np.zeros(len(data), dtype=bool), np.zeros(len(data), dtype=bool) for r in rules: if r.target_class == 0: # 0 ... 'F', should be like that or it wont work inc |= r.covered_examples else: corr |= r.covered_examples corr_perc = (corr & (data.Y == 1)).sum() corr_perc /= data.Y.sum() inc_perc = (inc & (data.Y == 0)).sum() inc_perc /= len(data) - data.Y.sum() return corr_perc, inc_perc def learn_and_write(filename, learner_pos, learner_all, data): rules_pos = learner_pos(data).rule_list rules_all = learner_all(data).rule_list pos_cov_corr, pos_cov_inc = get_coverages(rules_pos, data) all_cov_corr, all_cov_inc = get_coverages(rules_all, data) with open(os.path.join(output_path, filename), "wt") as f: print("Only positive values of attributes:", file=f) for r in rules_pos: print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0], file=f) print("Coverage incorrect: ", pos_cov_inc, file=f) print("Coverage correct: ", pos_cov_corr, file=f) print(file=f) print("All values (positive and negative) of attributes:", file=f) for r in rules_all: print(r, r.curr_class_dist, r.quality, np.where(r.covered_examples==1)[0], file=f) print("Coverage incorrect: ", all_cov_inc, file=f) print("Coverage correct: ", all_cov_corr, file=f) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Get patterns from student programs.') parser.add_argument('path', help='path to data directory') parser.add_argument('output_path', help='path to output data directory') args = parser.parse_args() path = args.path.rstrip('/') rule_learner_positive = RL4TFull(parent_alpha=0.05, threshold=0.9) rule_learner_all = RL4TFull(parent_alpha=0.05, threshold=0.9, positive_only=False) learners = [ rule_learner_positive, rule_learner_all, Orange.classification.TreeLearner(), Orange.classification.RandomForestLearner(n_estimators=100), Orange.classification.MajorityLearner()] for d in datasets: print(d) # create orange data names = d[2] if len(d)==3 else None problem_path = os.path.join(path, d[0]) data, attrs = create_data(problem_path, names, False, []) data_dyn, attrs_dyn = create_data(problem_path, names, True, [str(v) for v in d[1]]) # save data output_path = os.path.join(args.output_path, d[0]) os.makedirs(output_path, exist_ok=True) data.save(os.path.join(output_path, "regex.tab")) data_dyn.save(os.path.join(output_path, "regex_dynamic.tab")) with open(os.path.join(output_path, "regex_attributes.txt"), "wt") as fatt: for at in attrs: fatt.write("{}: {}\n".format(at, str(attrs[at]["desc"]).replace('\n',' '))) with open(os.path.join(output_path, "both_attributes.txt"), "wt") as fatt: for at in attrs_dyn: fatt.write("{}: {}\n".format(at, str(attrs_dyn[at]["desc"]).replace('\n',' '))) # learn rules regex learn_and_write("rules_regex.txt", rule_learner_positive, rule_learner_all, data) learn_and_write("rules_both.txt", rule_learner_positive, rule_learner_all, data_dyn) res = CrossValidation(data, learners, k=5, random_state=0) res_dyn = CrossValidation(data_dyn, learners, k=5, random_state=0) with open(os.path.join(output_path, "results.txt"), "wt") as f: print("Methods: rules(positive only), rules(all values), decision tree, random forest, majority", file=f) print(file=f) print("Without dynamic attributes: ", file=f) print("ca", CA(res), file=f) print("auc", AUC(res), file=f) print("ll", LogLoss(res), file=f) print("precision", Precision(res, target=0), file=f) print("recall", Recall(res, target=0), file=f) print(file=f) print("With dynamic attributes: ", file=f) print("ca", CA(res_dyn), file=f) print("auc", AUC(res_dyn), file=f) print("ll", LogLoss(res_dyn), file=f) print("precision", Precision(res_dyn, target=0), file=f) print("recall", Recall(res_dyn, target=0), file=f)