From 651e2be4480b19ac486cb8a4dd2fb08b448ebc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mo=C5=BEina?= Date: Tue, 17 Jan 2017 20:12:29 +0100 Subject: Added scripts for learning rules. --- abml/evaluate.py | 75 ++++++++++++++++++++++ abml/learn_dist.py | 15 +++++ abml/rules_prolog.py | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100644 abml/evaluate.py create mode 100644 abml/learn_dist.py create mode 100644 abml/rules_prolog.py diff --git a/abml/evaluate.py b/abml/evaluate.py new file mode 100644 index 0000000..2e318fb --- /dev/null +++ b/abml/evaluate.py @@ -0,0 +1,75 @@ +import pickle +import argparse +import Orange +from Orange.evaluation import TestOnTestData, CA, AUC, LogLoss +import abml.rules_prolog as rp + +import orangecontrib.evcrules.logistic as logistic +import orangecontrib.abml.abrules as rules +import orangecontrib.abml.argumentation as arg + +parser = argparse.ArgumentParser(description='Learn and test rules for prolog programs.') +parser.add_argument('Name', type=str, help='Predicate name.') +args = parser.parse_args() +name = args.Name + +# load data +data = Orange.data.Table('data/{}/programs-train'.format(name)) + +# create learner +rule_learner = rp.Rules4Prolog(name, 0.9) + + + +# learn a classifier +classifier = rule_learner(data) + +# save model +fmodel = open("data/{}/model.txt".format(name), "wt") +for r in classifier.rule_list: + print(r, r.curr_class_dist, r.quality) + fmodel.write("{} dist={} quality={}\n".format(str(r), str(r.curr_class_dist), r.quality)) + +# accuracy of model +testdata = Orange.data.Table('data/{}/programs-test'.format(name)) +predictions = classifier(testdata) +acc = 0 +for i, p in enumerate(predictions): + acc += p == testdata.Y[i] +acc /= len(testdata) +print("Accuracy on test data: ", acc) +predictions = classifier(data) +acc = 0 +for i, p in enumerate(predictions): + acc += p == data.Y[i] +acc /= len(data) +print("Accuracy on train data: ", acc) + +# test model + other methodsstrong_piece_attack defends_around_king +bayes = Orange.classification.NaiveBayesLearner() +logistic = Orange.classification.LogisticRegressionLearner() +tree = Orange.classification.TreeLearner() +random_forest = Orange.classification.RandomForestLearner() +svm = Orange.classification.SVMLearner() +cn2 = Orange.classification.rules.CN2UnorderedLearner() +learners = [rule_learner, logistic, bayes, cn2, tree, random_forest, svm] +res = TestOnTestData(data, testdata, learners) +ca = CA(res) +auc = AUC(res) +ll = LogLoss(res) + +names = ['logrules', 'logistic', 'naive-bayes', 'cn2', 'tree', 'random-forest', 'svm'] +scores = "" +scores += "CA\tAUC\tLogLoss\tMethod\n" +for ni, n in enumerate(names): + scores += "{}\t{}\t{}\t{}\n".format(ca[ni], auc[ni], ll[ni], n) +print(scores) +fscores = open("data/{}/scores.txt".format(name), "wt") +fscores.write(scores) + +all_rules = classifier.rule_list +all_rules.sort(key = lambda r: r.quality, reverse=True) +rfile = open("data/{}/rules.txt".format(name), "wt") +for r in all_rules: + print(r, r.curr_class_dist, r.quality) + rfile.write("{} {} {}\n".format(r, r.curr_class_dist, r.quality)) diff --git a/abml/learn_dist.py b/abml/learn_dist.py new file mode 100644 index 0000000..58e4968 --- /dev/null +++ b/abml/learn_dist.py @@ -0,0 +1,15 @@ +import pickle +import argparse +from Orange.data import Table +import abml.rules_prolog as rp + +parser = argparse.ArgumentParser(description='Learn and test rules for prolog programs.') +parser.add_argument('Name', type=str, help='Predicate name.') +args = parser.parse_args() +name = args.Name + +data = Table('data/{}/programs-train'.format(name)) + +rule_learner = rp.create_learner(name, evds=False) +rule_learner.calculate_evds(data) +pickle.dump(rule_learner.evds, open("data/{}/evds.pickle".format(name), "wb")) diff --git a/abml/rules_prolog.py b/abml/rules_prolog.py new file mode 100644 index 0000000..c5c4134 --- /dev/null +++ b/abml/rules_prolog.py @@ -0,0 +1,173 @@ +import numpy as np +import pickle +import itertools +from Orange.classification.rules import _RuleClassifier, GuardianValidator +import orangecontrib.abml.abrules as rules +from Orange.classification.rules import Rule + +class TrueCondValidator: + """ + Checks whether all conditions have positive values + """ + def __init__(self, max_rule_length, min_covered_examples): + self.max_rule_length = max_rule_length + self.min_covered_examples = min_covered_examples + self.guardian = GuardianValidator(self.max_rule_length, self.min_covered_examples) + + def validate_rule(self, rule): + for att, op, val in rule.selectors: + if op == "!=" and rule.domain[att].values[int(val)] == "T" or \ + op == "==" and rule.domain[att].values[int(val)] == "F": + return False + return self.guardian.validate_rule(rule) + +class PureAccuracyValidator: + def __init__(self, negative, threshold): + self.negative = negative + self.threshold = threshold + + def validate_rule(self, rule): + if (rule.target_class == self.negative and + (rule.curr_class_dist[rule.target_class] != rule.curr_class_dist.sum() and + rule.quality < self.threshold)): + return False + return True + +class RelativePureValidator: + def __init__(self, target, threshold, covered, Y): + self.target = target + self.threshold = threshold + self.covered = covered + self.Y = Y + + def validate_rule(self, rule): + if rule.target_class == self.target: + rel_covered = rule.covered_examples & ~self.covered + rel_Y = self.Y[rel_covered] + rf = rel_Y[rel_Y == rule.target_class].sum() + rf /= rel_covered.sum() + if rf < self.threshold: + return False + return True + +class NegativeFirstClassifier(_RuleClassifier): + """ + Classificator from rules that first checks if a negative rule covers + an example. If it does, it will automatically classify example as negative. + If it doesnt, then it checks for positive rules and assigns this example + best rule's class accuracy. """ + def __init__(self, domain, rule_list): + self.domain = domain + self.rule_list = rule_list + self.num_classes = len(self.domain.class_var.values) + self.negative = self.domain.class_var.values.index("F") + + def coverage(self, data): + self.predict(data.X) + coverages = np.zeros((self.X.shape[0], len(self.rule_list)), dtype=bool) + for ri, r in enumerate(self.rule_list): + coverages[:, ri] = r.evaluate_data(self.X) + return coverages + + def predict(self, X): + self.X = X + probabilities = np.zeros((X.shape[0], self.num_classes), dtype=float) + # negative rules first + neg_rules = [r for r in self.rule_list if r.target_class == self.negative] + solved = np.zeros(X.shape[0], dtype=bool) + for rule in neg_rules: + covered = rule.evaluate_data(X) + solved |= covered + probabilities[solved, self.negative] = 1.0 + # now positive class + pos_rules = [r for r in self.rule_list if r.target_class != self.negative] + for rule in pos_rules: + covered = rule.evaluate_data(X) + to_change = covered & ~solved + probabilities[to_change, rule.target_class] = rule.quality + probabilities[to_change, np.arange(self.num_classes) != rule.target_class] = (1-rule.quality)/(self.num_classes-1) + solved |= covered + + probabilities[~solved] = np.ones(self.num_classes) / self.num_classes + return probabilities + +class Rules4Prolog: + def __init__(self, name, threshold): + self.threshold = threshold + self.learner = rules.ABRuleLearner(width=50, parent_alpha=0.05) + self.learner.rule_finder.general_validator = TrueCondValidator(self.learner.rule_finder.general_validator.max_rule_length, + self.learner.rule_finder.general_validator.min_covered_examples) + self.learner.rule_validator = PureAccuracyValidator(0, self.threshold) + self.learner.classifier = NegativeFirstClassifier + self.learner.evds = pickle.load(open("data/{}/evds.pickle".format(name), "rb")) + + def __call__(self, data): + # first learn rules for negative class (quality should be higher than + # threshold or distribution should be pure) + self.learner.target_class = "F" + neg_rules = self.learner(data).rule_list + + # then create another data set and remove all examples that negative + # rules cover + coverage = np.zeros(len(data), dtype=bool) + for r in neg_rules: + coverage |= r.covered_examples + + # learn positive rules, however accept them only if relative frequency + # of rules on the temporary data set is higher than threshold OR there + # are no negative examples + X, Y, W = data.X, data.Y, data.W if data.W else None + Y = Y.astype(dtype=int) + self.learner.target_class = "T" + old_validator = self.learner.rule_validator + self.learner.rule_validator = RelativePureValidator(1, self.threshold, + coverage, Y) + cls = self.learner(data) + pos_rules = cls.rule_list + + # create sub rules that satisfy rule_validator's conditions + """all_rules = [] + all_dists = set() + for r in pos_rules: + covered = r.covered_examples.tostring() + tc = r.target_class + if (covered, tc) not in all_dists: + all_dists.add((covered, tc)) + all_rules.append(r) + # add sub rules to all_rules + s = r.selectors + ps = itertools.chain.from_iterable(itertools.combinations(s, i) for i in range(len(s))) + for p in ps: + if not p: + continue + newr = Rule(selectors = p, domain=r.domain, + initial_class_dist=r.initial_class_dist, + prior_class_dist=r.prior_class_dist, + quality_evaluator=r.quality_evaluator, + complexity_evaluator=r.complexity_evaluator) + newr.filter_and_store(X, Y, W, tc) + newr.do_evaluate() + covered = newr.covered_examples.tostring() + if (covered, tc) not in all_dists and \ + self.learner.rule_validator.validate_rule(newr): # such rule is not in the set yet + all_dists.add((covered, tc)) + all_rules.append(newr) + newr.create_model()""" + + # restore old validator to self.learner + self.learner.rule_validator = old_validator + return self.learner.classifier(domain=cls.domain, rule_list=neg_rules+pos_rules) #all_rules) + + + + +def create_learner(name, evds=True): + rule_learner = rules.ABRuleLearner(width=50, parent_alpha=0.05) + rule_learner.rule_finder.general_validator = TrueCondValidator(rule_learner.rule_finder.general_validator.max_rule_length, + rule_learner.rule_finder.general_validator.min_covered_examples) + rule_learner.rule_validator = PureAccuracyValidator(0) + rule_learner.classifier = NegativeFirstClassifier + if evds: + rule_learner.evds = pickle.load(open("data/{}/evds.pickle".format(name), "rb")) + return rule_learner + -- cgit v1.2.1