# CodeQ: an online programming tutor.
# Copyright (C) 2015 UL FRI
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import collections
from itertools import combinations
import pickle
import random
import re
import sys

from nltk import ParentedTree, Tree
from nltk.tgrep import tgrep_positions

from prolog.util import Token, parse as prolog_parse

# construct pattern to match the structure of nodes given by [include]
def pattern(node, include):
    if isinstance(node, Token):
        if node.type == 'NAME':
            return '"{}"'.format(node.val)
        return None

    if any(n is node for n in include):
        if node.label() == 'variable':
            return 'variable <: "{}"'.format(node[0].val)
        return None

    label = node.label()
    subpats = [pattern(child, include) for child in node]

    if label == 'functor':
        return '{} <1 ({})'.format(label, subpats[0])

    pat = None
    if any(subpats):
        if label == 'and':
            if subpats[1]:
                pat = subpats[1]
            if subpats[0]:
                if pat:
                    pat = subpats[0] + ' .. (' + pat + ')'
                else:
                    pat = subpats[0]
        elif label == 'args':
            pat = label
            for i, subpat in enumerate(subpats):
                if subpat:
                    pat += ' <{} ({})'.format(i+1, subpat)
        elif label == 'binop':
            pat = label
            if subpats[0]:
                pat += ' <1 ({})'.format(subpats[0])
            pat += ' <2 ("{}")'.format(node[1].val)
            if subpats[2]:
                pat += ' <3 ({})'.format(subpats[2])
        elif label == 'clause':
            pat = label
            for i, subpat in enumerate(subpats):
                if subpat:
                    pat += ' {} ({})'.format('<1' if i == 0 else '<<', subpats[i])
        elif label == 'compound':
            if len(subpats) > 1 and subpats[1]:
                pat = label
                for i, subpat in enumerate(subpats):
                    pat += ' <{} ({})'.format(i+1, subpat)
            else:
                return None
        elif label == 'head':
            pat = label
            pat += ' <1 ({})'.format(subpats[0])
        if not pat:
            for s in subpats:
                if s:
                    pat = s
                    break
    return pat

# construct a pattern connecting (variable) node [a] to [b]
def connect(a, b):
    path_a = []
    node = a
    while node.parent():
        node = node.parent()
        path_a.insert(0, node)

    path_b = []
    node = b
    while node.parent():
        node = node.parent()
        path_b.insert(0, node)

    common_ancestor = None
    for i in range(min(len(path_a), len(path_b))):
        if path_a[i] is not path_b[i]:
            break
        if path_a[i].label() in {'compound', 'head'}:
            break
        common_ancestor = path_a[i]

    def node_label(node):
        if node.label() == 'compound':
            right = node[1]
            nargs = 1
            while right._label == "args" and len(right) == 2:
                right = right[1]
                nargs += 1
            return 'compound <1 (functor <: "{}{}")'.format(node[0][0].val, nargs)
        if node.label() == 'binop':
            return 'binop <2 "{}"'.format(node[1].val)
        return node.label()
    i = 0
    while path_a[i].label() != 'clause':
        i += 1

    # path from top to common ancestor
    pat = path_a[i].label()
    i += 1
    n_top = 0
    while i < min(len(path_a), len(path_b)) and path_a[i] is path_b[i]:
        node = path_a[i]
        i += 1
        if node.label() == 'and':
            continue
        if node.parent().label() == 'and':
            op = '<+(and)'
        else:
            op = '<{}'.format(node.parent_index()+1)
        pat += ' {} ({}'.format(op, node_label(node))
        n_top += 1

    path_a = path_a[i:]
    path_b = path_b[i:]

    # path from common ancestor to [a]
    n_a = 0
    for node in path_a:
        if node.label() == 'and':
            continue
        op = '<'
        if node.parent().label() == 'and':
            op = '<+(and)'
        elif node.parent_index() is not None:
            op = '<{}'.format(node.parent_index()+1)
        pat += ' {} ({}'.format(op, node_label(node))
        n_a += 1
    pat += ' <{} ({} <: "{}")'.format(a.parent_index()+1, a.label(), a[0].val)
    pat += ')' * n_a

    # path from common ancestor to [b]
    n_b = 0
    for node in path_b:
        if node.label() == 'and':
            continue
        op = '<'
        if node.parent().label() == 'and':
            op = '<+(and)'
        elif node.parent_index() is not None:
            op = '<{}'.format(node.parent_index()+1)
        pat += ' {} ({}'.format(op, node_label(node))
        n_b += 1
    pat += ' <{} ({} <: "{}")'.format(b.parent_index()+1, b.label(), b[0].val)

    pat += ')' * (n_top + n_b)
    return pat

# replace variable names with patterns and backlinks
def postprocess(pattern):
    macros = '@ VAR /[A-Z]/; '
    #m = re.search(r'"[A-Z][^"]*_[0-9]+[^"]*"', pattern)
    m = re.search(r'"[A-Z]_[0-9]+"', pattern)
    nvars = 0
    while m is not None:
        orig_name = m.group(0)
        n = orig_name.strip('"').split("_")[1]
        pat_name = 'v{}'.format(nvars)
        nvars += 1
        pattern = pattern[:m.start()] + '@VAR={}{}'.format(pat_name, n) + pattern[m.end():]
        for m in re.finditer(orig_name, pattern):
            pattern = pattern[:m.start()] + '~{}{}'.format(pat_name, n) + pattern[m.end():]
        m = re.search(r'"([A-Z]*_[0-9]+)"', pattern)
    return macros + pattern

def postprocess_simple(pattern):
    pattern = postprocess(pattern)
    if pattern.startswith("@ VAR /[A-Z]/; clause <2 (or"):
        return None
    #relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|\(variable.*?\))', pattern)
    relevant = re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', pattern)
    
    # elements
    elements = []
    current = ""
    i = 0
    par = 0
    while i < len(pattern):
        if par > 0 and pattern[i] == ")":
            par -= 1
            if par == 0:
                elements.append(current)
                current = ""
        if par > 0 and pattern[i] == "(":
            par += 1
        if par == 0 and \
                (pattern[i:].startswith("(head") or 
                 pattern[i:].startswith("(compound") or
                 pattern[i:].startswith("(binop")):
            par = 1
        if par > 0:
            current += pattern[i]
        i += 1
    # simplify variable
    for ei, e in enumerate(elements):
    #    #elements[ei] = re.sub("\(variable.*?\)", "(variable)", e)
        elements[ei] = "("+" ".join(re.findall('(head|\(functor.*?\)|\(binop.*?".*?"|args|variable|literal)', e))+")"
    elements = sorted(elements)
    #print(pattern)
    #print(relevant)
    #return "_".join(relevant)#pattern
    return " ".join(elements)#pattern

# construct pattern to match the structure of nodes given by [include],
# supports variables and literals
def pattern2(node, include):
    if isinstance(node, Token):
        return None

    label = node.label()
    if any(n is node for n in include):
        if label == 'literal':
            return '"{}"'.format(node[0].val)
        if label == 'variable':
            return '{}'.format(label)
        return None
    if label == 'functor':
        return '({} "{}")'.format(label, node[0].val)

    subpats = [pattern2(child, include) for child in node]
    pat = None
    if any(subpats):
        if label == 'and':
            if subpats[1]:
                pat = subpats[1]
            if subpats[0]:
                if pat:
                    pat = subpats[0] + ' ' + pat
                else:
                    pat = subpats[0]
        elif label == 'args':
            pat = label
            for i, subpat in enumerate(subpats):
                if subpat:
                    pat += ' {}'.format(subpat)
            pat = '(' + pat + ')'
        elif label == 'unop':
            pat = '(' + label + ' ' + node[0].val + ' ' + subpats[1] + ')'
        elif label == 'binop':
            pat = label
            if subpats[0]:
                pat += ' {}'.format(subpats[0])
            pat += ' "{}"'.format(node[1].val)
            if subpats[2]:
                pat += ' {}'.format(subpats[2])
            pat = '(' + pat + ')'
        elif label == 'clause':
            pat = label
            for i, subpat in enumerate(subpats):
                if subpat:
                    pat += ' {}'.format(subpats[i])
            return '(' + pat + ')'
        elif label == 'compound':
            if len(subpats) > 1 and subpats[1]:
                pat = label
                for i, subpat in enumerate(subpats):
                    pat += ' {}'.format(subpat)
                pat = '(' + pat + ')'
            else:
                return None
        elif label == 'head':
            pat = label
            pat += ' {}'.format(subpats[0])
            pat = '(' + pat + ')'
        elif label == 'list':
            pat = 'list '
            if subpats[0]:
                pat += '(h {})'.format(subpats[0])
            if subpats[0] and subpats[1]:
                pat += ' '
            if subpats[1]:
                pat += '(t {})'.format(subpats[1])
            pat = '(' + pat + ')'
        if not pat:
            for s in subpats:
                if s:
                    pat = s
                    break
    return pat

def get_patterns(tree):
    orig = tree
    if isinstance(tree, str):
        tree = prolog_parse(tree)
        if tree is None:
            return
    tree = ParentedTree.convert(tree)

    # get patterns separately for each clause
    for clause in tree:
        all_variables = []
        variables = collections.defaultdict(list)
        def walk(node):
            if isinstance(node, Tree):
                if node.label() == 'variable':
                    name = node[0].val
                    variables[name].append(node)
                    all_variables.append(node)
                else:
                    for child in node:
                        walk(child)
        walk(clause)

        # connecting pairs of nodes with same variable
        for var, nodes in variables.items():
            for selected in combinations(nodes, 2):
                pat = pattern2(clause, selected)
                if pat:
                    print(pat)
                    yield pat, selected
                #pat = connect(*selected)
                #if pat:
                #    pp = postprocess_simple(pat)
                #    if pp:
                #        yield pp, selected

        # add singletons
        for var, nodes in variables.items():
            if len(nodes) == 1: 
                pat = pattern2(clause, nodes)
                if pat:
                    print(pat)
                    yield pat, nodes
                #pat = pattern(tree, var)
                #if pat:
                #    pp = postprocess_simple(pat)
                #    if pp:
                #        yield pp, nodes
        
        # add patterns for literal/variable pairs
        literals = [node for node in clause.subtrees() if node.label() == 'literal']
        for literal in literals:
            top = literal
            while top != clause and top.parent().label() in {'compound', 'binop', 'unop', 'args', 'list'}:
                top = top.parent()
            variables = [node for node in top.subtrees() if node.label() == 'variable']
            for var in variables:
                pat = pattern2(clause, [var, literal])
                if pat:
                    yield pat, [var, literal]

#        # connecting pairs of nodes with variables
#        for selected in combinations(all_variables, 2):
#            pat = connect(selected[0], selected[1])
#            if pat:
#                yield postprocess(pat), selected

#        # using pairs of nodes with variables
#        for selected in combinations(all_variables, 2):
#            pat = pattern(tree, selected)
#            if pat:
#                yield postprocess(pat), selected

#        # using pairs of nodes with same variable
#        for var, nodes in variables.items():
#            for selected in combinations(nodes, 2):
#                pat = pattern(tree, selected)
#                if pat:
#                    yield postprocess(pat), selected

#       # using each variable separately
#       for var, nodes in variables.items():
#           pat = pattern(tree, nodes)
#           if pat:
#               yield postprocess(pat), nodes

#       # using each goal to select variables FIXME
#       goals = [s for s in tree.subtrees() if s.label() in {'compound', 'binop'}]
#       for goal in goals:
#           goal_vars = {n.val for n in goal.leaves() if n.type == 'VARIABLE'}
#           pat = pattern(tree, goal_vars)
#           if pat:
#               yield postprocess(pat)

# nltk.tgrep does not play nice with non-Tree leaves
def _tgrep_prepare(tree):
    if not isinstance(tree, ParentedTree):
        tree = ParentedTree.convert(tree)
    def prepare(node):
        if isinstance(node, Token) or isinstance(node, str):
            return ParentedTree(str(node), [])
        return ParentedTree(node.label(), [prepare(child) for child in node])
    return prepare(tree)

def find_motif(tree, motif):
    tree = _tgrep_prepare(tree)
    return tgrep_positions(motif, [tree])

# Extract edits and other data from existing traces for each problem.
# Run with: python3 -m monkey.patterns <problem ID> <solutions.pickle>
if __name__ == '__main__':
    # Ignore traces from these users.
    ignored_users = [
        1, # admin
        231, # test
        360, # test2
        358, # sasha
    ]

    pid = int(sys.argv[1])
    name = sys.argv[2]
    submissions = pickle.load(open('pickle/programs-{}.pickle'.format(pid), 'rb'))

    print('Analyzing programs for problem {}…'.format(pid))
    ndata = {
        'train': [],
        'test': []
    }
    # get all users
    users = set()
    for code, data in submissions.items():
        users |= data['users']
    users = list(users)
    users.sort()
    random.Random(0).shuffle(users)
    split = int(len(users)*0.7)
    learn_users = set(users[:split])
    test_users = set(users[split:])
    print(test_users)

    for code, data in submissions.items():
        if len(code) > 1000 or prolog_parse(code) is None:
            continue
        if name not in code:
            continue
        ndata['train'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & learn_users)
        ndata['test'] += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'] & test_users)
        #programs += [(code, data['n_tests'] == data['n_passed'])] * len(data['users'])

    print('correct: {} ({} unique)'.format(
        len([code for code, correct in ndata['train'] if correct]),
        len({code for code, correct in ndata['train'] if correct})))
    print('incorrect: {} ({} unique)'.format(
        len([code for code, correct in ndata['train'] if not correct]),
        len({code for code, correct in ndata['train'] if not correct})))

    #iprograms.sort()
    #random.Random(0).shuffle(programs)
    #split = int(len(programs)*0.7)
    #data = {
    #    'train': programs[:split],
    #    'test': programs[split:],
    #}
    data = ndata
    
    # extract attributes from training data
    patterns = collections.Counter()
    for code, correct in data['train']:
        for pat, nodes in get_patterns(code):
            patterns[pat] += 1

    attrs = []
    with open('data/attributes-{:03}.tab'.format(pid), 'w') as pattern_file:
        for i, (pat, count) in enumerate(patterns.most_common()):
            if count < 1:
                break
            attrs.append(pat)
            print('a{}\t{}'.format(i, pat), file=pattern_file)

    # check and write attributes for training/test data
    for t in ['train', 'test']:
        with open('data/programs-{:03}-{}.tab'.format(pid, t), 'w') as f:
            # print header
            print('\t'.join(['code', 'correct'] + ['a'+str(i) for i in range(len(attrs))]), file=f)
            print('\t'.join(['d'] * (len(attrs)+2)), file=f)
            print('meta\tclass', file=f)
            for code, correct in data[t]:
                record = '{}\t{}'.format(repr(code), 'T' if correct else 'F')

                ## check attributes by using tgrep to find patterns
                #tree = _tgrep_prepare(prolog_parse(code))
                #for pat in attrs:
                #    matches = list(tgrep_positions(pat, [tree]))
                #    record += '\t{}'.format(bool(matches[0]))

                # check attributes by enumerating patterns in code
                code_pats = [pat for pat, nodes in get_patterns(code)]
                for pat in attrs:
                    record += '\t{}'.format('T' if pat in code_pats else 'F')

                print(record, file=f)