From 420d2e988ef93e0117a006287cf68c6e107286f7 Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Sun, 4 Feb 2018 12:30:43 +0100 Subject: Add introduction --- aied2018/aied2018.bib | 122 +++++++++++++++++++++++++++++++++++++++------- aied2018/aied2018.tex | 2 +- aied2018/introduction.tex | 19 ++++++++ aied2018/rules.tex | 1 + 4 files changed, 125 insertions(+), 19 deletions(-) create mode 100644 aied2018/introduction.tex diff --git a/aied2018/aied2018.bib b/aied2018/aied2018.bib index 207b11f..55d6498 100644 --- a/aied2018/aied2018.bib +++ b/aied2018/aied2018.bib @@ -1,12 +1,12 @@ -@article{Mozina:2007, - author = {Martin Mo\v{z}ina and Jure \v{Z}abkar and Ivan Bratko}, - title = {Argument-Based Machine Learning}, - journal = {Artificial Intelligence}, - volume = {171}, - number = {10/15}, - year = {2007}, - pages = {922-937} - } +@inproceedings{clarkECML1991, + author = "Peter Clark and Robin Boswell", + title = "Rule Induction with {CN}2: Some Recent Improvements", + booktitle = "Machine Learning - Proceeding of the Fifth Europen + Conference (EWSL-91)", + year = "1991", + address = "Berlin", + pages = "151-163" +} @article{demsar2013orange, author = {Janez Demšar and Tomaž Curk and Aleš Erjavec and Črt Gorup and Tomaž Hočevar and Mitar Milutinovič and Martin Možina and Matija Polajnar and Marko Toplak and Anže Starič and Miha Štajdohar and Lan Umek and Lan Žagar and Jure Žbontar and Marinka Žitnik and Blaž Zupan}, @@ -18,13 +18,99 @@ url = {http://jmlr.org/papers/v14/demsar13a.html} } +@article{glassman2015overcode, + title={{OverCode}: Visualizing variation in student solutions to programming problems at scale}, + author={Glassman, Elena L and Scott, Jeremy and Singh, Rishabh and Guo, Philip J and Miller, Robert C}, + journal={ACM Transactions on Computer-Human Interaction (TOCHI)}, + volume={22}, + number={2}, + pages={7}, + year={2015}, + publisher={ACM} +} -@INPROCEEDINGS{clarkECML1991, - AUTHOR = "Peter Clark and Robin Boswell", - TITLE = "Rule Induction with {CN}2: Some Recent Improvements", - BOOKTITLE = "Machine Learning - Proceeding of the Fifth Europen - Conference (EWSL-91)", - YEAR = "1991", - address = "Berlin", - pages = "151-163" -} \ No newline at end of file +@inproceedings{hovemeyer2016control, + title={Control-flow-only abstract syntax trees for analyzing students' programming progress}, + author={Hovemeyer, David and Hellas, Arto and Petersen, Andrew and Spacco, Jaime}, + booktitle={Proceedings of the 2016 ACM Conference on International Computing Education Research}, + pages={63--72}, + year={2016}, + organization={ACM} +} + +@inproceedings{huang2013syntactic, + title = {Syntactic and functional variability of a million code submissions in a machine learning {MOOC}}, + author = {Huang, Jonathan and Piech, Chris and Nguyen, Andy and Guibas, Leonidas}, + booktitle = {Proc. Workshops 16th Int'l Conf. Artificial Intelligence in Education (AIED 13)}, + pages = {25--32}, + year = {2013}, +} + +@inproceedings{jin2012program, + title={Program representation for automatic hint generation for a data-driven novice programming tutor}, + author={Jin, Wei and Barnes, Tiffany and Stamper, John and Eagle, Michael John and Johnson, Matthew W and Lehmann, Lorrie}, + booktitle={Proc. 11th Int'l Conf. Intelligent Tutoring Systems (ITS 12)}, + pages={304--309}, + year={2012}, +} + +@inproceedings{lazar2017automatic, + title={Automatic Extraction of AST Patterns for Debugging Student Programs}, + author={Lazar, Timotej and Možina, Martin and Bratko, Ivan}, + booktitle={International Conference on Artificial Intelligence in Education}, + pages={162--174}, + year={2017}, + organization={Springer} +} + +@inproceedings{levy2006tregex, + title={Tregex and Tsurgeon: tools for querying and manipulating tree data structures}, + author={Levy, Roger and Andrew, Galen}, + year={2006}, + booktitle={5th International Conference on Language Resources and Evaluation (LREC 2006)} +} + +@article{Mozina:2007, + author = {Martin Mo\v{z}ina and Jure \v{Z}abkar and Ivan Bratko}, + title = {Argument-Based Machine Learning}, + journal = {Artificial Intelligence}, + volume = {171}, + number = {10/15}, + year = {2007}, + pages = {922-937} +} + +@inproceedings{nguyen2014codewebs, + title={Codewebs: scalable homework search for massive open online programming courses}, + author={Nguyen, Andy and Piech, Christopher and Huang, Jonathan and Guibas, Leonidas}, + booktitle={Proc. 23rd Int'l World Wide Web Conf. (WWW 14)}, + pages={491--502}, + year={2014}, + COMMENTorganization={ACM}, +} + +@inproceedings{piech2015autonomously, + title={Autonomously generating hints by inferring problem solving policies}, + author={Piech, Chris and Sahami, Mehran and Huang, Jonathan and Guibas, Leonidas}, + booktitle={Proc. 2nd ACM Conference on Learning @ Scale (L@S 2015)}, + pages={195--204}, + year={2015}, + organization={ACM}, +} + +@article{piech2015learning, + title={Learning program embeddings to propagate feedback on student code}, + author={Piech, Chris and Huang, Jonathan and Nguyen, Andy and Phulsuksombati, Mike and Sahami, Mehran and Guibas, Leonidas}, + journal={arXiv preprint arXiv:1505.05969}, + year={2015} +} + +@article{rivers2015data-driven, + title={Data-Driven Hint Generation in Vast Solution Spaces: a Self-Improving {Python} Programming Tutor}, + author={Rivers, Kelly and Koedinger, Kenneth R}, + journal={International Journal of Artificial Intelligence in Education}, + pages={1--28}, + year={2015}, + doi={10.1007/s40593-015-0070-z}, + url={http://dx.doi.org/10.1007/s40593-015-0070-z}, +} diff --git a/aied2018/aied2018.tex b/aied2018/aied2018.tex index cbaae94..a028852 100644 --- a/aied2018/aied2018.tex +++ b/aied2018/aied2018.tex @@ -49,7 +49,7 @@ We propose using tree regular expressions to encode common patterns in programs. \textbf{Keywords:} Learning programming · Educational data analysis · Error diagnosis · Abstract syntax tree · Tree regular expressions \end{abstract} -%\input{introduction} +\input{introduction} %\input{background} \input{patterns} \input{rules} diff --git a/aied2018/introduction.tex b/aied2018/introduction.tex new file mode 100644 index 0000000..9c395a4 --- /dev/null +++ b/aied2018/introduction.tex @@ -0,0 +1,19 @@ +\section{Introduction} + +Providing feedback to students is among the most time-consuming tasks when teaching programming. In large courses with hundreds of students, feedback is therefore often limited to automated program testing. While test cases can reliably determine whether a program is correct or not, they cannot easily be associated with specific errors in the code. + +Prompt, specific feedback could however greatly benefit learning. Furthermore, analyzing the problems students have with programming exercises can allow teachers to improve the course. The main obstacle to such analysis is the large variability of student submissions: even for the simplest tasks, students can submit tens of thousands of different programs~\cite{huang2013syntactic,piech2015autonomously}. + +Several attempts have been made to automatically discover commonalities in a set of programs~\cite{jin2012program,rivers2015data-driven,nguyen2014codewebs,hovemeyer2016control}. This would allow a teacher to annotate a representative subset of submissions with feedback messages, which could then be automatically propagated to similar programs. These techniques are used for instance by the OverCode tool to visualize variations in student programs~\cite{glassman2015overcode}. + +This paper presents a new language for describing patterns in student code. Our approach is based on \emph{tree regular expressions} (TREs) used in natural language processing~\cite{levy2006tregex}. TREs are similar to ordinary regular expressions: they allow us to specify important patterns in a program’s abstract syntax tree (AST) while disregarding irrelevant parts. TREs are sufficiently expressive to represent the various concepts and errors in novice programs. + +We have previously demonstrated this approach with Prolog programs~\cite{lazar2017automatic}. Here we refine the definition of AST patterns, and show that they can be applied to Python -- representing a different programming paradigm -- with only a few language-specific modifications. We also demonstrate that rules learned from such patterns can be easily interpreted. + +Our approach is similar to CodeWebs, in which Nguyen et al. discover functionally equivalent program fragments -- subtrees of the AST that perform the same transformation from inputs to outputs~\cite{nguyen2014codewebs}. Piech et al. learn program embeddings to a similar end~\cite{piech2015learning}. Our work differs mainly in that TREs allow us to describe relations in (potentially disjoint) subparts of the program. We induce rules based on binary correct/incorrect labels and do not execute any program more than once. + +Jin et al. use linkage graphs to represent dependencies between individual program statements based on the variables they contain~\cite{jin2012program}. This allows them to form equivalence classes of programs by ignoring unrelated statements. While TREs can encode such attributes, we use smaller patterns that encode dependencies between pairs of variables. Hovemeyer et al. focus on control structures such as loops and conditionals~\cite{hovemeyer2016control}. Our AST patterns include TREs that describe control flow, expressing the same features. + +When comparing programs we must account for superficial differences such as different naming schemes. Rivers et al. canonicalize student programs using equivalency-preserving transformations (renaming variables, reordering binary expressions, inlining functions and so on)~\cite{rivers2015data-driven}. We use their canonicalization as a preprocessing step before extracting patterns. + +The next section describes TREs and AST patterns, and gives a brief quantitative evaluation to show that patterns can be used to discriminate between correct and incorrect programs. Section~\ref{sec:rules} describes our modified version of the CN2 rule-learning algorithm, and analyzes student solutions to two programming exercises in terms of such rules. The last section outlines the directions for our future work. diff --git a/aied2018/rules.tex b/aied2018/rules.tex index 7821ec4..572d261 100644 --- a/aied2018/rules.tex +++ b/aied2018/rules.tex @@ -1,4 +1,5 @@ \section{Rules} +\label{sec:rules} \subsection{The learning algorithm} The goal of learning rules in this paper is to extract and explain common approaches and mistakes in student programs. We use a rule learner called ABCN2e implemented within the Orange data mining library~\cite{demsar2013orange}. ABCN2e is an improvement of the classical CN2 algorithm~\cite{clarkECML1991} for learning unordered rules. The differences between CN2 and ABCN2e are described in a technical report found at \url{https://ailab.si/abml.} -- cgit v1.2.1