-
Notifications
You must be signed in to change notification settings - Fork 1
/
ai4ed-aaai24.bib
227 lines (196 loc) · 27.6 KB
/
ai4ed-aaai24.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
@proceedings{ai4ed-proc,
title = {Proceedings of the AI for Education Workshop at AAAI 2024, Bridging Innovation and Responsibility},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
editor = {Muktha Ananda and Debshila Basu Malick and Jill Burstein and Lydia T. Liu and Zitao Liu and James Sharpnack and Zichao Wang and Serena Wang},
year = {2024},
name = {AI for Education Workshop},
shortname = {AI4Ed},
year = {2024},
volume = {257},
start = {2024-02-26},
end = {2024-02-27},
published = {2024-08-09},
address= {Vancouver Convention Center, Vancouver, Canada},
conference_url = {https://ai4ed.cc/workshops/aaai2024},
}
% booktitle: the citation to the volume (e.g. Proceedings of the Twenty First International Conference on Machine Learning)
% name: the long name of the conference (e.g. International Conference on Machine Learning)
%shortname the abbreviation for the conference (without the year, e.g., ICML)
%year : the year of the conference (e.g., 2013)
%editor: the editors’ names (in “Lastname, Firstnames” format, separated by “and”)
%volume: the PMLR number assigned to your conference/workshop
%start: the first day of the conference/workshop in YYYY-MM-DD format.
% end: the last day of the conference/workshop in YYYY-MM-DD format.
%address: the location of the conference in venue, city, country format.
%conference_url: the URL of the conference website.
%conference_number: if the conference is part of a series, give the number in the series here.
%title: The title of the paper
%author: The paper’s authors in “Lastname, Firstnames” format, separated by “and”. Do not %use unicode characters, use the LaTeX equivalents.
%pages: The page numbers in “startpage–endpage” format
%abstract: The paper’s abstract.
%The identifiers should be in the form lastnameYY where lastname is the lowercase last name of the first author and YY is the last two digits of the year of the conference. If two papers share the same identifier under this scheme, they must be disambiguated by appending one of a, b, c, etc. to the conflicting identifiers (e.g., reid12a and reid12b). Please do not use UTF-8 charcters in these names (or the file names). ASCII only.
@inproceedings{ananda24,
author = {Muktha Ananda and Debshila Basu Malick and Jill Burstein and Lydia T. Liu and Zitao Liu and James Sharpnack and Zichao Wang and Serena Wang},
title = {AI for Education at AAAI 2024 Bridging Innovation and Responsibility},
year = {2024},
pages = {1-2},
abstract = {Volume preface}
}
@inproceedings{gorgun24,
author = {Gorgun, Guher and Bulut, Okan},
title = {Current Evaluation Methods are a Bottleneck in Automatic Question Generation},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {3-8},
abstract = {This study provides a comprehensive review of frequently used evaluation methods for assessing the quality of automatic question generation (AQG) systems based on computational linguistics techniques and large language models. As we present a comprehensive overview of the current state of evaluation methods, we discuss the advantages and limitations of each method. Furthermore, we elucidate the next steps for the full integration of automatic question generation systems in educational settings to achieve effective personalization and adaptation.}
}
@inproceedings{krupp24,
author = {Krupp, Lars and Steinert, Steffen and Kiefer-Emmanouilidis, Maximilian and Avila, Karina E and Lukowicz, Paul, and Kuhn, Jochen and K{\"u}chemann, Stefan and Karolus, Jakob},
title = {Challenges and Opportunities of Moderating Usage of Large Language Models in Education},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {9-18},
abstract = {The increased presence of large language models (LLMs) in educational settings has ignited debates concerning negative repercussions, including overreliance and inadequate task reflection. Our work advocates moderated usage of such models, designed in a way that supports students and encourages critical thinking. We developed two moderated interaction methods with ChatGPT: hint-based assistance and presenting multiple answer choices.
In a study with students (N=40) answering physics questions, we compared the effects of our moderated models against two baseline settings: unmoderated ChatGPT access and
internet searches. We analyzed the interaction strategies and found that the moderated
versions exhibited less unreflected usage (e.g., copy & paste) compared to the unmoderated condition. However, neither ChatGPT-supported condition could match the ratio of reflected usage present in internet searches. Our research highlights the potential benefits of moderating language models, showing a research direction toward designing effective AI-supported educational strategies.}
}
@inproceedings{luzan24,
title = {Evaluation of the Instance Weighting Strategy for Transfer Learning of Educational Predictive Models},
author = {Luzan, Mariia and Brooks, Christopher},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2023},
pages = {19-28},
abstract = {This work contributes to our understanding of how transfer learning can be used to improve educational predictive models across higher institution units. Specifically, we provide an empirical evaluation of the instance weighting strategy for transfer learning, whereby a model created from a source institution is modified based on the distribution characteristics of the target institution. In this work we demonstrated that this increases overall model goodness-of-fit, increases the goodness-of-fit for each demographic group considered, and reduces disparity between demographic groups when we consider a simulated institutional intervention that can only be deployed to 10\% of the student body.}
}
@inproceedings{silva24,
title = {Exploring the Relationship Between Feature Attribution Methods and Model Performance},
author = {Silva, Priscylla and Silva, Claudio and Nonato, Luis Gustavo},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {29-38},
abstract = {Machine learning and deep learning models are pivotal in educational contexts, particularly in predicting student success. Despite their widespread application, a significant gap persists in comprehending the factors influencing these models' predictions, especially in explainability within education. This work addresses this gap by employing nine distinct explanation methods and conducting a comprehensive analysis to explore the correlation between the agreement among these methods in generating explanations and the predictive model's performance. Applying Spearman's correlation, our findings reveal a very strong correlation between the model's performance and the level of agreement observed among the explanation methods.}
}
@inproceedings{qu24,
title = {Concept Prerequisite Relation Prediction by Using Permutation-Equivariant Directed Graph Neural Networks},
author = {Qu, Xiran and Shang, Xuequn and Zhang, Yupei},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {39-47},
abstract = {This paper studies the problem of CPRP, concept prerequisite relation prediction, which is a fundamental task in using AI for education. CPRP is usually formulated into a link- prediction task on a relationship graph of concepts and solved by training the graph neural network (GNN) model. However, current directed GNNs fail to manage graph isomor- phism which refers to the invariance of non-isomorphic graphs, reducing the expressivity of resulting representations. We present a permutation-equivariant directed GNN model by introducing the Weisfeiler-Lehman test into directed GNN learning. Our method is then used for CPRP and evaluated on three public datasets. The experimental results show that our model delivers better prediction performance than the state-of-the-art methods.
}
}
@inproceedings{kim24,
title = {Problem-Solving Guide (PSG): Predicting the Algorithm Tags and Difficulty for Competitive Programming Problems},
author = {Kim, Juntae and Cho, Eunjung and Na, Dongbin},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {48-56},
abstract = {The recent program development industries have required problem-solving abilities for engineers, especially application developers. However, AI-based education systems to help solve computer algorithm problems have not yet attracted attention, while most big tech companies require the ability to solve algorithm problems including Google, Meta, and Amazon. The most useful guide to solving algorithm problems might be guessing the category (tag) of the facing problems. Therefore, our study addresses the task of predicting the algorithm tag as a useful tool for engineers and developers. Moreover, we also consider predicting the difficulty levels of algorithm problems, which can be used as useful guidance to calculate the required time to solve that problem. In this paper, we present a real-world algorithm problem multi-task dataset, AMT, by mainly collecting problem samples from the most famous and large competitive programming website Codeforces. To the best of our knowledge, our proposed dataset is the most large-scale dataset for predicting algorithm tags compared to previous studies. Moreover, our work is the first to address predicting the difficulty levels of algorithm problems. We present a deep learning-based novel method for simultaneously predicting algorithm tags and the difficulty levels of an algorithm problem given.
}
}
@inproceedings{liu24,
title = {LogicPrpBank: A Corpus for Logical Implication and Equivalence},
author = {Liu, Zhexiong and Zhang, Jing and Lu, Jiaying and Ma, Wenjing and Ho, Joyce C.},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {57-65},
abstract = {Logic reasoning has been critically needed in problem-solving and decision-making. Although Language Models (LMs) have demonstrated capabilities of handling multiple reasoning tasks (e.g., commonsense reasoning), their ability to reason complex mathematical problems, specifically propositional logic, remains largely underexplored. This lack of exploration can be attributed to the limited availability of annotated corpora. Here, we present a well-labeled propositional logic corpus, LogicPrpBank, containing 7093 Propositional Logic Statements (PLSs) across six mathematical subjects, to study a brand-new task of reasoning logical implication and equivalence. We benchmark LogicPrpBank with widely-used LMs to show that our corpus offers a useful resource for this challenging task and there is ample room for model improvement.
}
}
@inproceedings{han24,
author = {Han, Zifei F. and Lin, Jionghao and Gurung, Ashish and Thomas, Danielle R and Chen, Eason and Borchers, Conrad and Gupta, Shivang and Koedinger, Kenneth R},
title = {Improving Assessment of Tutoring Practices using Retrieval-Augmented Generation},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {66-76},
abstract = {One-on-one tutoring is an effective instructional method for enhancing learning, yet its efficacy hinges on tutor competencies. Novice math tutors often prioritize content-specific guidance, neglecting aspects such as social-emotional learning. Social-emotional learning promotes equity and inclusion and nurtures relationships with students, which is crucial for holistic student development. Assessing the competencies of tutors accurately and efficiently can drive the development of tailored tutor training programs. However, evaluating novice tutor ability during real-time tutoring remains challenging as it typically requires experts-in-the-loop. To address this challenge, this study harnesses Generative Pre-trained Transformers (GPT), such as GPT-3.5 and GPT-4, to automatically assess tutors’ ability of using social-emotional tutoring strategies. Moreover, this study also reports on the financial dimensions and considerations of employing these models in real-time and at scale for automated assessment. Four prompting strategies were assessed: two basic Zero-shot prompt strategies, Tree of Thought prompting, and Retrieval-Augmented Generator (RAG) prompting. The results indicate that RAG prompting demonstrated the most accurate performance (assessed by the level of hallucination and correctness in the generated assessment texts) and the lowest financial costs. These findings inform the development of personalized tutor training interventions to enhance the educational effectiveness of tutored learning.},
software = {https://tutorevaluation.vercel.app}
}
@inproceedings{kakarla24,
title = {Using Large Language Models to Assess Tutors' Performance in Reacting to Students Making Math Errors},
author = {Kakarla, Sanjit and Thomas, Danielle R and Lin, Jionghao and Gupta, Shivang and Koedinger, Kenneth R},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {77-84},
abstract = {Research suggests that tutors should adopt a strategic approach when addressing math errors made by low-efficacy students. Rather than drawing direct attention to the error, tutors should guide the students to identify and correct their mistakes on their own. While tutor lessons have introduced this pedagogical skill, human evaluation of tutors applying this strategy is arduous and time-consuming. Large language models (LLMs) show promise in providing real-time assessment to tutors during their actual tutoring sessions, yet little is known regarding their accuracy in this context. In this study, we investigate the capacity of generative AI to evaluate real-life tutors’ performance in responding to students making math errors. By analyzing 50 real-life tutoring dialogues, we find both GPT-3.5-Turbo and GPT-4 demonstrate proficiency in assessing the criteria related to reacting to students making errors. However, both models exhibit limitations in recognizing instances where the student made an error. Notably, GPT-4 tends to overidentify instances of students making errors, often attributing student uncertainty or inferring potential errors where human evaluators did not. Future work will focus on enhancing generalizability by assessing a larger dataset of dialogues and evaluating learning transfer. Specifically, we will analyze the performance of tutors in real-life scenarios when responding to students' math errors before and after lesson completion on this crucial tutoring skill.},
}
@inproceedings{lekan24,
author = {Lekan, Kasra and Pardos, Zachary A.},
title = {AI-Augmented Advising: A Comparative Study of GPT-4 and Advisor-based Major Recommendations},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {85-96},
abstract = {Choosing an undergraduate major is an important decision that impacts academic and career outcomes. We investigate using GPT-4, a state-of-the-art large language model (LLM), to augment human advising for major selection. Through a 3-phase survey, we compare GPT suggestions and responses for undeclared Freshmen and Sophomore students (n=33) to expert responses from university advisors (n=25). Undeclared students were first surveyed on their interests and goals. These responses were then given to both campus advisors and to GPT to produce a major recommendation for each student. In the case of GPT, information about the majors offered on campus was added to the prompt. Advisors, overall, rated the recommendations of GPT to be highly helpful and agreed with their recommendations 33\% of the time. Additionally, we observe more agreement with AI major recommendations when advisors see the AI recommendations before making their own. However, this result was not statistically significant. The results provide a first signal as to the viability of LLMs for personalized major recommendation and shed light on the promise and limitations of AI for advising support.}
}
@inproceedings{latif24,
author = {Latif, Ehsan and Zhai, Xiaoming},
title = {Automatic Scoring of Students’ Science Writing Using Hybrid Neural Network},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {97-106},
abstract = {This study explores the efficacy of a multi-perspective hybrid neural network (HNN) for scoring student responses in science education with an analytic rubric. We compared the accuracy of the HNN model with four ML approaches (BERT, ANN, Naive Bayes, and Logistic Regression). The results have shown that HHN achieved 8\%, 3\%, 1\%, and 0.12\% higher accuracy than Naive Bayes, Logistic Regression, ANN, and BERT, respectively, for five scoring aspects (p < 0.001). The overall HNN’s perceived accuracy (M = 96.23\%, SD = 1.45\%) is comparable to the (training and inference) expensive BERT model’s accuracy (M = 96.12\%, SD = 1.52\%). We also have observed that HNN is x2 more efficient in terms of training and inferencing than BERT and has comparable efficiency to the lightweight but less accurate Naive Bayes model. Our study confirmed the accuracy and efficiency of using HNN to automatically score students’ science writing.}
}
@inproceedings{lekshmi-narayanan24,
title={Explaining code examples in introductory programming courses: Llm vs humans},
author={Lekshmi-Narayanan, Arun-Balajiee and Oli, Priti and Chapagain, Jeevan and Hassany, Mohammad and Banjade, Rabin and Brusilovsky, Peter and Rus, Vasile},
year = {2024},
pages = {107-117},
journal={arXiv preprint arXiv:2403.05538},
abstract = {Worked examples, which present an explained code for solving typical programming problems are among the most popular types of learning content in programming classes. Most approaches and tools for presenting these examples to students are based on line-by-line
explanations of the example code. However, instructors rarely have time to provide explanations for many examples typically used in a programming class. In this paper, we assess the feasibility of using LLMs to generate code explanations for passive and active example
exploration systems. To achieve this goal, we compare the code explanations generated by chatGPT with the explanations generated by both experts and students.}
}
@inproceedings{oli24,
title = {Automated Assessment of Students' Code Comprehension using LLM},
author = {Oli, Priti and Banjade, Rabin and Chapagain, Jeevan and Rus, Vasile},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {118-128},
abstract = {Assessing students' answers and in particular natural language answers is a crucial challenge in the field of education. Advances in transformer-based models such as Large Language Models(LLMs), have led to significant progress in various natural language tasks. Nevertheless, amidst the growing trend of evaluating LLMs across diverse tasks, evaluating LLMs in the realm of automated answer assessment has not received much attention. To address this gap, we explore the potential of using LLMs for automated assessment of student's short and open-ended answers in program comprehension tasks. Particularly, we use LLMs to compare students' explanations with expert explanations in the context of line-by-line explanations of computer programs. For comparison purposes, we assess both decoder-only Large Language Models (LLMs) and encoder-based Semantic Textual Similarity (STS) models in the context of assessing the correctness of students' explanation of computer code. Our findings indicate that decoder-only LLMs, when prompted in few-shot and chain-of-thought setting perform comparable to fine-tuned encoder-based models in evaluating students' short answers in the programming domain.},
}
%% Day 2
@inproceedings{tercan24,
title = {Synthesizing a Progression of Subtasks for Block-Based Visual Programming Tasks},
author = {Tercan, Alperen and Ghosh, Ahana and Eniser, Hasan Ferit and Christakis, Maria and Singla, Adish},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {129-138},
abstract = {Block-based visual programming environments play an increasingly important role in introducing computing concepts to K-12 students. The open-ended and conceptual nature of these visual programming tasks make them challenging for novice programmers. A natural approach to providing assistance for problem-solving is breaking down a complex task into a progression of simpler subtasks. However, this is not trivial given that the solution codes are typically nested and have non-linear execution behavior. In this paper, we formalize the problem of synthesizing such a progression for a given reference task in a visual programming domain. We propose a novel synthesis algorithm that generates a progression of subtasks that are high-quality, well-spaced in terms of their complexity, and solving this progression leads to solving the reference task. We conduct a user study to demonstrate that our synthesized progression of subtasks can assist a novice programmer in solving tasks from the Hour of Code: Maze Challenge by Code.org.}
}
@inproceedings{bauschard24,
title = {Augmented Debate-Centered Instruction: A Novel Research Agenda for Responsible AI Integration in Education},
author = {Bauschard, Stefan and Coverstone, Alan and Gonier, Devin and Hines, John and Rao, Anand},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {139-150},
abstract = {This paper puts forth a novel research agenda called Augmented Debate-Centered Instruction (ADCI) to address a concerning gap in understanding the responsible integration of AI in education. As assessments grounded in individual writing become unreliable with the proliferation of language models, most attention has focused on detecting AI cheating rather than exploring pedagogical adaptations. This oversight leaves open critical questions about how generative AI might be leveraged to augment instructional practices and provide alternative modes of assessment, especially for developing essential cognitive abilities and durable skills.}
}
@inproceedings{zhang24,
title = {Learning to Compare Hints: Combining Insights from Student Logs and Large Language Models},
author = {Zhang, Ted and Kumar, Harshith Arun and Schmucker, Robin and Azaria, Amos and Mitchell, Tom},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {162-169},
abstract = {We explore the general problem of learning to predict which teaching actions will result in the best learning outcomes for students in online courses. More specifically, we consider the problem of predicting which hint will most help a student who answers a practice question incorrectly, and who is about to make a second attempt to answer that question. In previous work we showed that log data from thousands of previous students could be used to learn empirically which of several pre-defined hints produces the best learning outcome. However, while that study utilized data from thousands of students submitting millions of responses, it did not consider the actual text of the question, the hint, or the answer. In this paper, we ask the follow-on question ``Can we train a machine learned model to examine the text of the question, the answer, and the text of hints, to predict which hint will lead to better learning outcomes?'' Our experimental results show that the answer is yes. This is important because the trained model can now be applied to new questions and hints covering related subject matter, to estimate which of the new hints will be most useful, even before testing it on students. Finally, we show that the pairs of hints for which the model makes most accurate predictions are the hint pairs where choosing the right hint has the biggest payoff (i.e., hint pairs for which the difference in learning outcomes is greatest).
}
}
@inproceedings{moringen24,
title = {Generating Piano Practice Policy with a Gaussian Process},
author = {Moringen, Alexandra and Vromen, Elad and Ritter, Helge and Friedman, Jason},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
pages = {151-161},
abstract = {A typical process of learning to play a piece on a piano consists of a progression through a series of practice units that focus on individual dimensions of the skill, the so-called practice modes. Practice modes in learning to play music comprise a particularly large set of possibilities, such as hand coordination, posture, articulation, ability to read a music score, correct timing or pitch, etc. Self-guided practice is known to be suboptimal, and a model that schedules optimal practice to maximize a learner’s progress still does not exist. Because we each learn differently and there are many choices for possible piano practice tasks and methods, the set of practice modes should be dynamically adapted to the human learner, a process typically guided by a teacher. However, having a human teacher guide individual practice is not always feasible since it is time-consuming, expensive, and often unavailable. In this work, we present a modeling framework to guide the human learner through the learning process by choosing the practice modes generated by a policy model. To this end, we present a computational architecture building on a Gaussian process that incorporates 1) the learner state, 2) a policy that selects a suitable practice mode, 3) performance evaluation, and 4) expert knowledge. The proposed policy model is trained to approximate the expert-learner interaction during a practice session. In our future work, we will test different Bayesian optimization techniques, e.g., different acquisition functions, and evaluate their effect on the learning progress.
}
}
@inproceedings{kumar24,
title = {Using Large Language Models for Student-Code GuidedTest Case Generation in Computer Science Education},
author = {Ashok Kumar, Nischal and Andrew S., Lan},
booktitle = {Proceedings of the 2024 AAAI Conference on Artificial Intelligence},
year = {2024},
volume = {257},
pages = {170-179},
abstract = {In computer science education, test cases are an integral part of programming assignments since they can be used as assessment items to test students' programming knowledgeand provide personalized feedback on student-written code. The goal of our work is to propose a fully automated approach for test case generation that can accurately measure student knowledge, which is important for two reasons. First, manually constructing testcases requires expert knowledge and is a labor-intensive process. Second, developing test cases for students, especially those who are novice programmers, is significantly different from those oriented toward professional-level software developers. Therefore, we need an automated process for test case generation to assess student knowledge and provide feedback. In this work, we propose a large language model-based approach to automatically generate test cases and show that they are good measures of student knowledge, using a publicly available dataset that contains student-written Java code. We also discuss future research directions centered on using test cases to help students.
}
}