Skip to content

Commit 5885213

Browse files
committed
Add support for unit_test mode in evaluation function and tests
Introduces a `unit_test` mode to run teacher-defined test functions or `unittest.TestCase` classes. Updates the evaluation function, test suite, and Postman collection to support the new functionality. Includes error handling, timeout logic, and detailed feedback for passed, failed, and runtime error tests.
1 parent 435352c commit 5885213

3 files changed

Lines changed: 268 additions & 5 deletions

File tree

evaluation_function/evaluation.py

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
import shutil
34
import subprocess
@@ -30,6 +31,50 @@ def _capture_plots():
3031

3132
_CAPTURE_CALL = "_capture_plots()\n"
3233

34+
_UNIT_RUNNER_TEMPLATE = """
35+
import json as _json
36+
import unittest as _ut
37+
38+
_unit_results = []
39+
40+
class _UnitTrackingResult(_ut.TestResult):
41+
def __init__(self):
42+
super().__init__()
43+
self.successes = []
44+
def addSuccess(self, test):
45+
super().addSuccess(test)
46+
self.successes.append(test)
47+
48+
_unit_tc_classes = [v for v in globals().values()
49+
if isinstance(v, type) and issubclass(v, _ut.TestCase) and v is not _ut.TestCase]
50+
if _unit_tc_classes:
51+
_unit_suite = _ut.TestSuite()
52+
for _unit_cls in _unit_tc_classes:
53+
_unit_suite.addTests(_ut.TestLoader().loadTestsFromTestCase(_unit_cls))
54+
_unit_tr = _UnitTrackingResult()
55+
_unit_suite.run(_unit_tr)
56+
for _unit_t in _unit_tr.successes:
57+
_unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'pass'}})
58+
for _unit_t, _unit_e in _unit_tr.failures:
59+
_unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'fail', 'message': _unit_e}})
60+
for _unit_t, _unit_e in _unit_tr.errors:
61+
_unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'error', 'message': _unit_e}})
62+
63+
_unit_plain = [(n, f) for n, f in sorted(globals().items())
64+
if n.startswith('test_') and callable(f) and not isinstance(f, type)]
65+
for _unit_name, _unit_fn in _unit_plain:
66+
try:
67+
_unit_fn()
68+
_unit_results.append({{'name': _unit_name, 'status': 'pass'}})
69+
except AssertionError as _unit_exc:
70+
_unit_results.append({{'name': _unit_name, 'status': 'fail', 'message': str(_unit_exc)}})
71+
except Exception as _unit_exc:
72+
_unit_results.append({{'name': _unit_name, 'status': 'error', 'message': repr(_unit_exc)}})
73+
74+
with open({results_path!r}, 'w') as _unit_f:
75+
_json.dump(_unit_results, _unit_f)
76+
"""
77+
3378

3479
def _run_code(code: str, stdin: str) -> tuple[str, str, bool, list[Image.Image]]:
3580
plot_dir = tempfile.mkdtemp()
@@ -143,13 +188,62 @@ def _evaluate_io(response: str, tests: list, result: Result) -> Result:
143188
return result
144189

145190

191+
def _evaluate_unit(response: str, test_code: str, result: Result) -> Result:
192+
if not test_code.strip():
193+
result.add_feedback("error", "No test code provided for unit_test mode.")
194+
return result
195+
196+
results_path = tempfile.mktemp(suffix=".json")
197+
runner = _UNIT_RUNNER_TEMPLATE.format(results_path=results_path)
198+
combined = response + "\n\n" + test_code + runner
199+
stdout, stderr, timed_out, _ = _run_code(combined, "")
200+
201+
test_results = None
202+
try:
203+
if timed_out:
204+
result.add_feedback("error", f"Code timed out after {_TIMEOUT}s.")
205+
elif not os.path.exists(results_path):
206+
msg = _code_block("Error", stderr.strip()) if stderr else "Unknown error — no results produced."
207+
result.add_feedback("error", msg)
208+
else:
209+
with open(results_path) as f:
210+
test_results = json.load(f)
211+
finally:
212+
if os.path.exists(results_path):
213+
os.unlink(results_path)
214+
215+
if test_results is None:
216+
return result
217+
218+
passed = 0
219+
for r in test_results:
220+
name, status = r["name"], r["status"]
221+
if status == "pass":
222+
passed += 1
223+
result.add_feedback("pass", f"{name}: passed.")
224+
else:
225+
msg = r.get("message", "")
226+
body = f"{name}: {'failed' if status == 'fail' else 'runtime error'}."
227+
if msg:
228+
label = "AssertionError" if status == "fail" else "Error"
229+
body += f"\n\n{_code_block(label, msg.strip())}"
230+
result.add_feedback("fail", body)
231+
232+
total = len(test_results)
233+
result.is_correct = total > 0 and passed == total
234+
result.add_feedback("summary", f"{passed}/{total} tests passed.")
235+
return result
236+
237+
146238
def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
147239
result = Result()
148240
mode = params.get("mode")
149-
if mode not in ("demo", "io_test"):
150-
result.add_feedback("error", f"Unknown or missing mode: {mode!r}. Expected 'demo' or 'io_test'.")
241+
if mode not in ("demo", "io_test", "unit_test"):
242+
result.add_feedback("error", f"Unknown or missing mode: {mode!r}. Expected 'demo', 'io_test', or 'unit_test'.")
151243
return result
152244

153245
if mode == "demo":
154246
return _evaluate_demo(str(response), result)
155-
return _evaluate_io(str(response), params.get("tests", []), result)
247+
if mode == "io_test":
248+
return _evaluate_io(str(response), params.get("tests", []), result)
249+
return _evaluate_unit(str(response), params.get("test_code", ""), result)

evaluation_function/evaluation_test.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,73 @@ def test_run_code_captures_images(self):
111111
_, _, timed_out, images = _run_code(_PLOT_CODE, "")
112112
self.assertFalse(timed_out)
113113
self.assertEqual(len(images), 1)
114-
self.assertEqual(images[0].format, "PNG")
114+
self.assertEqual(images[0].format, "PNG")
115+
116+
117+
_SQUARE_FN = "def square(n): return n * n"
118+
_WRONG_SQUARE_FN = "def square(n): return n + 1"
119+
_CRASH_FN = "raise ValueError('boom')"
120+
121+
_SQUARE_TESTS = (
122+
"def test_positive():\n"
123+
" assert square(5) == 25, 'expected 25'\n"
124+
"def test_zero():\n"
125+
" assert square(0) == 0\n"
126+
)
127+
128+
_SQUARE_TESTS_UNITTEST = (
129+
"import unittest\n"
130+
"class SquareTest(unittest.TestCase):\n"
131+
" def test_positive(self):\n"
132+
" self.assertEqual(square(5), 25)\n"
133+
" def test_zero(self):\n"
134+
" self.assertEqual(square(0), 0)\n"
135+
)
136+
137+
138+
def _unit_params(test_code):
139+
return {"mode": "unit_test", "test_code": test_code}
140+
141+
142+
class TestUnitTestMode(unittest.TestCase):
143+
144+
def test_all_pass(self):
145+
result = evaluation_function(_SQUARE_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
146+
147+
self.assertTrue(result["is_correct"])
148+
self.assertIn("2/2 tests passed", result["feedback"])
149+
self.assertIn("test_positive: passed", result["feedback"])
150+
self.assertIn("test_zero: passed", result["feedback"])
151+
152+
def test_assertion_fail(self):
153+
result = evaluation_function(_WRONG_SQUARE_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
154+
155+
self.assertFalse(result["is_correct"])
156+
self.assertIn("0/2 tests passed", result["feedback"])
157+
self.assertIn("failed", result["feedback"])
158+
self.assertIn("expected 25", result["feedback"])
159+
160+
def test_unittest_style(self):
161+
result = evaluation_function(_SQUARE_FN, None, _unit_params(_SQUARE_TESTS_UNITTEST)).to_dict()
162+
163+
self.assertTrue(result["is_correct"])
164+
self.assertIn("2/2 tests passed", result["feedback"])
165+
166+
def test_module_level_crash(self):
167+
result = evaluation_function(_CRASH_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
168+
169+
self.assertFalse(result["is_correct"])
170+
self.assertIn("boom", result["feedback"])
171+
172+
def test_empty_test_code(self):
173+
result = evaluation_function(_SQUARE_FN, None, _unit_params("")).to_dict()
174+
175+
self.assertFalse(result["is_correct"])
176+
self.assertIn("No test code", result["feedback"])
177+
178+
def test_student_print_no_pollution(self):
179+
code_with_print = _SQUARE_FN + "\nprint('hello')"
180+
result = evaluation_function(code_with_print, None, _unit_params(_SQUARE_TESTS)).to_dict()
181+
182+
self.assertTrue(result["is_correct"])
183+
self.assertIn("2/2 tests passed", result["feedback"])

postman/evaluatePython.postman_collection.json

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"info": {
33
"_postman_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
44
"name": "evaluatePython",
5-
"description": "Test suite for the evaluatePython Lambda Feedback evaluation function.\n\nThe service is fronted by Shimmy, which exposes a single `POST /` HTTP endpoint on port 8080. The command (`eval`, `preview`, `healthcheck`) is selected via a `command` request header (absent = `eval`).\n\n## Running locally\n\nWith Shimmy installed:\n```\nshimmy -c python -a -m -a evaluation_function.main -i ipc\n```\n\nOr via Docker:\n```\ndocker run -p 8080:8080 <image>\n```\n\n## Collection variable\n\n`baseUrl` — default `http://localhost:8080`. Override to point at staging or production.\n\n## Modes\n\n`params.mode` is required for all eval requests:\n- `\"demo\"` — run code and show output, no test validation\n- `\"io_test\"` — run code against stdin/stdout test cases",
5+
"description": "Test suite for the evaluatePython Lambda Feedback evaluation function.\n\nThe service is fronted by Shimmy, which exposes a single `POST /` HTTP endpoint on port 8080. The command (`eval`, `preview`, `healthcheck`) is selected via a `command` request header (absent = `eval`).\n\n## Running locally\n\nWith Shimmy installed:\n```\nshimmy -c python -a -m -a evaluation_function.main -i ipc\n```\n\nOr via Docker:\n```\ndocker run -p 8080:8080 <image>\n```\n\n## Collection variable\n\n`baseUrl` — default `http://localhost:8080`. Override to point at staging or production.\n\n## Modes\n\n`params.mode` is required for all eval requests:\n- `\"demo\"` — run code and show output, no test validation\n- `\"io_test\"` — run code against stdin/stdout test cases\n- `\"unit_test\"` — run teacher-written `def test_xxx()` or `unittest.TestCase` functions against student code",
66
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
77
},
88
"variable": [
@@ -595,6 +595,106 @@
595595
}
596596
]
597597
},
598+
{
599+
"name": "22 – unit_test all pass",
600+
"request": {
601+
"method": "POST",
602+
"header": [
603+
{ "key": "Content-Type", "value": "application/json" }
604+
],
605+
"body": {
606+
"mode": "raw",
607+
"raw": "{\n \"response\": \"def square(n): return n * n\",\n \"answer\": \"\",\n \"params\": {\n \"mode\": \"unit_test\",\n \"test_code\": \"def test_positive():\\n assert square(5) == 25, 'expected 25'\\ndef test_zero():\\n assert square(0) == 0\\n\"\n }\n}",
608+
"options": { "raw": { "language": "json" } }
609+
},
610+
"url": { "raw": "{{baseUrl}}", "host": ["{{baseUrl}}"] }
611+
},
612+
"event": [
613+
{
614+
"listen": "prerequest",
615+
"script": {
616+
"type": "text/javascript",
617+
"exec": ["pm.request.timeout = 40000;"]
618+
}
619+
},
620+
{
621+
"listen": "test",
622+
"script": {
623+
"type": "text/javascript",
624+
"exec": [
625+
"pm.test('Status 200', () => {",
626+
" pm.expect(pm.response.code, 'Body: ' + pm.response.text()).to.equal(200);",
627+
"});",
628+
"",
629+
"pm.test('is_correct is true', () => {",
630+
" pm.expect(pm.response.json().result.is_correct).to.be.true;",
631+
"});",
632+
"",
633+
"pm.test('Feedback reports 2/2 tests passed', () => {",
634+
" pm.expect(pm.response.json().result.feedback).to.include('2/2 tests passed');",
635+
"});",
636+
"",
637+
"pm.test('Individual test names shown as passed', () => {",
638+
" const feedback = pm.response.json().result.feedback;",
639+
" pm.expect(feedback).to.include('test_positive: passed');",
640+
" pm.expect(feedback).to.include('test_zero: passed');",
641+
"});"
642+
]
643+
}
644+
}
645+
]
646+
},
647+
{
648+
"name": "23 – unit_test assertion failure",
649+
"request": {
650+
"method": "POST",
651+
"header": [
652+
{ "key": "Content-Type", "value": "application/json" }
653+
],
654+
"body": {
655+
"mode": "raw",
656+
"raw": "{\n \"response\": \"def square(n): return n + 1\",\n \"answer\": \"\",\n \"params\": {\n \"mode\": \"unit_test\",\n \"test_code\": \"def test_positive():\\n assert square(5) == 25, 'expected 25'\\ndef test_zero():\\n assert square(0) == 0\\n\"\n }\n}",
657+
"options": { "raw": { "language": "json" } }
658+
},
659+
"url": { "raw": "{{baseUrl}}", "host": ["{{baseUrl}}"] }
660+
},
661+
"event": [
662+
{
663+
"listen": "prerequest",
664+
"script": {
665+
"type": "text/javascript",
666+
"exec": ["pm.request.timeout = 40000;"]
667+
}
668+
},
669+
{
670+
"listen": "test",
671+
"script": {
672+
"type": "text/javascript",
673+
"exec": [
674+
"pm.test('Status 200', () => {",
675+
" pm.expect(pm.response.code, 'Body: ' + pm.response.text()).to.equal(200);",
676+
"});",
677+
"",
678+
"pm.test('is_correct is false', () => {",
679+
" pm.expect(pm.response.json().result.is_correct).to.be.false;",
680+
"});",
681+
"",
682+
"pm.test('Feedback reports 0/2 tests passed', () => {",
683+
" pm.expect(pm.response.json().result.feedback).to.include('0/2 tests passed');",
684+
"});",
685+
"",
686+
"pm.test('Feedback includes assertion message', () => {",
687+
" pm.expect(pm.response.json().result.feedback).to.include('expected 25');",
688+
"});",
689+
"",
690+
"pm.test('Feedback marks test as failed', () => {",
691+
" pm.expect(pm.response.json().result.feedback).to.include('failed');",
692+
"});"
693+
]
694+
}
695+
}
696+
]
697+
},
598698
{
599699
"name": "20 – Image generation (demo mode)",
600700
"request": {

0 commit comments

Comments
 (0)