Add support for unit_test mode in evaluation function and tests

m-messer · m-messer · commit 588521363983 · 2026-05-25T15:14:45.000+01:00
Introduces a `unit_test` mode to run teacher-defined test functions or `unittest.TestCase` classes. Updates the evaluation function, test suite, and Postman collection to support the new functionality. Includes error handling, timeout logic, and detailed feedback for passed, failed, and runtime error tests.
diff --git a/evaluation_function/evaluation.py b/evaluation_function/evaluation.py
@@ -1,3 +1,4 @@
+import json
 import os
 import shutil
 import subprocess
@@ -30,6 +31,50 @@ def _capture_plots():
 
 _CAPTURE_CALL = "_capture_plots()\n"
 
+_UNIT_RUNNER_TEMPLATE = """
+import json as _json
+import unittest as _ut
+
+_unit_results = []
+
+class _UnitTrackingResult(_ut.TestResult):
+    def __init__(self):
+        super().__init__()
+        self.successes = []
+    def addSuccess(self, test):
+        super().addSuccess(test)
+        self.successes.append(test)
+
+_unit_tc_classes = [v for v in globals().values()
+                    if isinstance(v, type) and issubclass(v, _ut.TestCase) and v is not _ut.TestCase]
+if _unit_tc_classes:
+    _unit_suite = _ut.TestSuite()
+    for _unit_cls in _unit_tc_classes:
+        _unit_suite.addTests(_ut.TestLoader().loadTestsFromTestCase(_unit_cls))
+    _unit_tr = _UnitTrackingResult()
+    _unit_suite.run(_unit_tr)
+    for _unit_t in _unit_tr.successes:
+        _unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'pass'}})
+    for _unit_t, _unit_e in _unit_tr.failures:
+        _unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'fail', 'message': _unit_e}})
+    for _unit_t, _unit_e in _unit_tr.errors:
+        _unit_results.append({{'name': _unit_t.id().split('.')[-1], 'status': 'error', 'message': _unit_e}})
+
+_unit_plain = [(n, f) for n, f in sorted(globals().items())
+               if n.startswith('test_') and callable(f) and not isinstance(f, type)]
+for _unit_name, _unit_fn in _unit_plain:
+    try:
+        _unit_fn()
+        _unit_results.append({{'name': _unit_name, 'status': 'pass'}})
+    except AssertionError as _unit_exc:
+        _unit_results.append({{'name': _unit_name, 'status': 'fail', 'message': str(_unit_exc)}})
+    except Exception as _unit_exc:
+        _unit_results.append({{'name': _unit_name, 'status': 'error', 'message': repr(_unit_exc)}})
+
+with open({results_path!r}, 'w') as _unit_f:
+    _json.dump(_unit_results, _unit_f)
+"""
+
 
 def _run_code(code: str, stdin: str) -> tuple[str, str, bool, list[Image.Image]]:
     plot_dir = tempfile.mkdtemp()
@@ -143,13 +188,62 @@ def _evaluate_io(response: str, tests: list, result: Result) -> Result:
     return result
 
 
+def _evaluate_unit(response: str, test_code: str, result: Result) -> Result:
+    if not test_code.strip():
+        result.add_feedback("error", "No test code provided for unit_test mode.")
+        return result
+
+    results_path = tempfile.mktemp(suffix=".json")
+    runner = _UNIT_RUNNER_TEMPLATE.format(results_path=results_path)
+    combined = response + "\n\n" + test_code + runner
+    stdout, stderr, timed_out, _ = _run_code(combined, "")
+
+    test_results = None
+    try:
+        if timed_out:
+            result.add_feedback("error", f"Code timed out after {_TIMEOUT}s.")
+        elif not os.path.exists(results_path):
+            msg = _code_block("Error", stderr.strip()) if stderr else "Unknown error — no results produced."
+            result.add_feedback("error", msg)
+        else:
+            with open(results_path) as f:
+                test_results = json.load(f)
+    finally:
+        if os.path.exists(results_path):
+            os.unlink(results_path)
+
+    if test_results is None:
+        return result
+
+    passed = 0
+    for r in test_results:
+        name, status = r["name"], r["status"]
+        if status == "pass":
+            passed += 1
+            result.add_feedback("pass", f"{name}: passed.")
+        else:
+            msg = r.get("message", "")
+            body = f"{name}: {'failed' if status == 'fail' else 'runtime error'}."
+            if msg:
+                label = "AssertionError" if status == "fail" else "Error"
+                body += f"\n\n{_code_block(label, msg.strip())}"
+            result.add_feedback("fail", body)
+
+    total = len(test_results)
+    result.is_correct = total > 0 and passed == total
+    result.add_feedback("summary", f"{passed}/{total} tests passed.")
+    return result
+
+
 def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
     result = Result()
     mode = params.get("mode")
-    if mode not in ("demo", "io_test"):
-        result.add_feedback("error", f"Unknown or missing mode: {mode!r}. Expected 'demo' or 'io_test'.")
+    if mode not in ("demo", "io_test", "unit_test"):
+        result.add_feedback("error", f"Unknown or missing mode: {mode!r}. Expected 'demo', 'io_test', or 'unit_test'.")
         return result
 
     if mode == "demo":
         return _evaluate_demo(str(response), result)
-    return _evaluate_io(str(response), params.get("tests", []), result)
+    if mode == "io_test":
+        return _evaluate_io(str(response), params.get("tests", []), result)
+    return _evaluate_unit(str(response), params.get("test_code", ""), result)
diff --git a/evaluation_function/evaluation_test.py b/evaluation_function/evaluation_test.py
@@ -111,4 +111,73 @@ def test_run_code_captures_images(self):
         _, _, timed_out, images = _run_code(_PLOT_CODE, "")
         self.assertFalse(timed_out)
         self.assertEqual(len(images), 1)
-        self.assertEqual(images[0].format, "PNG")
+        self.assertEqual(images[0].format, "PNG")
+
+
+_SQUARE_FN = "def square(n): return n * n"
+_WRONG_SQUARE_FN = "def square(n): return n + 1"
+_CRASH_FN = "raise ValueError('boom')"
+
+_SQUARE_TESTS = (
+    "def test_positive():\n"
+    "    assert square(5) == 25, 'expected 25'\n"
+    "def test_zero():\n"
+    "    assert square(0) == 0\n"
+)
+
+_SQUARE_TESTS_UNITTEST = (
+    "import unittest\n"
+    "class SquareTest(unittest.TestCase):\n"
+    "    def test_positive(self):\n"
+    "        self.assertEqual(square(5), 25)\n"
+    "    def test_zero(self):\n"
+    "        self.assertEqual(square(0), 0)\n"
+)
+
+
+def _unit_params(test_code):
+    return {"mode": "unit_test", "test_code": test_code}
+
+
+class TestUnitTestMode(unittest.TestCase):
+
+    def test_all_pass(self):
+        result = evaluation_function(_SQUARE_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
+
+        self.assertTrue(result["is_correct"])
+        self.assertIn("2/2 tests passed", result["feedback"])
+        self.assertIn("test_positive: passed", result["feedback"])
+        self.assertIn("test_zero: passed", result["feedback"])
+
+    def test_assertion_fail(self):
+        result = evaluation_function(_WRONG_SQUARE_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
+
+        self.assertFalse(result["is_correct"])
+        self.assertIn("0/2 tests passed", result["feedback"])
+        self.assertIn("failed", result["feedback"])
+        self.assertIn("expected 25", result["feedback"])
+
+    def test_unittest_style(self):
+        result = evaluation_function(_SQUARE_FN, None, _unit_params(_SQUARE_TESTS_UNITTEST)).to_dict()
+
+        self.assertTrue(result["is_correct"])
+        self.assertIn("2/2 tests passed", result["feedback"])
+
+    def test_module_level_crash(self):
+        result = evaluation_function(_CRASH_FN, None, _unit_params(_SQUARE_TESTS)).to_dict()
+
+        self.assertFalse(result["is_correct"])
+        self.assertIn("boom", result["feedback"])
+
+    def test_empty_test_code(self):
+        result = evaluation_function(_SQUARE_FN, None, _unit_params("")).to_dict()
+
+        self.assertFalse(result["is_correct"])
+        self.assertIn("No test code", result["feedback"])
+
+    def test_student_print_no_pollution(self):
+        code_with_print = _SQUARE_FN + "\nprint('hello')"
+        result = evaluation_function(code_with_print, None, _unit_params(_SQUARE_TESTS)).to_dict()
+
+        self.assertTrue(result["is_correct"])
+        self.assertIn("2/2 tests passed", result["feedback"])
diff --git a/postman/evaluatePython.postman_collection.json b/postman/evaluatePython.postman_collection.json
@@ -2,7 +2,7 @@
   "info": {
     "_postman_id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
     "name": "evaluatePython",
-    "description": "Test suite for the evaluatePython Lambda Feedback evaluation function.\n\nThe service is fronted by Shimmy, which exposes a single `POST /` HTTP endpoint on port 8080. The command (`eval`, `preview`, `healthcheck`) is selected via a `command` request header (absent = `eval`).\n\n## Running locally\n\nWith Shimmy installed:\n```\nshimmy -c python -a -m -a evaluation_function.main -i ipc\n```\n\nOr via Docker:\n```\ndocker run -p 8080:8080 <image>\n```\n\n## Collection variable\n\n`baseUrl` — default `http://localhost:8080`. Override to point at staging or production.\n\n## Modes\n\n`params.mode` is required for all eval requests:\n- `\"demo\"` — run code and show output, no test validation\n- `\"io_test\"` — run code against stdin/stdout test cases",
+    "description": "Test suite for the evaluatePython Lambda Feedback evaluation function.\n\nThe service is fronted by Shimmy, which exposes a single `POST /` HTTP endpoint on port 8080. The command (`eval`, `preview`, `healthcheck`) is selected via a `command` request header (absent = `eval`).\n\n## Running locally\n\nWith Shimmy installed:\n```\nshimmy -c python -a -m -a evaluation_function.main -i ipc\n```\n\nOr via Docker:\n```\ndocker run -p 8080:8080 <image>\n```\n\n## Collection variable\n\n`baseUrl` — default `http://localhost:8080`. Override to point at staging or production.\n\n## Modes\n\n`params.mode` is required for all eval requests:\n- `\"demo\"` — run code and show output, no test validation\n- `\"io_test\"` — run code against stdin/stdout test cases\n- `\"unit_test\"` — run teacher-written `def test_xxx()` or `unittest.TestCase` functions against student code",
     "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
   },
   "variable": [
@@ -595,6 +595,106 @@
             }
           ]
         },
+        {
+          "name": "22 – unit_test all pass",
+          "request": {
+            "method": "POST",
+            "header": [
+              { "key": "Content-Type", "value": "application/json" }
+            ],
+            "body": {
+              "mode": "raw",
+              "raw": "{\n  \"response\": \"def square(n): return n * n\",\n  \"answer\": \"\",\n  \"params\": {\n    \"mode\": \"unit_test\",\n    \"test_code\": \"def test_positive():\\n    assert square(5) == 25, 'expected 25'\\ndef test_zero():\\n    assert square(0) == 0\\n\"\n  }\n}",
+              "options": { "raw": { "language": "json" } }
+            },
+            "url": { "raw": "{{baseUrl}}", "host": ["{{baseUrl}}"] }
+          },
+          "event": [
+            {
+              "listen": "prerequest",
+              "script": {
+                "type": "text/javascript",
+                "exec": ["pm.request.timeout = 40000;"]
+              }
+            },
+            {
+              "listen": "test",
+              "script": {
+                "type": "text/javascript",
+                "exec": [
+                  "pm.test('Status 200', () => {",
+                  "    pm.expect(pm.response.code, 'Body: ' + pm.response.text()).to.equal(200);",
+                  "});",
+                  "",
+                  "pm.test('is_correct is true', () => {",
+                  "    pm.expect(pm.response.json().result.is_correct).to.be.true;",
+                  "});",
+                  "",
+                  "pm.test('Feedback reports 2/2 tests passed', () => {",
+                  "    pm.expect(pm.response.json().result.feedback).to.include('2/2 tests passed');",
+                  "});",
+                  "",
+                  "pm.test('Individual test names shown as passed', () => {",
+                  "    const feedback = pm.response.json().result.feedback;",
+                  "    pm.expect(feedback).to.include('test_positive: passed');",
+                  "    pm.expect(feedback).to.include('test_zero: passed');",
+                  "});"
+                ]
+              }
+            }
+          ]
+        },
+        {
+          "name": "23 – unit_test assertion failure",
+          "request": {
+            "method": "POST",
+            "header": [
+              { "key": "Content-Type", "value": "application/json" }
+            ],
+            "body": {
+              "mode": "raw",
+              "raw": "{\n  \"response\": \"def square(n): return n + 1\",\n  \"answer\": \"\",\n  \"params\": {\n    \"mode\": \"unit_test\",\n    \"test_code\": \"def test_positive():\\n    assert square(5) == 25, 'expected 25'\\ndef test_zero():\\n    assert square(0) == 0\\n\"\n  }\n}",
+              "options": { "raw": { "language": "json" } }
+            },
+            "url": { "raw": "{{baseUrl}}", "host": ["{{baseUrl}}"] }
+          },
+          "event": [
+            {
+              "listen": "prerequest",
+              "script": {
+                "type": "text/javascript",
+                "exec": ["pm.request.timeout = 40000;"]
+              }
+            },
+            {
+              "listen": "test",
+              "script": {
+                "type": "text/javascript",
+                "exec": [
+                  "pm.test('Status 200', () => {",
+                  "    pm.expect(pm.response.code, 'Body: ' + pm.response.text()).to.equal(200);",
+                  "});",
+                  "",
+                  "pm.test('is_correct is false', () => {",
+                  "    pm.expect(pm.response.json().result.is_correct).to.be.false;",
+                  "});",
+                  "",
+                  "pm.test('Feedback reports 0/2 tests passed', () => {",
+                  "    pm.expect(pm.response.json().result.feedback).to.include('0/2 tests passed');",
+                  "});",
+                  "",
+                  "pm.test('Feedback includes assertion message', () => {",
+                  "    pm.expect(pm.response.json().result.feedback).to.include('expected 25');",
+                  "});",
+                  "",
+                  "pm.test('Feedback marks test as failed', () => {",
+                  "    pm.expect(pm.response.json().result.feedback).to.include('failed');",
+                  "});"
+                ]
+              }
+            }
+          ]
+        },
         {
           "name": "20 – Image generation (demo mode)",
           "request": {