ajaxwin
refactor: Reward clamping in graders
41a051f
"""
grader.py (Task 3 – Rule Checker)
------------------------------------
Deterministic grader for function-identification submissions.
Grade table
───────────
1 β†’ submitted function is the exact target (case-insensitive)
0.50 β†’ submitted function is a direct internal subfunction of the target
0.001 β†’ anything else
"""
import json
from math import exp
from typing import Dict, Any
class Task3Grader:
"""
Grades a Task 3 submit_function submission.
Parameters
----------
target_function : dict with at least 'name' and 'code' keys
property_specification : the property the target function violates
"""
REWARD_CORRECT = 1
REWARD_PARTIAL = 0.5
REWARD_WRONG = 0.001
def __init__(self, target_function: Dict[str, Any], property_specification: Dict | str, max_steps: int) -> None:
self.target_function = target_function
self.property_specification = property_specification
self.max_steps = max_steps
self._decay = 0.01
def _clamp(self, reward: float) -> float:
return max(0.001, min(0.999, reward))
def grade(self, submitted_function: str, steps: int, cummulative_cost: int) -> float:
"""Returns deterministic grade strictly in (0, 1)."""
norm = submitted_function.strip().lower()
reward = self.REWARD_WRONG
if norm == self.target_function["name"].strip().lower():
reward = self.REWARD_CORRECT
elif norm in self.target_function.get("code", "").strip().lower():
reward = self.REWARD_PARTIAL
penalty = self._decay ** (-(steps * cummulative_cost) / self.max_steps)
return self._clamp(reward * penalty)
def get_canonical_answer(self) -> Dict[str, Dict | str]:
"""For debugging / logging only β€” do not expose to the agent."""
return {
"target_function": self.target_function,
"property_specification": json.dumps(self.property_specification)
if isinstance(self.property_specification, dict) else self.property_specification,
}