diff --git a/composer_replication/datagen/repo_gate.py b/composer_replication/datagen/repo_gate.py new file mode 100644 index 0000000000000000000000000000000000000000..4b02474d6a9faf34d94cca58e3a08f42d576a4e4 --- /dev/null +++ b/composer_replication/datagen/repo_gate.py @@ -0,0 +1,361 @@ +"""repo_gate.py — Stage-0 ingest gate: license tiers + benchmark decontamination. + +Architecture step 1 of the dataset pipeline (research/deepread/ +13-synthesis-architecture.md Part B). Closes two verified findings: + + * V3 / D-5 — ZERO benchmark decontamination existed anywhere in code or + designs, while the pipeline trains on SWE-bench-family substrates and is + scored on SWE-bench Verified. ``is_eval_contaminated`` is the hard wall: + a repo on the eval list is NEVER admitted, regardless of license. + * V9 / D-13 — the only license filter was a lowercase substring match on a + task field (``substrates.py`` ``is_redistributable``), with no SPDX + detection at the repo-ingest path and no trainable-vs-redistributable + split. ``detect_license`` + ``license_tier`` replace the boolean with a + three-tier verdict. + +Why tiers, not a boolean (D-13): weak-copyleft repos (MPL/LGPL) are fine to +*train on* but we must not *redistribute* derivative diffs from them — a +boolean "redistributable?" gate either over-excludes them or leaks them into +published corpora. The tier travels with the verdict so downstream corpus +steps (step 6) can route TRAINABLE_ONLY rows away from any published split. + +Why title-anchored matching for the GNU family: GPL-3.0 §13 mentions the +"GNU Affero General Public License" by name and AGPL-3.0 §13 mentions the +"GNU General Public License" — naive full-body substring matching +misclassifies one as the other. We therefore classify the GNU/MPL/Apache +family from the document HEADER (first ~400 normalized chars, where the +license title lives) and only use full-body phrases for the short permissive +licenses whose titles are not distinctive (MIT/ISC/BSD/Unlicense). + +Stdlib-only on purpose: the gate must run before anything heavy is installed. +""" +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path + +# --------------------------------------------------------------------------- +# License detection (V9 / D-13) +# --------------------------------------------------------------------------- + +#: License files checked in order; first match wins (case-insensitive on name). +_LICENSE_FILENAMES: tuple[str, ...] = ("LICENSE", "LICENSE.txt", "LICENSE.md", "COPYING") + +#: Trove classifier / PEP 639 expression fragments → SPDX id. Secondary signal +#: only — the classifier cannot distinguish BSD-2 from BSD-3, so it maps to +#: BSD-3-Clause (the common case) and the LICENSE file is preferred when present. +_CLASSIFIER_MAP: tuple[tuple[str, str], ...] = ( + ("gnu affero general public license", "AGPL-3.0"), + ("gnu lesser general public license v3", "LGPL-3.0"), + ("gnu lesser general public license v2.1", "LGPL-2.1"), + ("gnu lesser general public license", "LGPL-3.0"), + ("gnu general public license v3", "GPL-3.0"), + ("gnu general public license v2", "GPL-2.0"), + ("mozilla public license 2.0", "MPL-2.0"), + ("apache software license", "Apache-2.0"), + ("mit license", "MIT"), + ("bsd license", "BSD-3-Clause"), + ("isc license", "ISC"), + ("the unlicense", "Unlicense"), +) + +#: Bare SPDX ids accepted from PEP 639 ``license = ""`` in pyproject. +_SPDX_IDS: frozenset[str] = frozenset( + { + "MIT", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", "ISC", + "GPL-2.0", "GPL-3.0", "AGPL-3.0", "LGPL-2.1", "LGPL-3.0", + "MPL-2.0", "Unlicense", + } +) +_SPDX_LOOKUP: dict[str, str] = {s.lower(): s for s in _SPDX_IDS} +# Common -only/-or-later suffixed forms normalize to the base id we tier on. +for _base in ("GPL-2.0", "GPL-3.0", "AGPL-3.0", "LGPL-2.1", "LGPL-3.0"): + _SPDX_LOOKUP[f"{_base.lower()}-only"] = _base + _SPDX_LOOKUP[f"{_base.lower()}-or-later"] = _base + + +@dataclass(frozen=True) +class LicenseInfo: + """Outcome of license detection: SPDX-ish id + which signal decided it.""" + + spdx_id: str # one of _SPDX_IDS or "unknown" + signal: str # "license_file" | "classifier" | "none" + source: str = "" # filename that supplied the winning signal + + +def _normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", text).strip().lower() + + +#: Title strings for the families that cross-cite each other. NOTE: "gnu +#: affero general public license" does NOT contain "gnu general public +#: license" as a substring ("affero" splits it), so the titles are disjoint. +_HEADER_TITLES: tuple[tuple[str, str], ...] = ( + ("gnu affero general public license", "agpl"), + ("gnu lesser general public license", "lgpl"), + ("gnu general public license", "gpl"), + ("mozilla public license", "mpl"), + ("apache license", "apache"), +) + + +def _classify_header(header: str) -> str | None: + """Title-anchored families (GNU/MPL/Apache). The EARLIEST-occurring title + wins, because a license document's own title always precedes any + cross-citation — GPL-3 §13 names the AGPL and AGPL-3 §13 names the GPL, + so mere presence-matching misclassifies one as the other (the V9 trap).""" + hits = [(idx, family) for title, family in _HEADER_TITLES if (idx := header.find(title)) >= 0] + if not hits: + return None + family = min(hits)[1] + if family == "agpl": + return "AGPL-3.0" + if family == "lgpl": + return "LGPL-2.1" if "version 2.1" in header else "LGPL-3.0" + if family == "gpl": + return "GPL-2.0" if "version 2" in header and "version 3" not in header else "GPL-3.0" + if family == "mpl": + return "MPL-2.0" if "2.0" in header else None + return "Apache-2.0" if "version 2.0" in header else None + + +def _classify_body(body: str) -> str | None: + """Distinctive-phrase matching for the short permissive licenses. Order + matters: ISC's grant ("permission to use, copy, modify") is checked via + its unique "and/or distribute … with or without fee" wording so it can't + be shadowed by MIT's "permission is hereby granted" phrase.""" + if "free and unencumbered software released into the public domain" in body: + return "Unlicense" + # Apache boilerplate notice files ("Licensed under the Apache License, + # Version 2.0") carry the title mid-body, not in a header — the tricky + # Apache-vs-MIT case: both say "permission"/"license", only Apache names + # itself with a version. + if "apache license" in body and "version 2.0" in body: + return "Apache-2.0" + if "permission is hereby granted, free of charge, to any person obtaining a copy" in body: + return "MIT" + if "with or without fee" in body and "permission to use, copy, modify" in body: + return "ISC" + if "redistribution and use in source and binary forms" in body: + # The third clause ("Neither the name of …") is what separates 3- from 2-. + return "BSD-3-Clause" if "neither the name of" in body else "BSD-2-Clause" + return None + + +def _classify_license_text(text: str) -> str: + norm = _normalize_text(text) + return _classify_header(norm[:400]) or _classify_body(norm) or "unknown" + + +def _classifier_signal(repo_root: Path) -> tuple[str, str] | None: + """Secondary signal: trove classifiers / PEP 639 license expression in + pyproject.toml or setup.py. Regex-scan, not a TOML parse — the gate must + not depend on packaging libs and classifiers are line-shaped in practice.""" + for name in ("pyproject.toml", "setup.py"): + path = repo_root / name + if not path.is_file(): + continue + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + # PEP 639: license = "Apache-2.0" (pyproject only, but harmless on setup.py). + m = re.search(r'license\s*=\s*["\']([A-Za-z0-9.+-]+)["\']', text) + if m and m.group(1).lower() in _SPDX_LOOKUP: + return _SPDX_LOOKUP[m.group(1).lower()], name + low = _normalize_text(text) + for fragment, spdx in _CLASSIFIER_MAP: + if f"license :: osi approved :: {fragment}" in low or ( + "license ::" in low and fragment in low + ): + return spdx, name + return None + + +def detect_license(repo_root: Path) -> LicenseInfo: + """Detect the repo license. LICENSE-file text is the primary signal; + packaging classifiers are secondary (used only when the file is absent or + unclassifiable). The winning signal is recorded so corpus manifests can + show provenance for the tier decision (V9 closure must be auditable).""" + for name in _LICENSE_FILENAMES: + path = repo_root / name + if not path.is_file(): + # Case-insensitive fallback (e.g. "License.md", "COPYING.txt" not + # matched here on purpose — only exact-name case variants). + matches = [p for p in repo_root.glob("*") if p.is_file() and p.name.lower() == name.lower()] + path = matches[0] if matches else path + if path.is_file(): + try: + text = path.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + spdx = _classify_license_text(text) + if spdx != "unknown": + return LicenseInfo(spdx_id=spdx, signal="license_file", source=path.name) + # File exists but unclassifiable → let the classifier signal try + # before giving up; remember we saw a file for the "none" case. + fallback = _classifier_signal(repo_root) + if fallback is not None: + return LicenseInfo(spdx_id=fallback[0], signal="classifier", source=fallback[1]) + return LicenseInfo(spdx_id="unknown", signal="license_file", source=path.name) + fallback = _classifier_signal(repo_root) + if fallback is not None: + return LicenseInfo(spdx_id=fallback[0], signal="classifier", source=fallback[1]) + return LicenseInfo(spdx_id="unknown", signal="none") + + +# --------------------------------------------------------------------------- +# License tiers (D-13: tiers, not a boolean) +# --------------------------------------------------------------------------- + + +class Tier(Enum): + """Three-way license verdict. TRAINABLE_ONLY exists because weak copyleft + (MPL/LGPL) permits training but redistribution of derivative diffs would + trigger copyleft obligations — collapsing this to a boolean either loses + training data or leaks copyleft material into published corpora (D-13).""" + + REDISTRIBUTABLE = "redistributable" + TRAINABLE_ONLY = "trainable_only" + EXCLUDED = "excluded" + + +_TIER_BY_SPDX: dict[str, Tier] = { + "MIT": Tier.REDISTRIBUTABLE, + "Apache-2.0": Tier.REDISTRIBUTABLE, + "BSD-2-Clause": Tier.REDISTRIBUTABLE, + "BSD-3-Clause": Tier.REDISTRIBUTABLE, + "ISC": Tier.REDISTRIBUTABLE, + "Unlicense": Tier.REDISTRIBUTABLE, + "MPL-2.0": Tier.TRAINABLE_ONLY, + "LGPL-2.1": Tier.TRAINABLE_ONLY, + "LGPL-3.0": Tier.TRAINABLE_ONLY, + # GPL/AGPL and unknown are EXCLUDED: strong copyleft would bind the model + # outputs' redistribution story, and "unknown" defaults closed (V9). + "GPL-2.0": Tier.EXCLUDED, + "GPL-3.0": Tier.EXCLUDED, + "AGPL-3.0": Tier.EXCLUDED, +} + + +def license_tier(info: LicenseInfo) -> Tier: + """Map detected license → tier. Anything unrecognized is EXCLUDED — the + gate fails closed, never open (V9: the old substring filter failed open).""" + return _TIER_BY_SPDX.get(info.spdx_id, Tier.EXCLUDED) + + +# --------------------------------------------------------------------------- +# Benchmark decontamination (V3 / D-5) +# --------------------------------------------------------------------------- + +#: The canonical 12 SWE-bench test repos (SWE-bench / -Lite / -Verified / +#: -Multimodal all draw eval instances from these). Training on ANY of them +#: contaminates every SWE-bench-family score we report (V3). Lowercase +#: "org/repo" form. Extend via a JSON file (list of "org/repo" strings) +#: passed to is_eval_contaminated(extra_list=...) — e.g. SWE-Gym eval splits. +DECONTAMINATION_LIST: frozenset[str] = frozenset( + { + "astropy/astropy", + "django/django", + "matplotlib/matplotlib", + "mwaskom/seaborn", + "pallets/flask", + "psf/requests", + "pydata/xarray", + "pylint-dev/pylint", + "pytest-dev/pytest", + "scikit-learn/scikit-learn", + "sphinx-doc/sphinx", + "sympy/sympy", + } +) + + +def load_decontamination_list(path: Path) -> frozenset[str]: + """Load an extension list from a JSON file: ``["org/repo", ...]``. This is + THE documented mechanism for adding eval repos (new SWE-bench releases, + SWE-Gym eval splits) without editing code.""" + entries = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(entries, list): + raise ValueError(f"{path}: decontamination JSON must be a list of 'org/repo' strings") + return frozenset(normalize_repo(str(e)) for e in entries) + + +def normalize_repo(repo: str) -> str: + """Reduce any repo spelling — full https/ssh GitHub URL, trailing ``.git``, + mixed case — to lowercase ``org/repo``. Decontamination must hit no matter + how the driver spells the repo (V3: a miss here is silent contamination).""" + r = repo.strip().lower() + r = re.sub(r"^(https?://|git@)", "", r) + r = re.sub(r"^[^/]*github\.com[:/]", "", r) + r = r.rstrip("/") + r = r.removesuffix(".git") + parts = [p for p in r.split("/") if p] + return "/".join(parts[:2]) if len(parts) >= 2 else r + + +def is_eval_contaminated(repo: str, extra_list: frozenset[str] | None = None) -> bool: + """True if ``repo`` is in the SWE-bench-family eval set (or the caller's + extension list). Case-insensitive; accepts URLs and bare org/repo.""" + key = normalize_repo(repo) + return key in DECONTAMINATION_LIST or (extra_list is not None and key in extra_list) + + +# --------------------------------------------------------------------------- +# The gate verdict — single entry point for the pipeline driver +# --------------------------------------------------------------------------- + + +@dataclass +class GateVerdict: + """Everything the driver needs to admit/reject a repo, with reasons kept + for the run manifest (step 6's lineage record).""" + + repo: str + license_info: LicenseInfo + tier: Tier + contaminated: bool + admitted: bool + reasons: list[str] = field(default_factory=list) + + +def gate_repo(repo: str, repo_root: Path | None, extra_decontamination: frozenset[str] | None = None) -> GateVerdict: + """Architecture step 1: the one call the pipeline driver makes per repo. + + Hard rules (in priority order): + 1. Contaminated (V3) → NEVER admitted, even if the license is permissive. + 2. Tier EXCLUDED (GPL/AGPL/unknown) → not admitted (V9: fail closed). + 3. Tier TRAINABLE_ONLY → admitted, with the do-not-redistribute + constraint recorded as a reason so step 6 can route the rows. + """ + contaminated = is_eval_contaminated(repo, extra_decontamination) + info = detect_license(repo_root) if repo_root is not None else LicenseInfo("unknown", "none") + tier = license_tier(info) + + reasons: list[str] = [] + if contaminated: + reasons.append( + f"benchmark decontamination: {normalize_repo(repo)} is a SWE-bench-family eval repo (V3/D-5)" + ) + if repo_root is None: + reasons.append("no repo_root provided: license undetectable, failing closed (V9)") + if tier is Tier.EXCLUDED and not contaminated: + reasons.append(f"license tier EXCLUDED: spdx={info.spdx_id} (signal={info.signal})") + if tier is Tier.TRAINABLE_ONLY: + reasons.append( + f"license tier TRAINABLE_ONLY: spdx={info.spdx_id} — usable for training, " + "derivative diffs must NOT be redistributed (D-13)" + ) + + admitted = (not contaminated) and tier is not Tier.EXCLUDED + return GateVerdict( + repo=repo, + license_info=info, + tier=tier, + contaminated=contaminated, + admitted=admitted, + reasons=reasons, + ) diff --git a/composer_replication/datagen/rollout_harness.py b/composer_replication/datagen/rollout_harness.py new file mode 100644 index 0000000000000000000000000000000000000000..8e394e320098eb985002b1fa3cde45c3dd81b95b --- /dev/null +++ b/composer_replication/datagen/rollout_harness.py @@ -0,0 +1,214 @@ +"""rollout_harness.py — the agent loop over FeatureDeletionEnv (finding V2). + +THE critical missing component the design critic identified: nothing in the +repo ran an agent episode against `FeatureDeletionEnv` to completion, so the +SFT corpus had NO producer and the tree-of-work had no env-grounded seeds. +`collect_trajectory` is that producer: prompt → policy.act → env.step → … → +submit → `_grade()`, emitting a `CanonicalTrajectory` whose steps are real +executed environment transitions (the seeds the tree needs, fixing the +seed-trace/oracle disjointness of finding D-1 as a free byproduct). + +The policy is pluggable (`RolloutPolicy` protocol): a scripted fake for tests, +a frontier API model for expert-trajectory collection (SWE-Gym/SWE-smith both +validated this recipe — 491 and 5,016 expert trajectories respectively), or a +local model later. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Protocol, runtime_checkable + +from composer_replication.datagen.env import FeatureDeletionEnv, StepResult +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.trajectory import ( + CanonicalTrajectory, + ToolCall, + TrajectoryStep, +) + + +@runtime_checkable +class RolloutPolicy(Protocol): + """Anything that maps (observation, history) → the next action. + + Returning a `ToolCall` continues the episode (translated to an env action + dict); returning a plain `str` is the final message — the harness submits. + """ + + def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str: ... + + +@dataclass +class ScriptedPolicy: + """Test fake: replays a fixed action list, then submits.""" + + actions: list[ToolCall | str] + _i: int = 0 + + def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str: + if self._i >= len(self.actions): + return "done" # str → submit + a = self.actions[self._i] + self._i += 1 + return a + + +class OpenRouterPolicy: + """Frontier-API policy for expert-trajectory collection (thin stub). + + Mirrors `teacher_replay._call_teacher`'s payload shape (one chat call, + temperature 0.2). Lazy-deps on httpx so the module imports without it. + Deliberately minimal: real expert collection should evaluate adopting + mini-swe-agent/SWE-agent as the scaffold (deepread 11 finding 2) — this + class exists so the harness has a live-API path without a new framework. + """ + + def __init__(self, model_slug: str, api_key: str | None = None, + max_tokens: int = 512) -> None: + try: + import httpx # noqa: F401, PLC0415 — lazy heavy dep + except ImportError as e: + raise ImportError( + "OpenRouterPolicy requires httpx (`pip install httpx` or the " + "[serverless] extra). For tests use ScriptedPolicy. Got: " + repr(e) + ) from e + from composer_replication.teacher_replay import _load_api_key + self.model_slug = model_slug + self.api_key = api_key or _load_api_key() + self.max_tokens = max_tokens + + def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str: + import httpx # noqa: PLC0415 + + from composer_replication.teacher_replay import OPENROUTER_URL + messages = [{"role": "user", "content": observation}] + r = httpx.post( + OPENROUTER_URL, + json={"model": self.model_slug, "messages": messages, + "max_tokens": self.max_tokens, "temperature": 0.2}, + headers={"Authorization": f"Bearer {self.api_key}"}, + timeout=120.0, + ) + r.raise_for_status() + return str(r.json()["choices"][0]["message"]["content"]) + + +def _to_env_action(call: ToolCall) -> dict: + """ToolCall → FeatureDeletionEnv action dict. + + CONVENTION (documented here, the single translation point): the env's + `step()` consumes ``{"type": , **args}``; ``type=="submit"`` + triggers grading (env.py:67). A ToolCall named "submit" therefore ends the + episode through the same path as a plain-text final message. + """ + return {"type": call.name, **call.args} + + +def collect_trajectory( + env: FeatureDeletionEnv, + task: FeatureDeletionTask, + policy: RolloutPolicy, + *, + max_turns: int = 40, + budget_usd: float | None = None, + provenance: dict | None = None, +) -> CanonicalTrajectory: + """Run one episode and return the graded CanonicalTrajectory. + + The episode ends when the policy emits a plain string (final message → + submit), a ToolCall named "submit", or `max_turns` is hit (the env grades + on its own turn limit too — we mirror it here so the harness's history + stays aligned with the env's accounting). + """ + obs = env.reset(task) + steps: list[TrajectoryStep] = [] + final: StepResult | None = None + + for _ in range(max_turns): + action = policy.act(obs, steps) + if isinstance(action, str) or action.name == "submit": + final = env.step({"type": "submit"}) + steps.append(TrajectoryStep( + observation=obs, action=action, result=final.observation, + tool_error=False, + )) + break + res = env.step(_to_env_action(action)) + tool_error = "error" in (res.observation or "").lower()[:200] + steps.append(TrajectoryStep( + observation=obs, action=action, result=res.observation, + tool_error=tool_error, + )) + if res.done: # env hit its own turn limit and graded + final = res + break + obs = res.observation + + if final is None: + # max_turns exhausted without submit — grade what exists. + final = env.step({"type": "submit"}) + + info = final.info or {} + return CanonicalTrajectory( + task_id=task.task_id, + steps=steps, + grade=float(final.reward) if final.reward is not None else None, + guard_ok=bool(info.get("guard_ok", True)), + hacked=bool(info.get("hacked", False)), + provenance={"source": "rollout_harness", + "policy": type(policy).__name__, + **(provenance or {})}, + ) + + +# --------------------------------------------------------------------- +# Admission — type the signal and route it (final report §4) +# --------------------------------------------------------------------- + + +@dataclass(frozen=True) +class AdmissionVerdict: + """Where a trajectory may go. Routing per the typed-train-on-all verdict: + clean full passes → SFT; clean near-misses → DPO-candidate (contrastive + rejected vs a winner, never raw negative gradient); everything else → + rejected (quarantine-side, full provenance kept for audit).""" + + sft_admitted: bool + dpo_candidate: bool + rejected: bool + reasons: tuple[str, ...] + + +def admit(traj: CanonicalTrajectory) -> AdmissionVerdict: + reasons: list[str] = [] + clean = traj.guard_ok and not traj.hacked + if not traj.guard_ok: + reasons.append("pass_to_pass guard broken") + if traj.hacked: + reasons.append("hack monitor flagged") + grade = traj.grade if traj.grade is not None else 0.0 + if traj.grade is None: + reasons.append("ungraded (no execution oracle)") + + sft = clean and traj.grade is not None and grade == 1.0 + dpo = clean and traj.grade is not None and 0.0 < grade < 1.0 + if sft: + reasons.append("clean full pass") + elif dpo: + reasons.append(f"clean near-miss (grade={grade:.2f})") + elif clean and grade == 0.0 and traj.grade is not None: + reasons.append("clean zero — no partial signal") + return AdmissionVerdict( + sft_admitted=sft, dpo_candidate=dpo, + rejected=not (sft or dpo), reasons=tuple(reasons), + ) + + +__all__ = [ + "RolloutPolicy", + "ScriptedPolicy", + "OpenRouterPolicy", + "collect_trajectory", + "AdmissionVerdict", + "admit", +] diff --git a/composer_replication/datagen/swesmith_adapter.py b/composer_replication/datagen/swesmith_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..6010a866b6e41545f0c5f9131c55460316c69f35 --- /dev/null +++ b/composer_replication/datagen/swesmith_adapter.py @@ -0,0 +1,269 @@ +"""swesmith_adapter.py — adapt SWE-smith instances into Feature-Deletion tasks. + +THE BUY-VS-BUILD VERDICT (deepread finding V4 / D-6): `pip install swesmith` +(MIT) already ships what ADR-010's "Option B greenfield generator" would have +hand-built — env construction from arbitrary GitHub repos (ONE Docker image per +repo, ~500x more storage-efficient than per-task images), five bug-synthesis +strategies, issue-text generation, and validation-by-test-execution, at a +verified $1,360 + ~20 human-hours for 50k tasks. Its **PR Mirror strategy is +exactly this repo's gold-patch-reversion mechanic** and SWE-smith's own ablation +(Table 5, arXiv:2504.21798) shows PR-Mirror trajectories train the BEST models +of its five strategies — independent validation of ADR-010's core approach. +So SWE-smith is the synthesis ENGINE for "point at a repo"; this module is the +schema bridge into the existing `FeatureDeletionTask` world. + +THE SEMANTIC INVERSION (load-bearing — easy to get backwards): + * SWE-bench-shaped instances: `patch` is the GOLD FIX. broken = HEAD with the + fix reverted (`git apply -R patch`). `SweBenchAdapter` stores `patch` as + `golden_diff` directly. + * SWE-smith instances: `patch` INTRODUCES THE BUG. broken = HEAD with the + patch APPLIED. The fix — what the agent must produce, the validator's gate-4 + restoration diff — is the REVERSE of the bug patch. +This adapter therefore stores `golden_diff = reverse_unified_diff(bug_patch)`. +When mechanical reversal fails (exotic diff features), it falls back to the +original patch tagged with a provenance marker so downstream gate-4 validation +knows to use `git apply -R` instead of `git apply`. + +The adapter itself needs nothing beyond core deps. Live synthesis (building new +repo profiles / generating new bugs) needs the `swesmith` toolkit + Docker on +Linux — see the `[swesmith]` extra in pyproject. +""" +from __future__ import annotations + +import json +import re +from dataclasses import dataclass + +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.substrates import _as_tuple + +#: Marker prefixed to golden_diff when reverse_unified_diff could not invert the +#: bug patch mechanically. Consumers (gate-4 validation) must then apply the +#: remainder with `git apply -R` (it is the FORWARD bug patch, not the fix). +UNREVERSED_MARKER = "### UNREVERSED-BUG-PATCH (apply with -R) ###\n" + +#: instance_id substring patterns → synthesis strategy (SWE-smith §2.1 / §B). +#: Patterns follow the toolkit's naming: e.g. +#: pandas-dev__pandas.abc123.lm_modify__xyz +#: ...func_pm_ctrl_invert_if__..., ...combine_file__..., ...pr_1234 +_STRATEGY_PATTERNS: tuple[tuple[str, str], ...] = ( + ("lm_modify", "lm_modify"), + ("lm_rewrite", "lm_rewrite"), + ("func_pm", "procedural"), # procedural AST modifications (13 transform types) + ("func_basic", "procedural"), + ("combine_file", "combine"), + ("combine_module", "combine"), + ("combine", "combine"), + ("pr_", "pr_mirror"), +) + + +def parse_strategy(instance_id: str) -> str: + """Map a SWE-smith instance_id to its bug-synthesis strategy. + + Returns one of {lm_modify, lm_rewrite, procedural, combine, pr_mirror, + unknown}. The strategy matters because SWE-smith's Table 5 ablation found + trajectory quality differs sharply by strategy (PR Mirror best, LM Modify + steep drop-off) — we carry it as provenance so corpus builds can weight or + filter by strategy. + """ + iid = (instance_id or "").lower() + for pattern, strategy in _STRATEGY_PATTERNS: + if pattern in iid: + return strategy + return "unknown" + + +#: Heuristic cold-start difficulty priors per strategy, motivated by SWE-smith +#: Table 1 medians (PR Mirror: 3 median F2P but 14 lines edited; Combine: 15 +#: F2P / 11 lines = multi-site; procedural: 7 F2P / 5 lines, mechanical). +#: These only seed DifficultyCurriculum's p-hat before real rollouts exist. +_DIFFICULTY_PRIOR: dict[str, float] = { + "pr_mirror": 0.4, + "combine": 0.4, + "lm_rewrite": 0.45, + "lm_modify": 0.55, + "procedural": 0.6, + "unknown": 0.5, +} + + +_HUNK_RE = re.compile( + r"^@@ -(?P\d+)(?:,(?P\d+))? " + r"\+(?P\d+)(?:,(?P\d+))? @@(?P.*)$" +) + + +def reverse_unified_diff(patch: str) -> str | None: + """Mechanically invert a unified diff (swap additions and deletions). + + Handles the standard unified-diff features SWE-smith patches use: + ``diff --git`` headers, ``---``/``+++`` file lines, ``@@`` hunk headers + (old/new ranges swapped), ``+``/``-`` body lines (swapped), context lines, + and ``\\ No newline at end of file`` markers (kept in place). + + HONEST LIMITATIONS (returns None — caller falls back to UNREVERSED_MARKER): + * file mode changes (``old mode``/``new mode``), renames/copies + (``rename from``...), binary patches (``GIT binary patch``), and + ``index`` lines with mode suffixes are NOT inverted — reversing them + correctly requires git plumbing, not text surgery. + * Within a hunk, a reversed diff's line ORDER for paired -/+ runs is the + naive swap; `git apply` accepts it, but it is not byte-identical to + what `git diff` would emit for the reverse change. + """ + if not patch or "@@" not in patch: + return None + unsupported = ("old mode ", "new mode ", "rename from ", "rename to ", + "copy from ", "copy to ", "GIT binary patch") + if any(marker in patch for marker in unsupported): + return None + + out: list[str] = [] + for line in patch.splitlines(): + if line.startswith("diff --git "): + # `diff --git a/ b/` → swap the two paths. + m = re.match(r"^diff --git a/(?P.+) b/(?P.+)$", line) + if m: + out.append(f"diff --git a/{m.group('b')} b/{m.group('a')}") + else: + out.append(line) + elif line.startswith("--- "): + out.append("+++ " + line[4:].replace("a/", "b/", 1) + if line[4:].startswith("a/") else "+++ " + line[4:]) + elif line.startswith("+++ "): + out.append("--- " + line[4:].replace("b/", "a/", 1) + if line[4:].startswith("b/") else "--- " + line[4:]) + elif line.startswith("@@"): + m = _HUNK_RE.match(line) + if not m: + return None + old_start, old_count = m.group("old_start"), m.group("old_count") + new_start, new_count = m.group("new_start"), m.group("new_count") + oc = f",{old_count}" if old_count is not None else "" + nc = f",{new_count}" if new_count is not None else "" + out.append(f"@@ -{new_start}{nc} +{old_start}{oc} @@{m.group('tail')}") + elif line.startswith("+"): + out.append("-" + line[1:]) + elif line.startswith("-"): + out.append("+" + line[1:]) + else: + # context lines, `index ...`, `\ No newline...` pass through. + out.append(line) + return "\n".join(out) + ("\n" if patch.endswith("\n") else "") + + +@dataclass(frozen=True) +class SwesmithMeta: + """Sidecar provenance for a SWE-smith-derived task. + + Kept OUT of the frozen `FeatureDeletionTask` schema deliberately — the + schema is shared with SweBenchAdapter and the trainer; strategy provenance + is a corpus-construction concern, carried alongside (e.g. into the run + manifest), never into the policy-visible task row. + """ + + strategy: str # lm_modify | lm_rewrite | procedural | combine | pr_mirror | unknown + diff_reversed: bool # True if golden_diff is the mechanical reverse of the bug patch + source: str = "swesmith" + + +@dataclass +class SwesmithAdapter: + """Convert a SWE-smith instance dict into a FeatureDeletionTask. + + Mirrors `SweBenchAdapter`'s shape; differs in the patch semantics (see the + module docstring INVERSION note) and the per-REPO image convention. + """ + + default_test_command: str = "python -m pytest -q" + + def image_for(self, instance: dict) -> str: + # SWE-smith publishes ONE image per repo (not per task). Rows carry + # `image_name`; some exports use `docker_image`. Fall back to the + # toolkit's naming convention derived from the repo slug. + for key in ("image_name", "docker_image"): + if instance.get(key): + return str(instance[key]) + repo = str(instance.get("repo", "unknown")).replace("/", "__").lower() + return f"swesmith.x86_64.{repo}:latest" + + def to_task(self, instance: dict) -> FeatureDeletionTask: + task, _meta = self.to_task_with_meta(instance) + return task + + def to_task_with_meta(self, instance: dict) -> tuple[FeatureDeletionTask, SwesmithMeta]: + iid = str(instance.get("instance_id") or instance.get("task_id") or "unknown") + strategy = parse_strategy(iid) + + bug_patch = str(instance.get("patch", "")) + fix = reverse_unified_diff(bug_patch) + if fix is not None: + golden_diff = fix + diff_reversed = True + else: + golden_diff = UNREVERSED_MARKER + bug_patch + diff_reversed = False + + ftp = _as_tuple(instance.get("FAIL_TO_PASS")) + ptp = _as_tuple(instance.get("PASS_TO_PASS")) + + task = FeatureDeletionTask( + task_id=iid, + repo=str(instance.get("repo", "unknown")), + base_commit=str(instance.get("base_commit", "")), + broken_image=self.image_for(instance), + test_command=str(instance.get("test_command") or self.default_test_command), + fail_to_pass=ftp, + pass_to_pass=ptp, + golden_diff=golden_diff, + granularity="feature", + # SWE-smith rows don't carry per-instance licenses; repo-level + # licensing is the repo_gate's job (deepread finding V9/D-13). + upstream_license=str(instance.get("license_name", "unknown")), + difficulty_prior=_DIFFICULTY_PRIOR.get(strategy, 0.5), + ) + return task, SwesmithMeta(strategy=strategy, diff_reversed=diff_reversed) + + +def load_swesmith_instances( + path_or_hf_id: str, + *, + limit: int | None = None, +) -> list[dict]: + """Load SWE-smith instances from a local JSONL file or the HF dataset. + + Local ``.jsonl`` paths need no extra deps (used by tests/fixtures). HF ids + (e.g. ``SWE-bench/SWE-smith``) lazy-import `datasets` from the `[datagen]` + extra. + """ + if path_or_hf_id.endswith(".jsonl"): + rows: list[dict] = [] + with open(path_or_hf_id, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + rows.append(json.loads(line)) + if limit is not None and len(rows) >= limit: + break + return rows + try: + from datasets import load_dataset # noqa: PLC0415 — lazy heavy dep + except ImportError as e: + raise RuntimeError( + "Loading SWE-smith from the HF Hub requires `datasets`; install " + "with `pip install -e .[datagen]`. Got: " + repr(e) + ) from e + split = load_dataset(path_or_hf_id, split="train") + rows = [dict(r) for i, r in enumerate(split) if limit is None or i < limit] + return rows[: limit if limit is not None else len(rows)] + + +__all__ = [ + "SwesmithAdapter", + "SwesmithMeta", + "UNREVERSED_MARKER", + "load_swesmith_instances", + "parse_strategy", + "reverse_unified_diff", +] diff --git a/composer_replication/datagen/tests/test_repo_gate.py b/composer_replication/datagen/tests/test_repo_gate.py new file mode 100644 index 0000000000000000000000000000000000000000..2c913bf44c934e4184e5069d81aac9282f306201 --- /dev/null +++ b/composer_replication/datagen/tests/test_repo_gate.py @@ -0,0 +1,419 @@ +"""Tests for the Stage-0 ingest gate (repo_gate.py) — architecture step 1. + +Coverage targets the two findings the module closes: + * V9/D-13 — SPDX detection from real license fixture texts (incl. the + tricky GNU-family cross-citation and Apache-vs-MIT phrasing) and the + three-tier mapping that replaces the old boolean substring filter. + * V3/D-5 — decontamination hits in exact, URL, and mixed-case forms, and + the hard never-admit rule in the composed verdict. + +CPU-only, stdlib + tmp_path fixtures — no network, no Docker. +""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from composer_replication.datagen.repo_gate import ( + DECONTAMINATION_LIST, + GateVerdict, + LicenseInfo, + Tier, + detect_license, + gate_repo, + is_eval_contaminated, + license_tier, + load_decontamination_list, + normalize_repo, +) + +# --------------------------------------------------------------------- +# License fixture texts — distinctive excerpts of the real license texts. +# --------------------------------------------------------------------- + +MIT_TEXT = """\ +MIT License + +Copyright (c) 2026 Example Org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction... +""" + +# The tricky Apache case: the words "permission" and "license" appear in both +# MIT and Apache; only Apache names itself with a version. +APACHE_TEXT = """\ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. +""" + +BSD3_TEXT = """\ +BSD 3-Clause License + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice... +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +""" + +BSD2_TEXT = """\ +BSD 2-Clause License + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice. +2. Redistributions in binary form must reproduce the above copyright notice. +""" + +ISC_TEXT = """\ +ISC License + +Copyright (c) 2026, Example Org + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. +""" + +# GPL-3.0 §13 cross-cites the AGPL by full name — the classic trap for +# full-body substring matchers. Header anchoring must win. +GPL3_TEXT = """\ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + + 13. Use with the GNU Affero General Public License. + Notwithstanding any other provision of this License, you have + permission to link or combine any covered work with a work licensed + under version 3 of the GNU Affero General Public License... +""" + +GPL2_TEXT = """\ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. +""" + +# AGPL-3.0 §13 reciprocally cites the plain GPL by name. +AGPL3_TEXT = """\ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + 13. Remote Network Interaction; Use with the GNU General Public License. + Notwithstanding any other provision of this License... +""" + +LGPL21_TEXT = """\ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. +""" + +MPL2_TEXT = """\ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. +""" + +UNLICENSE_TEXT = """\ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any means. +""" + + +def _repo_with_license(tmp_path: Path, text: str, filename: str = "LICENSE") -> Path: + (tmp_path / filename).write_text(text, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------- +# detect_license — SPDX classification from file text +# --------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + (MIT_TEXT, "MIT"), + (APACHE_TEXT, "Apache-2.0"), + (BSD3_TEXT, "BSD-3-Clause"), + (BSD2_TEXT, "BSD-2-Clause"), + (ISC_TEXT, "ISC"), + (GPL3_TEXT, "GPL-3.0"), + (GPL2_TEXT, "GPL-2.0"), + (AGPL3_TEXT, "AGPL-3.0"), + (LGPL21_TEXT, "LGPL-2.1"), + (MPL2_TEXT, "MPL-2.0"), + (UNLICENSE_TEXT, "Unlicense"), + ], + ids=["mit", "apache2", "bsd3", "bsd2", "isc", "gpl3", "gpl2", "agpl3", "lgpl21", "mpl2", "unlicense"], +) +def test_detect_license_spdx_ids(tmp_path: Path, text: str, expected: str): + info = detect_license(_repo_with_license(tmp_path, text)) + assert info.spdx_id == expected + assert info.signal == "license_file" + assert info.source == "LICENSE" + + +def test_gpl3_not_misread_as_agpl(tmp_path: Path): + """GPL-3.0 §13 names the AGPL in its body; header anchoring must keep + this classified as GPL-3.0 (the V9 substring filter would have tripped).""" + info = detect_license(_repo_with_license(tmp_path, GPL3_TEXT)) + assert info.spdx_id == "GPL-3.0" + + +def test_apache_notice_without_header_still_apache(tmp_path: Path): + """The short 'Licensed under the Apache License, Version 2.0' boilerplate + has no canonical header — the body fallback must catch it, and must not + fall through to MIT despite shared 'permission' vocabulary.""" + notice = ( + "Copyright 2026 Example Org\n\n" + "Licensed under the Apache License, Version 2.0 (the \"License\");\n" + "you may not use this file except in compliance with the License.\n" + ) + info = detect_license(_repo_with_license(tmp_path, notice)) + assert info.spdx_id == "Apache-2.0" + + +def test_detect_license_alternate_filenames(tmp_path: Path): + info = detect_license(_repo_with_license(tmp_path, GPL2_TEXT, filename="COPYING")) + assert info.spdx_id == "GPL-2.0" + assert info.source == "COPYING" + info2 = detect_license(_repo_with_license(tmp_path, MIT_TEXT, filename="LICENSE.md")) + # LICENSE.md is also present in tmp_path now alongside COPYING; first + # filename in priority order (LICENSE/LICENSE.txt/LICENSE.md) wins over COPYING. + assert info2.spdx_id == "MIT" + assert info2.source == "LICENSE.md" + + +def test_detect_license_unknown_text(tmp_path: Path): + info = detect_license(_repo_with_license(tmp_path, "All rights reserved. Ask legal.")) + assert info.spdx_id == "unknown" + + +def test_detect_license_no_files(tmp_path: Path): + info = detect_license(tmp_path) + assert info == LicenseInfo(spdx_id="unknown", signal="none") + + +def test_classifier_secondary_signal(tmp_path: Path): + """No LICENSE file, but pyproject carries a trove classifier — the + classifier signal must win and be recorded as such.""" + (tmp_path / "pyproject.toml").write_text( + '[project]\nname = "x"\nclassifiers = [\n' + ' "License :: OSI Approved :: MIT License",\n]\n', + encoding="utf-8", + ) + info = detect_license(tmp_path) + assert info.spdx_id == "MIT" + assert info.signal == "classifier" + assert info.source == "pyproject.toml" + + +def test_classifier_pep639_expression(tmp_path: Path): + (tmp_path / "pyproject.toml").write_text( + '[project]\nname = "x"\nlicense = "Apache-2.0"\n', encoding="utf-8" + ) + info = detect_license(tmp_path) + assert info.spdx_id == "Apache-2.0" + assert info.signal == "classifier" + + +def test_license_file_beats_classifier(tmp_path: Path): + """When both signals exist and the file is classifiable, the file wins — + the classifier is secondary by design (it can't tell BSD-2 from BSD-3).""" + _repo_with_license(tmp_path, GPL3_TEXT) + (tmp_path / "pyproject.toml").write_text( + 'classifiers = ["License :: OSI Approved :: MIT License"]\n', encoding="utf-8" + ) + info = detect_license(tmp_path) + assert info.spdx_id == "GPL-3.0" + assert info.signal == "license_file" + + +def test_unclassifiable_file_falls_back_to_classifier(tmp_path: Path): + _repo_with_license(tmp_path, "Custom corporate license, see legal dept.") + (tmp_path / "pyproject.toml").write_text( + 'classifiers = ["License :: OSI Approved :: ISC License"]\n', encoding="utf-8" + ) + info = detect_license(tmp_path) + assert info.spdx_id == "ISC" + assert info.signal == "classifier" + + +# --------------------------------------------------------------------- +# license_tier — tiers, not a boolean (D-13) +# --------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("spdx", "tier"), + [ + ("MIT", Tier.REDISTRIBUTABLE), + ("Apache-2.0", Tier.REDISTRIBUTABLE), + ("BSD-2-Clause", Tier.REDISTRIBUTABLE), + ("BSD-3-Clause", Tier.REDISTRIBUTABLE), + ("ISC", Tier.REDISTRIBUTABLE), + ("Unlicense", Tier.REDISTRIBUTABLE), + ("MPL-2.0", Tier.TRAINABLE_ONLY), + ("LGPL-2.1", Tier.TRAINABLE_ONLY), + ("LGPL-3.0", Tier.TRAINABLE_ONLY), + ("GPL-2.0", Tier.EXCLUDED), + ("GPL-3.0", Tier.EXCLUDED), + ("AGPL-3.0", Tier.EXCLUDED), + ("unknown", Tier.EXCLUDED), + ("WTFPL", Tier.EXCLUDED), # unrecognized id → fail closed + ], +) +def test_license_tier_mapping(spdx: str, tier: Tier): + assert license_tier(LicenseInfo(spdx_id=spdx, signal="license_file")) is tier + + +# --------------------------------------------------------------------- +# Decontamination (V3 / D-5) +# --------------------------------------------------------------------- + + +def test_decontamination_list_has_the_canonical_12(): + assert len(DECONTAMINATION_LIST) == 12 + assert "django/django" in DECONTAMINATION_LIST + assert "sympy/sympy" in DECONTAMINATION_LIST + + +@pytest.mark.parametrize( + "repo", + [ + "django/django", # exact + "Django/Django", # case + "https://github.com/django/django", # https URL + "https://github.com/django/django.git", # URL + .git + "git@github.com:django/django.git", # ssh URL + "https://github.com/django/django/", # trailing slash + ], +) +def test_is_eval_contaminated_hits(repo: str): + assert is_eval_contaminated(repo) is True + + +@pytest.mark.parametrize( + "repo", + [ + "pandas-dev/pandas", + "https://github.com/torvalds/linux", + "someuser/django", # fork-org differs: NOT the eval repo + ], +) +def test_is_eval_contaminated_misses(repo: str): + assert is_eval_contaminated(repo) is False + + +def test_normalize_repo_forms(): + assert normalize_repo("git@github.com:PSF/Requests.git") == "psf/requests" + assert normalize_repo("https://github.com/pydata/xarray/tree/main") == "pydata/xarray" + + +def test_extension_list_from_json(tmp_path: Path): + """The documented extension mechanism: extra eval repos load from JSON + and hit through the same normalized matching.""" + extra_path = tmp_path / "extra.json" + extra_path.write_text(json.dumps(["SWE-Gym/Extra-Repo"]), encoding="utf-8") + extra = load_decontamination_list(extra_path) + assert is_eval_contaminated("https://github.com/swe-gym/extra-repo", extra_list=extra) + assert not is_eval_contaminated("swe-gym/other-repo", extra_list=extra) + + +def test_extension_list_rejects_non_list(tmp_path: Path): + bad = tmp_path / "bad.json" + bad.write_text('{"repo": "a/b"}', encoding="utf-8") + with pytest.raises(ValueError): + load_decontamination_list(bad) + + +# --------------------------------------------------------------------- +# gate_repo — verdict composition +# --------------------------------------------------------------------- + + +def test_gate_admits_permissive_clean_repo(tmp_path: Path): + v = gate_repo("example/clean", _repo_with_license(tmp_path, MIT_TEXT)) + assert isinstance(v, GateVerdict) + assert v.admitted is True + assert v.tier is Tier.REDISTRIBUTABLE + assert v.contaminated is False + assert v.reasons == [] + + +def test_gate_contaminated_never_admitted_even_if_permissive(tmp_path: Path): + """V3 hard rule: an eval repo with an MIT license is STILL rejected — + decontamination outranks license.""" + v = gate_repo("https://github.com/pallets/flask", _repo_with_license(tmp_path, MIT_TEXT)) + assert v.contaminated is True + assert v.admitted is False + assert any("decontamination" in r for r in v.reasons) + # license detection still ran and is recorded for the manifest + assert v.license_info.spdx_id == "MIT" + + +def test_gate_excluded_tier_never_admitted(tmp_path: Path): + v = gate_repo("example/agpl-repo", _repo_with_license(tmp_path, AGPL3_TEXT)) + assert v.tier is Tier.EXCLUDED + assert v.admitted is False + assert any("EXCLUDED" in r for r in v.reasons) + + +def test_gate_trainable_only_admitted_with_reason(tmp_path: Path): + """D-13: weak copyleft is admitted for training, but the verdict must + carry the do-not-redistribute constraint for step 6 to route on.""" + v = gate_repo("example/mpl-repo", _repo_with_license(tmp_path, MPL2_TEXT)) + assert v.tier is Tier.TRAINABLE_ONLY + assert v.admitted is True + assert any("TRAINABLE_ONLY" in r for r in v.reasons) + assert any("redistributed" in r for r in v.reasons) + + +def test_gate_no_repo_root_fails_closed(): + """No repo_root → license undetectable → unknown → EXCLUDED → rejected + (V9: the gate must default closed, never open).""" + v = gate_repo("example/unfetched", None) + assert v.license_info.spdx_id == "unknown" + assert v.tier is Tier.EXCLUDED + assert v.admitted is False + assert any("failing closed" in r for r in v.reasons) + + +def test_gate_extra_decontamination_list(tmp_path: Path): + extra = frozenset({"my-eval/secret-benchmark"}) + v = gate_repo( + "https://github.com/My-Eval/Secret-Benchmark.git", + _repo_with_license(tmp_path, MIT_TEXT), + extra_decontamination=extra, + ) + assert v.contaminated is True + assert v.admitted is False diff --git a/composer_replication/datagen/tests/test_rollout_harness.py b/composer_replication/datagen/tests/test_rollout_harness.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4f3d329c11b2e0b0c4dc2ffa4e10f35036c8bc --- /dev/null +++ b/composer_replication/datagen/tests/test_rollout_harness.py @@ -0,0 +1,103 @@ +"""Tests for the rollout harness (deepread finding V2 — the SFT-corpus producer).""" +from __future__ import annotations + +from composer_replication.datagen.env import FeatureDeletionEnv +from composer_replication.datagen.rollout_harness import ( + ScriptedPolicy, + admit, + collect_trajectory, +) +from composer_replication.datagen.sandbox import FakeSandbox +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.trajectory import CanonicalTrajectory, ToolCall + + +def _task() -> FeatureDeletionTask: + return FeatureDeletionTask( + task_id="t1", repo="org/repo", base_commit="abc", + broken_image="img:1", test_command="pytest -q", + fail_to_pass=("t/a.py::t1", "t/a.py::t2"), + pass_to_pass=("t/a.py::keep",), + ) + + +def _env(outcomes: dict[str, bool]) -> FeatureDeletionEnv: + return FeatureDeletionEnv(FakeSandbox(test_outcomes=outcomes)) + + +def test_collect_trajectory_full_pass(): + """Policy 'fixes' the repo via the FakeSandbox set_outcome pseudo-action, + then submits — grade 1.0, steps record real env transitions.""" + env = _env({"t/a.py::keep": True}) + policy = ScriptedPolicy(actions=[ + ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True, "t/a.py::t2": True}}), + "final answer: implemented the feature", + ]) + traj = collect_trajectory(env, _task(), policy) + assert isinstance(traj, CanonicalTrajectory) + assert traj.grade == 1.0 + assert traj.guard_ok is True and traj.hacked is False + assert len(traj.steps) == 2 + assert isinstance(traj.steps[0].action, ToolCall) + assert traj.steps[0].result == "ok" # env.step observation recorded + assert traj.provenance["source"] == "rollout_harness" + + +def test_collect_trajectory_guard_broken_zeroes_reward(): + env = _env({"t/a.py::keep": False}) # functional guard broken + policy = ScriptedPolicy(actions=[ + ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True, "t/a.py::t2": True, + "t/a.py::keep": False}}), + "done", + ]) + traj = collect_trajectory(env, _task(), policy) + assert traj.grade == 0.0 + assert traj.guard_ok is False + + +def test_collect_trajectory_near_miss(): + env = _env({"t/a.py::keep": True}) + policy = ScriptedPolicy(actions=[ + ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True}}), # 1 of 2 + "done", + ]) + traj = collect_trajectory(env, _task(), policy) + assert traj.grade == 0.5 + assert traj.guard_ok is True + + +def test_collect_trajectory_max_turns_grades_anyway(): + env = _env({"t/a.py::keep": True}) + looping = ScriptedPolicy(actions=[ToolCall("bash", {"command": "ls"})] * 50) + traj = collect_trajectory(env, _task(), looping, max_turns=3) + assert traj.grade is not None # graded despite never submitting + + +# --------------------------------------------------------------------- +# Admission routing (typed train-on-all, final report §4) +# --------------------------------------------------------------------- + + +def _t(grade, guard_ok=True, hacked=False) -> CanonicalTrajectory: + return CanonicalTrajectory(task_id="x", grade=grade, guard_ok=guard_ok, hacked=hacked) + + +def test_admit_routes_clean_pass_to_sft(): + v = admit(_t(1.0)) + assert v.sft_admitted and not v.dpo_candidate and not v.rejected + + +def test_admit_routes_near_miss_to_dpo(): + v = admit(_t(0.5)) + assert v.dpo_candidate and not v.sft_admitted and not v.rejected + + +def test_admit_rejects_hacked_even_at_full_grade(): + v = admit(_t(1.0, hacked=True)) + assert v.rejected and "hack monitor flagged" in v.reasons + + +def test_admit_rejects_guard_broken_and_ungraded(): + assert admit(_t(1.0, guard_ok=False)).rejected + assert admit(_t(None)).rejected + assert admit(_t(0.0)).rejected diff --git a/composer_replication/datagen/tests/test_swesmith_adapter.py b/composer_replication/datagen/tests/test_swesmith_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..2068bfe97eb457bd046ea6c9f159ce7ff7a19078 --- /dev/null +++ b/composer_replication/datagen/tests/test_swesmith_adapter.py @@ -0,0 +1,165 @@ +"""Tests for the SWE-smith adapter (deepread finding V4 — buy-vs-build). + +The load-bearing coverage: the PATCH-SEMANTICS INVERSION (SWE-smith's patch +introduces the bug; golden_diff must be its reverse) and the mechanical +reverse_unified_diff round-trip. +""" +from __future__ import annotations + +import json + +import pytest + +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.substrates import SweBenchAdapter +from composer_replication.datagen.swesmith_adapter import ( + UNREVERSED_MARKER, + SwesmithAdapter, + load_swesmith_instances, + parse_strategy, + reverse_unified_diff, +) + +BUG_PATCH = """\ +diff --git a/pkg/mod.py b/pkg/mod.py +index 1111111..2222222 100644 +--- a/pkg/mod.py ++++ b/pkg/mod.py +@@ -1,4 +1,3 @@ + def add(a, b): +- return a + b ++ return a - b + # trailing context +""" + + +def _instance(**over) -> dict: + base = { + "instance_id": "getmoto__moto.abc1234.lm_modify__1a2b", + "repo": "getmoto/moto", + "base_commit": "abc1234", + "patch": BUG_PATCH, + "FAIL_TO_PASS": json.dumps(["tests/test_mod.py::test_add"]), + "PASS_TO_PASS": json.dumps(["tests/test_mod.py::test_other"]), + "image_name": "swesmith.x86_64.getmoto__moto:latest", + } + base.update(over) + return base + + +# --------------------------------------------------------------------- +# Strategy parsing +# --------------------------------------------------------------------- + + +@pytest.mark.parametrize("iid,expected", [ + ("r__x.abc.lm_modify__1", "lm_modify"), + ("r__x.abc.lm_rewrite__1", "lm_rewrite"), + ("r__x.abc.func_pm_ctrl_invert_if__1", "procedural"), + ("r__x.abc.func_basic__1", "procedural"), + ("r__x.abc.combine_file__1", "combine"), + ("r__x.abc.combine_module__2", "combine"), + ("r__x.abc.pr_1234", "pr_mirror"), + ("r__x.abc.mystery__1", "unknown"), +]) +def test_parse_strategy(iid, expected): + assert parse_strategy(iid) == expected + + +# --------------------------------------------------------------------- +# reverse_unified_diff +# --------------------------------------------------------------------- + + +def test_reverse_swaps_adds_and_removes(): + rev = reverse_unified_diff(BUG_PATCH) + assert rev is not None + # The bug ADDED "return a - b"; the reverse must REMOVE it. + assert "- return a - b" in rev + assert "+ return a + b" in rev + # Hunk header ranges swapped: -1,4 +1,3 → -1,3 +1,4 + assert "@@ -1,3 +1,4 @@" in rev + # Context lines untouched. + assert " def add(a, b):" in rev + assert " # trailing context" in rev + + +def test_reverse_round_trip_is_identity_on_body(): + rev = reverse_unified_diff(BUG_PATCH) + rev2 = reverse_unified_diff(rev) + # Round trip restores hunks and +/- bodies (headers may normalize). + orig_body = [ln for ln in BUG_PATCH.splitlines() if ln[:1] in "+-@" and not ln.startswith(("+++", "---"))] + rt_body = [ln for ln in rev2.splitlines() if ln[:1] in "+-@" and not ln.startswith(("+++", "---"))] + assert orig_body == rt_body + + +def test_reverse_refuses_renames_and_binary(): + assert reverse_unified_diff("diff --git a/x b/y\nrename from x\nrename to y\n") is None + assert reverse_unified_diff("diff --git a/x b/x\nGIT binary patch\nliteral 5\n") is None + assert reverse_unified_diff("") is None + assert reverse_unified_diff("no hunks here") is None + + +# --------------------------------------------------------------------- +# Adapter +# --------------------------------------------------------------------- + + +def test_to_task_golden_diff_is_the_fix_not_the_bug(): + """THE semantic inversion: golden_diff must restore the feature.""" + task, meta = SwesmithAdapter().to_task_with_meta(_instance()) + assert isinstance(task, FeatureDeletionTask) + assert meta.diff_reversed is True + assert meta.strategy == "lm_modify" + # The FIX restores `a + b` (adds it back) and removes the bug. + assert "+ return a + b" in task.golden_diff + assert "- return a - b" in task.golden_diff + assert UNREVERSED_MARKER not in task.golden_diff + + +def test_to_task_unreversible_patch_gets_marker(): + inst = _instance(patch="diff --git a/x b/y\nrename from x\nrename to y\n@@ -1 +1 @@\n-a\n+b\n") + task, meta = SwesmithAdapter().to_task_with_meta(inst) + assert meta.diff_reversed is False + assert task.golden_diff.startswith(UNREVERSED_MARKER) + + +def test_image_resolution_prefers_instance_field_then_convention(): + a = SwesmithAdapter() + assert a.image_for(_instance()) == "swesmith.x86_64.getmoto__moto:latest" + assert a.image_for(_instance(image_name=None, docker_image="custom:tag")) == "custom:tag" + inst = _instance(image_name=None) + inst.pop("docker_image", None) + assert a.image_for(inst) == "swesmith.x86_64.getmoto__moto:latest" + + +def test_f2p_p2p_tuple_handling_matches_swebench_semantics(): + task = SwesmithAdapter().to_task(_instance( + FAIL_TO_PASS=["t/a.py::t1", "t/a.py::t2"], # real list, not JSON string + PASS_TO_PASS=json.dumps([]), + )) + assert task.fail_to_pass == ("t/a.py::t1", "t/a.py::t2") + assert task.pass_to_pass == () + + +def test_difficulty_priors_by_strategy(): + pr = SwesmithAdapter().to_task(_instance(instance_id="r__x.abc.pr_99")) + proc = SwesmithAdapter().to_task(_instance(instance_id="r__x.abc.func_pm_remove_loop__1")) + assert pr.difficulty_prior < proc.difficulty_prior # PR Mirror harder prior + + +def test_redistributable_filter_interplay(): + """repo_gate owns repo-level licensing, but the per-instance filter from + SweBenchAdapter still composes when a license field IS present.""" + task = SwesmithAdapter().to_task(_instance(license_name="GPL-3.0")) + assert SweBenchAdapter.is_redistributable(task) is False + task2 = SwesmithAdapter().to_task(_instance(license_name="MIT")) + assert SweBenchAdapter.is_redistributable(task2) is True + + +def test_load_local_jsonl(tmp_path): + p = tmp_path / "fixtures.jsonl" + p.write_text("\n".join(json.dumps(_instance(instance_id=f"r__x.abc.pr_{i}")) for i in range(5))) + rows = load_swesmith_instances(str(p), limit=3) + assert len(rows) == 3 + assert rows[0]["instance_id"] == "r__x.abc.pr_0" diff --git a/composer_replication/datagen/tests/test_trajectory.py b/composer_replication/datagen/tests/test_trajectory.py new file mode 100644 index 0000000000000000000000000000000000000000..cdc58a1a8fb97b5a4cc47f42774857a3f156f55d --- /dev/null +++ b/composer_replication/datagen/tests/test_trajectory.py @@ -0,0 +1,127 @@ +"""Tests for the canonical trajectory IR (deepread findings V2/D-11/D-8). + +The load-bearing test is the SENTINEL leak guard: `to_policy_row` must never +emit golden_diff/deleted_symbols, even though the task dataclass carries them. +""" +from __future__ import annotations + +import json + +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.trajectory import ( + CanonicalTrajectory, + ToolCall, + TrajectoryStep, + from_trace_states, + to_policy_row, + to_sft_messages, +) +from composer_replication.teacher_replay import TraceState + + +def _task(**over) -> FeatureDeletionTask: + base = dict( + task_id="t1", repo="org/repo", base_commit="abc", + broken_image="img:1", test_command="pytest -q", + fail_to_pass=("t/a.py::t1",), pass_to_pass=("t/a.py::t2",), + golden_diff="SENTINEL_NEVER_LEAK", + deleted_symbols=("secret_fn",), + ) + base.update(over) + return FeatureDeletionTask(**base) + + +def _traj() -> CanonicalTrajectory: + return CanonicalTrajectory( + task_id="t1", + steps=[ + TrajectoryStep(observation="repo is broken", action=ToolCall("bash", {"command": "pytest"}), + result="2 failed", tool_error=False), + TrajectoryStep(observation="2 failed", action="here is my final patch", + result="graded", tool_error=False), + ], + grade=1.0, guard_ok=True, hacked=False, + provenance={"source": "test"}, + ) + + +# --------------------------------------------------------------------- +# ToolCall canonical form — the v1 divergence algebra +# --------------------------------------------------------------------- + + +def test_canonical_form_is_order_insensitive_on_args(): + a = ToolCall("edit", {"path": "x.py", "content": "y"}) + b = ToolCall("edit", {"content": "y", "path": "x.py"}) + assert a.canonical_form() == b.canonical_form() + + +def test_canonical_form_distinguishes_name_and_args(): + assert ToolCall("bash", {"command": "ls"}).canonical_form() != \ + ToolCall("bash", {"command": "ls -la"}).canonical_form() + assert ToolCall("read", {"f": "x"}).canonical_form() != \ + ToolCall("write", {"f": "x"}).canonical_form() + + +# --------------------------------------------------------------------- +# THE leak guard (finding D-8) +# --------------------------------------------------------------------- + + +def test_policy_row_never_contains_golden_diff_or_deleted_symbols(): + row = to_policy_row(_traj(), _task()) + blob = json.dumps(row) + assert "SENTINEL_NEVER_LEAK" not in blob + assert "secret_fn" not in blob + assert "golden_diff" not in blob + assert "deleted_symbols" not in blob + # And the row still carries what the policy MAY see. + assert row["repo"] == "org/repo" + assert row["fail_to_pass"] == ["t/a.py::t1"] + assert row["grade"] == 1.0 + + +# --------------------------------------------------------------------- +# IR ↔ SFT messages +# --------------------------------------------------------------------- + + +def test_to_sft_messages_alternates_roles(): + msgs = to_sft_messages(_traj()) + assert msgs[0] == {"role": "user", "content": "repo is broken"} + assert msgs[1]["role"] == "assistant" + assert "[TOOL_USE] name=bash" in msgs[1]["content"] + assert msgs[2] == {"role": "user", "content": "2 failed"} + + +# --------------------------------------------------------------------- +# Claude Code → IR adapter +# --------------------------------------------------------------------- + + +def test_from_trace_states_parses_single_tool_use_and_error_flag(): + states: list[TraceState] = [ + { + "state_id": "sess1::0000", + "messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "[TOOL_RESULT (ERROR)] (id=x)\nboom", + "tool_error": True}, + ], + "student_action": '[TOOL_USE] name=Bash input={"command":"ls"}', + }, + { + "state_id": "sess1::0001", + "messages": [{"role": "user", "content": "plain prompt"}], + "student_action": "I think the fix is...\n\n[TOOL_USE] name=Edit input={\"p\":1}\n\n[TOOL_USE] name=Bash input={\"c\":2}", + }, + ] + traj = from_trace_states(states) + assert traj.task_id == "sess1" + assert traj.grade is None # ungraded — Claude Code traces have no oracle + s0, s1 = traj.steps + assert isinstance(s0.action, ToolCall) and s0.action.name == "Bash" + assert s0.tool_error is True + # Multi-tool turn stays as the raw string (honest, not guessed). + assert isinstance(s1.action, str) + assert s1.tool_error is False diff --git a/composer_replication/datagen/trajectory.py b/composer_replication/datagen/trajectory.py new file mode 100644 index 0000000000000000000000000000000000000000..9f891233807c57334e3a9129986be50a89846582 --- /dev/null +++ b/composer_replication/datagen/trajectory.py @@ -0,0 +1,203 @@ +"""trajectory.py — the canonical trajectory IR (deepread findings V2/D-11/D-8). + +THE GAP THIS CLOSES: the repo had 3 (heading to 5) incompatible trajectory +shapes — Claude Code TraceState text serialization, Bedrock `.jsonl.out` rows, +the planned tree/rollout/OpenHands shapes — with no shared schema, and the +divergence gate rested on a whitespace-collapse string normalizer +(`teacher_replay._normalize_action`, self-admitted skeleton). This module is the +single intermediate representation every producer adapts INTO and every corpus +writer reads FROM. + +SECURITY INVARIANT (finding D-8): `FeatureDeletionTask.golden_diff` uses +`repr=False`, but `dataclasses.asdict()` and naive JSON serialization still +include it. `to_policy_row()` is the ONE serializer allowed to produce +policy-visible rows, and its output is unit-tested to never contain +`golden_diff` or `deleted_symbols`. +""" +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field +from typing import Any + +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.teacher_replay import TraceState + +#: Bump when the IR shape changes; carried on every trajectory + corpus row. +SCHEMA_VERSION = "1" + + +@dataclass(frozen=True) +class ToolCall: + """One structured tool invocation — the unit of the action algebra. + + `canonical_form()` is the v1 divergence-gate algebra (finding D-3): two + actions are "the same" iff their canonical forms match (tool name + sorted, + JSON-normalized args). This replaces the whitespace-collapse stub that made + the divergence gate fire on noise. v2 will need arg-level normalization + (path equivalence, whitespace-insensitive code args); keep that evolution + HERE so every consumer inherits it. + """ + + name: str + args: dict = field(default_factory=dict) + + def canonical_form(self) -> str: + try: + args_json = json.dumps(self.args, sort_keys=True, separators=(",", ":")) + except (TypeError, ValueError): + args_json = str(self.args) + return f"{self.name}:{args_json}" + + +@dataclass +class TrajectoryStep: + """One agent turn: what it saw, what it did, what came back.""" + + observation: str + action: ToolCall | str # str = plain text / final message + result: str | None = None # tool output observed AFTER the action + tool_error: bool = False + + +@dataclass +class CanonicalTrajectory: + """The IR: an episode (or trace) as a typed step list + outcome + provenance.""" + + task_id: str + steps: list[TrajectoryStep] = field(default_factory=list) + grade: float | None = None # _grade() pass-fraction; None = ungraded trace + guard_ok: bool = True + hacked: bool = False + provenance: dict = field(default_factory=dict) # source, policy id, cost_usd, run_id + schema_version: str = SCHEMA_VERSION + + +# --------------------------------------------------------------------- +# Producers → IR +# --------------------------------------------------------------------- + +# ClaudeCodeIngester serializes tool calls as "[TOOL_USE] name= input=". +_TOOL_USE_RE = re.compile(r"^\[TOOL_USE\] name=(?P\S+) input=(?P\{.*\})$") + + +def _parse_action(student_action: str) -> ToolCall | str: + """Parse a TraceState student_action back into a ToolCall where possible. + + Claude Code assistant turns serialize as newline-joined blocks; if exactly + one [TOOL_USE] block is present we recover the structured call (the common + case ADR-002 chose one-node-per-turn for). Multi-tool turns and pure-text / + thinking turns stay as the raw string — honest about what we can't + structure rather than guessing. + """ + blocks = [b for b in student_action.split("\n\n") if b.strip()] + tool_blocks = [b for b in blocks if b.startswith("[TOOL_USE]")] + if len(tool_blocks) == 1: + m = _TOOL_USE_RE.match(tool_blocks[0].strip()) + if m: + try: + return ToolCall(name=m.group("name"), args=json.loads(m.group("input"))) + except (json.JSONDecodeError, ValueError): + pass + return student_action + + +def from_trace_states( + states: list[TraceState], + *, + task_id: str = "", + provenance: dict | None = None, +) -> CanonicalTrajectory: + """Adapt a Claude Code trace (TraceState list) into the IR. + + HONEST CAPABILITY NOTE (finding D-1): these traces carry no executable + environment — no broken_image, no FAIL_TO_PASS — so the resulting + trajectory is UNGRADED (`grade=None`) and is admissible only for flat + Channel-3 / SFT-style uses, never as a tree seed. Env-grounded rollouts + (rollout_harness.collect_trajectory) are the graded producers. + """ + steps: list[TrajectoryStep] = [] + for s in states: + # The observation for step t is the last user message before the turn. + obs = "" + tool_error = False + for msg in reversed(s["messages"]): + if msg.get("role") == "user": + obs = str(msg.get("content", "")) + tool_error = bool(msg.get("tool_error", False)) + break + steps.append(TrajectoryStep( + observation=obs, + action=_parse_action(s["student_action"]), + result=None, + tool_error=tool_error, + )) + prov = {"source": "claude_code", **(provenance or {})} + return CanonicalTrajectory(task_id=task_id or (states[0]["state_id"].split("::")[0] if states else ""), + steps=steps, grade=None, provenance=prov) + + +# --------------------------------------------------------------------- +# IR → consumers +# --------------------------------------------------------------------- + + +def _action_text(action: ToolCall | str) -> str: + if isinstance(action, ToolCall): + return f"[TOOL_USE] name={action.name} input=" + json.dumps( + action.args, separators=(",", ":") + ) + return action + + +def to_sft_messages(traj: CanonicalTrajectory) -> list[dict]: + """IR → OpenAI-style messages for SFT (obs→user, action→assistant).""" + messages: list[dict] = [] + for step in traj.steps: + if step.observation: + messages.append({"role": "user", "content": step.observation}) + messages.append({"role": "assistant", "content": _action_text(step.action)}) + if step.result: + messages.append({"role": "user", "content": step.result}) + return messages + + +#: Task fields the POLICY may see. Everything else (golden_diff, +#: deleted_symbols) is construction-side and must never reach a corpus row. +_POLICY_VISIBLE_TASK_FIELDS = ( + "task_id", "repo", "base_commit", "test_command", + "fail_to_pass", "pass_to_pass", "granularity", "difficulty_prior", +) + + +def to_policy_row(traj: CanonicalTrajectory, task: FeatureDeletionTask) -> dict: + """THE policy-visible corpus serializer (finding D-8 — the leak guard). + + Builds the row field-by-field from an allowlist; never `asdict(task)`, + which would include `golden_diff` despite its `repr=False`. Unit-tested + with a sentinel to prove the absence. + """ + row: dict[str, Any] = { + "schema_version": traj.schema_version, + "messages": to_sft_messages(traj), + "grade": traj.grade, + "guard_ok": traj.guard_ok, + "hacked": traj.hacked, + "provenance": dict(traj.provenance), + } + for f in _POLICY_VISIBLE_TASK_FIELDS: + v = getattr(task, f) + row[f] = list(v) if isinstance(v, tuple) else v + return row + + +__all__ = [ + "SCHEMA_VERSION", + "ToolCall", + "TrajectoryStep", + "CanonicalTrajectory", + "from_trace_states", + "to_sft_messages", + "to_policy_row", +] diff --git a/composer_replication/diloco/__init__.py b/composer_replication/diloco/__init__.py index 3b912ad9d27dda9129821fa2c68c82c832472698..5fe9f276efd3d971fe0ca774d5129fe0fa959bf6 100644 --- a/composer_replication/diloco/__init__.py +++ b/composer_replication/diloco/__init__.py @@ -4,9 +4,12 @@ Wraps `torchft.local_sgd.DiLoCo` with the framework's conventions: - Sign convention is documented LOUDLY here once and tested via Spike 008. - The wrapper exposes the same constructor shape as torchft's DiLoCo so a future swap-in of the upstream class is a one-line change. -- Vanilla DiLoCo (Douillard et al. 2023) = `fragment_sync_delay=0`, single - fragment. Streaming DiLoCo (Liu et al. 2025) = non-zero delay, multiple - fragments. Spike 008 uses vanilla; Streaming is configured by the same API. +- Vanilla DiLoCo (Douillard et al. 2023, arXiv:2311.08105) = + `fragment_sync_delay=0`, single fragment. Streaming DiLoCo (Douillard et + al., arXiv:2501.18512 "Streaming DiLoCo with overlapping communication"; + the separate Eager-Updates work is Kale et al., arXiv:2502.12996 — citation + corrected per deepread finding V7) = non-zero delay, multiple fragments. + Spike 008 uses vanilla; Streaming is configured by the same API. Reference: `docs/adrs/ADR-003-diloco-impl.md`. diff --git a/composer_replication/opsd.py b/composer_replication/opsd.py index cd0e45275cdfc3030837796aef1179bd5f7c5f32..c795c388e17c84feb2f6c854adf0c99a2b35b3f7 100644 --- a/composer_replication/opsd.py +++ b/composer_replication/opsd.py @@ -10,17 +10,23 @@ Mathematical reference: - OPSD paper: Zhao et al., "Self-Distilled Reasoner: On-Policy Self-Distillation for LLMs", arXiv:2601.18734. - SDPO paper: Hübotter et al., "Reinforcement Learning via Self-Distillation", - arXiv:2601.20802 (formalizes the same loss as Composer 2.5's "Targeted RL with - Textual Feedback"). + arXiv:2601.20802. PROVENANCE (corrected per deepread finding V1): Cursor's + blog cites SDPO/OPSD only as *background* ("For more background on this + approach see…"), NOT as its mechanism. Published SDPO distills over the FULL + rollout with feedback in the prefix and an EMA-regularized teacher; this + repo's channel is a turn-localized hint-splice with a live (stop-grad, + non-EMA) teacher — a third, blog-inspired design, neither verbatim SDPO nor + confirmed-Composer. The kernel below matches OPSD's generalized JSD math. The loss computes JSD/KL divergence between a teacher distribution (model conditioned on privileged information / a hint) and a student distribution (model on the original context). Both come from the SAME model — the teacher is just "the model with hint inserted into context." -Composer 2.5 uses this with the privileged information being a "hint" inserted -at the error-turn site. We use the same loss; the data collator constructs -ctx_teacher = ctx_student + hint_at_error_turn for us. +Composer 2.5's blog describes inserting a "hint" at the error-turn site and +distilling the student toward the hint-conditioned distribution "for that turn +only". The data collator constructs ctx_teacher = ctx_student + +hint_at_error_turn for us. """ from __future__ import annotations diff --git a/composer_replication/pipeline/__init__.py b/composer_replication/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4388cf11938b7098c5d3291301745ae37ffb1485 --- /dev/null +++ b/composer_replication/pipeline/__init__.py @@ -0,0 +1,38 @@ +"""composer_replication.pipeline — the Stage-0 dataset-pipeline contract + driver. + +THE single reconciled dataset contract (supersedes the two divergent layouts in +research/design-F1 and design-F2 — deepread finding V8/D-7), the pragmatic +near-duplicate detector, and the local stage-driver that turns +(tasks, env, policy) into a carded, deduped, holdout-split corpus. +""" +from composer_replication.pipeline.build_corpus import build_corpus +from composer_replication.pipeline.dedup import ( + dedup, + find_near_duplicates, + jaccard_estimate, + minhash_signature, +) +from composer_replication.pipeline.s3_contract import ( + RunLayout, + RunManifest, + write_dataset_card, + write_dpo_rows, + write_sft_rows, + write_tasks, + write_tasks_full, +) + +__all__ = [ + "RunLayout", + "RunManifest", + "build_corpus", + "dedup", + "find_near_duplicates", + "jaccard_estimate", + "minhash_signature", + "write_dataset_card", + "write_dpo_rows", + "write_sft_rows", + "write_tasks", + "write_tasks_full", +] diff --git a/composer_replication/pipeline/build_corpus.py b/composer_replication/pipeline/build_corpus.py new file mode 100644 index 0000000000000000000000000000000000000000..4c214ed5fdeb3f526b8724b29e8b8d7f0524ea23 --- /dev/null +++ b/composer_replication/pipeline/build_corpus.py @@ -0,0 +1,137 @@ +"""build_corpus.py — the local Stage-0 stage-driver (architecture step 6-7). + +One function wires the whole local pipeline: holdout-split the task pool +(holdout tasks are NEVER rolled out — they are the eval anchor), roll out a +policy over each train task, admit + type + route trajectories +(sft / dpo-candidate / quarantine), dedup the SFT rows (within-run AND against +a prior generation's signatures), and write everything through the single +s3_contract layout with a manifest + dataset card. + +Deliberately LOCAL-first (finding D-9): the five-service AWS orchestration is +Stage 4; this driver must produce one real corpus end-to-end on a laptop with a +FakeSandbox before anything is distributed. Write-once per layout +(finding D-21): refuses to run if the manifest already exists. +""" +from __future__ import annotations + +from typing import Callable, Sequence + +from composer_replication.datagen.env import FeatureDeletionEnv +from composer_replication.datagen.rollout_harness import ( + RolloutPolicy, + admit, + collect_trajectory, +) +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.trajectory import to_policy_row +from composer_replication.pipeline import s3_contract +from composer_replication.pipeline.dedup import dedup +from composer_replication.pipeline.s3_contract import RunLayout, RunManifest +from composer_replication.safety.holdout import HeldoutSplit + + +def build_corpus( + source_tasks: Sequence[FeatureDeletionTask], + env_factory: Callable[[], FeatureDeletionEnv], + policy_factory: Callable[[], RolloutPolicy], + layout: RunLayout, + manifest: RunManifest, + *, + holdout_frac: float = 0.2, + holdout_seed: int = 0, + max_tasks: int | None = None, + cost_per_rollout_usd: float = 0.0, + prior_signatures: Sequence[Sequence[int]] | None = None, + dedup_threshold: float = 0.85, +) -> RunManifest: + """Run the Stage-0 pipeline over `source_tasks`; returns the final manifest. + + Args: + source_tasks: gate_repo-admitted FeatureDeletionTasks (the caller runs + `datagen.repo_gate.gate_repo` BEFORE this — the driver assumes the + license/decontamination gates already passed). + env_factory: fresh `FeatureDeletionEnv` per rollout (a sandbox is + stateful; sharing one across episodes leaks trajectory state). + policy_factory: fresh policy per rollout (ScriptedPolicy is stateful). + manifest: a `RunManifest` with run_id/created_at/budget set by the + caller (created_at is caller-passed for reproducibility). + cost_per_rollout_usd: accounting hook — API policies should report + real cost; the driver enforces `manifest.budget_usd` with it. + prior_signatures: previous generation's MinHash signatures + (cross-generation dedup, finding D-12). + + Raises: + FileExistsError: if the layout already has a manifest (write-once). + """ + if s3_contract.manifest_exists(layout): + raise FileExistsError( + f"Run layout already has a manifest at {layout.manifest_path} — " + "runs are write-once per (root, run_id); mint a new run_id " + "(finding D-21 idempotency)." + ) + + # 1. Holdout split FIRST — held-out tasks are never rolled out, so no + # training signal can derive from them (the HeldoutSplit discipline). + split = HeldoutSplit.split(source_tasks, holdout_frac=holdout_frac, + seed=holdout_seed, check_content=True) + by_id = {t.task_id: t for t in source_tasks} + holdout_tasks = [by_id[i] for i in sorted(split.holdout_ids)] + train_tasks = [by_id[i] for i in sorted(split.train_ids)] + if max_tasks is not None: + train_tasks = train_tasks[:max_tasks] + + # 2. Rollouts + admission routing. + sft_rows: list[dict] = [] + dpo_rows: list[dict] = [] + quarantine_rows: list[dict] = [] + traj_rows: list[dict] = [] + partial = False + for task in train_tasks: + if manifest.over_budget: + partial = True + break + traj = collect_trajectory(env_factory(), task, policy_factory(), + provenance={"run_id": manifest.run_id}) + manifest.spend(cost_per_rollout_usd) + verdict = admit(traj) + row = to_policy_row(traj, task) + traj_rows.append({**row, "admission": list(verdict.reasons)}) + if verdict.sft_admitted: + sft_rows.append(row) + elif verdict.dpo_candidate: + dpo_rows.append(row) + else: + quarantine_rows.append({**row, "reasons": list(verdict.reasons)}) + + # 3. Dedup the SFT corpus (within-run + cross-generation). + def _key(r: dict) -> str: + return " ".join(m.get("content", "") for m in r.get("messages", [])) + + sft_rows, dedup_stats = dedup(sft_rows, _key, dedup_threshold, + prior_signatures=prior_signatures) + + # 4. Write everything through the contract. + s3_contract.write_tasks(layout, train_tasks) + s3_contract.write_tasks_full(layout, train_tasks) + s3_contract.write_holdout(layout, holdout_tasks) + s3_contract.write_trajectories(layout, traj_rows) + s3_contract.write_sft_rows(layout, sft_rows) + s3_contract.write_dpo_rows(layout, dpo_rows) + s3_contract.write_quarantine(layout, quarantine_rows) + + manifest.counts = { + "tasks_train": len(train_tasks), + "tasks_holdout": len(holdout_tasks), + "rollouts": len(traj_rows), + "sft_rows": len(sft_rows), + "dpo_rows": len(dpo_rows), + "quarantined": len(quarantine_rows), + **{f"dedup_{k}": v for k, v in dedup_stats.items()}, + } + manifest.status = "partial" if partial else "building" + manifest.write(layout) + s3_contract.write_dataset_card(layout, manifest, dedup_stats=dedup_stats) + return manifest + + +__all__ = ["build_corpus"] diff --git a/composer_replication/pipeline/dedup.py b/composer_replication/pipeline/dedup.py new file mode 100644 index 0000000000000000000000000000000000000000..30f1ea5ab539d6edc9f1e3ee70f0e774bc33a0c7 --- /dev/null +++ b/composer_replication/pipeline/dedup.py @@ -0,0 +1,138 @@ +"""dedup.py — MinHash near-duplicate detection (finding D-12). + +Cross-generation dedup is a flywheel-collapse mitigation: a self-training loop +that re-ingests its own outputs accumulates near-identical rows, and per-batch +`document_deduplicator` (the only dedup the old designs had) never sees across +runs. This module computes MinHash signatures over word 5-shingles so a run can +(a) dedup within itself and (b) accept the PRIOR run's signature file and dedup +against it (lineage threaded by `RunManifest.parent_run_id`). + +Pragmatic v1: builtin-hash permutation MinHash with N=64 seeds, no banding/LSH +(O(n^2) pair scan — fine for Stage-0 corpus sizes, thousands of rows). +`datasketch` (MinHashLSH) is the documented upgrade path when row counts make +the pair scan bite. + +NOTE on hash stability: Python's builtin `hash()` over str is salted per +process (PYTHONHASHSEED), which would make signatures non-portable across +runs — exactly what cross-generation dedup needs. We therefore use md5-based +hashing (stable everywhere) despite the small speed cost. +""" +from __future__ import annotations + +import hashlib +import json +import re +from typing import IO, Callable, Iterable, Sequence + +N_PERMUTATIONS = 64 +_SHINGLE_W = 5 +_WORD_RE = re.compile(r"\w+") +_MAX64 = (1 << 64) - 1 + + +def _shingles(text: str, w: int = _SHINGLE_W) -> set[str]: + words = _WORD_RE.findall(text.lower()) + if len(words) <= w: + return {" ".join(words)} if words else set() + return {" ".join(words[i:i + w]) for i in range(len(words) - w + 1)} + + +def _stable_hash(s: str, seed: int) -> int: + h = hashlib.md5(f"{seed}:{s}".encode()).digest() + return int.from_bytes(h[:8], "big") + + +def minhash_signature(text: str, n_perm: int = N_PERMUTATIONS) -> tuple[int, ...]: + """MinHash signature: per-seed minimum over the shingle set.""" + sh = _shingles(text) + if not sh: + return tuple([_MAX64] * n_perm) + return tuple(min(_stable_hash(s, seed) for s in sh) for seed in range(n_perm)) + + +def jaccard_estimate(sig_a: Sequence[int], sig_b: Sequence[int]) -> float: + """Estimated Jaccard similarity = fraction of agreeing signature slots.""" + if len(sig_a) != len(sig_b) or not sig_a: + raise ValueError("signatures must be equal-length and non-empty") + return sum(1 for a, b in zip(sig_a, sig_b) if a == b) / len(sig_a) + + +def find_near_duplicates( + rows: Sequence[dict], + key_fn: Callable[[dict], str], + threshold: float = 0.85, + *, + prior_signatures: Sequence[Sequence[int]] | None = None, +) -> list[tuple[int, int]]: + """All (i, j) index pairs whose estimated Jaccard >= threshold. + + `prior_signatures` (from a previous run) participate as virtual rows with + negative indices -(k+1), so a pair (i, -1) means "row i duplicates prior + signature 0" — the cross-generation case. + """ + sigs = [minhash_signature(key_fn(r)) for r in rows] + pairs: list[tuple[int, int]] = [] + for i in range(len(sigs)): + for j in range(i + 1, len(sigs)): + if jaccard_estimate(sigs[i], sigs[j]) >= threshold: + pairs.append((i, j)) + for k, prior in enumerate(prior_signatures or []): + if jaccard_estimate(sigs[i], prior) >= threshold: + pairs.append((i, -(k + 1))) + return pairs + + +def dedup( + rows: Sequence[dict], + key_fn: Callable[[dict], str], + threshold: float = 0.85, + *, + prior_signatures: Sequence[Sequence[int]] | None = None, +) -> tuple[list[dict], dict]: + """Keep-first dedup. Returns (kept_rows, stats). + + A row duplicating a PRIOR-run signature is dropped outright (the prior run + already owns it); within-run duplicates keep the earliest occurrence. + """ + pairs = find_near_duplicates(rows, key_fn, threshold, + prior_signatures=prior_signatures) + drop: set[int] = set() + for i, j in pairs: + if j < 0: + drop.add(i) # duplicates a prior-run row + else: + drop.add(j) # keep-first within this run + kept = [r for i, r in enumerate(rows) if i not in drop] + return kept, { + "rows_in": len(rows), + "rows_kept": len(kept), + "dropped_within_run": len({j for _, j in pairs if j >= 0} & drop), + "dropped_cross_generation": len({i for i, j in pairs if j < 0} & drop), + "threshold": threshold, + } + + +def signatures_to_jsonl(rows: Sequence[dict], key_fn: Callable[[dict], str], + fh: IO[str]) -> int: + """Persist this run's signatures so the NEXT generation can dedup against + them (pass the loaded list as `prior_signatures`).""" + n = 0 + for r in rows: + fh.write(json.dumps(list(minhash_signature(key_fn(r)))) + "\n") + n += 1 + return n + + +def load_signatures(fh: IO[str]) -> list[tuple[int, ...]]: + return [tuple(json.loads(line)) for line in fh if line.strip()] + + +__all__ = [ + "N_PERMUTATIONS", + "dedup", + "find_near_duplicates", + "jaccard_estimate", + "load_signatures", + "minhash_signature", + "signatures_to_jsonl", +] diff --git a/composer_replication/pipeline/s3_contract.py b/composer_replication/pipeline/s3_contract.py new file mode 100644 index 0000000000000000000000000000000000000000..87a490dbde5886ab134dc30ff81682cb0e1e9fbd --- /dev/null +++ b/composer_replication/pipeline/s3_contract.py @@ -0,0 +1,287 @@ +"""s3_contract.py — THE single dataset layout + manifest (finding V8/D-7/D-8). + +Supersedes BOTH prior contracts: design-F1's `runs//{sft_corpus,dpo_pairs, +rl_task_pool,divergence_pairs,wm_tuples,holdout,diloco_rendezvous}` and +design-F2's `{traces,tasks,replay,task_grades,corpus}/v1/run_id=` — the two +were never reconciled and coexisted in the grounding doc. One layout, one +manifest, two explicit serializers with a unit-tested leak guard. + +Deliberate exclusions from the run layout: + * `diloco_rendezvous/` — training-comms state, not dataset; lives in its own + prefix/bucket (finding D-19). + * `wm_tuples/` — emitted only when the P4 world-model ablation is scheduled + (finding D-14); not part of Stage 0. + +Layout (root = any local path or fsspec URI): + /runs// + tasks/manifest.jsonl policy-safe task rows (golden_diff -> sha256) + tasks_full/manifest.jsonl construction-side full rows (RESTRICTED prefix) + traj/*.jsonl CanonicalTrajectory records (audit trail) + corpus_sft/rows.jsonl admitted SFT rows (to_policy_row output) + corpus_dpo/rows.jsonl DPO-candidate rows + holdout/tasks.jsonl held-out task ids+rows (never rolled out) + quarantine/*.jsonl rejected trajectories w/ reasons (audit) + manifest.json RunManifest + DATASET_CARD.md human-readable card +""" +from __future__ import annotations + +import dataclasses +import hashlib +import json +from dataclasses import dataclass, field +from typing import IO, Iterable + +from composer_replication.datagen.schema import FeatureDeletionTask + +SCHEMA_VERSION = "1" + + +def _is_local(root: str) -> bool: + return "://" not in root or root.startswith("file://") + + +def _open(path: str, mode: str = "w") -> IO[str]: + """Open a path for text IO; plain `open` locally, fsspec for s3:// etc. + + fsspec is lazy so the module (and all local-corpus runs) need no extra dep. + """ + if _is_local(path): + import os + local = path.removeprefix("file://") + os.makedirs(os.path.dirname(local), exist_ok=True) + return open(local, mode, encoding="utf-8") + try: + import fsspec # noqa: PLC0415 — lazy heavy dep + except ImportError as e: + raise RuntimeError( + "Non-local corpus roots require fsspec; install with " + "`pip install -e .[serverless]`. Got: " + repr(e) + ) from e + return fsspec.open(path, mode, encoding="utf-8").open() + + +def _exists(path: str) -> bool: + if _is_local(path): + import os + return os.path.exists(path.removeprefix("file://")) + import fsspec # noqa: PLC0415 + fs, _, paths = fsspec.get_fs_token_paths(path) + return bool(fs.exists(paths[0])) + + +@dataclass(frozen=True) +class RunLayout: + """Pure-path logic for one run's prefixes — testable without any IO.""" + + root: str + run_id: str + + def _p(self, *parts: str) -> str: + base = self.root.rstrip("/") + return f"{base}/runs/{self.run_id}/" + "/".join(parts) + + @property + def tasks_path(self) -> str: + return self._p("tasks", "manifest.jsonl") + + @property + def tasks_full_path(self) -> str: + # RESTRICTED prefix: carries golden_diff/deleted_symbols. On S3 this + # prefix gets a deny-by-default policy; locally it is still separated + # so a naive `corpus_*` glob can never sweep it up. + return self._p("tasks_full", "manifest.jsonl") + + @property + def traj_path(self) -> str: + return self._p("traj", "trajectories.jsonl") + + @property + def sft_path(self) -> str: + return self._p("corpus_sft", "rows.jsonl") + + @property + def dpo_path(self) -> str: + return self._p("corpus_dpo", "rows.jsonl") + + @property + def holdout_path(self) -> str: + return self._p("holdout", "tasks.jsonl") + + @property + def quarantine_path(self) -> str: + return self._p("quarantine", "rejected.jsonl") + + @property + def manifest_path(self) -> str: + return self._p("manifest.json") + + @property + def card_path(self) -> str: + return self._p("DATASET_CARD.md") + + +@dataclass +class RunManifest: + """Run-level metadata: counts, cost, lineage, budget, acceptance status. + + `created_at` is CALLER-passed (never datetime.now() in here) so manifests + are reproducible in tests. `parent_run_id` threads flywheel lineage so + cross-generation dedup (finding D-12) can find prior signatures. + """ + + run_id: str + created_at: str + source: str = "" + counts: dict = field(default_factory=dict) + cost_usd: float = 0.0 + parent_run_id: str | None = None + schema_version: str = SCHEMA_VERSION + status: str = "building" # building | accepted | rejected | partial + budget_usd: float | None = None + + def spend(self, usd: float) -> None: + self.cost_usd += usd + + @property + def over_budget(self) -> bool: + return self.budget_usd is not None and self.cost_usd >= self.budget_usd + + def write(self, layout: RunLayout) -> None: + with _open(layout.manifest_path) as f: + json.dump(dataclasses.asdict(self), f, indent=2) + + @classmethod + def read(cls, layout: RunLayout) -> RunManifest: + with _open(layout.manifest_path, "r") as f: + return cls(**json.load(f)) + + +# --------------------------------------------------------------------- +# Writers — the leak guard lives here (finding D-8) +# --------------------------------------------------------------------- + + +def _task_row_policy_safe(task: FeatureDeletionTask) -> dict: + """Task row with the construction-side secrets REPLACED, not just hidden. + + `asdict()` includes `golden_diff` despite `repr=False` — that is exactly + the leak D-8 flagged. We keep provenance via a sha256 (verifiable, not + recoverable) and drop `deleted_symbols` entirely (they name the answer). + """ + row = dataclasses.asdict(task) + gold = row.pop("golden_diff", "") + row.pop("deleted_symbols", None) + row["golden_diff_sha256"] = hashlib.sha256(gold.encode()).hexdigest() if gold else "" + return row + + +def write_tasks(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int: + """Write the POLICY-SAFE task manifest (the default everything reads).""" + n = 0 + with _open(layout.tasks_path) as f: + for t in tasks: + f.write(json.dumps(_task_row_policy_safe(t)) + "\n") + n += 1 + return n + + +def write_tasks_full(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int: + """Write FULL task rows (incl. golden_diff) to the RESTRICTED prefix. + + Only the validator/monitor side reads this; never corpus consumers. + """ + n = 0 + with _open(layout.tasks_full_path) as f: + for t in tasks: + f.write(json.dumps(dataclasses.asdict(t)) + "\n") + n += 1 + return n + + +def _write_jsonl(path: str, rows: Iterable[dict]) -> int: + n = 0 + with _open(path) as f: + for r in rows: + f.write(json.dumps(r) + "\n") + n += 1 + return n + + +def write_sft_rows(layout: RunLayout, rows: Iterable[dict]) -> int: + return _write_jsonl(layout.sft_path, rows) + + +def write_dpo_rows(layout: RunLayout, rows: Iterable[dict]) -> int: + return _write_jsonl(layout.dpo_path, rows) + + +def write_quarantine(layout: RunLayout, rows: Iterable[dict]) -> int: + return _write_jsonl(layout.quarantine_path, rows) + + +def write_holdout(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int: + return _write_jsonl(layout.holdout_path, (_task_row_policy_safe(t) for t in tasks)) + + +def write_trajectories(layout: RunLayout, rows: Iterable[dict]) -> int: + return _write_jsonl(layout.traj_path, rows) + + +def write_dataset_card(layout: RunLayout, manifest: RunManifest, + *, license_tiers: dict[str, int] | None = None, + dedup_stats: dict | None = None, + decontamination_note: str = "") -> None: + """A small human-readable dataset card (finding D-18).""" + lines = [ + f"# Dataset card — run `{manifest.run_id}`", + "", + f"- **created:** {manifest.created_at}", + f"- **source:** {manifest.source}", + f"- **status:** {manifest.status}", + f"- **schema_version:** {manifest.schema_version}", + f"- **cost (USD):** {manifest.cost_usd:.2f}" + + (f" / budget {manifest.budget_usd:.2f}" if manifest.budget_usd else ""), + f"- **lineage:** parent_run_id={manifest.parent_run_id or 'none'}", + "", + "## Counts", + "", + ] + for k, v in sorted(manifest.counts.items()): + lines.append(f"- {k}: {v}") + if license_tiers: + lines += ["", "## License tiers seen", ""] + lines += [f"- {k}: {v}" for k, v in sorted(license_tiers.items())] + lines += ["", "## Decontamination", "", + decontamination_note or + "All source repos checked against the SWE-bench-family eval list " + "(datagen.repo_gate.DECONTAMINATION_LIST) at ingest."] + if dedup_stats: + lines += ["", "## Dedup", ""] + lines += [f"- {k}: {v}" for k, v in sorted(dedup_stats.items())] + lines += ["", "Policy-safe rows only: `golden_diff` is sha256-hashed and " + "`deleted_symbols` dropped in `tasks/`, `corpus_*/`, `holdout/` " + "(full rows live in the restricted `tasks_full/`).", ""] + with _open(layout.card_path) as f: + f.write("\n".join(lines)) + + +def manifest_exists(layout: RunLayout) -> bool: + """Write-once guard for the driver (finding D-21 idempotency).""" + return _exists(layout.manifest_path) + + +__all__ = [ + "SCHEMA_VERSION", + "RunLayout", + "RunManifest", + "manifest_exists", + "write_dataset_card", + "write_dpo_rows", + "write_holdout", + "write_quarantine", + "write_sft_rows", + "write_tasks", + "write_tasks_full", + "write_trajectories", +] diff --git a/composer_replication/pipeline/tests/__init__.py b/composer_replication/pipeline/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/composer_replication/pipeline/tests/test_pipeline.py b/composer_replication/pipeline/tests/test_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..38a189a38351b19d356d970b0b792b96c5768181 --- /dev/null +++ b/composer_replication/pipeline/tests/test_pipeline.py @@ -0,0 +1,223 @@ +"""Tests for the Stage-0 pipeline: contract, dedup, driver. + +Load-bearing coverage: the sentinel leak guard on write_tasks (finding D-8), +holdout exclusion + budget stop + idempotency in build_corpus (D-21), and the +cross-generation dedup path (D-12). +""" +from __future__ import annotations + +import io +import json +from pathlib import Path + +import pytest + +from composer_replication.datagen.env import FeatureDeletionEnv +from composer_replication.datagen.rollout_harness import ScriptedPolicy +from composer_replication.datagen.sandbox import FakeSandbox +from composer_replication.datagen.schema import FeatureDeletionTask +from composer_replication.datagen.trajectory import ToolCall +from composer_replication.pipeline.build_corpus import build_corpus +from composer_replication.pipeline.dedup import ( + dedup, + find_near_duplicates, + jaccard_estimate, + load_signatures, + minhash_signature, + signatures_to_jsonl, +) +from composer_replication.pipeline.s3_contract import ( + RunLayout, + RunManifest, + write_dataset_card, + write_tasks, + write_tasks_full, +) + + +def _task(i: int, **over) -> FeatureDeletionTask: + base = dict( + task_id=f"task-{i:03d}", repo="org/repo", base_commit="abc", + broken_image="img:1", test_command="pytest -q", + fail_to_pass=(f"t/a.py::t{i}",), pass_to_pass=("t/a.py::keep",), + golden_diff="SENTINEL_NEVER_LEAK", deleted_symbols=("secret_fn",), + ) + base.update(over) + return FeatureDeletionTask(**base) + + +# --------------------------------------------------------------------- +# RunLayout / RunManifest +# --------------------------------------------------------------------- + + +def test_layout_paths_are_pure_and_namespaced(): + lay = RunLayout(root="/data/corpora", run_id="run42") + assert lay.sft_path == "/data/corpora/runs/run42/corpus_sft/rows.jsonl" + assert lay.manifest_path == "/data/corpora/runs/run42/manifest.json" + s3 = RunLayout(root="s3://bucket/prefix/", run_id="r") + assert s3.tasks_path == "s3://bucket/prefix/runs/r/tasks/manifest.jsonl" + + +def test_manifest_round_trip_and_budget(tmp_path): + lay = RunLayout(root=str(tmp_path), run_id="r1") + m = RunManifest(run_id="r1", created_at="2026-06-09T00:00:00Z", + source="test", budget_usd=1.0) + m.spend(0.4) + assert not m.over_budget + m.spend(0.6) + assert m.over_budget + m.write(lay) + m2 = RunManifest.read(lay) + assert m2.cost_usd == pytest.approx(1.0) + assert m2.budget_usd == 1.0 + + +# --------------------------------------------------------------------- +# THE leak guard (finding D-8) +# --------------------------------------------------------------------- + + +def test_write_tasks_never_leaks_golden_diff(tmp_path): + lay = RunLayout(root=str(tmp_path), run_id="r1") + write_tasks(lay, [_task(1)]) + blob = Path(lay.tasks_path).read_text() + assert "SENTINEL_NEVER_LEAK" not in blob + assert "secret_fn" not in blob + row = json.loads(blob.splitlines()[0]) + assert row["golden_diff_sha256"] # provenance preserved as a hash + # The restricted full writer DOES carry it (construction side only). + write_tasks_full(lay, [_task(1)]) + assert "SENTINEL_NEVER_LEAK" in Path(lay.tasks_full_path).read_text() + + +# --------------------------------------------------------------------- +# MinHash dedup +# --------------------------------------------------------------------- + +_TEXT_A = "the quick brown fox jumps over the lazy dog and then runs far away home tonight" +_TEXT_A2 = "the quick brown fox jumps over the lazy dog and then runs far away home today" +_TEXT_B = "import numpy as np def main(): return np.zeros(10) print(main()) totally different content here" + + +def test_jaccard_estimate_near_duplicates_high_disjoint_low(): + sa, sa2, sb = (minhash_signature(t) for t in (_TEXT_A, _TEXT_A2, _TEXT_B)) + assert jaccard_estimate(sa, sa2) > 0.5 + assert jaccard_estimate(sa, sb) < 0.2 + assert jaccard_estimate(sa, sa) == 1.0 + + +def test_dedup_keeps_first_and_drops_near_dup(): + rows = [{"text": _TEXT_A}, {"text": _TEXT_A2}, {"text": _TEXT_B}] + kept, stats = dedup(rows, lambda r: r["text"], threshold=0.5) + assert [r["text"] for r in kept] == [_TEXT_A, _TEXT_B] + assert stats["dropped_within_run"] == 1 + + +def test_cross_generation_dedup_via_signature_file(): + prior_rows = [{"text": _TEXT_A}] + buf = io.StringIO() + signatures_to_jsonl(prior_rows, lambda r: r["text"], buf) + buf.seek(0) + prior_sigs = load_signatures(buf) + + rows = [{"text": _TEXT_A2}, {"text": _TEXT_B}] + kept, stats = dedup(rows, lambda r: r["text"], threshold=0.5, + prior_signatures=prior_sigs) + assert [r["text"] for r in kept] == [_TEXT_B] + assert stats["dropped_cross_generation"] == 1 + + +def test_find_near_duplicates_pairs(): + rows = [{"t": _TEXT_A}, {"t": _TEXT_A2}] + assert find_near_duplicates(rows, lambda r: r["t"], 0.5) == [(0, 1)] + + +# --------------------------------------------------------------------- +# build_corpus end-to-end (FakeSandbox + ScriptedPolicy) +# --------------------------------------------------------------------- + + +def _passing_policy(): + # Flips both this task's F2P tests green generically: FakeSandbox's + # set_outcome takes explicit test names, so the fixture tasks share names + # via the same fail_to_pass tuple pattern; we set a superset. + outcomes = {f"t/a.py::t{i}": True for i in range(20)} + outcomes["t/a.py::keep"] = True + return ScriptedPolicy(actions=[ToolCall("set_outcome", {"outcomes": outcomes}), "done"]) + + +def _failing_policy(): + return ScriptedPolicy(actions=["gave up immediately"]) + + +def _env(): + return FeatureDeletionEnv(FakeSandbox(test_outcomes={"t/a.py::keep": True})) + + +def test_build_corpus_end_to_end(tmp_path): + tasks = [_task(i) for i in range(6)] + lay = RunLayout(root=str(tmp_path), run_id="e2e") + manifest = RunManifest(run_id="e2e", created_at="2026-06-09T00:00:00Z", source="fixture") + + out = build_corpus(tasks, _env, _passing_policy, lay, manifest, + holdout_frac=0.34, holdout_seed=7) + + # Holdout exclusion: holdout tasks were never rolled out. + assert out.counts["tasks_holdout"] >= 1 + assert out.counts["rollouts"] == out.counts["tasks_train"] + # Full passes routed to SFT (post-dedup near-identical rows collapse — + # the fixture tasks produce near-identical messages, which is itself a + # realistic dedup scenario). + assert out.counts["sft_rows"] >= 1 + assert out.counts["quarantined"] == 0 + # Files exist and the SFT corpus never leaks the sentinel. + sft_blob = Path(lay.sft_path).read_text() + assert "SENTINEL_NEVER_LEAK" not in sft_blob + assert Path(lay.card_path).exists() + assert Path(lay.holdout_path).exists() + + +def test_build_corpus_quarantines_failures(tmp_path): + tasks = [_task(i) for i in range(3)] + lay = RunLayout(root=str(tmp_path), run_id="fail") + manifest = RunManifest(run_id="fail", created_at="2026-06-09T00:00:00Z", source="fixture") + out = build_corpus(tasks, _env, _failing_policy, lay, manifest, + holdout_frac=0.34, holdout_seed=7) + assert out.counts["sft_rows"] == 0 + assert out.counts["quarantined"] == out.counts["rollouts"] > 0 + + +def test_build_corpus_budget_stop_marks_partial(tmp_path): + tasks = [_task(i) for i in range(6)] + lay = RunLayout(root=str(tmp_path), run_id="budget") + manifest = RunManifest(run_id="budget", created_at="2026-06-09T00:00:00Z", + source="fixture", budget_usd=0.25) + out = build_corpus(tasks, _env, _passing_policy, lay, manifest, + holdout_frac=0.2, holdout_seed=7, + cost_per_rollout_usd=0.1) + assert out.status == "partial" + assert out.counts["rollouts"] < out.counts["tasks_train"] + + +def test_build_corpus_is_write_once(tmp_path): + tasks = [_task(i) for i in range(3)] + lay = RunLayout(root=str(tmp_path), run_id="once") + m1 = RunManifest(run_id="once", created_at="2026-06-09T00:00:00Z", source="fixture") + build_corpus(tasks, _env, _passing_policy, lay, m1, holdout_frac=0.34) + m2 = RunManifest(run_id="once", created_at="2026-06-09T00:00:01Z", source="fixture") + with pytest.raises(FileExistsError, match="write-once"): + build_corpus(tasks, _env, _passing_policy, lay, m2, holdout_frac=0.34) + + +def test_dataset_card_contents(tmp_path): + lay = RunLayout(root=str(tmp_path), run_id="card") + m = RunManifest(run_id="card", created_at="2026-06-09T00:00:00Z", + source="fixture", counts={"sft_rows": 3}) + write_dataset_card(lay, m, license_tiers={"REDISTRIBUTABLE": 3}, + dedup_stats={"rows_kept": 3}) + card = Path(lay.card_path).read_text() + assert "run `card`" in card + assert "sft_rows: 3" in card + assert "REDISTRIBUTABLE: 3" in card + assert "Decontamination" in card diff --git a/composer_replication/teacher_replay.py b/composer_replication/teacher_replay.py index 6cbb3cb4b04cfa1bff71b635dc0205a2ba19dd5f..d6d210924a2157ec544baf713c270b98ded0a95d 100644 --- a/composer_replication/teacher_replay.py +++ b/composer_replication/teacher_replay.py @@ -4,8 +4,12 @@ This is channel 3 of the integrated trainer: at each step of a frozen agentic trace, query N pre-trained external teachers (frontier models from different labs) and convert teacher disagreement into preference pairs for DPO loss. -Generalized from spike-001's `replay.py`. Verified economic floor (✅ spike 001): -$0.98 mean per-trace cost ungated, $0.30/trace projected with VOI gating. +Generalized from spike-001's `replay.py`. Cost calibration (✅ spike 001, +relabeled per deepread finding V11): $0.98 mean per-TRACE cost ungated was +measured on a ~50-state SYNTHETIC trace at N=3 teachers; real Claude Code +sessions run 125–2,830 tool-use messages (ADR-002), so a full real session is +~2 orders of magnitude more (~$70–80 flat, before VOI gating). $0.30/trace +projected with VOI gating, same synthetic basis. Usage: from teacher_replay import replay_trace, extract_dpo_pairs diff --git a/composer_replication/trainer/kl_in_reward.py b/composer_replication/trainer/kl_in_reward.py index a6db8fc58dea5bbd98285cce68e21f687e41c89b..108bc709079f4a1d2f8e2d76bf10b7fa83e87ad8 100644 --- a/composer_replication/trainer/kl_in_reward.py +++ b/composer_replication/trainer/kl_in_reward.py @@ -9,7 +9,9 @@ literature says this is not cosmetic: * arXiv:2512.21852 ("A Comedy of Estimators") — k1-in-reward improves OOD generalization; k3-in-reward can collapse. - * verl adopted k1-in-reward as its *only* reverse-KL option. + * verl ships k1-in-reward as its default/recommended reverse-KL option + (it also supports a k3-family "low_var_kl" — wording corrected per + deepread finding V13). * TRL issue #4967 tracks the same divergence. OOD generalization is exactly the "take any model to the next level" axis, so diff --git a/docs/COMPOSER_RECIPE_MAPPING.md b/docs/COMPOSER_RECIPE_MAPPING.md index 9d000e984d266c180d224a2dcb3568ea19042079..6435c9db9625e435f253ef0ddd2c06bb828da338 100644 --- a/docs/COMPOSER_RECIPE_MAPPING.md +++ b/docs/COMPOSER_RECIPE_MAPPING.md @@ -22,7 +22,7 @@ The Cursor blog discusses **only three** training innovations explicitly. Everyt **Cited prior art** (Cursor's footnote 1): - **OPSD: Self-Distilled Reasoner — On-Policy Self-Distillation for LLMs** (Zhao et al., 2026, [arXiv:2601.18734](https://arxiv.org/abs/2601.18734), [GitHub: siyan-zhao/OPSD](https://github.com/siyan-zhao/OPSD)). The original on-policy-self-distillation framework: single LLM, teacher conditioned on privileged information (e.g. ground-truth answer), student sees only the question, loss = per-token KL on student's own rollouts. -- **SDPO: Reinforcement Learning via Self-Distillation** (Hübotter et al., 2026, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop). Generalizes OPSD to RL with rich feedback: *"SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy."* This is **mathematically the same** as Composer's targeted-textual-feedback method. **There is published code.** Comparison table from the SDPO paper: +- **SDPO: Reinforcement Learning via Self-Distillation** (Hübotter et al., 2026, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop). Generalizes OPSD to RL with rich feedback: *"SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy."* Cursor's blog cites this paper only as **background** ("For more background on this approach see…") — NOT as its mechanism; SDPO's published loss is full-rollout with feedback-in-prefix and an EMA-regularized teacher, while Composer's blog describes a turn-localized hint splice. Closely related, **not verified identical** (deepread finding V1). **There is published code.** Comparison table from the SDPO paper: | Method | Sampling | Signal | Feedback | |---|---|---|---| @@ -72,7 +72,7 @@ This is **infrastructure, not algorithm**. It only matters at MoE-1T scale; for | Composer 2.5 stage | Blog mechanism | Our replication target | v0.0 | v0.1 | v0.2 | |---|---|---|---|---|---| | **(a)** Continued pretraining on code | Standard pretraining, code-weighted | Skip — start from already-code-tuned `Qwen3-Coder-7B` or `Qwen3-Coder-30B-A3B` | ✗ | ✗ | ✗ | -| **(b)** Synthetic data at scale | Feature Deletion + 24 other (unnamed) generators | Build 1 generator (Feature Deletion) as OpenEnv-compatible env. Use SWE-bench-lite and SWE-Gym as drop-in alternatives. | ✗ (use SWE-bench-lite only) | ✓ (build Feature Deletion) | scale generator suite | +| **(b)** Synthetic data at scale | Feature Deletion + an unspecified number of other generators (the blog says only "a range of approaches" — the old "24" was a back-formation from the 25x task multiplier; deepread finding V5) | Build 1 generator (Feature Deletion) as OpenEnv-compatible env. Use SWE-bench-lite and SWE-Gym as drop-in alternatives. | ✗ (use SWE-bench-lite only) | ✓ (build Feature Deletion) | scale generator suite | | **(c)** Realistic-environment RL (RLVR) | Async sandboxes, same tool harness as production | TRL `GRPOTrainer` + verifiers + OpenEnv; SWE-bench-lite env in v0.0; build sandboxed code execution env in v0.1 | ✓ baseline | ✓ + DAPO patches | + decentralized rollouts | | **(d)** Targeted RL w/ textual feedback (Composer's secret sauce) | Same-model self-distill: insert hint into context → teacher; original → student; on-policy KL at the turn | **Lift the OPSD/SDPO loss directly from `siyan-zhao/OPSD`** (published code, MIT). Generate hints via templates (v0.1) or LLM (v0.2). | ✗ (deferred) | ✓ (this is the Composer-recipe channel) | + learned hint generator | | **(e)** Trace-replay multi-teacher distill (NOVEL — our addition) | N/A (not in Composer) | N=3 teachers (Opus 4.7, GPT-5, DeepSeek V4 Pro) replay each step; disagreement → DPO pairs | ✓ (this is the v0.0 novelty bet) | ✓ + VOI gating | + tiered teachers | @@ -148,7 +148,7 @@ Primary sources for each Composer-2.5 component, post-audit: - **Cursor blog** — [Introducing Composer 2.5](https://cursor.com/blog/composer-2-5) (2026) - **Cursor blog** — [Composer 2 technical report](https://cursor.com/blog/composer-2-technical-report) (predecessor; named the "Anyrun" environment per subagent — verify if needed) - **OPSD paper** — Zhao et al., *Self-Distilled Reasoner: On-Policy Self-Distillation for LLMs*, [arXiv:2601.18734](https://arxiv.org/abs/2601.18734), code at [siyan-zhao/OPSD](https://github.com/siyan-zhao/OPSD). MIT. -- **SDPO paper** — Hübotter et al., *Reinforcement Learning via Self-Distillation*, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop. The direct formalization of Composer's hint-distill. +- **SDPO paper** — Hübotter et al., *Reinforcement Learning via Self-Distillation*, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop. The closest published formalization; cited by Cursor only as background (deepread finding V1). - **Self-Distillation continual-learning** — [arXiv:2601.19897](https://arxiv.org/abs/2601.19897). Cited by Cursor; less directly relevant. - **Moonshot Kimi K2.5** — base model, [HF model card](https://huggingface.co/moonshotai/Kimi-K2-Thinking). diff --git a/docs/adrs/ADR-016-stage0-dataset-pipeline.md b/docs/adrs/ADR-016-stage0-dataset-pipeline.md new file mode 100644 index 0000000000000000000000000000000000000000..f1ba7a7061216aca4b7ce230da8b8c730af69dba --- /dev/null +++ b/docs/adrs/ADR-016-stage0-dataset-pipeline.md @@ -0,0 +1,119 @@ +--- +status: accepted +date: 2026-06-09 +deciders: [Codeseys, ARIA] +--- + +# ADR-016: Stage-0 dataset-generation pipeline — SWE-smith engine + rollout harness + ingest gates + single contract + +## Context and Problem Statement + +The user asked to "architect and build a pipeline that builds out a dataset like +the Composer 2.5 blog mentions, with our vision enhancements — point to an +open-source repo and use that to build the dataset, or use traces or other +datasets and enhance them." + +Before building, a full-source critical review re-read every foundational paper +and blog (8 source clusters, `research/deepread/01-08`), ground-mapped the repo +(`00`), ran adversarial fidelity + design critics, and independently VERIFIED +every finding (`12` — 0 refuted). The verified verdict: the envisioned pipeline +had four structural breaks (seed-trace/oracle disjointness; no rollout harness — +the SFT corpus had NO producer; an uncomputable divergence gate; no +`Sandbox.fork()`), several missing controls (zero benchmark decontamination, +no secrets gate, a `golden_diff` serialization leak, two unreconciled S3 +contracts, no cross-generation dedup), and a buy-vs-build inversion (the +planned image-builder duplicates `pip install swesmith`, whose PR-Mirror +strategy IS this repo's gold-patch-reversion mechanic and is validated best-of- +five by SWE-smith's own ablation, Table 5 of arXiv:2504.21798). + +## Decision + +Build **Stage 0 local-first** (architecture: `research/deepread/13-synthesis-architecture.md`): + +1. **SWE-smith is the synthesis engine** for "point at a repo" (`[swesmith]` + extra; `datagen/swesmith_adapter.py` bridges its instances into + `FeatureDeletionTask`, handling the patch-semantics INVERSION — SWE-smith's + patch introduces the bug, so `golden_diff` = `reverse_unified_diff(patch)`). + `SweBenchAdapter` remains the bridge for SWE-bench-shaped substrates. +2. **Ingest gates before anything else** (`datagen/repo_gate.py`): SPDX-ish + license detection → three tiers (REDISTRIBUTABLE / TRAINABLE_ONLY / + EXCLUDED, fail-closed) + **benchmark decontamination** against the + SWE-bench-family eval-repo list (hard fail). +3. **The rollout harness is the corpus producer** (`datagen/rollout_harness.py`): + `collect_trajectory(env, task, policy)` runs a pluggable policy + (ScriptedPolicy for tests; OpenRouterPolicy stub; mini-swe-agent/SWE-agent + adoption is the documented upgrade) through `FeatureDeletionEnv` to + `_grade()`. Its env-grounded trajectories are ALSO the tree-of-work's seed + nodes — fixing the seed/oracle disjointness as a byproduct. `admit()` routes + typed signal: clean full pass → SFT; clean near-miss → DPO candidate; + guard-broken/hacked → quarantine (never raw negative gradient). +4. **One canonical trajectory IR** (`datagen/trajectory.py`): `ToolCall` (whose + `canonical_form()` is the v1 divergence-gate action algebra, replacing the + whitespace stub), `CanonicalTrajectory`, adapters from Claude Code traces + (explicitly UNGRADED — demoted to flat/SFT uses), and `to_policy_row()` — + the ONE policy-visible serializer, unit-tested to never emit + `golden_diff`/`deleted_symbols` (sentinel test). +5. **One reconciled dataset contract** (`pipeline/s3_contract.py`, supersedes + design-F1's and design-F2's divergent layouts): `runs//{tasks, + tasks_full(RESTRICTED), traj, corpus_sft, corpus_dpo, holdout, quarantine}` + + `RunManifest` (counts, cost, budget, `parent_run_id` lineage, status) + + dataset card. Policy-safe task rows carry `golden_diff_sha256`, never the + diff. DiLoCo rendezvous and `wm_tuples/` are deliberately OUT (separate + concern; ablation-gated respectively). +6. **Cross-generation dedup** (`pipeline/dedup.py`): stable-hash MinHash over + word 5-shingles; a run can dedup against the prior generation's signature + file (flywheel-collapse mitigation). datasketch/LSH is the upgrade path. +7. **The local stage-driver** (`pipeline/build_corpus.py`): holdout-split FIRST + (held-out tasks never rolled out), rollouts under a budget ceiling + (partial-marking), typed routing, dedup, write-once-per-run idempotency. + +## Fidelity corrections shipped with this ADR (deepread findings, all verified) + +- **V1:** "SDPO is mathematically the same as Composer's mechanism" corrected + in `opsd.py` + `COMPOSER_RECIPE_MAPPING.md` — Cursor cites SDPO/OPSD as + *background*; our channel is a third, blog-inspired design (turn-localized + hint splice, live stop-grad teacher, no EMA). +- **V5:** fabricated numbers struck/tagged: 69.3%/Terminal-Bench parity (no + primary source), "24 other generators" (back-formed), "85% post-training + compute" (community speculation) — `research/01`, mapping doc, `research/06`, + `research/09`. +- **V7:** Streaming DiLoCo citation fixed (`diloco/__init__.py`): 2501.18512 = + Douillard et al.; Eager Updates = Kale et al. 2502.12996. +- **V11:** `teacher_replay.py` cost docstring relabeled ($0.98 = 50-state + synthetic trace; real sessions ~2 OOM more). +- **V13:** `kl_in_reward.py` "verl's only reverse-KL option" → default/ + recommended (verl also ships a k3-family option). + +## What is deliberately NOT in Stage 0 + +- AWS orchestration (Glue/EMR/Batch/Bedrock-batch/Step Functions) — Stage 4, + only after local runs are routine (finding D-9). +- Tree depth>1 — gated on a `Sandbox.fork()` spike + a measured divergence-gate + firing rate (findings D-3/D-4). Depth-1 multi-candidate rollouts need no fork. +- World-model `wm_tuples/` emission — gated on the P4 ablation being scheduled + (finding D-14; CWM evidence is mid-training, not RL-time aux head — V6). +- Secrets/PII scrub at trace ingest (finding V9) — REQUIRED before any raw + Claude Code session is uploaded to shared storage; tracked as the next + pipeline item. Local-only runs are unaffected. + +## Acceptance gate + +- [x] `repo_gate`: 53 tests (license tiers, decontamination, gate verdicts). +- [x] `swesmith_adapter`: 18 tests (patch INVERSION semantics, reverse round-trip, + strategy provenance, image conventions). +- [x] `trajectory` + `rollout_harness`: 13 tests (IR round-trips, SENTINEL leak + guard, env-grounded episode to grade 1.0 / guard-broken / near-miss, + admission routing). +- [x] `pipeline`: 12 tests (layout, manifest+budget, leak guard at the writer, + MinHash within-run + cross-generation, build_corpus e2e with holdout + exclusion + budget stop + write-once). +- [x] Full suite green: 511 passed / 66 skipped. +- [ ] Live swesmith synthesis on a real pointed-at repo (needs Docker+Linux) — + the documented `[~]` gate, same shape as ADR-010's Docker e2e. + +## More Information + +- `research/deepread/13-synthesis-architecture.md` — the architecture this implements. +- `research/deepread/12-verified-findings.md` — the verified finding ledger (V1–V15). +- `research/deepread/02-swe-task-synthesis.md` — the SWE-smith/R2E-Gym/SWE-Gym deep-read. +- ADR-010 (the substrate-inversion base this extends), ADR-002 (trace source). diff --git a/pyproject.toml b/pyproject.toml index 11a211bf2667587d1ebaa26694170594bc97392d..0e097b6c6eb035f05596e9f003ce52a7e5feb8b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,14 @@ aws = [ "boto3>=1.34", "sagemaker>=2.200,<3", ] +# SWE-smith task-synthesis engine (deepread finding V4 buy-vs-build verdict): +# the swesmith toolkit builds env images from arbitrary GitHub repos and +# synthesizes bugs (PR Mirror = this repo's gold-patch-reversion mechanic). +# LIVE synthesis needs Docker on Linux (the toolkit does not support macOS/ +# Windows officially); the SwesmithAdapter itself needs nothing beyond core. +swesmith = [ + "swesmith>=0.1", +] # Replaysim dataset normalization (per ADR-004) # # NOTE: data-juicer is intentionally NOT pinned as an extra. The package diff --git a/research/01-composer-2.5.md b/research/01-composer-2.5.md index f2d535bd8d9971855b65b584f0206dec161ba405..427b16c65e1eb573b0becf5d6dfe2d0b1ec07d55 100644 --- a/research/01-composer-2.5.md +++ b/research/01-composer-2.5.md @@ -11,7 +11,7 @@ > The targeted-textual-feedback method is correctly described, but this file does **not** cite the three self-distillation papers Cursor cites in footnote 1 (OPSD `arXiv:2601.18734`, SDPO `arXiv:2601.20802`, Self-Distillation Continual Learning `arXiv:2601.19897`). The mapping document does. ## Overview -Cursor's Composer 2.5 is an advanced agentic coding model that powers the Cursor IDE. Released in mid-May 2026, it represents a massive leap in agentic capabilities, particularly for long-running, multi-file software engineering tasks. While the base weights are Moonshot AI's open-source **Kimi K2.5** model, roughly 85% of the total compute budget for Composer 2.5 was spent on Cursor's proprietary post-training and Reinforcement Learning (RL) pipeline. +Cursor's Composer 2.5 is an advanced agentic coding model that powers the Cursor IDE. Released in mid-May 2026, it represents a massive leap in agentic capabilities, particularly for long-running, multi-file software engineering tasks. While the base weights are Moonshot AI's open-source **Kimi K2.5** model, a large share of the compute budget went to Cursor's proprietary post-training/RL pipeline (the widely-circulated "85%" figure is community speculation, in NO primary source — deepread finding V5). The resulting model is highly optimized for the exact constraints and tools of the Cursor environment (file edits, terminal usage, LSP interaction). Composer 2.5 is praised for having fewer "false-start" tool calls, avoiding prompt-baiting, and demonstrating a much calmer, more effective collaboration loop than its predecessors. @@ -60,8 +60,8 @@ During post-training, Cursor employs **Sharded Muon** and **Dual Mesh HSDP (Hybr ## Performance Characteristics Cursor claims Composer 2.5 achieves a Pareto-optimal tradeoff between intelligence and inference cost compared to frontier models (Opus 4.5/4.6, GPT-5.4/5.5). -* **Intelligence Improvements**: On Cursor's internal *CursorBench* (which tests sweeping, multi-file edits with ambiguous prompts), Composer 2.5 scored 69.3% (or ~61-63% depending on the specific benchmark version cited), a massive jump from Composer 1.5's ~44% and Composer 2's ~52%. -* **Frontier Parity**: On public agentic benchmarks like *Terminal-Bench 2.0*, it hit 69.3%. On *SWE-bench Multilingual*, it achieved parity with or slightly surpassed OpenAI's GPT-5.5. +* **Intelligence Improvements**: On Cursor's internal *CursorBench* (which tests sweeping, multi-file edits with ambiguous prompts), Composer 2.5's score is NOT in any primary source (the circulating 69.3% figure appears in neither the 2.5 blog nor the Composer 2 techreport — deepread finding V5; the techreport's Table 1 gives Composer 2 = 61.3 CursorBench). Treat all 2.5 benchmark numbers as unverified. +* **Frontier Parity**: Claims of Terminal-Bench 2.0 / SWE-bench Multilingual parity circulate in secondary commentary only; neither primary source contains benchmark numbers for 2.5 (deepread finding V5). * **Cost Efficiency**: * Standard Tier: $0.50 per 1M input / $2.50 per 1M output tokens. * Fast Tier: $3.00 per 1M input / $15.00 per 1M output tokens. diff --git a/research/06-feature-deletion-datagen.md b/research/06-feature-deletion-datagen.md index 44e8bd937ec28bfe3ac83a73ba7aef7ea5fa26fa..d18b00b01bc7b1cbfe7ff05772d360fd0c9d4997 100644 --- a/research/06-feature-deletion-datagen.md +++ b/research/06-feature-deletion-datagen.md @@ -327,7 +327,7 @@ Feature-Deletion is **embarrassingly parallel and CPU-bound** — no GPU in the 1. **Deletion-target selection heuristic** — blog silent (`research/09` §1 "NO CHANGE"). We propose coverage-selectivity (§5 Path B); Cursor's actual heuristic is unknown. 2. **Deleter model vs. program** — blog implies an agent deletes ("asked to delete code… such that the codebase remains functional"); we default to *programmatic* deletion (cheaper, deterministic, no second model). An LLM-deleter is a v0.2 escalation. -3. **The other ~24 generators** — Feature Deletion is "one synthetic approach… a range of approaches"; the rest are unnamed. Out of scope here; this brief delivers the one named generator. +3. **The other generators (count UNKNOWN)** — Feature Deletion is "one synthetic approach… a range of approaches"; the rest are unnamed and uncounted (the old "~24" was a back-formation from the 25x task multiplier — deepread finding V5). Out of scope here; this brief delivers the one named generator. 4. **"Agentic monitoring tools" internals** — unspecified; our §3c monitor is a best-effort programmatic stand-in. 5. **Composer2.pdf (arXiv:2603.24477)** — flagged by `research/09` action-item #1 as the likely home of data-mix % and generator inventory; **not yet extracted**. Recommend a follow-up pull before scaling the generator suite. diff --git a/research/09-composer-blog-delta-2026.md b/research/09-composer-blog-delta-2026.md index f26972dde93c9bc694e0106540699b68e5583407..f35a88ef9f89cc7d3886f47e1c9ff8695252230e 100644 --- a/research/09-composer-blog-delta-2026.md +++ b/research/09-composer-blog-delta-2026.md @@ -20,7 +20,7 @@ The **2.5 blog body is byte-for-byte unchanged** from what the mapping doc captu **DELTAS (not in / under-stated in COMPOSER_RECIPE_MAPPING.md):** -- **[DELTA — new emphasis]** The phrase *"we both **select for** and **create** harder tasks **dynamically throughout the run**"* is a **dynamic curriculum / online task-selection** signal. The mapping doc captured "Feature Deletion + 24 unnamed generators" but did **not** flag that task difficulty is filtered *online* (the model "begins to get most training problems correct," so hard tasks are up-weighted live). This is a data-*mix*/curriculum detail with direct replication impact: our generator suite needs a difficulty filter / pass-rate gate, not just a static task bank. +- **[DELTA — new emphasis]** The phrase *"we both **select for** and **create** harder tasks **dynamically throughout the run**"* is a **dynamic curriculum / online task-selection** signal. The mapping doc captured "Feature Deletion + other unnamed generators" (its old "24" count was a back-formation — deepread finding V5) but did **not** flag that task difficulty is filtered *online* (the model "begins to get most training problems correct," so hard tasks are up-weighted live). This is a data-*mix*/curriculum detail with direct replication impact: our generator suite needs a difficulty filter / pass-rate gate, not just a static task bank. - **[DELTA — new authoritative source for CPT data mix]** The Composer 2 technical-report blog states the CPT data mix explicitly: *"continued pretraining on a data mix that **emphasizes code** to deepen the base model's coding knowledge"* and *"We find that **reducing pretraining loss improves downstream RL performance**, with better base knowledge reliably translating into a better agent."* The mapping doc marked "continued pretraining on heavily code-weighted data" as `[BLOG-VERIFIED]` from the 2.5 Muon section — but the **causal claim (CPT loss ↓ ⇒ RL performance ↑)** is new and is the stated *justification* for doing CPT at all. Relevant to our "skip CPT, start from Qwen3-Coder" decision: Cursor's own evidence says base-knowledge quality gates RL ceiling, which strengthens the case for starting from an already-code-tuned base. - **[DELTA — new artifact]** There is now a **full Composer 2 arXiv technical report: [arXiv:2603.24477](https://arxiv.org/abs/2603.24477)** and a downloadable PDF at **`https://cursor.com/resources/Composer2.pdf`** (authored by Sasha Rush et al.). The report explicitly *"covers... ablations on the training recipe, our approach to agent behavior shaping, and the design of our evaluation suite."* The mapping doc cited only the blog stub and never the arXiv ID/PDF. **This PDF is the most likely place to resolve the data-mix weighting %, the RL algorithm name, and the hint-generation mechanism — none of which are in either blog.** → Recommend a dedicated follow-up extraction of Composer2.pdf. - **[CONFIRM — "Anyrun"]** Mapping doc flagged "Anyrun" as possibly not Cursor-sourced. **Confirmed real:** the Composer 2 report blog says *"**Anyrun**, our internal compute platform for running hundreds of thousands of sandboxed coding environments."* It is a Composer-**2** artifact (carried into 2.5), correctly attributed. Resolves the mapping doc's open flag. diff --git a/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md b/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md new file mode 100644 index 0000000000000000000000000000000000000000..245e87604212ad52b628ff1ee7a6a6b050108b18 --- /dev/null +++ b/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md @@ -0,0 +1,224 @@ +--- +title: '[2304.06767] RAFT: Reward rAnked FineTuning for Generative Foundation Model + Alignment' +id: 230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen +tags: +- deepread +created: '2026-06-10T00:31:18.566124Z' +source: https://arxiv.org/abs/2304.06767 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:31:18.565918Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2304.06767] RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment +Computer Science > Machine Learning +arXiv:2304.06767 +(cs) +[Submitted on 13 Apr 2023 ( +v1 +), last revised 1 Dec 2023 (this version, v4)] +Title: +RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment +Authors: +Hanze Dong +, +Wei Xiong +, +Deepanshu Goyal +, +Yihan Zhang +, +Winnie Chow +, +Rui Pan +, +Shizhe Diao +, +Jipeng Zhang +, +Kashun Shum +, +Tong Zhang +View a PDF of the paper titled RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment, by Hanze Dong and 9 other authors +View PDF +HTML (experimental) +Abstract: +Generative foundation models are susceptible to implicit biases that can arise from extensive unsupervised training data. Such biases can produce suboptimal samples, skewed outcomes, and unfairness, with potentially serious consequences. Consequently, aligning these models with human ethics and preferences is an essential step toward ensuring their responsible and effective deployment in real-world applications. Prior research has primarily employed Reinforcement Learning from Human Feedback (RLHF) to address this problem, where generative models are fine-tuned with RL algorithms guided by a human-feedback-informed reward model. However, the inefficiencies and instabilities associated with RL algorithms frequently present substantial obstacles to the successful alignment, necessitating the development of a more robust and streamlined approach. To this end, we introduce a new framework, Reward rAnked FineTuning (RAFT), designed to align generative models effectively. Utilizing a reward model and a sufficient number of samples, our approach selects the high-quality samples, discarding those that exhibit undesired behavior, and subsequently enhancing the model by fine-tuning on these filtered samples. Our studies show that RAFT can effectively improve the model performance in both reward learning and other automated metrics in both large language models and diffusion models. +Comments: +29 pages, 12 figures, Published in Transactions on Machine Learning Research (TMLR) +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (stat.ML) +Cite as: +arXiv:2304.06767 +[cs.LG] +(or +arXiv:2304.06767v4 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2304.06767 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Hanze Dong [ +view email +] +[v1] +Thu, 13 Apr 2023 18:22:40 UTC (62,967 KB) +[v2] +Thu, 25 May 2023 06:27:31 UTC (42,022 KB) +[v3] +Wed, 30 Aug 2023 01:25:29 UTC (33,955 KB) +[v4] +Fri, 1 Dec 2023 14:28:06 UTC (34,049 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment, by Hanze Dong and 9 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2023-04 +Change to browse by: +cs +cs.AI +cs.CL +cs.CV +stat +stat.ML +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md new file mode 100644 index 0000000000000000000000000000000000000000..0a6136f6d7b1ffa8f18204c4d776d9f8030ee5f1 --- /dev/null +++ b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md @@ -0,0 +1,2735 @@ +--- +title: '[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language + Models' +id: 230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2 +tags: +- deepread +created: '2026-06-10T00:41:12.876142Z' +source: https://ar5iv.labs.arxiv.org/html/2305.10601 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:41:12.875985Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language Models +Tree of Thoughts: Deliberate Problem Solving +with Large Language Models +Shunyu Yao +Princeton University +Dian Yu +Google DeepMind +Jeffrey Zhao +Google DeepMind +Izhak Shafran +Google DeepMind +Thomas L. Griffiths +Princeton University +Yuan Cao +Google DeepMind +Karthik Narasimhan +Princeton University +Abstract +Language models are increasingly being deployed for general problem solving across a wide range of tasks, but are still confined to token-level, left-to-right decision-making processes during inference. This means they can fall short in tasks that require exploration, strategic lookahead, or where initial decisions play a pivotal role. +To surmount these challenges, we introduce a new framework for language model inference, “Tree of Thoughts” (ToT), which generalizes over the popular “Chain of Thought” approach to prompting language models, and enables exploration over coherent units of text (“thoughts”) that serve as intermediate steps toward problem solving. +ToT allows LMs to perform deliberate decision making by considering multiple different reasoning paths and self-evaluating choices to decide the next course of action, as well as looking ahead or backtracking when necessary to make global choices. +Our experiments show that ToT significantly enhances language models’ problem-solving abilities on three novel tasks requiring non-trivial planning or search: Game of 24, Creative Writing, and Mini Crosswords. +For instance, in Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of tasks, our method achieved a success rate of 74%. Code repo with all prompts: +https://github.com/princeton-nlp/tree-of-thought-llm +. +1 +Introduction +Originally designed to generate text, scaled-up versions of language models (LMs) such as GPT +[ +25 +, +26 +, +1 +, +23 +] +and PaLM +[ +5 +] +have been shown to be increasingly capable of performing an ever wider range of tasks requiring mathematical, symbolic, commonsense, and knowledge reasoning. It is perhaps surprising that underlying all this progress is still the original autoregressive mechanism for generating text, which makes token-level decisions one by one and in a left-to-right fashion. +Is such a simple mechanism sufficient for a LM to be built toward a general problem solver? +If not, what problems would challenge the current paradigm, and what should be alternative mechanisms? +The literature on human cognition provides some clues to answer these questions. +Research on “dual process” models suggests that people have two modes in which they engage with decisions – a fast, automatic, unconscious mode (“System 1”) and a slow, deliberate, conscious mode (“System 2”) +[ +30 +, +31 +, +16 +, +15 +] +. +These two modes have previously been connected to a variety of mathematical models used in machine learning. For example, research on reinforcement learning in humans and other animals has explored the circumstances under which they engage in associative “model free” learning or more deliberative “model based” planning +[ +7 +] +. +The simple associative token-level choices of LMs are also reminiscent of “System 1”, and thus might benefit from augmentation by a more deliberate “System 2” planning process that (1) maintains and explores diverse alternatives for current choices instead of just picking one, and (2) evaluates its current status and actively looks ahead or backtracks to make more global decisions. +To design such a planning process, we return to the origins of artificial intelligence (and cognitive science), drawing inspiration from the planning processes explored by Newell, Shaw, and Simon starting in the 1950s +[ +21 +, +22 +] +. Newell and colleagues characterized +problem solving +[ +21 +] +as search through a combinatorial problem space, represented as a tree. We thus propose the Tree of Thoughts (ToT) framework for general problem solving with language models. As Figure +1 +illustrates, while existing methods (detailed below) sample continuous language sequences for problem solving, ToT actively maintains a tree of thoughts, where each +thought +is a coherent language sequence that serves as an intermediate step toward problem solving (Table +1 +). Such a high-level semantic unit allows the LM to self-evaluate the progress different intermediate thoughts make towards solving the problem through a deliberate reasoning process that is also instantiated in language (Figures +2 +, +4 +, +6 +). This implementation of search heuristics via LM self-evaluation and deliberation is novel, as previous search heuristics are either programmed or learned. Finally, we combine this language-based capability to generate and evaluate diverse thoughts with search algorithms, such as breadth-first search (BFS) or depth-first search (DFS), which allow systematic exploration of the tree of thoughts with lookahead and backtracking. +Empirically, we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4 +[ +23 +] +: Game of 24, Creative Writing, and Crosswords (Table +1 +). +These tasks require deductive, mathematical, commonsense, lexical reasoning abilities, and a way to incorporate systematic planning or search. +We show ToT obtains superior results on all three tasks by being general and flexible enough to support different levels of thoughts, different ways to generate and evaluate thoughts, and different search algorithms that adapt to the nature of different problems. We also analyze how such choices affect model performances via systematic ablations and discuss future directions to better train and use LMs. +Figure 1: +Schematic illustrating various approaches to problem solving with LLMs. Each rectangle box represents a +thought +, which is a coherent language sequence that serves as an intermediate step toward problem solving. See concrete examples of how thoughts are generated, evaluated, and searched in Figures +2 +, +4 +, +6 +. +2 +Background +We first formalize some existing methods that use large language models for problem-solving, which our approach is inspired by and later compared with. +We use +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +to denote a pre-trained LM with parameters +θ +𝜃 +\theta +, and +lowercase letters +x +, +y +, +z +, +s +, +⋯ +𝑥 +𝑦 +𝑧 +𝑠 +⋯ +x,y,z,s,\cdots +to denote a language sequence +, i.e. +x += +( +x +​ +[ +1 +] +, +⋯ +, +x +​ +[ +n +] +) +𝑥 +𝑥 +delimited-[] +1 +⋯ +𝑥 +delimited-[] +𝑛 +x=(x[1],\cdots,x[n]) +where each +x +​ +[ +i +] +𝑥 +delimited-[] +𝑖 +x[i] +is a token, so that +p +θ +​ +( +x +) += +∏ +i += +1 +n +p +θ +​ +( +x +​ +[ +i +] +| +x +​ +[ +1 +​ +… +​ +i +] +) +subscript +𝑝 +𝜃 +𝑥 +superscript +subscript +product +𝑖 +1 +𝑛 +subscript +𝑝 +𝜃 +conditional +𝑥 +delimited-[] +𝑖 +𝑥 +delimited-[] +1 +… +𝑖 +p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1...i]) +. We use uppercase letters +S +, +⋯ +𝑆 +⋯ +S,\cdots +to denote a collection of language sequences. +Input-output (IO) prompting +is the most common way to turn a problem input +x +𝑥 +x +into output +y +𝑦 +y +with LM: +y +∼ +p +θ +​ +( +y +| +prompt +I +​ +O +​ +( +x +) +) +similar-to +𝑦 +subscript +𝑝 +𝜃 +conditional +𝑦 +subscript +prompt +𝐼 +𝑂 +𝑥 +y\sim p_{\theta}(y|\texttt{prompt}_{{IO}}(x)) +, where +prompt +I +​ +O +​ +( +x +) +subscript +prompt +𝐼 +𝑂 +𝑥 +\texttt{prompt}_{IO}(x) +wraps input +x +𝑥 +x +with task instructions and/or few-shot input-output examples. For simplicity, let us denote +p +θ +prompt +​ +( +output +∣ +input +) += +p +θ +​ +( +output +∣ +prompt +​ +( +input +) +) +superscript +subscript +𝑝 +𝜃 +prompt +conditional +output +input +subscript +𝑝 +𝜃 +conditional +output +prompt +input +p_{\theta}^{{\rm prompt}}(\texttt{output}\mid\texttt{input})=p_{\theta}(\texttt{output}\mid\texttt{prompt}(\texttt{input})) +, so that IO prompting can be formulated as +y +∼ +p +θ +I +​ +O +​ +( +y +| +x +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝐼 +𝑂 +conditional +𝑦 +𝑥 +y\sim p_{\theta}^{IO}(y|x) +. +Chain-of-thought (CoT) prompting +[ +38 +] +was proposed to address cases where the mapping of input +x +𝑥 +x +to output +y +𝑦 +y +is non-trivial (e.g. when +x +𝑥 +x +is a math question and +y +𝑦 +y +is the final numerical answer). The key idea is to introduce a chain of +thoughts +z +1 +, +⋯ +, +z +n +subscript +𝑧 +1 +⋯ +subscript +𝑧 +𝑛 +z_{1},\cdots,z_{n} +to bridge +x +𝑥 +x +and +y +𝑦 +y +, where each +z +i +subscript +𝑧 +𝑖 +z_{i} +is a coherent language sequence that serves as a meaningful intermediate step toward problem solving (e.g. +z +i +subscript +𝑧 +𝑖 +z_{i} +could be an intermediate equation for math QA). To solve problems with CoT, each thought +z +i +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i +∣ +x +, +z +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑧 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +1 +z_{i}\sim p_{\theta}^{CoT}(z_{i}\mid x,z_{1\cdots i-1}) +is sampled sequentially, then the output +y +∼ +p +θ +C +​ +o +​ +T +​ +( +y +| +x +, +z +1 +​ +⋯ +​ +n +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +𝑦 +𝑥 +subscript +𝑧 +1 +⋯ +𝑛 +y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n}) +. In practice, +[ +z +1 +​ +⋯ +​ +n +, +y +] +∼ +p +θ +C +​ +o +​ +T +​ +( +z +1 +​ +⋯ +​ +n +, +y +| +x +) +similar-to +subscript +𝑧 +1 +⋯ +𝑛 +𝑦 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +subscript +𝑧 +1 +⋯ +𝑛 +conditional +𝑦 +𝑥 +[z_{1\cdots n},y]\sim p_{\theta}^{CoT}(z_{1\cdots n},y|x) +is sampled as a continuous language sequence, and the +decomposition +of thoughts (e.g. is each +z +i +subscript +𝑧 +𝑖 +z_{i} +a phrase, a sentence, or a paragraph) is left ambiguous. +Self-consistency with CoT (CoT-SC) +[ +36 +] +is an ensemble approach that samples +k +𝑘 +k +i.i.d. chains of thought: +[ +z +1 +​ +⋯ +​ +n +( +i +) +, +y +( +i +) +] +∼ +p +θ +C +​ +o +​ +T +​ +( +z +1 +​ +⋯ +​ +n +, +y +| +x +) +​ +( +i += +1 +​ +⋯ +​ +k +) +similar-to +subscript +superscript +𝑧 +𝑖 +1 +⋯ +𝑛 +superscript +𝑦 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +subscript +𝑧 +1 +⋯ +𝑛 +conditional +𝑦 +𝑥 +𝑖 +1 +⋯ +𝑘 +[z^{(i)}_{1\cdots n},y^{(i)}]\sim p_{\theta}^{CoT}(z_{1\cdots n},y|x)\ (i=1\cdots k) +, then returns the most frequent output: +arg +⁡ +max +y +⁡ +# +​ +{ +i +∣ +y +( +i +) += +y +} +subscript +𝑦 +# +conditional-set +𝑖 +superscript +𝑦 +𝑖 +𝑦 +\arg\max_{y}\#\{i\mid y^{(i)}=y\} +. CoT-SC improves upon CoT, because there are generally different thought processes for the same problem (e.g. different ways to prove the same theorem), and the output decision can be more faithful by exploring a richer set of thoughts. However, within each chain there is no local exploration of different thought steps, and the “most frequent” heuristic only applies when the output space is limited (e.g. multi-choice QA). +3 +Tree of Thoughts: Deliberate Problem Solving with LM +A genuine problem-solving process involves the repeated use of available information to initiate exploration, which discloses, in turn, more information until a way to attain the solution is finally discovered.—— +Newell et al. [ +21 +] +Research on human problem-solving suggests that people search through a combinatorial problem-space – a tree where the nodes represent partial solutions, and the branches correspond to operators that modify them +[ +21 +, +22 +] +. Which branch to take is determined by heuristics that help to navigate the problem-space and guide the problem-solver towards a solution. This perspective highlights two key shortcomings of existing approaches that use LMs to solve general problems: 1) Locally, they do not explore +different +continuations within a thought process – the branches of the tree. 2) Globally, they do not incorporate any type of planning, lookahead, or backtracking to help evaluate these different options – the kind of heuristic-guided search that seems characteristic of human problem-solving. +To address these shortcomings, we introduce +Tree of Thoughts (ToT) +, a paradigm that allows LMs to explore multiple reasoning paths over thoughts (Figure +1 +(c)). ToT frames any problem as a search over a tree, where each node is a +state +s += +[ +x +, +z +1 +​ +⋯ +​ +i +] +𝑠 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +s=[x,z_{1\cdots i}] +representing a partial solution with the input and the sequence of thoughts so far. A specific instantiation of ToT involves answering four questions: 1. How to +decompose +the intermediate process into thought steps; 2. How to +generate +potential thoughts from each state; 3. How to heuristically +evaluate +states; 4. What +search +algorithm to use. +1. Thought decomposition. +While CoT samples thoughts coherently without explicit decomposition, ToT leverages problem properties to design and decompose intermediate thought steps. As Table +1 +shows, depending on different problems, a thought could be a couple of words (Crosswords), a line of equation (Game of 24), or a whole paragraph of writing plan (Creative Writing). In general, a thought should be “small” enough so that LMs can generate promising and diverse samples (e.g. generating a whole book is usually too “big” to be coherent), yet “big” enough so that LMs can evaluate its prospect toward problem solving (e.g. generating one token is usually too “small” to evaluate). +2. Thought generator +G +​ +( +p +θ +, +s +, +k +) +𝐺 +subscript +𝑝 +𝜃 +𝑠 +𝑘 +G(p_{\theta},s,k) +. +Given a tree state +s += +[ +x +, +z +1 +​ +⋯ +​ +i +] +𝑠 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +s=[x,z_{1\cdots i}] +, we consider two strategies to generate +k +𝑘 +k +candidates for the next thought step: +(a) +Sample +i.i.d. thoughts from a CoT prompt (Creative Writing, Figure +4 +): +z +( +j +) +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i ++ +1 +| +s +) += +p +θ +C +​ +o +​ +T +​ +( +z +i ++ +1 +| +x +, +z +1 +​ +⋯ +​ +i +) +​ +( +j += +1 +​ +⋯ +​ +k +) +similar-to +superscript +𝑧 +𝑗 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +1 +𝑠 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +1 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +𝑗 +1 +⋯ +𝑘 +z^{(j)}\sim p_{\theta}^{CoT}(z_{i+1}|s)=p_{\theta}^{CoT}(z_{i+1}|x,z_{1\cdots i})\ (j=1\cdots k) +. This works better when the thought space is rich (e.g. each thought is a paragraph), and i.i.d. samples lead to diversity; +(b) +Propose +thoughts sequentially using a “propose prompt” (Game of 24, Figure +2 +; Crosswords, Figure +6 +): +[ +z +( +1 +) +, +⋯ +, +z +( +k +) +] +∼ +p +θ +p +​ +r +​ +o +​ +p +​ +o +​ +s +​ +e +​ +( +z +i ++ +1 +( +1 +​ +⋯ +​ +k +) +∣ +s +) +similar-to +superscript +𝑧 +1 +⋯ +superscript +𝑧 +𝑘 +superscript +subscript +𝑝 +𝜃 +𝑝 +𝑟 +𝑜 +𝑝 +𝑜 +𝑠 +𝑒 +conditional +superscript +subscript +𝑧 +𝑖 +1 +1 +⋯ +𝑘 +𝑠 +[z^{(1)},\cdots,z^{(k)}]\sim p_{\theta}^{propose}(z_{i+1}^{(1\cdots k)}\mid s) +. This works better when the thought space is more constrained (e.g. each thought is just a word or a line), so proposing different thoughts in the same context avoids duplication. +3. State evaluator +V +​ +( +p +θ +, +S +) +𝑉 +subscript +𝑝 +𝜃 +𝑆 +V(p_{\theta},S) +. +Given a frontier of different states, the state evaluator evaluates the progress they make towards solving the problem, serving as a +heuristic +for the search algorithm to determine which states to keep exploring and in which order. While heuristics are a standard approach to solving search problems, they are typically either programmed (e.g. DeepBlue +[ +3 +] +) or learned (e.g. AlphaGo +[ +29 +] +). We propose a third alternative, by using the LM to deliberately reason about states. When applicable, such a deliberate heuristic can be more flexible than programmed rules, and more sample-efficient than learned models. +Similar to the thought generator, we consider two strategies to evaluate states either independently or together: +(a) +Value +each state independently: +V +​ +( +p +θ +, +S +) +​ +( +s +) +∼ +p +θ +v +​ +a +​ +l +​ +u +​ +e +​ +( +v +| +s +) +​ +∀ +s +∈ +S +similar-to +𝑉 +subscript +𝑝 +𝜃 +𝑆 +𝑠 +superscript +subscript +𝑝 +𝜃 +𝑣 +𝑎 +𝑙 +𝑢 +𝑒 +conditional +𝑣 +𝑠 +for-all +𝑠 +𝑆 +V(p_{\theta},S)(s)\sim p_{\theta}^{value}(v|s)\ \forall s\in S +, where a value prompt reasons about the state +s +𝑠 +s +to generate a scalar value +v +𝑣 +v +(e.g. 1-10) or a classification (e.g. sure/likely/impossible) that could be heuristically turned into a value. The basis of such evaluative reasoning can vary across problems and thought steps. In this work, we explore evaluation via few +lookahead +simulations (e.g. quickly confirm that 5, 5, 14 can reach 24 via 5 + 5 + 14, or “hot_l” can mean “inn” via filling “e” in “_”) plus commonsense (e.g. 1 2 3 are too small to reach 24, or no word can start with “tzxc”). While the former might promote “good” states, the latter could help eliminate “bad” states. Such valuations do not need to be perfect, and only need to be approximately helpful for decision making. +(b) +Vote +across states: +V +​ +( +p +θ +, +S +) +​ +( +s +) += +𝟙 +​ +[ +s += +s +∗ +] +𝑉 +subscript +𝑝 +𝜃 +𝑆 +𝑠 +1 +delimited-[] +𝑠 +superscript +𝑠 +V(p_{\theta},S)(s)=\mathds{1}[s=s^{*}] +, where a “good” state +s +∗ +∼ +p +θ +v +​ +o +​ +t +​ +e +​ +( +s +∗ +| +S +) +similar-to +superscript +𝑠 +superscript +subscript +𝑝 +𝜃 +𝑣 +𝑜 +𝑡 +𝑒 +conditional +superscript +𝑠 +𝑆 +s^{*}\sim p_{\theta}^{vote}(s^{*}|S) +is voted out based on deliberately comparing different states in +S +𝑆 +S +in a vote prompt. +When problem success is harder to directly value (e.g. passage coherency), it is natural to to instead compare different partial solutions and vote for the most promising one. This is similar in spirit to a “step-wise” self-consistency strategy, i.e. cast “which state to explore” as a multi-choice QA, and use LM samples to vote for it. +For both strategies, we could prompt the LM multiple times to aggregate the value or vote results to trade time/resource/cost for more faithful/robust heuristics. +Algorithm 1 +ToT-BFS( +x +, +p +θ +, +G +, +k +, +V +, +T +, +b +𝑥 +subscript +𝑝 +𝜃 +𝐺 +𝑘 +𝑉 +𝑇 +𝑏 +x,p_{\theta},G,k,V,T,b +) +Input +x +𝑥 +x +, LM +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, thought generator +G +​ +( +) +𝐺 +G() +& size limit +k +𝑘 +k +, states evaluator +V +​ +( +) +𝑉 +V() +, step limit +T +𝑇 +T +, breadth limit +b +𝑏 +b +. +S +0 +← +{ +x +} +← +subscript +𝑆 +0 +𝑥 +S_{0}\leftarrow\{x\} +for +t += +1 +, +⋯ +, +T +𝑡 +1 +⋯ +𝑇 +t=1,\cdots,T +do +S +t +′ +← +{ +[ +s +, +z +] +∣ +s +∈ +S +t +− +1 +, +z +t +∈ +G +​ +( +p +θ +, +s +, +k +) +} +← +subscript +superscript +𝑆 +′ +𝑡 +conditional-set +𝑠 +𝑧 +formulae-sequence +𝑠 +subscript +𝑆 +𝑡 +1 +subscript +𝑧 +𝑡 +G +subscript +𝑝 +𝜃 +𝑠 +𝑘 +S^{\prime}_{t}\leftarrow\{[s,z]\mid s\in S_{t-1},z_{t}\in{\color[rgb]{0,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{0,0,0}\pgfsys@color@gray@stroke{0}\pgfsys@color@gray@fill{0}\mathrm{G}}(p_{\theta},s,k)\} +V +t +← +V +​ +( +p +θ +, +S +t +′ +) +← +subscript +𝑉 +𝑡 +𝑉 +subscript +𝑝 +𝜃 +subscript +superscript +𝑆 +′ +𝑡 +V_{t}\leftarrow V(p_{\theta},S^{\prime}_{t}) +S +t +← +arg +⁡ +max +S +⊂ +S +t +′ +, +| +S +| += +b +​ +∑ +s +∈ +S +V +t +​ +( +s +) +← +subscript +𝑆 +𝑡 +subscript +formulae-sequence +𝑆 +subscript +superscript +𝑆 +′ +𝑡 +𝑆 +𝑏 +subscript +𝑠 +𝑆 +subscript +𝑉 +𝑡 +𝑠 +S_{t}\leftarrow\arg\max_{S\subset S^{\prime}_{t},|S|=b}\sum_{s\in S}V_{t}(s) +end +for +return +G +​ +( +p +θ +, +arg +⁡ +max +s +∈ +S +T +⁡ +V +T +​ +( +s +) +, +1 +) +𝐺 +subscript +𝑝 +𝜃 +subscript +𝑠 +subscript +𝑆 +𝑇 +subscript +𝑉 +𝑇 +𝑠 +1 +G(p_{\theta},\arg\max_{s\in S_{T}}V_{T}(s),1) +Algorithm 2 +ToT-DFS( +s +, +t +, +p +θ +, +G +, +k +, +V +, +T +, +v +t +​ +h +𝑠 +𝑡 +subscript +𝑝 +𝜃 +𝐺 +𝑘 +𝑉 +𝑇 +subscript +𝑣 +𝑡 +ℎ +s,t,p_{\theta},G,k,V,T,v_{\small th} +) +Current state +s +𝑠 +s +, step +t +𝑡 +t +, LM +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, thought generator +G +​ +( +) +𝐺 +G() +and size limit +k +𝑘 +k +, states evaluator +V +​ +( +) +𝑉 +V() +, step limit +T +𝑇 +T +, threshold +v +t +​ +h +subscript +𝑣 +𝑡 +ℎ +v_{\small th} +if +t +> +T +𝑡 +𝑇 +t>T +then +record output +G +​ +( +p +θ +, +s +, +1 +) +𝐺 +subscript +𝑝 +𝜃 +𝑠 +1 +G(p_{\theta},s,1) +end +if +for +s +′ +∈ +G +​ +( +p +θ +, +s +, +k +) +superscript +𝑠 +′ +𝐺 +subscript +𝑝 +𝜃 +𝑠 +𝑘 +s^{\prime}\in G(p_{\theta},s,k) +do +▷ +▷ +\triangleright +sorted candidates +if +V +​ +( +p +θ +, +{ +s +′ +} +) +​ +( +s +) +> +v +t +​ +h +​ +r +​ +e +​ +s +𝑉 +subscript +𝑝 +𝜃 +superscript +𝑠 +′ +𝑠 +subscript +𝑣 +𝑡 +ℎ +𝑟 +𝑒 +𝑠 +V(p_{\theta},\{s^{\prime}\})(s)>v_{\small thres} +then +▷ +▷ +\triangleright +pruning +DFS +( +s +′ +, +t ++ +1 +) +superscript +𝑠 +′ +𝑡 +1 +(s^{\prime},t+1) +end +if +end +for +4. Search algorithm. +Finally, within the ToT framework, one can plug and play different search algorithms depending on the tree structure. We explore two relatively simple search algorithms and leave more advanced ones (e.g. A* +[ +11 +] +, MCTS +[ +2 +] +) for future work: +(a) +Breadth-first search (BFS) +(Algorithm +1 +) maintains a set of the +b +𝑏 +b +most promising states per step. This is used for Game of 24 and Creative Writing where the tree depth is limit ( +T +≤ +3 +𝑇 +3 +T\leq 3 +), and initial thought steps can be evaluated and pruned to a small set ( +b +≤ +5 +𝑏 +5 +b\leq 5 +). +(b) +Depth-first search (DFS) +(Algorithm +2 +) explores the most promising state first, until the final output is reached ( +t +> +T +𝑡 +𝑇 +t>T +), or the state evaluator deems it impossible to solve the problem from the current +s +𝑠 +s +( +V +​ +( +p +θ +, +{ +s +} +) +​ +( +s +) +≤ +v +t +​ +h +𝑉 +subscript +𝑝 +𝜃 +𝑠 +𝑠 +subscript +𝑣 +𝑡 +ℎ +V(p_{\theta},\{s\})(s)\leq v_{th} +for a value threshold +v +t +​ +h +subscript +𝑣 +𝑡 +ℎ +v_{th} +). In the latter case, the subtree from +s +𝑠 +s +is +pruned +to trade exploration for exploitation. In both cases, DFS +backtracks +to the parent state of +s +𝑠 +s +to continue exploration. +Conceptually, ToT has several benefits as a method for general problem-solving with LMs: (1) +Generality. +IO, CoT, CoT-SC, and self-refinement can be seen as special cases of ToT (i.e. trees of limited depth and breadth; Figure +1 +). (2) +Modularity. +The base LM, as well as the thought decomposition, generation, evaluation, and search procedures can all be varied independently. (3) +Adaptability +. Different problem properties, LM capabilities, and resource constraints can be accommodated. (4) +Convenience. +No extra training is needed, just a pre-trained LM is sufficient. The next section will show how these conceptual benefits translate to strong empirical performance in different problems. +4 +Experiments +Game of 24 +Creative Writing +5x5 Crosswords +Input +4 numbers +(4 9 10 13) +4 random sentences +10 clues +(h1. presented;..) +Output +An equation to reach 24 +(13-9)*(10-4)=24 +A passage of 4 paragraphs ending in the 4 sentences +5x5 letters: +SHOWN; WIRRA; AVAIL; … +Thoughts +3 intermediate equations +(13-9=4 (left 4,4,10); 10-4=6 (left 4,6); 4*6=24) +A short writing plan +(1. Introduce a book that connects…) +Words to fill in for clues: +(h1. shown; v5. naled; …) +#ToT steps +3 +1 +5-10 (variable) +Table 1: +Task overview. Input, output, thought examples are in blue. +We propose three tasks that are hard even when sampling from the state-of-the-art language model, GPT-4 +[ +23 +] +, using standard IO prompting or chain-of-thought (CoT) prompting. We show how deliberate search in trees of thoughts (ToT) produces better results, and more importantly, interesting and promising new ways to use language models to solve problems requiring search or planning. +Unless otherwise stated, we perform experiments using a Chat Completion mode GPT-4 +1 +1 +1 +Experiments were done between May 5-16, 2023. +with a sampling temperature of 0.7. +4.1 +Game of 24 +Game of 24 is a mathematical reasoning challenge, where the goal is to use 4 numbers and basic arithmetic operations (+-*/) to obtain 24. +For example, given input “4 9 10 13”, a solution output could be “(10 - 4) * (13 - 9) = 24”. +Figure 2: +ToT in a game of 24. The LM is prompted for (a) thought generation and (b) valuation. +Method +Success +IO prompt +7.3% +CoT prompt +4.0% +CoT-SC +(k=100) +9.0% +ToT (ours) +(b=1) +45% +ToT (ours) +(b=5) +74% +IO + Refine +(k=10) +27% +IO +(best of 100) +33% +CoT +(best of 100) +49% +Table 2: +Game of 24 Results. +Figure 3: +Game of 24 (a) scale analysis & (b) error analysis. +Task Setup. +We scrape data from +4nums.com +, which has 1,362 games that are sorted from easy to hard by human solving time, and use a subset of relatively hard games indexed 901-1,000 for testing. For each task, we consider the output as success if it is a valid equation that equals 24 and uses the input numbers each exactly once. We report the success rate across 100 games as the metric. +Baselines. +We use a standard input-output (IO) prompt with 5 in-context examples. For chain-of-thought (CoT) prompting, we augment each input-output pair with 3 intermediate equations, each operating on two remaining numbers. For example, given input “4 9 10 13”, the thoughts could be “13 - 9 = 4 (left: 4 4 10); 10 - 4 = 6 (left: 4 6); 4 * 6 = 24 (left: 24)”. For each game, we sample IO and CoT prompting for 100 times for average performance. +We also consider a CoT self-consistency baseline, which takes the majority output from 100 CoT samples, and an iterative-refine approach on top of an IO sample for at most +10 +10 +10 +iterations. At each iteration, the LM is conditioned on all previous history to “reflect on your mistakes and generate a refined answer” if the output is incorrect. Note that it uses groundtruth feedback signals about equation correctness. +ToT Setup. +To frame Game of 24 into ToT, it is natural to decompose the thoughts into 3 steps, each an intermediate equation. As shown in Figure +2 +(a), at each tree node, we exact the remaining numbers and prompt the LM to propose some possible next steps. +The same “propose prompt” is used for all 3 thought steps, though it only has one example with 4 input numbers. +We perform a breadth-first search (BFS) in ToT, where at each step we keep the best +b += +5 +𝑏 +5 +b=5 +candidates. +To perform deliberate BFS in ToT, as shown in Figure +2 +(b), we prompt LM to evaluate each thought candidate as “sure/maybe/impossible” with regard to reaching 24. The aim is to promote correct partial solutions that can be verdicted within few lookahead trials, and eliminate impossible partial solutions based on “too big/small” commonsense, and keep the rest “maybe”. We sample values +3 +3 +3 +times for each thought. +Results. +As shown in Table +3 +, IO, CoT, and CoT-SC prompting methods perform badly on the task, achieving only 7.3%, 4.0%, and 9.0% success rates. In contrast, ToT with a breadth of +b += +1 +𝑏 +1 +b=1 +already achieves a success rate of +45 +% +percent +45 +45\% +, while +b += +5 +𝑏 +5 +b=5 +achieves +74 +% +percent +74 +74\% +. +We also consider an oracle setup for IO/CoT, by calculating the success rate using best of +k +𝑘 +k +samples +( +1 +≤ +k +≤ +100 +) +1 +𝑘 +100 +(1\leq k\leq 100) +. To compare IO/CoT (best of k) with ToT, we consider calculating the tree nodes visited per task in ToT across +b += +1 +​ +⋯ +​ +5 +𝑏 +1 +⋯ +5 +b=1\cdots 5 +, and map the 5 success rates in Figure +3 +(a), treating IO/CoT (best of +k +𝑘 +k +) as visiting +k +𝑘 +k +nodes in a bandit. Not surprisingly, CoT scales better than IO, and best of 100 CoT samples achieve a success rate of +49 +% +percent +49 +49\% +, but still much worse than exploring more nodes in ToT ( +b +> +1 +𝑏 +1 +b>1 +). +Error analysis. +Figure +3 +(b) breaks down at which step CoT and ToT samples fail the task, i.e. the thought (in CoT) or all +b +𝑏 +b +thoughts (in ToT) are invalid or impossible to reach 24. Notably, around 60% of CoT samples already failed the task after generating the first step, or equivalently, the first three words (e.g. “ +4 ++ +9 +4 +9 +4+9 +”). This highlights the issues with direct left-to-right decoding. +4.2 +Creative writing +Next, we invent a creative writing task where the input is 4 random sentences and the output should be a coherent passage with 4 paragraphs that end in the 4 input sentences respectively. +Such a task is open-ended and exploratory, and challenges creative thinking as well as high-level planning. +Task setup. +We sample random sentences from +randomwordgenerator.com +to form 100 inputs, and there is no groundtruth passage for each input constraint. As we find that GPT-4 can follow the input constraints most of the time, we focus on evaluating passage coherency in two ways: using a GPT-4 zero-shot prompt to provide a 1-10 scalar score, or using human judgments to compare pairs of outputs from different methods. For the former, we sample 5 scores and average them for each task output, and we find these 5 scores usually consistent, with a standard deviation of around +0.56 +0.56 +0.56 +on average across outputs. For the latter, we employ a subset of the authors in a blind study to compare the coherency of CoT vs. ToT generated passage pairs, where the order of passages is random flipped over 100 inputs. +Baselines. +Given the creative nature of the task, both IO and CoT prompts are zero-shot. While the former prompts the LM to directly generate a coherent passage given input constraints, the latter prompts the LM to first make a brief plan then write the passage, i.e. the plan serves as the intermediate thought step. We generate 10 IO and CoT samples per task. +We also consider an iterative-refine ( +k +≤ +5 +𝑘 +5 +k\leq 5 +) method on top of a random IO sample for each task, where the LM is conditioned on input constraints and the last generated passage to decide if the passage is already “perfectly coherent”, and if not generate a refined one. +ToT setup. +We build a ToT with depth 2 (and only 1 intermediate thought step) — the LM first generates +k += +5 +𝑘 +5 +k=5 +plans and votes for the best one (Figure +4 +), then similarly generate +k += +5 +𝑘 +5 +k=5 +passages based on the best plan then vote for the best one. Here the breadth limit +b += +1 +𝑏 +1 +b=1 +, as only one choice is kept per step. A simple zero-shot vote prompt (“analyze choices below, then conclude which is most promising for the instruction”) is used to sample 5 votes at both steps. +Results. +Figure +5 +(a) shows average GPT-4 scores across 100 tasks, where ToT (7.56) is deemed to generate more coherent passages than IO (6.19) and CoT (6.93) on average. While such an automatic metric might be noisy, Figure +5 +(b) confirms the finding by showing that humans prefer ToT over CoT in 41 out of 100 passage pairs, while only prefer CoT over ToT in 21 (other 38 pairs are found “similarly coherent”). Lastly, iterative-refine is more effective on this natural language task, where it improves IO coherency score from 6.19 to 7.67, and ToT coherency score from 7.56 to 7.91. +We believe it could be thought of as a third approach to thought generation in the ToT framework, where new thoughts can arise from refining old thoughts instead of i.i.d. or sequentially generated. +Figure 4: +A step of deliberate search in a randomly picked Creative Writing task. Given the input, the LM samples 5 different plans, then votes 5 times to decide which plan is best. The majority choice is used to consequently write the output passage with the same sample-vote procedure. +Figure 5: +Creative Writing results. +Method +Success Rate (%) +Letter +Word +Game +IO +38.7 +14 +0 +CoT +40.6 +15.6 +1 +ToT (ours) +78 +60 +20 ++best state +82.4 +67.5 +35 +-prune +65.4 +41.5 +5 +-backtrack +54.6 +20 +5 +Table 3: +Mini Crosswords results. +4.3 +Mini crosswords +Figure 6: +In Mini Crosswords, (a) how thoughts are proposed and aggregated in a priority queue for depth-first search (DFS), and (b) how a state is evaluated based on the possibility of filling in each remaining word clue, and pruned if any remaining clue is deemed not possible to fill by the LM. Then DFS backtracks to the parent state and explore the next promising thought for clue. +In Game of 24 and Creative Writing, ToT is relatively shallow — at most 3 thought steps are needed to reach the final output. Here we explore +5 +× +5 +5 +5 +5\times 5 +mini crosswords as a harder search problem involving natural language. Again, the goal is not just to solve the task, as more general crosswords can be readily solved with specialized NLP pipelines +[ +34 +] +that leverages large-scale retrieval instead of LM. Rather, we aim to explore the limit of LM as a general problem solver that explores its own thoughts and guides its own exploration with deliberate reasoning as heuristics. +Task setup. +We scrape data from +GooBix +, which contains 156 games of +5 +× +5 +5 +5 +5\times 5 +mini crosswords. As we observe adjacent games contain similar clues, we use 20 games with indices +1 +, +6 +, +⋯ +, +91 +, +96 +1 +6 +⋯ +91 +96 +1,6,\cdots,91,96 +for testing, and games +136 +, +141 +, +146 +, +151 +, +156 +136 +141 +146 +151 +156 +136,141,146,151,156 +for prompting. +For each task, the input describes the 5 horizontal clues and 5 vertical clues, and the output should be a board of +5 +× +5 += +25 +5 +5 +25 +5\times 5=25 +letters to solve the crosswords. For evaluation, we consider three levels of success: the portion of correct letters (25 per game), words (10 per game), and games. +Baselines. +We provide 5 example input-output pairs in the IO prompt, and in the CoT prompt additionally include intermediate words in the order h1..5 then v1..5. We run each prompt for 10 samples and average the results. +ToT setup. +We leverage a depth-first search (Algorithm +2 +) that keeps exploring the most promising subsequent word clue until the state is no longer promising, then backtrack to the parent state to explore alternative thoughts. +To make search tractable, subsequent thoughts are constrained not to change any filled words or letters, so that the ToT has at most 10 intermediate steps. +For thought generation, at each state we translate all existing thoughts (e.g. “h2.motor; h1.tasks” for the state in Figure +6 +(a)) into letter constraints for remaining clues (e.g. “v1.To heap: tm___;…”) and prompt a proposal prompt +5 +5 +5 +times to come up with candidates for where and what to fill in the next word. Importantly, we also prompt the LM to give a confidence level for different thoughts, and aggregate these across proposals to obtain a sorted list of next thoughts to explore (Figure +6 +(a)). +For state evaluations, we similarly translate each state into letter constraints for remaining clues, then evaluate for each clue if it is possible to fill given the constraints. If any remaining clue is deemed “impossible” to fill in (e.g. “v1. To heap: tm_s_”), then the exploration of the state’s subtree is pruned and DFS backtracks to its parent to explore the next promising thought. We limit DFS search steps to 100, and simply render the deepest explored state (the first explored one if multiple) into the final output. +Results. +As shown in Table +5 +, IO and CoT prompting methods perform poorly with a word-level success rate less than +16 +% +percent +16 +16\% +, while ToT significantly improves all metrics, achieving a word-level success rate of +60 +% +percent +60 +60\% +and solving 4 out of 20 games. Such an improvement is not surprising, given IO and CoT lack mechanisms to try different clues, make changes to decisions, or backtrack. +Oracle and ablation studies. +When outputting from the oracle best DFS state (instead of the heuristically determined best state) per task, ToT performance is even higher and actually solves 7/20 games (Table +5 +, “+best state”), indicating our simple output heuristics can be readily improved. Interestingly, sometimes when the crosswords game is actually solved, the state evaluator might still deem some words as “impossible” and prune — possibly because +5 +× +5 +5 +5 +5\times 5 +crosswords by design have some rare or obselete words that GPT-4 cannot recognize +2 +2 +2 +For example, “agend” is an obsolete form of “agendum”, but GPT-4 deems it a typo for “agenda”. External retrieval +or web interaction +could augment LM for problem solving under knowledge uncertainty. +. +Given the state evaluation as a pruning heuristic is imperfect, we also explore ablating the pruning, and find the performance generally worse (Table +5 +, “-prune”). However, it could actually find the correct solution for 4/20 games (though only outputting 1 via heuristic), 3 of which are games ToT+pruning cannot solve within 100 steps. Thus, better heuristics for DFS pruning are critical for problem solving in this case. +Lastly, we confirm the importance of backtracking by running an ablation that keeps filling the most promising clue for at most 20 steps, allowing overwrites. This is similar to a “greedy” BFS search with breadth limit of +b += +1 +𝑏 +1 +b=1 +, and performs poorly with a word level success of only +20 +% +percent +20 +20\% +(Table +5 +, “-backtrack”). +5 +Related Work +Planning and decision making. +Smart planning and decision making are critical to achieving predefined goals. As they are trained on vast amount of world knowledge and human examples, +LMs are known to have already absorbed rich commonsense that makes it possible to propose reasonable plans conditioned on problem setting and environmental states +[ +12 +, +42 +, +37 +, +13 +, +35 +, +41 +, +40 +] +. Our proposed ToT approach extends existing planning formulations by considering multiple potentially feasible plans simultaneously at each problem-solving step, and proceeding with the most promising ones. The integration between thought sampling and value feedback organically integrates planning and decision-making mechanisms, enabling effective search inside a solution tree. On the other hand, traditional decision-making procedures usually require training dedicated reward and policy models as in reinforcement learning (for example CHAI +[ +33 +] +), whereas we use the LM itself to provide the value estimates for decision making. +RAP +[ +9 +] +is a concurrent work that treats language model reasoning as planning with its internal world model, and proposes a MCTS-based method similar to ToT. However, its tasks are simpler than ours, and its framework lacks the modularity to incorporate different tree search algorithms. +Self-reflection. +Using LLMs to assess the viability of their own predictions is becoming an increasingly important procedure in problem solving. +[ +28 +, +20 +, +24 +] +introduced the “self-reflection” mechanism, in which LMs provide feedback to their generation candidates. +[ +4 +] +improves LMs code generation accuracy by injecting feedback messages generated by the LM itself based on its code execution results. Similarly, +[ +17 +] +also introduces “critic” or review steps over the actions and states, deciding the next action to take in solving computer operation tasks. Another recent work very relevant to ours is “self-eval guided decoding” +[ +39 +] +. Similar to our method, self-eval decoding also follows a tree-search procedure with leaves sampled from stochastic beam search decoding, which are then evaluated by LLM itself with carefully prepared self-eval prompts. Their approach however, uses the PAL formulation +[ +8 +] +which represents thoughts as codes, which makes it difficult to tackle challenging tasks like creative writing which we consider in this paper. Our Tree-of-Thought formulation is thus more versatile and handles challenging tasks on which GPT-4 only achieves very low accuracy with standard prompts. +Program-guided LLM generation. +Our proposal is also related to recent advancements that organize LM’s behavior with systematic procedures +[ +14 +, +44 +, +6 +, +43 +] +or symbolic program guidance. For example, +Schlag et al. [ +27 +] +embeds LMs in an algorithmic search procedure to help solve problems like question answering step-by-step, in which the search trees are expanded by relevant paragraphs that might provide answers. This approach however differs from ours in that trees are expanded by sampling external paragraphs instead of the LM’s own thoughts, and there is no reflection or voting steps. Another approach, LLM+P +[ +18 +] +, goes one step further and delegates the actual planning process to a classical planner. +Classical search methods. +Last but not least, our approach can be treated as a modern rendition of classical search methods for problem solving. For example it can be considered as a heuristic search algorithm like A* +[ +10 +] +, in which the heuristic at each search node is provided by the LM’s self-assessment. From this perspective, our method is also related to NeuroLogic A*esque decoding +[ +19 +] +, which is inspired by A* search but introduces look-ahead heuristics that are efficient for LMs to improve the beam-search or top-k sampling decoding. This method however is constrained to sentence generation tasks, whereas our framework are designed for complex, multi-step problem solving guarded by value feedback. +6 +Discussion +Limitations and future directions. +Deliberate search such as ToT might not be necessary for many existing tasks that GPT-4 already excels at (see Appendix +B.1 +), and as an initial step this work only explores three relatively simple tasks that challenges GPT-4 (see Appendix +B.2 +for some GPT-3.5 experiment results) and calls of better search and planning abilities incorporated with LMs. However, as we begin to deploy LMs for more real-world decision making applications (e.g. coding, data analysis, robotics, etc.), more complex tasks could emerge and present new opportunities to study these research questions. Also, search methods like ToT requires more resources (e.g. GPT-4 API cost) than sampling methods in order to improve task performances, but the modular flexibility of ToT allows users to customize such performance-cost tradeoffs, and ongoing open-source efforts +[ +32 +] +should readily reduce such costs in the near future. More details about cost and efficiency are in Appendix +B.3 +. Lastly, this work focuses on using an off-the-shelf LM, and fine-tuning LMs using a ToT-style high-level counterfactual decision making (e.g. deliberating over potential choices for the next paragraph, instead of predicting the next token) might present opportunities to enhance the problem-solving capabilities of LMs. +Conclusion. +The associative “System 1” of LMs can be beneficially augmented by a “System 2” based on searching a tree of possible paths to the solution to a problem. The Tree of Thoughts framework provides a way to translate classical insights about problem-solving into actionable methods for contemporary LMs. At the same time, LMs address a weakness of these classical methods, providing a way to solve complex problems that are not easily formalized, such as creative writing. We see this intersection of LMs with classical approaches to AI as an exciting direction. +Broader Impact +ToT is a framework that empowers LMs to more autonomously and intelligently make decisions and solve problems. While current tasks are limited to reasoning and search problems, future applications involving interaction with external environments or humans could bring potential danger, e.g. facilitating harmful uses of LMs. On the other hand, ToT also improves the interpretability of model decisions and the opportunity for human alignment, as the resulting representations are readable, high-level language reasoning instead of implicit, low-level token values. +Acknowledgements +SY and KN acknowledge support from an Oracle Collaborative Research award and the National Science Foundation under Grant No. 2239363. Any opinions, findings, conclusions, or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation. SY is also supported by the Harold W. Dodds Fellowship from Princeton. +References +Brown et al. [2020] +T. Brown, B. Mann, N. Ryder, M. Subbiah, J. D. Kaplan, P. Dhariwal, +A. Neelakantan, P. Shyam, G. Sastry, A. Askell, et al. +Language models are few-shot learners. +Advances in neural information processing systems +, +33:1877–1901, 2020. +Browne et al. [2012] +C. Browne, E. J. Powley, D. Whitehouse, S. M. M. Lucas, P. I. Cowling, +P. Rohlfshagen, S. Tavener, D. P. Liebana, S. Samothrakis, and S. Colton. +A survey of monte carlo tree search methods. +IEEE Transactions on Computational Intelligence and AI in +Games +, 4:1–43, 2012. +Campbell et al. [2002] +M. Campbell, A. J. Hoane Jr, and F.-h. Hsu. +Deep blue. +Artificial intelligence +, 134(1-2):57–83, +2002. +Chen et al. [2023] +X. Chen, M. Lin, N. Schärli, and D. Zhou. +Teaching large language models to self-debug, 2023. +Chowdhery et al. [2022] +A. Chowdhery, S. Narang, J. Devlin, M. Bosma, G. Mishra, A. Roberts, P. Barham, +H. W. Chung, C. Sutton, S. Gehrmann, et al. +Palm: Scaling language modeling with pathways. +arXiv preprint arXiv:2204.02311 +, 2022. +Creswell and Shanahan [2022] +A. Creswell and M. Shanahan. +Faithful reasoning using large language models. +arXiv preprint arXiv:2208.14271 +, 2022. +Daw et al. [2005] +N. D. Daw, Y. Niv, and P. Dayan. +Uncertainty-based competition between prefrontal and dorsolateral +striatal systems for behavioral control. +Nature neuroscience +, 8(12):1704–1711, +2005. +Gao et al. [2023] +L. Gao, A. Madaan, S. Zhou, U. Alon, P. Liu, Y. Yang, J. Callan, and G. Neubig. +Pal: Program-aided language models, 2023. +Hao et al. [2023] +S. Hao, Y. Gu, H. Ma, J. J. Hong, Z. Wang, D. Z. Wang, and Z. Hu. +Reasoning with language model is planning with world model. +arXiv preprint arXiv:2305.14992 +, 2023. +Hart et al. [1968a] +P. E. Hart, N. J. Nilsson, and B. Raphael. +A formal basis for the heuristic determination of minimum cost paths. +IEEE Transactions on Systems Science and Cybernetics +, +4(2):100–107, 1968a. +doi: +10.1109/TSSC.1968.300136 +. +Hart et al. [1968b] +P. E. Hart, N. J. Nilsson, and B. Raphael. +A formal basis for the heuristic determination of minimum cost paths. +IEEE transactions on Systems Science and Cybernetics +, +4(2):100–107, 1968b. +Huang et al. [2022a] +W. Huang, P. Abbeel, D. Pathak, and I. Mordatch. +Language models as zero-shot planners: Extracting actionable +knowledge for embodied agents, 2022a. +Huang et al. [2022b] +W. Huang, F. Xia, T. Xiao, H. Chan, J. Liang, P. Florence, A. Zeng, J. Tompson, +I. Mordatch, Y. Chebotar, et al. +Inner monologue: Embodied reasoning through planning with language +models. +arXiv preprint arXiv:2207.05608 +, 2022b. +Jung et al. [2022] +J. Jung, L. Qin, S. Welleck, F. Brahman, C. Bhagavatula, R. L. Bras, and +Y. Choi. +Maieutic prompting: Logically consistent reasoning with recursive +explanations. +arXiv preprint arXiv:2205.11822 +, 2022. +Kahneman [2011] +D. Kahneman. +Thinking, fast and slow +. +Macmillan, 2011. +Kahneman et al. [2002] +D. Kahneman, S. Frederick, et al. +Representativeness revisited: Attribute substitution in intuitive +judgment. +Heuristics and biases: The psychology of intuitive judgment +, +49(49-81):74, 2002. +Kim et al. [2023] +G. Kim, P. Baldi, and S. McAleer. +Language models can solve computer tasks, 2023. +Liu et al. [2023] +B. Liu, Y. Jiang, X. Zhang, Q. Liu, S. Zhang, J. Biswas, and P. Stone. +Llm+p: Empowering large language models with optimal planning +proficiency, 2023. +Lu et al. [2021] +X. Lu, S. Welleck, P. West, L. Jiang, J. Kasai, D. Khashabi, R. L. Bras, +L. Qin, Y. Yu, R. Zellers, N. A. Smith, and Y. Choi. +Neurologic a*esque decoding: Constrained text generation with +lookahead heuristics. +In +North American Chapter of the Association for Computational +Linguistics +, 2021. +Madaan et al. [2023] +A. Madaan, N. Tandon, P. Gupta, S. Hallinan, L. Gao, S. Wiegreffe, U. Alon, +N. Dziri, S. Prabhumoye, Y. Yang, S. Welleck, B. P. Majumder, S. Gupta, +A. Yazdanbakhsh, and P. Clark. +Self-refine: Iterative refinement with self-feedback, 2023. +Newell et al. [1959] +A. Newell, J. C. Shaw, and H. A. Simon. +Report on a general problem solving program. +In +IFIP congress +, volume 256, page 64. Pittsburgh, PA, 1959. +Newell et al. [1972] +A. Newell, H. A. Simon, et al. +Human problem solving +. +Prentice-Hall, 1972. +OpenAI [2023] +OpenAI. +Gpt-4 technical report. +ArXiv +, abs/2303.08774, 2023. +Paul et al. [2023] +D. Paul, M. Ismayilzada, M. Peyrard, B. Borges, A. Bosselut, R. West, and +B. Faltings. +Refiner: Reasoning feedback on intermediate representations, 2023. +Radford et al. [2018] +A. Radford, K. Narasimhan, T. Salimans, I. Sutskever, et al. +Improving language understanding by generative pre-training. +OpenAI blog +, 2018. +Radford et al. [2019] +A. Radford, J. Wu, R. Child, D. Luan, D. Amodei, I. Sutskever, et al. +Language models are unsupervised multitask learners. +OpenAI blog +, 1(8):9, 2019. +Schlag et al. [2023] +I. Schlag, S. Sukhbaatar, A. Celikyilmaz, W. tau Yih, J. Weston, +J. Schmidhuber, and X. Li. +Large language model programs, 2023. +Shinn et al. [2023] +N. Shinn, B. Labash, and A. Gopinath. +Reflexion: an autonomous agent with dynamic memory and +self-reflection, 2023. +Silver et al. [2017] +D. Silver, J. Schrittwieser, K. Simonyan, I. Antonoglou, A. Huang, A. Guez, +T. Hubert, L. Baker, M. Lai, A. Bolton, et al. +Mastering the game of go without human knowledge. +nature +, 550(7676):354–359, 2017. +Sloman [1996] +S. A. Sloman. +The empirical case for two systems of reasoning. +Psychological bulletin +, 119(1):3, 1996. +Stanovich [1999] +K. E. Stanovich. +Who is rational? Studies of individual differences in +reasoning +. +Psychology Press, 1999. +Touvron et al. [2023] +H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix, +B. Rozière, N. Goyal, E. Hambro, F. Azhar, et al. +Llama: Open and efficient foundation language models. +arXiv preprint arXiv:2302.13971 +, 2023. +Verma et al. [2022] +S. Verma, J. Fu, S. Yang, and S. Levine. +Chai: A chatbot ai for task-oriented dialogue with offline +reinforcement learning. +In +Proceedings of the 2022 Conference of the North American +Chapter of the Association for Computational Linguistics: Human Language +Technologies +, pages 4471–4491, 2022. +Wallace et al. [2022] +E. Wallace, N. Tomlin, A. Xu, K. Yang, E. Pathak, M. Ginsberg, and D. Klein. +Automated crossword solving. +arXiv preprint arXiv:2205.09665 +, 2022. +Wang et al. [2023a] +L. Wang, W. Xu, Y. Lan, Z. Hu, Y. Lan, R. K.-W. Lee, and E.-P. Lim. +Plan-and-solve prompting: Improving zero-shot chain-of-thought +reasoning by large language models, 2023a. +Wang et al. [2022] +X. Wang, J. Wei, D. Schuurmans, Q. Le, E. Chi, and D. Zhou. +Self-consistency improves chain of thought reasoning in language +models. +arXiv preprint arXiv:2203.11171 +, 2022. +Wang et al. [2023b] +Z. Wang, S. Cai, A. Liu, X. Ma, and Y. Liang. +Describe, explain, plan and select: Interactive planning with large +language models enables open-world multi-task agents, 2023b. +Wei et al. [2022] +J. Wei, X. Wang, D. Schuurmans, M. Bosma, E. Chi, Q. Le, and D. Zhou. +Chain of thought prompting elicits reasoning in large language +models. +arXiv preprint arXiv:2201.11903 +, 2022. +Xie et al. [2023] +Y. Xie, K. Kawaguchi, Y. Zhao, X. Zhao, M.-Y. Kan, J. He, and Q. Xie. +Decomposition enhances reasoning via self-evaluation guided decoding, +2023. +Yang et al. [2023] +S. Yang, O. Nachum, Y. Du, J. Wei, P. Abbeel, and D. Schuurmans. +Foundation models for decision making: Problems, methods, and +opportunities, 2023. +Yao et al. [2022] +S. Yao, J. Zhao, D. Yu, N. Du, I. Shafran, K. Narasimhan, and Y. Cao. +ReAct: Synergizing reasoning and acting in language models. +arXiv preprint arXiv:2210.03629 +, 2022. +Zhang et al. [2023] +S. Zhang, Z. Chen, Y. Shen, M. Ding, J. B. Tenenbaum, and C. Gan. +Planning with large language models for code generation. +In +The Eleventh International Conference on Learning +Representations +, 2023. +URL +https://openreview.net/forum?id=Lr8cOOtYbfL +. +Zhou et al. [2022] +D. Zhou, N. Schärli, L. Hou, J. Wei, N. Scales, X. Wang, D. Schuurmans, +C. Cui, O. Bousquet, Q. Le, et al. +Least-to-most prompting enables complex reasoning in large language +models. +arXiv preprint arXiv:2205.10625 +, 2022. +Zhu et al. [2022] +X. Zhu, J. Wang, L. Zhang, Y. Zhang, R. Gan, J. Zhang, and Y. Yang. +Solving math word problem via cooperative reasoning induced language +models. +arXiv preprint arXiv:2210.16257 +, 2022. +Appendix A +Code, Prompts, Trajectories +All code is available at +https://github.com/princeton-nlp/tree-of-thought-llm +. +All prompts are available at +https://github.com/princeton-nlp/tree-of-thought-llm/tree/master/src/tot/prompts +. +Trajectories are available at +https://github.com/princeton-nlp/tree-of-thought-llm/tree/master/logs +. +Appendix B +Additional Experiment Results +Given the motivation of exploring and extending the capability frontier of language models, our experiments in the main paper have focused on a setup with the state-of-the-art language model (GPT-4), and three hard tasks invented to challenge it. Here, we report additional experiments with weaker LLM or easier tasks, and discuss cost and efficiency. +GSM8K +StrategyQA +IO +51 +73 +CoT +86 +82 +ToT +90 +83 +Table 4: +New tasks with +zero-shot ToT and GPT-4. +GPT-4 +GPT-3.5 +IO +7.3% +6% +CoT +4.0% +3% +ToT +74% +19% +Table 5: +Game of 24 with +GPT-4 vs GPT-3.5. +GPT-4 +GPT-3.5 +IO +6.19 +4.47 +CoT +6.93 +5.16 +ToT +7.56 +6.62 +Table 6: +Creative Writing with +GPT-4 vs. GPT-3.5. +B.1 +Extension to new tasks (GSM8k, StrategyQA) with zero-shot ToT +While more common NLP tasks might be too easy for GPT-4 and do not require ToT (which is why we considered harder new tasks), we believe applying ToT to new tasks could be straightforward. For example, we implemented a simple and generic zero-shot ToT-BFS similar to creative writing (sample 5 problem solving strategies then vote for the best one; then sample 5 solutions based on the best strategy then vote for the best one) for GSM8K and StrategyQA with few extra lines of code: +# define the answer format of new tasks +gsm8k_format = ‘"the answer is n" where n is a number’ +strategyqa_format = ‘either "the answer is yes" or "the answer is no"’ + +# define zero-shot io prompting +standard_prompt = ‘Answer the following question with {format}: {input}’ + +# define thought format for zero-shot cot and zero-shot tot +cot_prompt = ‘‘‘Answer the following question: {input} + +Make a strategy then write. Your output should be of the following format: + +Strategy: +Your strategy about how to answer the question. + +Answer: +Your answer to the question. It should end with {format}. +’’’ + +# define zero-shot voting used for zero-shot tot +vote_prompt = ‘‘‘Given an instruction and several choices, +decide which choice is most promising. +Analyze each choice in detail, then conclude in the last line +"The best choice is {s}", where s the integer id of the choice. +’’’ +We evaluated on a subset of 100 random GSM8K test and StrategyQA dev questions. As shown in Table +B +and as expected, ToT improves over CoT on both tasks (but only slightly, given GPT-4 + CoT is already very good on such tasks, and StrategyQA’s bottleneck is external knowledge, not reasoning). Considering computational costs, it is more suitable to try smaller LLMs + ToT for traditional NLP tasks, or GPT-4 + ToT for hard tasks that challenge GPT-4 + CoT’s reasoning. +B.2 +Extension to new LMs (GPT-3.5) +To understand how ToT works with other LLMs, we also ran GPT-3.5-turbo for Creative Writing (Table +B +) and Game of 24 (Table +B +). +On both tasks, “ToT +> +> +CoT +> +> +IO” remains true for GPT-3.5. +On Creative Writing, we find GPT-3.5+ToT outperform GPT-4+IO, and similar to GPT-4+CoT, which suggests ToT could also work well on weaker language models. +On Game of 24 (we changed 1-shot proposal prompt to 3-shot to make it work), GPT-3.5+ToT’s 19% is far worse than GPT-4+ToT’s 74%. To further understand the importance of generation vs. evaluation, we ran GPT-4 generation + GPT-3.5 evaluation (64%) and GPT-3.5 generation + GPT-4 evaluation (31%). This suggests the game’s bottleneck is thought generation, and different generation/evaluation language models might attain decent results while reducing costs. +B.3 +Cost and efficiency +Running ToT requires significantly more computations than IO or CoT prompting. For example, in Game of 24 (Table +7 +below), solving a problem with ToT requires 5.5k completion tokens, close to 100 CoT trials (6.7k tokens). But the performance of ToT is better than best of 100 independent CoT trials. +Game of 24 +Generate/Prompt tokens +Cost per case +Success +IO (best of 100) +1.8k / 1.0k +$0.13 +33% +CoT (best of 100) +6.7k / 2.2k +$0.47 +49% +ToT +5.5k / 1.4k +$0.74 +74% +Table 7: +Cost analysis on Game of 24. +On Creative Writing (Table +8 +below), we found ToT takes around 5x completion tokens and money cost, which is intuitive as +b += +5 +𝑏 +5 +b=5 +and most tokens are generated passages. +Creative Writing +Generate/Prompt tokens +Cost per case +IO +0.9k / 0.4k +$0.06 +CoT +0.9k / 0.4k +$0.07 +ToT +4k / 2.9k +$0.32 +Table 8: +Cost analysis on Game of 24. +So completing Game of 24 and Creative Writing’s main ToT experiments cost around +0.74 +× +100 ++ +0.32 +× +100 += +106 +0.74 +100 +0.32 +100 +106 +0.74\times 100+0.32\times 100=106 +dollars. Crosswords’ DFS experiments should be also within +100 +100 +100 +dollars. In general, cost and efficiency of ToT highly depend on the prompts and search algorithms used, and could require 5-100 times more generated tokens than CoT. Some actionable insights: +• +We recommend using ToT on tasks requiring deliberate reasoning, on which CoT struggles. +• +Flexibility of ToT allows some performance-cost tradeoff, e.g., change beam size or vote number in BFS, few-shot vs. zero-shot prompting, GPT-3.5 vs. GPT-4, etc. One could configure the setup based on some resource constraints or performance goal. +• +There is much space for improving efficiency, e.g., BFS could early stop when solution is found, or trim down beam size to when some thoughts are ”impossible”. +• +We believe that more computation is indeed required in order for the model to achieve stronger intelligence, and this should not become a blocking issue as in the long run, (open-source) LMs will become much cheaper and more efficient. It is also a great direction how to better train/finetune LMs for thought generation and/or evaluation. +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md new file mode 100644 index 0000000000000000000000000000000000000000..8112d1616db6d69cf8f81687e4e0653027333c1a --- /dev/null +++ b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md @@ -0,0 +1,213 @@ +--- +title: '[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language + Models' +id: 230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models +tags: +- deepread +created: '2026-06-10T00:39:55.627654Z' +source: https://arxiv.org/abs/2305.10601 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:39:55.627506Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language Models +Computer Science > Computation and Language +arXiv:2305.10601 +(cs) +[Submitted on 17 May 2023 ( +v1 +), last revised 3 Dec 2023 (this version, v2)] +Title: +Tree of Thoughts: Deliberate Problem Solving with Large Language Models +Authors: +Shunyu Yao +, +Dian Yu +, +Jeffrey Zhao +, +Izhak Shafran +, +Thomas L. Griffiths +, +Yuan Cao +, +Karthik Narasimhan +View a PDF of the paper titled Tree of Thoughts: Deliberate Problem Solving with Large Language Models, by Shunyu Yao and 6 other authors +View PDF +HTML (experimental) +Abstract: +Language models are increasingly being deployed for general problem solving across a wide range of tasks, but are still confined to token-level, left-to-right decision-making processes during inference. This means they can fall short in tasks that require exploration, strategic lookahead, or where initial decisions play a pivotal role. To surmount these challenges, we introduce a new framework for language model inference, Tree of Thoughts (ToT), which generalizes over the popular Chain of Thought approach to prompting language models, and enables exploration over coherent units of text (thoughts) that serve as intermediate steps toward problem solving. ToT allows LMs to perform deliberate decision making by considering multiple different reasoning paths and self-evaluating choices to decide the next course of action, as well as looking ahead or backtracking when necessary to make global choices. Our experiments show that ToT significantly enhances language models' problem-solving abilities on three novel tasks requiring non-trivial planning or search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of tasks, our method achieved a success rate of 74%. Code repo with all prompts: +this https URL +. +Comments: +NeurIPS 2023 camera ready version. Code repo with all prompts: +this https URL +Subjects: +Computation and Language (cs.CL) +; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) +Cite as: +arXiv:2305.10601 +[cs.CL] +(or +arXiv:2305.10601v2 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2305.10601 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Shunyu Yao [ +view email +] +[v1] +Wed, 17 May 2023 23:16:17 UTC (609 KB) +[v2] +Sun, 3 Dec 2023 22:50:35 UTC (623 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Tree of Thoughts: Deliberate Problem Solving with Large Language Models, by Shunyu Yao and 6 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2023-05 +Change to browse by: +cs +cs.AI +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +5 blog links +( +what is this? +) +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md new file mode 100644 index 0000000000000000000000000000000000000000..3704173207d5ba9f751b4ab7e1bc04b1dcdef9a4 --- /dev/null +++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md @@ -0,0 +1,4095 @@ +--- +title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning + in Language Models' +id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2 +tags: +- deepread +created: '2026-06-10T00:40:44.608943Z' +source: https://ar5iv.labs.arxiv.org/html/2310.04406 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:40:44.608803Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Andy Zhou +University of Illinois at Urbana-Champaign +AI@UIUC +Kai Yan +University of Illinois at Urbana-Champaign +Michal Shlapentokh-Rothman +University of Illinois at Urbana-Champaign +Haohan Wang +University of Illinois at Urbana-Champaign +Yu-Xiong Wang +University of Illinois at Urbana-Champaign +Abstract +While large language models (LLMs) have demonstrated impressive performance on a range of decision-making tasks, they rely on simple acting processes and fall short of broad deployment as autonomous agents. We introduce LATS (Language Agent Tree Search), a general framework that synergizes the capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration from Monte Carlo tree search commonly used in model-based reinforcement learning, LATS employs LLMs as agents, value functions, and optimizers, repurposing their latent strengths for enhanced decision-making. What is crucial in this method is the use of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that moves beyond the limitations of existing techniques. Our experimental evaluation across diverse domains, such as programming, HotPotQA, and WebShop, illustrates the applicability of LATS for decision-making while maintaining competitive reasoning performance. In particular, LATS achieves 94.4% for programming on HumanEval with GPT-4 and an average score of 75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness and generality of our method. +1 +Introduction +General autonomous agents capable of reasoning and decision-making in a variety of environments +(Wooldridge & Jennings, +1995 +) +have been of longstanding interest in the field of artificial intelligence. While this has traditionally been studied in reinforcement learning, the recent rise of large language models (LLMs) +(Brown et al., +2020 +; Chowdhery et al., +2022 +; Touvron et al., +2023 +; OpenAI, +2023 +) +with strong reasoning and general adaptability offers an alternative paradigm. Not only have LLMs excelled on standard NLP tasks such as text summarization +(Nallapati et al., +2016 +) +or natural language inference +(Bowman et al., +2015 +) +, but they have been adapted to an increasingly diverse set of tasks that often require advanced common-sense reasoning or quantitative skills +(Cobbe et al., +2021 +; Saparov & He, +2022 +) +. LLMs are also capable of performing in complex environments that involve knowledge and reasoning, such as web navigation +(Yao et al., +2022 +; Deng et al., +2023 +) +, tool-use +(Schick et al., +2023 +) +, or open-ended games +(Fan et al., +2022 +) +. +Figure 1: +An overview of LATS. LATS uses an external environment and self-reflection to improve reasoning and decision-making. +Reasoning and acting abilities have also been improved by prompting techniques that augment LLMs with feedback or observations from an external environment +(Yao et al., +2023b +; Gao et al., +2022 +; Shinn et al., +2023 +) +. This eliminates the need to rely entirely on the base abilities of the Language Model (LM), enhancing it through external tools or semantic feedback. Despite this strength, these methods are reflexive and fall short of humans’ deliberate and thoughtful decision-making characteristics to solve problems +(Sloman, +1996 +; Evans, +2010 +) +. In particular, such methods fail to consider multiple reasoning paths or to plan ahead. Recent search-guided LLM works +(Xie et al., +2023 +; Yao et al., +2023a +; Hao et al., +2023 +) +address this issue by searching over multiple reasoning chains. While these methods enable planning, these methods operate in isolation and do not incorporate external feedback that can improve reasoning. +To help address these issues, we propose LATS (Language Agent Tree Search), a general framework for decision-making and reasoning with language models. LATS unifies LM planning, acting, and reasoning strategies by expanding ReAct +(Yao et al., +2023b +) +into a search over a combinatorial space of possible reasoning and acting steps. We adapt Monte Carlo tree search (MCTS) from model-based reinforcement learning +(Silver et al., +2017 +; Anthony et al., +2017 +; Jiang et al., +2018 +) +to language agents, repurposing a pretrained LLM as an agent, value function, and optimizer. Utilizing the strong natural language understanding and in-context learning ability of modern LMs, we use text as an interface between each component of the framework, allowing LATS to adapt planning to environmental conditions without additional training. To the best of our knowledge, +LATS is the first framework that combines reasoning, acting, and planning to enhance LLMs +. Notably, LATS doubles the performance of GPT-3.5 on HotPotQA +(Yang et al., +2018 +) +over ReAct +(Yao et al., +2023b +) +and raises the average score by +22.1 +22.1 +22.1 +on WebShop +(Yao et al., +2022 +) +. When used with GPT-4, LATS achieves a +94.4 +94.4 +94.4 +Pass@1 rate for programming on HumanEval +(Chen et al., +2021 +) +, setting the state of the art. To summarize, our +contributions +are the following: +• +We introduce an LM-based Monte Carlo tree search variant to deliberately construct the best trajectory from sampled actions, enabling more flexible and adaptive problem-solving compared to reflexive prompting methods. This is guided by heuristics from the LM. +• +By integrating external feedback and self-reflection, LATS enhances model sensibility and enables agents to learn from experience, surpassing reasoning-based search methods. +• +Through experiments across diverse domains like programming, interactive QA, and web navigation, we demonstrate the versatility of LATS in harnessing LLMs for autonomous reasoning and decision-making. +2 +Related Work +Approach +Reasoning +Acting +Planning +Self +External +Reflection +Memory +CoT +(Wei et al., +2022 +) +✓ +× +\times +× +\times +× +\times +× +\times +ReAct +(Yao et al., +2023b +) +✓ +✓ +× +\times +× +\times +× +\times +ToT +(Yao et al., +2023a +) +✓ +× +\times +✓ +✓ +✓ +RAP +(Hao et al., +2023 +) +✓ +× +\times +✓ +× +\times +✓ +Self-Refine +(Madaan et al., +2023 +) +✓ +× +\times +× +\times +✓ +× +\times +Beam Search +(Xie et al., +2023 +) +✓ +× +\times +× +\times +✓ +× +\times +Reflexion +(Shinn et al., +2023 +) +✓ +✓ +× +\times +✓ +✓ +LATS (Ours) +✓ +✓ +✓ +✓ +✓ +Table 1: +A summary of related work on reasoning, acting, and planning. LATS is the first work incorporating designs from all three domains, allowing use in all corresponding tasks. We refer to planning as the use of a search algorithm, self-reflection as the use of LM-generated feedback, and external memory as storaging past text context for future updates of solution. +a) Tree-of-Thoughts +b) Reasoning via Planning +c) Language Agent Tree Search +Figure 2: +An overview of the differences between LATS and recently proposed LM search algorithms ToT +(Yao et al., +2023a +) +and RAP +(Hao et al., +2023 +) +. LATS leverages environmental feedback and self-reflection to further adapt search and improve performance. +LLMs for reasoning. +For LLMs, reasoning typically involves decomposing complex inputs into sequential intermediate steps towards a final answer +(Cobbe et al., +2021 +) +, demonstrated with Chain-of-Thought (CoT) prompting +(Wei et al., +2022 +) +and its variants +(Wei et al., +2022 +; Kojima et al., +2022 +; Wang et al., +2022 +) +. However, these methods, which create chains autoregressively in a single step, often suffer from error propagation as the number of steps increases +(Guo et al., +2018 +; Chen et al., +2022b +) +due to compound errors. Various advancements aim to mitigate this issue; some approaches, such as Self-Consistency +(Wang et al., +2022 +) +, employ majority voting over sampled chains, while others focus on multi-step decomposition, such as least-to-most prompting +(Zhou et al., +2022 +) +, or use of external tools such as a scratchpad +(Nye et al., +2021 +) +or compiler +(Gao et al., +2022 +) +. Recently, CoT has been improved with search algorithms +(Yao et al., +2023a +; Hao et al., +2023 +; Besta et al., +2023 +) +that can sample trajectories more effectively. Tree-of-thought (ToT) prompting +(Yao et al., +2023a +) +uses DFS or BFS-based search guided by an LM-generated heuristic while Reasoning via Planning (RAP) +(Hao et al., +2023 +) +uses MCTS with rollouts simulated by the LM. However, they rely solely on LM internal knowledge and cannot adapt to useful external feedback. +LLMs for acting. +The strong reasoning and common-sense abilities of LLMs have also been adapted for decision-making or acting tasks as a policy model in interactive environments. In the realm of robotics LLMs have been employed as high-level controllers of control policies +(Ahn et al., +2022 +; Huang et al., +2022 +; Driess et al., +2023 +) +. Similar work +(Baker et al., +2022 +; Wang et al., +2023 +; Zhu et al., +2023 +) +has also adapted LLM agents to complex multimodal games such as Minecraft +(Guss et al., +2019 +; Fan et al., +2022 +) +. LLMs are particularly useful in text-based environments +(Liu et al., +2018 +; Shridhar et al., +2020 +; Liu et al., +2023 +) +, where acting-based prompting techniques such as ReAct +(Yao et al., +2023b +) +have seen success. Similar to CoT, ReAct is limited by its simplicity and cannot effectively adapt to environment conditions. Many extensions have been proposed to address this, including Self-refine +(Madaan et al., +2023 +) +and Reflexion +(Shinn et al., +2023 +; Yao et al., +2023c +) +, which uses self-reflection to enhance reasoning and decision-making, and AdaPlanner +(Sun et al., +2023 +) +, which incorporates both positive and negative environmental feedback. However these methods focus on refining an individual plan or trajectory and do not consider alternative choices at each step. In addition, recent work +(Huang et al., +2023 +) +has suggested LLMs cannot self-correct their internal reasoning, making it critical to use external feedback. Alternatively to pure decision-making environments, the reasoning and practical abilities of LLMs have been enhanced by access to external tools, such as APIs, search engines, calculators, or other models +(Schick et al., +2023 +; Shen et al., +2023 +; Surís et al., +2023 +) +. Contrary to reasoning-based approaches, these methods have not been improved with planning, limiting their effectiveness. We summarize them in Tab. +1 +. +Tree-based search. +Tree-based search, where multiple branches of outcomes are explored during search, is widely used in many planning algorithms +(Świechowski et al., +2023 +; LaValle et al., +2001 +) +and Reinforcement Learning (RL) +(Hafner et al., +2019 +; Du et al., +2023 +; Wu et al., +2023 +) +algorithms for its good exploration-exploitation trade-off. Though tree-based search requires an environment model that can expand from arbitrary state +(Vodopivec et al., +2017 +) +, which often requires extra training in RL +(Hafner et al., +2023 +) +, such problem does not exist for LM tasks as we can conveniently backup to any state by setting the input to be the context and corresponding previous output by the LM. Thus, we work on the tree-based framework and use MCTS +(Świechowski et al., +2023 +) +to fully release the potential of LMs, while avoiding the cost of training a value function over language descriptions by leveraging the in-context learning +(Brown et al., +2020 +) +abilities of LLMs. +3 +Preliminaries +3.1 +Problem Setting and Prompting +Before describing LATS, we first define our problem and outline a few established methods that leverage large language models for reasoning or decision-making. In LM reasoning or decision making, we are given an input +x +𝑥 +x +in natural language and a pretrained language model +p +θ +​ +( +x +) +subscript +𝑝 +𝜃 +𝑥 +p_{\theta}(x) +parameterized by +θ +𝜃 +\theta +; our goal is to generate a final output +y +∼ +p +θ +​ +( +x +) +similar-to +𝑦 +subscript +𝑝 +𝜃 +𝑥 +y\sim p_{\theta}(x) +corresponding to the answer (reasoning) or completes the task (decision-making). Both +x +𝑥 +x +and +y +𝑦 +y +are language +sequences +, which are comprised of a list of +tokens +(the basic elements of natural language, often words), denoted as +x += +( +x +​ +[ +1 +] +, +… +, +x +​ +[ +n +] +) +𝑥 +𝑥 +delimited-[] +1 +… +𝑥 +delimited-[] +𝑛 +x=(x[1],\dots,x[n]) +and +y += +( +y +​ +[ +1 +] +, +… +, +y +​ +[ +n +] +) +𝑦 +𝑦 +delimited-[] +1 +… +𝑦 +delimited-[] +𝑛 +y=(y[1],\dots,y[n]) +. The LM decodes text autoregressively, i.e., without other inputs, the probability for an LM to generate a sequence +x +𝑥 +x +is given by +p +θ +​ +( +x +) += +∏ +i += +1 +n +p +θ +​ +( +x +​ +[ +i +] +| +x +​ +[ +1 +​ +… +​ +i +− +1 +] +) +subscript +𝑝 +𝜃 +𝑥 +superscript +subscript +product +𝑖 +1 +𝑛 +subscript +𝑝 +𝜃 +conditional +𝑥 +delimited-[] +𝑖 +𝑥 +delimited-[] +1 +… +𝑖 +1 +p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1\dots i-1]) +. Usually, to improve the LM, +prompts +are provided along with the input +x +𝑥 +x +, which are specific instructions or few-shot input-output examples. We denote the generic process where an input +x +𝑥 +x +is transformed into an output +y +𝑦 +y +by LM: +y +∼ +p +θ +​ +( +y +| +prompt +I +​ +O +​ +( +x +) +) +similar-to +𝑦 +subscript +𝑝 +𝜃 +conditional +𝑦 +subscript +prompt +𝐼 +𝑂 +𝑥 +y\sim p_{\theta}(y|\texttt{prompt}_{IO}(x)) +, where +prompt +I +​ +O +​ +( +x +) +subscript +prompt +𝐼 +𝑂 +𝑥 +\texttt{prompt}_{IO}(x) +denotes the input +x +𝑥 +x +. +Chain-of-thought (CoT) Prompting +(Wei et al., +2022 +) +was introduced to cater to scenarios where direct mapping from +x +𝑥 +x +to +y +𝑦 +y +is intricate, such as when +x +𝑥 +x +is from a mathematical query or challenging question. This method hinges on creating +thoughts +z +1 +, +… +, +z +n +subscript +𝑧 +1 +… +subscript +𝑧 +𝑛 +z_{1},\dots,z_{n} +that act as stepping stones between +x +𝑥 +x +and +y +𝑦 +y +; each thought +z +i +subscript +𝑧 +𝑖 +z_{i} +is a language sequence. To employ CoT prompting, thoughts are extracted sequentially as +z +i +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i +| +x +, +z +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑧 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +1 +z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1}) +, with the final output being +y +∼ +p +θ +C +​ +o +​ +T +​ +( +y +| +x +, +z +1 +​ +⋯ +​ +n +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +𝑦 +𝑥 +subscript +𝑧 +1 +⋯ +𝑛 +y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n}) +. +Tree-of-thought (ToT) Prompting +(Yao et al., +2023a +) +extends CoT prompting by exploring multiple reasoning paths over thoughts. It frames problems as a search over a tree where each node +s += +[ +x +, +z +1 +⋅ +i +] +𝑠 +𝑥 +subscript +𝑧 +⋅ +1 +𝑖 +s=[x,z_{1\cdot i}] +represents a partial solution state comprising the original input +x +𝑥 +x +and thought sequence +z +1 +​ +⋯ +​ +i +subscript +𝑧 +1 +⋯ +𝑖 +z_{1\cdots i} +. Thoughts +z +i +subscript +𝑧 +𝑖 +z_{i} +are generated by proposal or sampling with CoT +z +i +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i +| +x +, +z +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑧 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +1 +z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1}) +. Deliberate search algorithms like breadth-first or depth-first search are used to systematically explore the tree, guided by heuristics based on language model evaluations +V +​ +( +s +) +𝑉 +𝑠 +V(s) +of each state. +Reasoning via Planning +(RAP) +(Hao et al., +2023 +) +is similar to ToT, except that MCTS is used over DFS or BFS. Heuristics are designed from an LM, such as the likelihood or confidence of an action, and the LM is used as a world model to predict subsequent states during the simulation step. +ReAct +(Yao et al., +2023b +) +extends language models to tasks where the mapping from +x +𝑥 +x +to +y +𝑦 +y +is enhanced by or requires interactions with an external environment, such as a game or API. This technique constructs an action space +A +^ += +A +∪ +Z +^ +𝐴 +𝐴 +𝑍 +\hat{A}=A\cup Z +that adds permissible actions +a +𝑎 +a +to the reasoning traces +z +𝑧 +z +from CoT. Observations +o +𝑜 +o +from the environment are used to improve both reasoning and acting. To solve problems with ReAct, after each observation, actions are generated from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +sequentially as +a +i +∼ +p +θ +R +​ +e +​ +A +​ +c +​ +t +​ +( +a +i +| +x +, +o +1 +​ +⋯ +​ +i +− +1 +, +a +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑎 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝑅 +𝑒 +𝐴 +𝑐 +𝑡 +conditional +subscript +𝑎 +𝑖 +𝑥 +subscript +𝑜 +1 +⋯ +𝑖 +1 +subscript +𝑎 +1 +⋯ +𝑖 +1 +a_{i}\sim p_{\theta}^{ReAct}(a_{i}|x,o_{1\cdots i-1},a_{1\cdots i-1}) +, with the final output being +y +∼ +p +θ +R +​ +e +​ +A +​ +c +​ +t +​ +( +y +| +x +, +o +1 +​ +⋯ +​ +n +, +a +1 +​ +⋯ +​ +n +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝑅 +𝑒 +𝐴 +𝑐 +𝑡 +conditional +𝑦 +𝑥 +subscript +𝑜 +1 +⋯ +𝑛 +subscript +𝑎 +1 +⋯ +𝑛 +y\sim p_{\theta}^{ReAct}(y~{}|~{}x,o_{1\cdots n},a_{1\cdots n}) +. +While the previously described prompting techniques improve LM performance on reasoning tasks, they falter on difficult tasks that involve multifaceted decision-making due to several shortcomings: 1) +Flexibility +: Base prompting methods (CoT or ReAct) autoregressively sample from the LM, neglecting potential alternative continuations from specific states. 2) +Sensibility +: Reasoning-based methods (CoT, RAP, or ToT) rely solely on the internal representations of the LM and cannot consider external observations. This dependency risks fact hallucination and error propagation while setting a performance ceiling. 3) +Adaptability +: Current planning frameworks (RAP or ToT) use simple search algorithms such as BFS or cannot leverage environmental feedback to improve planning. Additionally, the agent is static and cannot reuse previous experience or learn from trial and error. While RAP also adopts MCTS, it is constrained to tasks where the LM can become a world model and accurately predict states. These shortcomings limit the ability of LMs to be deployed as general problem-solving agents and form the motivation for LATS. +3.2 +Monte-Carlo Tree Search (MCTS) +Monte-Carlo Tree Search (MCTS) is a heuristic search algorithm that is proved successful on many decision-making environments such as Atari +(Ye et al., +2021 +) +and Go +(Silver et al., +2016 +) +. MCTS builds a decision tree where every node in the tree is a state and edge is an action. MCTS runs for +k +𝑘 +k +episodes; for each episode, it starts from the root (i.e., initial state) and iteratively conducts two steps to expand the tree: 1) +Expansion +, where multiple children states +s +𝑠 +s +are explored from the current parent state +p +𝑝 +p +by sampling +n +𝑛 +n +actions, and 2) +Selection +, where the children with the highest UCT +(Upper Confidence bounds applied to Trees) +(Kocsis & Szepesvári, +2006 +) +value is selected by the next iteration. The UCT of a child state +s +𝑠 +s +is calculated as follows: +U +​ +C +​ +T +​ +( +s +) += +V +​ +( +s +) ++ +w +​ +ln +⁡ +N +​ +( +p +) +N +​ +( +s +) +, +𝑈 +𝐶 +𝑇 +𝑠 +𝑉 +𝑠 +𝑤 +𝑁 +𝑝 +𝑁 +𝑠 +UCT(s)=V(s)+w\sqrt{\frac{\ln N(p)}{N(s)}}, +(1) +where +N +​ +( +s +) +𝑁 +𝑠 +N(s) +is the number of visits to a node +s +𝑠 +s +, +V +​ +( +s +) +𝑉 +𝑠 +V(s) +is the value function (expected return) from the subtree of +s +𝑠 +s +, +w +𝑤 +w +is the exploration weight, and +p +𝑝 +p +is the parent node of +s +𝑠 +s +. The child node with the highest UCT value is selected for expansion in the next iteration. When the end of an episode is reached, a +backpropagation +is carried out: the return +r +𝑟 +r +is used for updating every +V +​ +( +s +) +𝑉 +𝑠 +V(s) +along the path +with the formula +V +​ +( +s +) += +V +old +​ +( +s +) +​ +( +N +​ +( +s +) +− +1 +) ++ +r +N +​ +( +s +) +𝑉 +𝑠 +subscript +𝑉 +old +𝑠 +𝑁 +𝑠 +1 +𝑟 +𝑁 +𝑠 +V(s)=\frac{V_{\text{old}}(s)(N(s)-1)+r}{N(s)} +, where +V +old +​ +( +s +) +subscript +𝑉 +old +𝑠 +V_{\text{old}}(s) +is the old value function. Normally, the major shortcoming of MCTS is that it requires an environment model to undo previous steps and form a searching tree, which is often a strong assumption. However, such a limitation does not exist for LMs, as we can conveniently reset to any step by simply copy-pasting historical text input. Such a special property is the key motivation of our work. +4 +Unifying Planning, Reasoning, and Acting +4.1 +LM Agent +LATS supports sequential reasoning or decision-making tasks on the basis of ReAct. At time step +t +𝑡 +t +, an agent receives an observation +o +t +∈ +O +subscript +𝑜 +𝑡 +𝑂 +o_{t}\in O +from the environment and takes an action +a +t +∈ +A +subscript +𝑎 +𝑡 +𝐴 +a_{t}\in A +following some policy +π +​ +( +a +t +| +x +, +o +1 +​ +⋯ +​ +i +− +1 +, +a +1 +​ +⋯ +​ +i +− +1 +) +𝜋 +conditional +subscript +𝑎 +𝑡 +𝑥 +subscript +𝑜 +1 +⋯ +𝑖 +1 +subscript +𝑎 +1 +⋯ +𝑖 +1 +\pi(a_{t}|x,o_{1\cdots i-1},a_{1\cdots i-1}) +, where +x +𝑥 +x +consists of the task instruction and a number of few-shot examples. We initialize the agent with +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +to leverage the useful language representations of an LM as a base decision-maker. We follow the ReAct instantiation in which the action space +A +^ += +A +∪ +Z +^ +𝐴 +𝐴 +𝑍 +\hat{A}=A\cup Z +consists of both the space of permissible actions +A +𝐴 +A +and language space of reasoning traces +Z +𝑍 +Z +. Actions directly affect the environment and result in observation, while thoughts are used to formalize decisions by organizing information, planning future actions, or injecting internal knowledge. The exact instantiation of the action space depends on the particular environment; for decision-making tasks actions might consist of commands on a website while for reasoning tasks the action space might be limited to a few external tools or APIs. +Instead of greedily decoding one trajectory or solution, we sample +n +𝑛 +n +actions from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +using the current state. This is based on the intuition that for complex decision-making tasks, there is likely to be a range of potential trajectories or reasoning paths that are correct +(Evans, +2010 +) +. Sampling a diverse set of candidates at each step mitigates the stochastic nature of LM text generation and enables greater exploration in both the decision-making and reasoning space. We wrap +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +within our proposed search algorithm to deliberately construct the best trajectory from sampled actions. +4.2 +LATS +Figure 3: +An overview of the six operations of LATS. A node is +selected +, +expanded +, +evaluated +, then +simulated +until a terminal node is reached, then the resulting value is +backpropagated +. If the trajectory fails, a +reflection +is generated and used as additional context for future trials. These operations are performed in succession until the budget is reached or task is successful. +The main component of LATS is a search algorithm that controls the overall problem-solving process with deliberate planning. To find the most promising trajectory and systemically balance exploration with exploitation, we adopt a variant of Monte Carlo Tree Search (MCTS) that frames decision-making as a tree search, in which each node +s += +[ +x +, +a +1 +​ +⋯ +​ +i +, +o +1 +​ +⋯ +​ +i +] +𝑠 +𝑥 +subscript +𝑎 +1 +⋯ +𝑖 +subscript +𝑜 +1 +⋯ +𝑖 +s=[x,a_{1\cdots i},o_{1\cdots i}] +represents a state comprising the original input +x +𝑥 +x +, action sequence +a +1 +⋅ +i +subscript +𝑎 +⋅ +1 +𝑖 +a_{1\cdot i} +, and observation sequence +o +1 +⋅ +i +subscript +𝑜 +⋅ +1 +𝑖 +o_{1\cdot i} +. +To adapt MCTS for language agents, LATS repurposes +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +as an agent, state evaluator, and feedback generator, leveraging the useful language priors of modern LMs to facilitate planning. While standard MCTS and RAP +Hao et al. ( +2023 +) +rely on internal dynamics models to facilitate simulation, LATS is model-free and uses environment interaction. LATS consists of a series of operations, +selection, expansion, evaluation, simulation, backpropagation, and reflection +, performed in succession until the task is successfully completed or a computational limit is reached. The full psuedocode of LATS can be found in Sec. +A +in the Appendix. +Selection. +In the first operation, the algorithm identifies a segment of the current tree most suitable for subsequent expansion. Starting from the root node, denoted as the initial state +s +0 +subscript +𝑠 +0 +s_{0} +, a child node is selected at each tree level until a leaf node is reached. To balance exploration and exploitation, we use the UCT algorithm as shown in Eq. +1 +. +Expansion. +After selecting a node, the second operation expands the tree by sampling +n +𝑛 +n +actions from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, as described in the prior section. The environment receives each action and returns corresponding feedback as an observation. This results in +n +𝑛 +n +new child nodes added to the tree. This tree is stored in an external long-term memory structure. +Evaluation. +The third operation assigns a scalar value to each new child node to be used for selection and backpropagation. This value effectively quantifies the agent’s progress in task completion, serving as a heuristic to steer the search algorithm towards the most promising regions of the tree. Following +Yao et al. ( +2023a +) +we repurpose +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +into a value function by prompting it to reason about a given state. To obtain a scalar value, we instruct +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +to end its reasoning trace with a score indicating the correctness of the trajectory. This method offers enhanced flexibility over programmed heuristics +(Campbell et al., +2002 +) +and greater efficiency than learned heuristics +(Silver et al., +2017 +) +. +Simulation. +The fourth operation expands the currently selected node until a terminal state is reached. At each depth level we sample and evaluate nodes with the same operations, but prioritize nodes of highest value. Reaching a terminal state provides objective feedback on the correctness of a trajectory. If the task is completed successfully, then LATS terminates the search. If the solution is partially successful or unsuccessful, then we perform two additional operations as described below. +Backpropagation. +This operation updates the values of the tree based on the outcome of a trajectory. For each node +s +0 +, +s +1 +, +… +, +s +n +subscript +𝑠 +0 +subscript +𝑠 +1 +… +subscript +𝑠 +𝑛 +s_{0},s_{1},\dots,s_{n} +in the trajectory from root (initial state +s +0 +subscript +𝑠 +0 +s_{0} +) of the searching tree to leaf (terminal state +s +n +subscript +𝑠 +𝑛 +s_{n} +), its value is updated to reflect the outcome of the simulation by +N +​ +( +s +i +) += +N +old +​ +( +s +i +) ++ +1 +𝑁 +subscript +𝑠 +𝑖 +subscript +𝑁 +old +subscript +𝑠 +𝑖 +1 +N(s_{i})=N_{\text{old}}(s_{i})+1 +and +V +​ +( +s +i +) += +r ++ +N +old +​ +( +s +i +) +​ +V +old +​ +( +s +i +) +N +​ +( +s +i +) +𝑉 +subscript +𝑠 +𝑖 +𝑟 +subscript +𝑁 +old +subscript +𝑠 +𝑖 +subscript +𝑉 +old +subscript +𝑠 +𝑖 +𝑁 +subscript +𝑠 +𝑖 +V(s_{i})=\frac{r+N_{\text{old}}(s_{i})V_{\text{old}}(s_{i})}{N(s_{i})} +, where +r +𝑟 +r +is the return and +N +old +, +V +old +subscript +𝑁 +old +subscript +𝑉 +old +N_{\text{old}},V_{\text{old}} +are the old number of visits and value function. These updated values are used in the UCT formula (Eq. +1 +) to guide the selection of the next node for exploration. +Reflection. +In addition to the environmental feedback, we also leverage +self-reflection +to further refine the decision-making process +(Shinn et al., +2023 +; Madaan et al., +2023 +) +. Upon encountering an unsuccessful terminal node, +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +is prompted with the trajectory and final reward to provide a verbal self-reflection that summarizes the errors in the reasoning or acting process and proposes superior alternatives. We store both failed trajectories and corresponding reflections in the memory. In subsequent iterations, these are integrated as additional context to the agent and value function, refining both through in-context learning. This imparts a semantic gradient signal more useful than a scalar value, enabling the agent to learn from trial and error without the cost of expensive optimization processes such as reinforcement learning. +Conceptually, LATS has the following advantages as a general framework for reasoning and decision-making with LM agents. +(1) +Generality +: LATS supports both reasoning and decision-making tasks by defining a shared space of thoughts and actions. (2) +Deliberate +: The use of MCTS and LM value function ensures a principled search that selects options with high value while exploring promising alternatives. (3) +Adaptability +: LATS is designed around the use of external feedback through observations and self-reflection, enabling greater adaptation during problem-solving. (4) +Flexibility +: LATS can accommodate different scenarios, environments, and resource stipulations by modifying state design and tree dimensions. (5) +Modularity +: The base LM agent, reflection generator, and value function can be independently altered and adapted to individual LM properties. +5 +Experiments +To demonstrate the general applicability of LATS, we evaluate our method on a variety of decision-making domains that requires both reasoning and acting ability: programming +(Chen et al., +2021 +; Austin et al., +2021 +) +, HotPotQA +(Yang et al., +2018 +) +, and WebShop +(Yao et al., +2022 +) +. +5.1 +HotPotQA +For a task that can be approached with both reasoning-based and acting-based strategies, we consider HotPotQA +(Yang et al., +2018 +) +, a multi-hop question-answering benchmark that requires retrieval over two or more Wikipedia passages. For the action space, in addition to LM thoughts we follow the setup from +Yao et al. ( +2023b +) +, which provides the agent with API calls to search and lookup information. The output of these API calls and self-generated reflections form the observation space. We use a subset of 100 questions and three few-shot examples for each method. For ToT, we use DFS as the base search algorithm and scoring with the LM as the heuristic. For all methods that involve sampling, including LATS, we sample +k += +50 +𝑘 +50 +k=50 +trajectories. More details and prompts can be found in Sec. +D +and Sec. +E +in the Appendix. +We evaluate internal reasoning strategies by removing actions and observations from the context, corresponding to CoT +(Wei et al., +2022 +) +and its variants, CoT-SC +(Wang et al., +2022 +) +, ToT +(Yao et al., +2023a +) +, and RAP +(Hao et al., +2023 +) +. These methods rely solely on the agent’s existing knowledge to answer the question. We also consider acting-based methods ReAct, Reflexion, and LATS, which augment the agent with the interactive API environment and primarily evaluate its information retrieval abilities. While LATS is designed for scenarios where external feedback can enhance reasoning, we also implement a reasoning-only version with CoT as the base prompt. We also combine internal and external reasoning in LATS by first prompting with a CoT-based prompt, then switching to a ReAct-based prompt upon failure. This is closer to how humans might approach this task, by using tools to lookup additional information only when the answer is not already known. +Prompt Method +HotpotQA (EM) +I/O +0.32 +CoT +(Wei et al., +2022 +) +0.34 +CoT - SC +(Wang et al., +2022 +) +0.38 +ToT +(Yao et al., +2023a +) +0.55 +RAP +(Hao et al., +2023 +) +0.60 +RAP (n = 10) +0.60 +LATS (CoT) +0.60 +Prompt Method +HotpotQA (EM) +ReAct +(Yao et al., +2023b +) +0.32 +ReAct (best of k) +0.38 +Reflexion +(Shinn et al., +2023 +) +0.51 +LATS +0.61 +LATS (n = 3) +0.56 +LATS (n = 10) +0.64 +LATS (CoT + ReAct) +0.71 +Table 2: +GPT-3.5 reasoning-based prompting (left) and acting-based prompting (right) results on HotpotQA. LATS achieves the highest exact match (EM) for acting and is competitive on reasoning. Unless otherwise specified, we sample +n += +5 +𝑛 +5 +n=5 +nodes during expansion and +k += +50 +𝑘 +50 +k=50 +trajectories. +Results. +We observe in Tab. +2 +that both internal reasoning and external retrieval strategies perform well on HotPotQA. Due to their large-scale training corpus, modern LLMs already encode factual knowledge and can often directly answer the question correctly. While CoT can slightly enhance performance on questions requiring reasoning, larger gains are observed with search methods ToT and RAP, which can sample and explore more outputs. We observe similar results for acting-based methods. LATS surpasses ReAct, even when sampling the same number of trajectories, by expanding more nodes with principled search (see Fig. +5 +in Appendix +D +for a qualitative sample). This is demonstrated when modifying +n +𝑛 +n +, the number of nodes expanded during each iteration. Increasing +n +𝑛 +n +can consistently improve performance, although at greater computational and inference costs. LATS is also competitive to RAP on internal reasoning but performs worse than acting. Combining internal and external reasoning in LATS results in the highest performance, indicating the importance of external feedback in augmenting reasoning even in tasks the base LM can already perform. +5.2 +Programming +Prompt Method +Model +Pass@1 +CoT +(Wei et al., +2022 +) +GPT-3.5 +46.9 +ReAct +(Yao et al., +2023b +) +GPT-3.5 +56.9 +Reflexion +(Shinn et al., +2023 +) +GPT-3.5 +68.1 +ToT +(Yao et al., +2023a +) +GPT-3.5 +54.4 +RAP +(Hao et al., +2023 +) +GPT-3.5 +63.1 +LATS (Ours) +GPT-3.5 +83.8 +I/O +GPT-4 +80.1 +Reflexion +GPT-4 +91.0 +LATS +GPT-4 +94.4 +Prompt Method +Pass@1 +CoT +(Wei et al., +2022 +) +54.9 +ReAct +(Wei et al., +2022 +) +67.0 +Reflexion +(Shinn et al., +2023 +) +70.0 +ToT +(Yao et al., +2023a +) +65.8 +RAP +(Hao et al., +2023 +) +71.4 +LATS (Ours) +81.1 +Table 3: +GPT-3.5 and GPT-4 Pass@1 accuracy on HumanEval +(Chen et al., +2021 +) +and MBPP +(Austin et al., +2021 +) +. Prompting with LATS achieves the highest performance. We sample 5 solutions during expansion for +8 +iterations. +To demonstrate the importance of external observations for complex reasoning tasks, we evaluate the baselines and LATS on programming with Humaneval +(Chen et al., +2021 +) +and MBPP +(Austin et al., +2021 +) +. Both datasets measure the correctness of synthesized programs in Python from natural language docstrings. We use individual solutions as the action space and test suite and compiler feedback as the external observation. We follow +Chen et al. ( +2022a +) +and use an LLM to generate a synthetic test suite of syntactically valid “assert” statements for each question. For each step, the solution is evaluated on this test suite, and the results including successful and failed tests and compiler output, are added to the context as an observation. We use the same test suite for Reflexion. +For this task, the reasoning and acting baselines share an action space, but acting methods are able to incorporate observations as additional context. For LATS, since each action corresponds to a complete solution, we skip the simulation step of LATS and directly use the percentage of passed tests as the backpropagated reward. We use +k += +8 +𝑘 +8 +k=8 +iterations, set the number of generated tests at +4 +4 +4 +, and sample +n += +5 +𝑛 +5 +n=5 +solutions during expansion. After the search is completed, we select the solution with the highest value and evaluate it on the real test suite for the pass@1 accuracy evaluation. More details and prompts can be found in Sec. +D +and Sec. +F +in the Appendix. +Results. +We find in Tab +3 +that both search and semantic feedback are crucial for better performance. Despite not using observations, ToT and RAP are competitive with Reflexion. LATS has the highest performance on both datasets. Since RAP uses a similar search algorithm as LATS, this reveals the importance of external feedback for difficult reasoning tasks such as programming. With GPT-4, using LATS sets the state of the art for HumanEval, showing LATS can be used with more advanced LLMs for higher performance. +5.3 +Webshop +For a complex decision-making environment with practical applications, we consider WebShop +(Yao et al., +2022 +) +, an online shopping environment composed of a website with 1.18M real-world products and 12k human instructions. Agents must navigate a website through a variety of commands to purchase an item matching a user specification. We use the preconstructed action space of search and click commands and browser feedback and reflections for the observation. The performance is gauged using two metrics: an average score, reflecting the percentage of user-specified attributes met by the selected product, and a success rate, indicating the frequency with which the chosen product fulfills all given conditions. We compare against acting-based prompting methods and RL-based approaches. We evaluate on 50 instructions, expand +n += +5 +𝑛 +5 +n=5 +children for LATS, and set +k += +30 +𝑘 +30 +k=30 +for LATS, ReAct best of +k +𝑘 +k +, and Reflexion. More details and prompts are in Appendix +D +and +G +. +Results. +We find in Tab. +5 +that GPT-3.5 with ReAct is competitive to imitation learning, and can exceed reinforcement learning techniques with stronger prompting strategies. Sampling +k += +30 +𝑘 +30 +k=30 +trajectories with ReAct and Reflexion results in a similar performance, suggesting the semantic feedback is not as helpful in complex environments like WebShop. Indeed like in +Shinn et al. ( +2023 +) +, we find that generated reflections are often generic and do not provide useful feedback, resulting in a tendency for the agent to become stuck in local minima. However, using LATS indeed results in a noticeable improvement, indicating a more effective exploration for the same number of iterations. +5.4 +Additional Observations +Method +Score +SR +ReAct +(Yao et al., +2023b +) +53.8 +28.0 +ReAct (best of k) +59.1 +32.0 +Reflexion +(Shinn et al., +2023 +) +64.2 +35.0 +LATS +75.9 +38.0 +IL +59.9 +29.1 +IL+RL +62.4 +28.7 +Fine-tuning +(Furuta et al., +2023 +) +67.5 +45.0 +Expert +82.1 +59.6 +Table 4: +Score and success rate (SR) on Webshop. Table is separated into prompting, RL-based training, and human performance. For the same number of iterations, LATS improves both score and success rate, and surpasses RL-based training. IL/IL+RL taken from +Yao et al. ( +2022 +) +. +Prompt Method +HotPotQA (EM) +ToT (ReAct) +0.39 +RAP (ReAct) +0.54 +LATS (No LM Heuristic) +0.37 +LATS (DFS) +0.42 +LATS (No Reflection) +0.56 +LATS +0.61 +Table 5: +Ablation results on LATS and baseline variants in HotPotQA; we use ReAct as the base prompt and sample +n += +5 +𝑛 +5 +n=5 +children and +k += +50 +𝑘 +50 +k=50 +maximum trajectories. LATS requires every component and operation for optimal performance. +We also conduct additional experiments on HotPotQA to demonstrate the effect of each component of LATS. We also design a version of ToT and RAP with ReAct prompt and can handle external observations. We use HotPotQA as our setup incorporates both reasoning (through thoughts) and acting (through API calls); the results are shown in Tab. +5 +. More ablations for token consumption on HotPotQA are in Tab. +7 +in Appendix +C +. Note that baselines generally perform worse than the reasoning-only setting of HotPotQA, which indicates that the acting-based setting is more challenging and adaption of search algorithms to decision-making scenarios is non-trivial. +Self-reflection. +We use self-reflection to provide additional semantic signals for the agent. We observe a +0.05 +0.05 +0.05 +performance drop when removed from LATS, suggesting this is useful. This is a smaller gain Reflexion +(Shinn et al., +2023 +) +observes over ReAct +(Yao et al., +2023b +) +as shown in Tab. +2 +, suggesting overlap between the types of questions where there is an improvement with self-reflection and search. This variant outperforms RAP-ReAct, reflecting our improvements to MCTS. +Search Algorithm. +MCTS is a more principled search algorithm than variants like A* or DFS search and the basis for observed performance gains. We observe the effects of using DFS, and incorporate the LM-based heuristic used in ToT +(Yao et al., +2023a +) +in which branches with low values are pruned. This removes the selection and backpropagation operations, and we observe a +0.08 +0.08 +0.08 +drop in performance when sampling the same number of nodes, but outperforms ToT-ReAct. +6 +Conclusion +In this work, we introduce Language Agent Tree Search (LATS), the first framework to unify planning, acting, and reasoning for enhanced LLM problem solving. By deliberately constructing trajectories with search algorithms, incorporating external feedback, and enabling agents to learn from experience, LATS addresses key limitations of prior prompting techniques. Our evaluations demonstrate the ability of LATS to harness LLM capabilities for a variety of decision-making tasks while keeping its reasoning ability without additional training. The proposed synergies between search, interaction, and reflection offer a versatile approach to autonomous decision-making, highlighting the potential of LLMs as generalist agents. A full discussion of the limitations and broader impacts is in Appendix +B +. +References +Ahn et al. (2022) +Michael Ahn, Anthony Brohan, Noah Brown, Yevgen Chebotar, Omar Cortes, Byron David, Chelsea Finn, Chuyuan Fu, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Daniel Ho, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Eric Jang, Rosario Jauregui Ruano, Kyle Jeffrey, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Kuang-Huei Lee, Sergey Levine, Yao Lu, Linda Luu, Carolina Parada, Peter Pastor, Jornell Quiambao, Kanishka Rao, Jarek Rettinghouse, Diego Reyes, Pierre Sermanet, Nicolas Sievers, Clayton Tan, Alexander Toshev, Vincent Vanhoucke, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Mengyuan Yan, and Andy Zeng. +Do as i can, not as i say: Grounding language in robotic affordances. +arXiv:2204.01691 +, 2022. +Anthony et al. (2017) +T. Anthony, Z. Tian, and D. Barber. +Thinking fast and slow with deep learning and tree search. +In +NIPS +, 2017. +Austin et al. (2021) +Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, et al. +Program synthesis with large language models. +arXiv:2108.07732 +, 2021. +Baker et al. (2022) +Bowen Baker, Ilge Akkaya, Peter Zhokhov, Joost Huizinga, Jie Tang, Adrien Ecoffet, Brandon Houghton, Raul Sampedro, and Jeff Clune. +Video pretraining (vpt): Learning to act by watching unlabeled online videos. +arXiv:2206.11795 +, 2022. +Besta et al. (2023) +Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michal Podstawski, Hubert Niewiadomski, Piotr Nyczyk, and Torsten Hoefler. +Graph of thoughts: Solving elaborate problems with large language models. +arXiv:2308.09687 +, 2023. +Bowman et al. (2015) +Samuel R Bowman, Gabor Angeli, Christopher Potts, and Christopher D Manning. +A large annotated corpus for learning natural language inference. +In +EMNLP +, 2015. +Brown et al. (2020) +Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. +Language models are few-shot learners. +In +NeurIPS +, 2020. +Campbell et al. (2002) +Murray Campbell, A Joseph Hoane Jr, and Feng-hsiung Hsu. +Deep blue. +Artificial intelligence +, 2002. +Chen et al. (2022a) +Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. +Codet: Code generation with generated tests. +arXiv:2207.10397 +, 2022a. +Chen et al. (2021) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. +Evaluating large language models trained on code. +arXiv:2107.03374 +, 2021. +Chen et al. (2022b) +Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen. +Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. +arXiv preprint arXiv:2211.12588 +, 2022b. +Chowdhery et al. (2022) +Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. +Palm: Scaling language modeling with pathways. +arXiv:2204.02311 +, 2022. +Cobbe et al. (2021) +Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. +Training verifiers to solve math word problems. +arXiv:2110.14168 +, 2021. +Deng et al. (2023) +Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su. +Mind2web: Towards a generalist agent for the web. +arXiv:2306.06070 +, 2023. +Driess et al. (2023) +Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence. +Palm-e: An embodied multimodal language model. +arXiv:2303.03378 +, 2023. +Du et al. (2023) +Yilun Du, Mengjiao Yang, Bo Dai, Hanjun Dai, Ofir Nachum, Joshua B. Tenenbaum, Dale Schuurmans, and Pieter Abbeel. +Learning universal policies via text-guided video generation. +arXiv:2302.00111 +, 2023. +Evans (2010) +Jonathan St BT Evans. +Intuition and reasoning: A dual-process perspective. +Psychological Inquiry +, 2010. +Fan et al. (2022) +Linxi Fan, Guanzhi Wang, Yunfan Jiang, Ajay Mandlekar, Yuncong Yang, Haoyi Zhu, Andrew Tang, De-An Huang, Yuke Zhu, and Anima Anandkumar. +Minedojo: Building open-ended embodied agents with internet-scale knowledge. +In +NeurIPS Datasets and Benchmarks Track +, 2022. +Furuta et al. (2023) +Hiroki Furuta, Ofir Nachum, Kuang-Huei Lee, Yutaka Matsuo, Shixiang Shane Gu, and Izzeddin Gur. +Multimodal web navigation with instruction-finetuned foundation models. +arXiv preprint arXiv:2305.11854 +, 2023. +Gao et al. (2022) +Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig. +Pal: Program-aided language models. +arXiv preprint arXiv:2211.10435 +, 2022. +Guo et al. (2018) +Jiaxian Guo, Sidi Lu, Han Cai, Weinan Zhang, Yong Yu, and Jun Wang. +Long text generation via adversarial training with leaked information. +AAAI +, 2018. +Guss et al. (2019) +William H. Guss, Brandon Houghton, Nicholay Topin, Phillip Wang, Cayden Codel, Manuela Veloso, and Ruslan Salakhutdinov. +Minerl: A large-scale dataset of minecraft demonstrations. +In +IJCAI +, 2019. +Hafner et al. (2019) +Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha, Honglak Lee, and James Davidson. +Learning latent dynamics for planning from pixels. +In +ICML +, 2019. +Hafner et al. (2023) +Danijar Hafner, Jurgis Pasukonis, Jimmy Ba, and Timothy Lillicrap. +Mastering diverse domains through world models. +arXiv:2301.04104 +, 2023. +Hao et al. (2023) +Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and Zhiting Hu. +Reasoning with language model is planning with world model. +arXiv:2305.14992 +, 2023. +Huang et al. (2023) +Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, and Denny Zhou. +Large language models cannot self-correct reasoning yet. +arXiv:2310.01798 +, 2023. +Huang et al. (2022) +Wenlong Huang, Fei Xia, Ted Xiao, Harris Chan, Jacky Liang, Pete Florence, Andy Zeng, Jonathan Tompson, Igor Mordatch, Yevgen Chebotar, et al. +Inner monologue: Embodied reasoning through planning with language models. +arXiv:2207.05608 +, 2022. +Jiang et al. (2018) +D. Jiang, E. Ekwedike, and H. Liu. +Feedback-based tree search for reinforcement learning. +In +ICML +, 2018. +Kocsis & Szepesvári (2006) +Levente Kocsis and Csaba Szepesvári. +Bandit based monte-carlo planning. +In +ECML +, 2006. +Kojima et al. (2022) +Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. +Large language models are zero-shot reasoners. +arXiv:2205.11916 +, 2022. +LaValle et al. (2001) +Steven M LaValle, James J Kuffner, BR Donald, et al. +Rapidly-exploring random trees: Progress and prospects. +Algorithmic and computational robotics: new directions +, 2001. +Liu et al. (2018) +Evan Zheran Liu, Kelvin Guu, Panupong Pasupat, Tianlin Shi, and Percy Liang. +Reinforcement learning on web interfaces using workflow-guided exploration. +In +ICLR +, 2018. +Liu et al. (2023) +Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, and Jie Tang. +Agentbench: Evaluating llms as agents. +arXiv:2308.03688 +, 2023. +Madaan et al. (2023) +Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, Shashank Gupta, Bodhisattwa Prasad Majumder, Katherine Hermann, Sean Welleck, Amir Yazdanbakhsh, and Peter Clark. +Self-refine: Iterative refinement with self-feedback. +arXiv:2303.17651 +, 2023. +Nallapati et al. (2016) +Ramesh Nallapati, Bowen Zhou, Cicero dos Santos, Caglar Gulcehre, and Bing Xiang. +Abstractive text summarization using sequence-to-sequence rnns and beyond. +In +SIGNLL +, 2016. +Nye et al. (2021) +Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, et al. +Show your work: Scratchpads for intermediate computation with language models. +arXiv:2112.00114 +, 2021. +OpenAI (2023) +OpenAI. +Gpt-4 technical report. +arXiv:2303.08774 +, 2023. +Saparov & He (2022) +Abulhair Saparov and He He. +Language models are greedy reasoners: A systematic formal analysis of chain-of-thought. +arXiv:2210.01240 +, 2022. +Schick et al. (2023) +Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. +Toolformer: Language models can teach themselves to use tools. +arXiv:2302.04761 +, 2023. +Shen et al. (2023) +Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. +Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. +arXiv:2303.17580 +, 2023. +Shinn et al. (2023) +Noah Shinn, Federico Cassano, Beck Labash, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. +Reflexion: Language agents with verbal reinforcement learning. +arXiv:2303.11366 +, 2023. +Shridhar et al. (2020) +Mohit Shridhar, Xingdi Yuan, Marc-Alexandre Côté, Yonatan Bisk, Adam Trischler, and Matthew Hausknecht. +Alfworld: Aligning text and embodied environments for interactive learning. +arXiv:2010.03768 +, 2020. +Silver et al. (2016) +David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. +Mastering the game of go with deep neural networks and tree search. +nature +, 2016. +Silver et al. (2017) +David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja Huang, Arthur Guez, Thomas Hubert, Lucas baker, Matthew Lai, Adrian Bolton, Yutian Chen, Timothy P. Lillicrap, Fan Hui, L. Sifre, George van den Driessche, Thore Graepel, and Demis Hassabis. +Mastering the game of go without human knowledge. +Nature +, 2017. +Sloman (1996) +Steven A. Sloman. +The empirical case for two systems of reasoning. +Psychological Bulletin +, 1996. +Sun et al. (2023) +Haotian Sun, Yuchen Zhuang, Lingkai Kong, Bo Dai, and Chao Zhang. +Adaplanner: Adaptive planning from feedback with language models. +arXiv:2305.16653 +, 2023. +Surís et al. (2023) +Dídac Surís, Sachit Menon, and Carl Vondrick. +Vipergpt: Visual inference via python execution for reasoning. +arXiv preprint arXiv:2303.08128 +, 2023. +Świechowski et al. (2023) +Maciej Świechowski, Konrad Godlewski, Bartosz Sawicki, and Jacek Mańdziuk. +Monte carlo tree search: A review of recent modifications and applications. +Artificial Intelligence Review +, 2023. +Touvron et al. (2023) +Hugo Touvron, Louis Martin, Kevin R. Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Daniel M. Bikel, Lukas Blecher, Cristian Cantón Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony S. Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel M. Kloumann, A. V. Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, R. Subramanian, Xia Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zhengxu Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and +Thomas Scialom. +Llama 2: Open foundation and fine-tuned chat models. +arXiv:2307.09288 +, 2023. +Vodopivec et al. (2017) +Tom Vodopivec, Spyridon Samothrakis, and Branko Ster. +On monte carlo tree search and reinforcement learning. +Journal of Artificial Intelligence Research +, 2017. +Wang et al. (2023) +Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar. +Voyager: An open-ended embodied agent with large language models. +arXiv:2305.16291 +, 2023. +Wang et al. (2022) +Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou. +Self-consistency improves chain of thought reasoning in language models. +arXiv:2203.11171 +, 2022. +Wei et al. (2022) +Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou. +Chain of thought prompting elicits reasoning in large language models. +arXiv:2201.11903 +, 2022. +Wooldridge & Jennings (1995) +Michael Wooldridge and Nicholas R Jennings. +Intelligent agents: Theory and practice. +The knowledge engineering review +, 1995. +Wu et al. (2023) +Philipp Wu, Alejandro Escontrela, Danijar Hafner, Pieter Abbeel, and Ken Goldberg. +Daydreamer: World models for physical robot learning. +In +CoRL +. PMLR, 2023. +Xie et al. (2023) +Yuxi Xie, Kenji Kawaguchi, Yiran Zhao, Xu Zhao, Min-Yen Kan, Junxian He, and Qizhe Xie. +Decomposition enhances reasoning via self-evaluation guided decoding. +arXiv:2305.00633 +, 2023. +Yang et al. (2018) +Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. +Hotpotqa: A dataset for diverse, explainable multi-hop question answering. +arXiv:1809.09600 +, 2018. +Yao et al. (2022) +Shunyu Yao, Howard Chen, John Yang, and Karthik R Narasimhan. +Webshop: Towards scalable real-world web interaction with grounded language agents. +In +NeurIPS +, 2022. +Yao et al. (2023a) +Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Thomas L. Griffiths, Yuan Cao, and Karthik Narasimhan. +Tree of thoughts: Deliberate problem solving with large language models. +arXiv:2305.10601 +, 2023a. +Yao et al. (2023b) +Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. +ReAct: Synergizing reasoning and acting in language models. +In +ICLR +, 2023b. +Yao et al. (2023c) +Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, Ran Xu, Phil Mui, Huan Wang, Caiming Xiong, and Silvio Savarese. +Retroformer: Retrospective large language agents with policy gradient optimization. +arXiv preprint arXiv:2308.02151 +, 2023c. +Ye et al. (2021) +Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, and Yang Gao. +Mastering atari games with limited data. +In +NeurIPS +, 2021. +Zhou et al. (2022) +Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Olivier Bousquet, Quoc Le, and Ed Chi. +Least-to-most prompting enables complex reasoning in large language models. +arXiv:2205.10625 +, 2022. +Zhu et al. (2023) +Xizhou Zhu, Yuntao Chen, Hao Tian, Chenxin Tao, Weijie Su, Chenyu Yang, Gao Huang, Bin Li, Lewei Lu, Xiaogang Wang, Yu Qiao, Zhaoxiang Zhang, and Jifeng Dai. +Ghost in the minecraft: Generally capable agents for open-world environments via large language models with text-based knowledge and memory. +arXiv:2305.17144 +, 2023. +7 +Appendix +The appendix is organized as follows. First in Sec. +A +, we show the pseudocode of our proposed algorithm, LATS; then in Sec. +B +, we provide further discussion of our method and its limitations, future direction and broader impact; then in Sec. +C +we provide additional experimental results; then in Sec. +D +, we specify the environment details in our experiments; finally, we list our prompts used for the three environments in Sec. +E +(HotPotQA), Sec. +F +(Programming) and Sec. +G +(Webshop) respectively. +Appendix A +LATS Pseudocode +Alg. +1 +shows the pseudocode of our algorithm LATS. Nodes are stored explicitly in the memory. Unless otherwise specified, in all experiments we use +n += +5 +𝑛 +5 +n=5 +and +w += +1 +𝑤 +1 +w=1 +. +Algorithm 1 +LATS +⁡ +( +S +0 +, +p +θ +, +p +V +, +p +ref +, +d +, +k +, +n +, +w +) +LATS +subscript +𝑆 +0 +subscript +𝑝 +𝜃 +subscript +𝑝 +𝑉 +subscript +𝑝 +ref +𝑑 +𝑘 +𝑛 +𝑤 +\operatorname{LATS}(S_{0},p_{\theta},{p_{V}},p_{\text{ref}},d,k,n,w) +Initial state +s +1 +subscript +𝑠 +1 +s_{1} +, action generator +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, value function +p +V +subscript +𝑝 +𝑉 +p_{V} +, reflection generator +p +ref +subscript +𝑝 +ref +p_{\text{ref}} +, number of generated actions +n +𝑛 +n +, depth limit +L +𝐿 +L +, number of roll-outs +K +𝐾 +K +, context +c +𝑐 +c +, and exploration weight +w +𝑤 +w +Initialize action space +A +𝐴 +A +, observation space +O +𝑂 +O +Initialize the state-action value function +p +V +: +S +× +A +↦ +ℝ +: +subscript +𝑝 +𝑉 +maps-to +𝑆 +𝐴 +ℝ +{p_{V}}:S\times A\mapsto\mathbb{R} +and visit counter +N +: +S +↦ +ℕ +: +𝑁 +maps-to +𝑆 +ℕ +{N}:S\mapsto\mathbb{N} +to zero +for +k +← +0 +, +… +, +K +− +1 +← +𝑘 +0 +… +𝐾 +1 +k\leftarrow 0,\dots,K-1 +do +for +t +← +0 +, +… +, +L +− +1 +← +𝑡 +0 +… +𝐿 +1 +t\leftarrow 0,\dots,L-1 +do +if +s +t +subscript +𝑠 +𝑡 +s_{t} +not terminal +then +▷ +▷ +\triangleright +Expansion & Simulation +for +i +← +1 +, +… +, +n +← +𝑖 +1 +… +𝑛 +i\leftarrow 1,\dots,n +do +Sample +a +t +( +i +) +∼ +p +θ +​ +( +a +∣ +s +t +) +similar-to +superscript +subscript +𝑎 +𝑡 +𝑖 +subscript +𝑝 +𝜃 +conditional +𝑎 +subscript +𝑠 +𝑡 +a_{t}^{(i)}\sim p_{\theta}(a\mid s_{t}) +Get +o +t +( +i +) +superscript +subscript +𝑜 +𝑡 +𝑖 +o_{t}^{(i)} +from environment, +s +t ++ +1 +( +i +) +← +( +c +t +( +i +) +, +o +t +( +i +) +, +a +t +( +i +) +) +← +superscript +subscript +𝑠 +𝑡 +1 +𝑖 +superscript +subscript +𝑐 +𝑡 +𝑖 +superscript +subscript +𝑜 +𝑡 +𝑖 +superscript +subscript +𝑎 +𝑡 +𝑖 +s_{t+1}^{(i)}\leftarrow(c_{t}^{(i)},o_{t}^{(i)},a_{t}^{(i)}) +, +c +t ++ +1 +( +i +) +← +( +o +t +( +i +) +, +a +t +( +i +) +) +← +superscript +subscript +𝑐 +𝑡 +1 +𝑖 +superscript +subscript +𝑜 +𝑡 +𝑖 +superscript +subscript +𝑎 +𝑡 +𝑖 +c_{t+1}^{(i)}\leftarrow(o_{t}^{(i)},a_{t}^{(i)}) +Evaluate +V +t +( +i +) +∼ +p +V +​ +( +s +t +( +i +) +) +similar-to +superscript +subscript +𝑉 +𝑡 +𝑖 +subscript +𝑝 +𝑉 +superscript +subscript +𝑠 +𝑡 +𝑖 +{V}_{t}^{(i)}\sim{p_{V}}(s_{t}^{(i)}) +▷ +▷ +\triangleright +Evaluation +V +​ +( +s +t +) +← +V +t +( +i +) +← +𝑉 +subscript +𝑠 +𝑡 +superscript +subscript +𝑉 +𝑡 +𝑖 +{V}(s_{t})\leftarrow{V}_{t}^{(i)} +Add +s +t +( +i +) +superscript +subscript +𝑠 +𝑡 +𝑖 +s_{t}^{(i)} +to children +end +for +end +if +if +s +t +subscript +𝑠 +𝑡 +s_{t} +is terminal +then +▷ +▷ +\triangleright +Reflection +Get +r +𝑟 +r +from environment +if +r +𝑟 +r +not success +then +reflection +← +p +ref +​ +( +c +t +) +← +reflection +subscript +𝑝 +ref +subscript +𝑐 +𝑡 +\text{reflection}\leftarrow p_{\text{ref}}(c_{t}) +c +← +reflection +← +𝑐 +reflection +c\leftarrow\text{reflection} +end +if +end +if +a +t +← +arg +⁡ +max +a +∈ +e +​ +( +s +t +) +⁡ +[ +V +​ +( +s +t +) ++ +w +​ +ln +⁡ +N +​ +( +s +t +− +1 +) +N +​ +( +s +t +) +] +← +subscript +𝑎 +𝑡 +subscript +𝑎 +𝑒 +subscript +𝑠 +𝑡 +𝑉 +subscript +𝑠 +𝑡 +𝑤 +𝑁 +subscript +𝑠 +𝑡 +1 +𝑁 +subscript +𝑠 +𝑡 +a_{t}\leftarrow\arg\max_{a\in e(s_{t})}\left[{V(s_{t})}+w\sqrt{\frac{\ln{N}(s_{t-1})}{{N}(s_{t})}}\right] +▷ +▷ +\triangleright +Selection +N +​ +( +s +t ++ +1 +) +← +N +​ +( +s +t ++ +1 +) ++ +1 +← +𝑁 +subscript +𝑠 +𝑡 +1 +𝑁 +subscript +𝑠 +𝑡 +1 +1 +{N}(s_{t+1})\leftarrow{N}(s_{t+1})+1 +if +a +t +subscript +𝑎 +𝑡 +a_{t} +is an output action +then +break +end +for +T +← +← +𝑇 +absent +T\leftarrow +the actual number of steps +for +t +← +T +− +1 +, +… +, +0 +← +𝑡 +𝑇 +1 +… +0 +t\leftarrow T-1,\dots,0 +do +▷ +▷ +\triangleright +Backpropagation +V +​ +( +s +t +) +← +V +​ +( +s +t +) +​ +( +N +​ +( +s +t +) +− +1 +) ++ +r +N +​ +( +s +t +) +← +𝑉 +subscript +𝑠 +𝑡 +𝑉 +subscript +𝑠 +𝑡 +𝑁 +subscript +𝑠 +𝑡 +1 +𝑟 +𝑁 +subscript +𝑠 +𝑡 +V(s_{t})\leftarrow\frac{V(s_{t})(N(s_{t})-1)+r}{N(s_{t})} +end +for +end +for +Appendix B +Discussion +Limitations. +Although LATS can improve reasoning and decision-making, this arrives at a higher computational cost relative to simpler prompting methods like ReAct or Reflexion. The search process takes more time than standard prompting or simpler techniques, and requires greater inference costs. While such an issue is mitigated by the fact that the number of nodes +n +𝑛 +n +expanded at every step provides a natural trade-off between performance and efficiency (setting +n += +1 +𝑛 +1 +n=1 +makes the method as effecient as ReAct with multiple trials or CoT-SC), in practice we recommend using LATS for difficult tasks like programming or for situations where performance is prioritized over efficiency. We hope that continued advancements in LLMs will reduce costs and increase the practicality of LATS. +Additionally, the benchmarks we use in this paper are relatively simple and focused on decision-making, compared to the complexity of real-world interactive environments. In addition, some environments might not easily support rollbacks to previous states. However, the design of LATS is flexible and can be adjusted to various resource constraints. Using planning-based prompting methods like LATS in environments like Minecraft +(Fan et al., +2022 +) +and more reasoning benchmarks would be interesting avenues for future work. +Broader impact. +LATS is a framework that enhances LLM performance through interactions with an environment. This improvement in autonomous decision-making may facilitate harmful uses of LLMs. Alternatively, LATS enhances interpretability and the potential for greater alignment, as it generates understandable, high-level linguistic reasoning and actions through several rounds of decision-making and reflection, rather than relying on implicit, low-level token values. +Appendix C +Ablations +Prompt Method +HotpotQA (EM) +LATS (w=0.5) +0.55 +LATS (w=2.0) +0.61 +LATS (d=4) +0.58 +LATS (CoT) +0.60 +LATS (No LM Heuristic) +0.37 +LATS +0.61 +Table 6: +Ablation results on LATS and baseline variants in HotPotQA measured by Exact Match (EM). We test different depth +d +𝑑 +d +, exploration factor +w +𝑤 +w +, and versions of LATS using CoT and without the LM value function. We sample +n += +5 +𝑛 +5 +n=5 +and +k += +50 +𝑘 +50 +k=50 +trajectories. +Figure 4: +Performance over successive iterations on HumanEval with GPT-3.5. +In this section, we ablate various designs of LATS. Experiments are conducted on HotPotQA with a maximum of +k += +50 +𝑘 +50 +k=50 +trajectories and sampling size of +n += +5 +𝑛 +5 +n=5 +and HumanEval with a maximum of +k += +8 +𝑘 +8 +k=8 +trajectories and sampling size of +n += +5 +𝑛 +5 +n=5 +. The result for HotPotQA is shown in Tab. +5 +and HumanEval in Fig. +4 +. +Exploration weight. +We find that there is lower performance on HotPotQA when the exploration weight +w +𝑤 +w +in the selection formula is decreased to +0.5 +0.5 +0.5 +, suggesting that this reduces the effectiveness of the search. Increasing +w +𝑤 +w +to +2.0 +2.0 +2.0 +does not lead to a performance improvement, but we tend to observe faster convergence. The optimal setting depends on the particular environment and complexity of the state space. +Depth. +In our main experiments we use a maximum depth of +d += +7 +𝑑 +7 +d=7 +on HotPotQA for all methods, following previous work +(Yao et al., +2023b +) +. We ablate the effect on LATS after reducing it to +d += +4 +𝑑 +4 +d=4 +. This results in only a slight drop in performance. We find that most questions can be answered within four steps, and using a greater number of steps tends to force the agent into local minima and rarely improves success. +LM value function. +The LM value function scores states based on expected future reward. Without this heuristic, the only signal to guide search would be from environment rewards for completed trajectories, which are scarce and often binary. When we remove the evaluation operation, we observe a dramatic +0.24 +0.24 +0.24 +drop in performance. +Performance over time. +To see the effects of increasing the number of trajectories sampled, we change +k +𝑘 +k +to different values. We conduct this experiment on HumanEval, which has a more noticeable difference due to sampling less trajectories. The results are shown in Fig. +4 +, in which LATS scales better with more iterations than Reflexion. +Sample complexity and Token cost. +One possible concern of LATS is that the tree-structured search might consume much more tokens than existing methods. To further study the computational cost of LATS compared to prior methods, we examine the sample complexity (i.e. asymptotic token cost) of all methods considered in this paper, and count the average number of nodes expanded by our method and other tree-structured methods (ToT and RAP) upon successful search on HotPotQA. We present the results in Tab. +7 +; the result shows that our method has the same sample complexity as other tree-based search methods, and has less average number of nodes expanded upon success, which indicates less token cost. The token cost gap will be even larger when taking failed trajectories into account, since our method has higher success rate and reaches computational budget limit less often. +Method +Performance ( +↑ +↑ +\uparrow +) +Sample complexity ( +↓ +↓ +\downarrow +) +Avg. #nodes upon success ( +↓ +↓ +\downarrow +) +ReAct (Best +k += +250 +𝑘 +250 +k=250 +) +0.42 +0.42 +0.42 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +CoT-SC ( +n += +1 +, +k += +250 +formulae-sequence +𝑛 +1 +𝑘 +250 +n=1,k=250 +) +0.40 +0.40 +0.40 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +LATS ( +n += +1 +, +k += +50 +formulae-sequence +𝑛 +1 +𝑘 +50 +n=1,k=50 +) +0.48 +0.48 +0.48 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +ToT (ReAct) +0.49 +0.49 +0.49 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +84.05 +84.05 +84.05 +RAP (ReAct) +0.54 +0.54 +0.54 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +70.60 +70.60 +70.60 +LATS ( +n += +5 +, +k += +50 +formulae-sequence +𝑛 +5 +𝑘 +50 +n=5,k=50 +) +0.61 +0.61 +0.61 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +66.65 +66.65 +66.65 +Table 7: +The performance, sample complexity of different methods and average number of nodes expanded upon success by methods with tree-based search. +n +𝑛 +n +is the number of children nodes expanded at every step and +k +𝑘 +k +is the number of trajectories. Our method has the same sample complexity as other methods with tree-based search and expands less nodes upon success, which indicates lower token cost. +Appendix D +Environment Details +D.1 +HotPotQA +Figure 5: +Example trajectories on HotPotQA for ReAct (left) and LATS (right). LATS can sample more actions and avoid failure from previous mistakes by evaluating states with an LM to guide the search toward promising areas of the tree. +HotPotQA +(Yang et al., +2018 +) +is a question-answering dataset that requires reasoning over multiple supporting documents to answer questions. It contains 113k Wikipedia-based question-answer pairs crafted by crowdworkers to be diverse, multi-hop, and explainable. Questions cover a range of types like entities, locations, dates, and comparison of shared properties between two entities. Crowdworkers also provide supporting facts from the documents that justify the answer. We use the HotPotQA benchmark setting with all the Wikipedia paragraphs to test retrieval. We use a randomly selected subset of 100 questions for our experiments and a maximum depth limit of 6. Fig. +5 +illustrates how ReAct and LATS work on an example task of HotPotQA, and gives a qualitative example on how LATS outperforms ReAct on the task. +Action Space. +We adopt the Wikipedia web API proposed in +Yao et al. ( +2023b +) +, with three types of actions to support interactive information retrieval: +(1) +search +[ +entity +], which returns the first 5 sentences from the corresponding +entity +wiki page if it exists, or else suggests top-5 similar entities from the Wikipedia search engine, +(2) +lookup +[ +string +], which returns the next sentence in the page containing +string +, +(3) +finish +[ +answer +], which finishes the current task with +answer +. +These API calls and free-form thoughts form the action space for this environment. +D.2 +Programming +The HumanEval dataset +(Chen et al., +2021 +) +is a collection of 164 handwritten programming problems introduced to evaluate the functional correctness of models for synthesizing programs from natural language descriptions. Each problem includes a function signature, docstring description, reference implementation, and multiple unit tests, with an average of 7.7 tests per problem. The programming tasks assess comprehension of natural language, reasoning, algorithms, and basic mathematics, at a difficulty level comparable to simple software interview questions. Pass rates are evaluated with the pass@k metric, where k samples are generated per problem and a problem is considered solved if any sample passes all tests. We use all 164 problems for our experiments and a maximum depth limit of 8. +The Mostly Basic Programming Problems (MBPP) +Austin et al. ( +2021 +) +benchmark contains 974 short Python functions designed to evaluate program synthesis techniques. The dataset was constructed by crowdsourcing from workers with basic Python knowledge. Each data point consists of a natural language description of a programming task, a reference solution implementation, and three test cases for functional correctness. The natural language prompts are typically short, one-sentence descriptions. Solutions cover common programming constructs including mathematical operations, list processing, string manipulation, and usage of the Python standard library. On average, solutions are 6.8 lines of code. The dataset is also supplemented with an additional set of 426 problems that were manually verified for unambiguous specifications, standard function signatures, and accurate test cases. We use a randomly selected subset of 397 problems for our experiments. +D.3 +WebShop +WebShop +(Yao et al., +2022 +) +is an interactive web-based environment designed to evaluate agents on grounded language understanding and decision-making. It simulates an e-commerce shopping task by providing agents with over 1 million real-world products scraped from Amazon, spanning 5 categories and 113 subcategories. These products contain rich linguistic information, with an average text length of 262 words and a vocabulary size of 224k. In addition, there are over 800k unique product options available for customization. The environment renders webpages in two modes: HTML mode provides pixel-level observations with interactive elements, while simple mode converts the raw HTML into a structured text observation more amenable for training agents. The action space consists of query searches and button clicks, which transition between 4 page types: search, results, item and item-detail. Instructions are crowdsourced natural language specifying product attributes and options, with a total of 12k collected. Automatic rewards are computed by comparing the product purchased by the agent against the attributes and options specified in the instruction, using both lexical matching and semantic similarity metrics. +Type +Argument +State +→ +→ +\rightarrow +Next State +search +[ +Query +] +Search +→ +→ +\rightarrow +Results +choose +Back to search +∗ +* +→ +→ +\rightarrow +Search +choose +Prev/Next page +Results +→ +→ +\rightarrow +Results +choose +[ +Product title +] +Results +→ +→ +\rightarrow +Item +choose +[ +Option +] +Item +→ +→ +\rightarrow +Item +choose +Desc/Overview +Item +→ +→ +\rightarrow +Item-Detail +choose +Previous +Item-Detail +→ +→ +\rightarrow +Item +choose +Buy +Item +→ +→ +\rightarrow +Episode End +Table 8: +Action space of webshop. +There are two evaluation metrics used in WebShop: (1) +Task Score +: defined as +( +100 +× +avg. reward +) +100 +avg. reward +(100\times\text{avg. reward}) +, which captures the average reward obtained across episodes; and (2) +Success Rate (SR) +defined as the portion of instructions where +r += +1 +𝑟 +1 +r=1 +. The reward is calculated based on the number of attributes satisfied by the selected item. We use 50 environments for our experiments and a maximum depth limit of 15. +Appendix E +HotPotQA Prompts +E.1 +Base Acting Prompt +Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +After each observation, provide the next Thought and next Action. +Here are some examples: +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +Thought 2: Arthur’s Magazine was started in 1844. I need to search First for Women next. +Action 2: Search[First for Women] +Observation 2: First for Women is a woman’s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey.[2] In 2011 the circulation of the magazine was 1,310,696 copies.[3] +Thought 3: First for Women was started in 1989. 1844 (Arthur’s Magazine) ¡ 1989 (First for Women), so Arthur’s Magazine was started first. +Action 3: Finish[Arthur’s Magazine] +(examples) +You have attempted to answer the following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question. +(trajectories) +(input) +E.2 +Base Reasoning Prompt +Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question. Start your response with either Action or an indexed Thought +Here are some examples: +Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into? +Let’s think step by step. +Thought 1: The eastern sector of Colorado orogeny extends into the High Plains. +Thought 2: High Plains rise in elevation from around 1,800 to 7,000 ft +Thought 3: The answer is 1,800 to 7,000 ft. +Action: Finish[1,800 to 7,000 ft] +(examples) +Previous trial: +(trajectories) +(input) +E.3 +Value Function Prompt +Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action. +Thus the correctness score is 10 +(other examples) +(failed trajectories) +(context) +E.4 +Reflection Prompt +Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action. +Thus the correctness score is 10 +(other examples) +(failed trajectories) +(context) +Appendix F +Programming Prompts +F.1 +HumanEval function implementation example +Sample function signature: +⬇ +def +minSubArraySum +( +nums +): +Given +an +array +of +integers +nums +, +find +the +minimum +sum +of +any +non +- +empty +sub +- +array +of +nums +. +Example +minSubArraySum +([2, +3, +4, +1, +2, +4]) +== +1 +minSubArraySum +([-1, +-2, +-3]) +== +-6 +Sample function body implementation: +⬇ +min_sum += +float +(’ +inf +’) +for +i +in +range +( +len +( +nums +)): +current_sum += +0 +for +j +in +range +( +i +, +len +( +nums +)): +current_sum ++= +nums +[ +j +] +if +current_sum +< +min_sum +: +min_sum += +current_sum +return +min_sum +F.2 +Base Acting/Reasoning Prompt +You are an AI Python assistant. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Write your full implementation (restate the function signature). +Example 1: +[previous impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a +- +b +[unit test results from previous impl]: +Tested passed: +Tests failed: +assert add(1, 2) == 3 # output: -1 +assert add(1, 2) == 4 # output: -1 +[reflection on previous impl]: +The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input. +[improved impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a ++ +b +F.3 +Reflection Prompt +You are a Python programming assistant. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user. +Example 1: +[previous impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a +- +b +[unit test results from previous impl]: +Tested passed: +Tests failed: +assert add(1, 2) == 3 # output: -1 +assert add(1, 2) == 4 # output: -1 +[reflection on previous impl]: +The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input. +F.4 +Test Case Generation Prompt +You are an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring. +(input) +Appendix G +WebShop Prompts +G.1 +Acting Prompt +Webshop +Instruction: +i would like a 3 ounce bottle of bright citrus deodorant for sensitive skin, and price lower than 50.00 dollars +[Search] +Action: search[3 ounce bright citrus deodorant sensitive skin] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next +] +[B078GWRC1J] +Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +$10.99 +[B078GTKVXY] +Ginger Fresh Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +$10.99 +[B08KBVJ4XN] +Barrel and Oak - Aluminum-Free Deodorant, Deodorant for Men, Essential Oil-Based Scent, 24-Hour Odor Protection, Cedar & Patchouli Blend, Gentle on Sensitive Skin (Mountain Sage, 2.7 oz, 2-Pack) +$15.95 +Action: think[B078GWRC1J and B078GTKVXY are bright citrus deodorant less then 50 dollars. I can check B078GWRC1J first.] +Observation: OK. +Action: click[B078GWRC1J] +Observation: +[Back to Search] +[Prev] +scent [assorted scents][bright citrus][calming lavender][ginger fresh][simply non-scents] +size [travel set (4-pack)][3 ounce (pack of 1)][3-ounce (2-pack)] +Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +Price: $10.99 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[For 3 ounce bottle of bright citrus deodorant for sensitive skin, the item has options ’bright citrus’ and ’3 ounce (pack of 1)’ and seems good to buy.] +Observation: OK. +Action: click[bright citrus] +Observation: You have clicked bright citrus. +Action: click[3 ounce (pack of 1)] +Observation: You have clicked 3 ounce (pack of 1). +Action: click[Buy Now] +G.2 +Value Function Prompt +Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Here are some examples +Webshop +Instruction: +i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars +[Search] +Action: search[dairy free and apple variety pack of chips] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next] +[B07HRFJWP8] +Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total) +$100.0 +[B01KMHY5PG] +Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars) +$21.49 +[B008D2X8C4] +Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz) +$100.0 +Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.] +Observation: OK. +Action: click[B008D2X8C4] +Observation: +[Back to Search] +[Prev] +flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack] +size [0.8 ounce (pack of 24)][4 ounce (pack of 12)] +Price: $100.0 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.] +Observation: OK. +Action: click[variety pack] +Observation: You have clicked variety pack. +Action: click[0.8 ounce (pack of 24)] +Observation: You have clicked 0.8 ounce (pack of 24). +Action: click[Buy Now] +Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products. +Thus the correctness score is 5 +(input) +G.3 +Reflection Prompt +Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Here are some examples +Webshop +Instruction: +i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars +[Search] +Action: search[dairy free and apple variety pack of chips] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next] +[B07HRFJWP8] +Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total) +$100.0 +[B01KMHY5PG] +Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars) +$21.49 +[B008D2X8C4] +Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz) +$100.0 +Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.] +Observation: OK. +Action: click[B008D2X8C4] +Observation: +[Back to Search] +[Prev] +flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack] +size [0.8 ounce (pack of 24)][4 ounce (pack of 12)] +Price: $100.0 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.] +Observation: OK. +Action: click[variety pack] +Observation: You have clicked variety pack. +Action: click[0.8 ounce (pack of 24)] +Observation: You have clicked 0.8 ounce (pack of 24). +Action: click[Buy Now] +Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products. +(input) +Reflection: +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md new file mode 100644 index 0000000000000000000000000000000000000000..5b8107c035d219f37354da26d53aab293e7415c2 --- /dev/null +++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md @@ -0,0 +1,4095 @@ +--- +title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning + in Language Models' +id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3 +tags: +- deepread +created: '2026-06-10T00:40:52.405072Z' +source: https://ar5iv.labs.arxiv.org/html/2310.04406 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:40:52.404928Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Andy Zhou +University of Illinois at Urbana-Champaign +AI@UIUC +Kai Yan +University of Illinois at Urbana-Champaign +Michal Shlapentokh-Rothman +University of Illinois at Urbana-Champaign +Haohan Wang +University of Illinois at Urbana-Champaign +Yu-Xiong Wang +University of Illinois at Urbana-Champaign +Abstract +While large language models (LLMs) have demonstrated impressive performance on a range of decision-making tasks, they rely on simple acting processes and fall short of broad deployment as autonomous agents. We introduce LATS (Language Agent Tree Search), a general framework that synergizes the capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration from Monte Carlo tree search commonly used in model-based reinforcement learning, LATS employs LLMs as agents, value functions, and optimizers, repurposing their latent strengths for enhanced decision-making. What is crucial in this method is the use of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that moves beyond the limitations of existing techniques. Our experimental evaluation across diverse domains, such as programming, HotPotQA, and WebShop, illustrates the applicability of LATS for decision-making while maintaining competitive reasoning performance. In particular, LATS achieves 94.4% for programming on HumanEval with GPT-4 and an average score of 75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness and generality of our method. +1 +Introduction +General autonomous agents capable of reasoning and decision-making in a variety of environments +(Wooldridge & Jennings, +1995 +) +have been of longstanding interest in the field of artificial intelligence. While this has traditionally been studied in reinforcement learning, the recent rise of large language models (LLMs) +(Brown et al., +2020 +; Chowdhery et al., +2022 +; Touvron et al., +2023 +; OpenAI, +2023 +) +with strong reasoning and general adaptability offers an alternative paradigm. Not only have LLMs excelled on standard NLP tasks such as text summarization +(Nallapati et al., +2016 +) +or natural language inference +(Bowman et al., +2015 +) +, but they have been adapted to an increasingly diverse set of tasks that often require advanced common-sense reasoning or quantitative skills +(Cobbe et al., +2021 +; Saparov & He, +2022 +) +. LLMs are also capable of performing in complex environments that involve knowledge and reasoning, such as web navigation +(Yao et al., +2022 +; Deng et al., +2023 +) +, tool-use +(Schick et al., +2023 +) +, or open-ended games +(Fan et al., +2022 +) +. +Figure 1: +An overview of LATS. LATS uses an external environment and self-reflection to improve reasoning and decision-making. +Reasoning and acting abilities have also been improved by prompting techniques that augment LLMs with feedback or observations from an external environment +(Yao et al., +2023b +; Gao et al., +2022 +; Shinn et al., +2023 +) +. This eliminates the need to rely entirely on the base abilities of the Language Model (LM), enhancing it through external tools or semantic feedback. Despite this strength, these methods are reflexive and fall short of humans’ deliberate and thoughtful decision-making characteristics to solve problems +(Sloman, +1996 +; Evans, +2010 +) +. In particular, such methods fail to consider multiple reasoning paths or to plan ahead. Recent search-guided LLM works +(Xie et al., +2023 +; Yao et al., +2023a +; Hao et al., +2023 +) +address this issue by searching over multiple reasoning chains. While these methods enable planning, these methods operate in isolation and do not incorporate external feedback that can improve reasoning. +To help address these issues, we propose LATS (Language Agent Tree Search), a general framework for decision-making and reasoning with language models. LATS unifies LM planning, acting, and reasoning strategies by expanding ReAct +(Yao et al., +2023b +) +into a search over a combinatorial space of possible reasoning and acting steps. We adapt Monte Carlo tree search (MCTS) from model-based reinforcement learning +(Silver et al., +2017 +; Anthony et al., +2017 +; Jiang et al., +2018 +) +to language agents, repurposing a pretrained LLM as an agent, value function, and optimizer. Utilizing the strong natural language understanding and in-context learning ability of modern LMs, we use text as an interface between each component of the framework, allowing LATS to adapt planning to environmental conditions without additional training. To the best of our knowledge, +LATS is the first framework that combines reasoning, acting, and planning to enhance LLMs +. Notably, LATS doubles the performance of GPT-3.5 on HotPotQA +(Yang et al., +2018 +) +over ReAct +(Yao et al., +2023b +) +and raises the average score by +22.1 +22.1 +22.1 +on WebShop +(Yao et al., +2022 +) +. When used with GPT-4, LATS achieves a +94.4 +94.4 +94.4 +Pass@1 rate for programming on HumanEval +(Chen et al., +2021 +) +, setting the state of the art. To summarize, our +contributions +are the following: +• +We introduce an LM-based Monte Carlo tree search variant to deliberately construct the best trajectory from sampled actions, enabling more flexible and adaptive problem-solving compared to reflexive prompting methods. This is guided by heuristics from the LM. +• +By integrating external feedback and self-reflection, LATS enhances model sensibility and enables agents to learn from experience, surpassing reasoning-based search methods. +• +Through experiments across diverse domains like programming, interactive QA, and web navigation, we demonstrate the versatility of LATS in harnessing LLMs for autonomous reasoning and decision-making. +2 +Related Work +Approach +Reasoning +Acting +Planning +Self +External +Reflection +Memory +CoT +(Wei et al., +2022 +) +✓ +× +\times +× +\times +× +\times +× +\times +ReAct +(Yao et al., +2023b +) +✓ +✓ +× +\times +× +\times +× +\times +ToT +(Yao et al., +2023a +) +✓ +× +\times +✓ +✓ +✓ +RAP +(Hao et al., +2023 +) +✓ +× +\times +✓ +× +\times +✓ +Self-Refine +(Madaan et al., +2023 +) +✓ +× +\times +× +\times +✓ +× +\times +Beam Search +(Xie et al., +2023 +) +✓ +× +\times +× +\times +✓ +× +\times +Reflexion +(Shinn et al., +2023 +) +✓ +✓ +× +\times +✓ +✓ +LATS (Ours) +✓ +✓ +✓ +✓ +✓ +Table 1: +A summary of related work on reasoning, acting, and planning. LATS is the first work incorporating designs from all three domains, allowing use in all corresponding tasks. We refer to planning as the use of a search algorithm, self-reflection as the use of LM-generated feedback, and external memory as storaging past text context for future updates of solution. +a) Tree-of-Thoughts +b) Reasoning via Planning +c) Language Agent Tree Search +Figure 2: +An overview of the differences between LATS and recently proposed LM search algorithms ToT +(Yao et al., +2023a +) +and RAP +(Hao et al., +2023 +) +. LATS leverages environmental feedback and self-reflection to further adapt search and improve performance. +LLMs for reasoning. +For LLMs, reasoning typically involves decomposing complex inputs into sequential intermediate steps towards a final answer +(Cobbe et al., +2021 +) +, demonstrated with Chain-of-Thought (CoT) prompting +(Wei et al., +2022 +) +and its variants +(Wei et al., +2022 +; Kojima et al., +2022 +; Wang et al., +2022 +) +. However, these methods, which create chains autoregressively in a single step, often suffer from error propagation as the number of steps increases +(Guo et al., +2018 +; Chen et al., +2022b +) +due to compound errors. Various advancements aim to mitigate this issue; some approaches, such as Self-Consistency +(Wang et al., +2022 +) +, employ majority voting over sampled chains, while others focus on multi-step decomposition, such as least-to-most prompting +(Zhou et al., +2022 +) +, or use of external tools such as a scratchpad +(Nye et al., +2021 +) +or compiler +(Gao et al., +2022 +) +. Recently, CoT has been improved with search algorithms +(Yao et al., +2023a +; Hao et al., +2023 +; Besta et al., +2023 +) +that can sample trajectories more effectively. Tree-of-thought (ToT) prompting +(Yao et al., +2023a +) +uses DFS or BFS-based search guided by an LM-generated heuristic while Reasoning via Planning (RAP) +(Hao et al., +2023 +) +uses MCTS with rollouts simulated by the LM. However, they rely solely on LM internal knowledge and cannot adapt to useful external feedback. +LLMs for acting. +The strong reasoning and common-sense abilities of LLMs have also been adapted for decision-making or acting tasks as a policy model in interactive environments. In the realm of robotics LLMs have been employed as high-level controllers of control policies +(Ahn et al., +2022 +; Huang et al., +2022 +; Driess et al., +2023 +) +. Similar work +(Baker et al., +2022 +; Wang et al., +2023 +; Zhu et al., +2023 +) +has also adapted LLM agents to complex multimodal games such as Minecraft +(Guss et al., +2019 +; Fan et al., +2022 +) +. LLMs are particularly useful in text-based environments +(Liu et al., +2018 +; Shridhar et al., +2020 +; Liu et al., +2023 +) +, where acting-based prompting techniques such as ReAct +(Yao et al., +2023b +) +have seen success. Similar to CoT, ReAct is limited by its simplicity and cannot effectively adapt to environment conditions. Many extensions have been proposed to address this, including Self-refine +(Madaan et al., +2023 +) +and Reflexion +(Shinn et al., +2023 +; Yao et al., +2023c +) +, which uses self-reflection to enhance reasoning and decision-making, and AdaPlanner +(Sun et al., +2023 +) +, which incorporates both positive and negative environmental feedback. However these methods focus on refining an individual plan or trajectory and do not consider alternative choices at each step. In addition, recent work +(Huang et al., +2023 +) +has suggested LLMs cannot self-correct their internal reasoning, making it critical to use external feedback. Alternatively to pure decision-making environments, the reasoning and practical abilities of LLMs have been enhanced by access to external tools, such as APIs, search engines, calculators, or other models +(Schick et al., +2023 +; Shen et al., +2023 +; Surís et al., +2023 +) +. Contrary to reasoning-based approaches, these methods have not been improved with planning, limiting their effectiveness. We summarize them in Tab. +1 +. +Tree-based search. +Tree-based search, where multiple branches of outcomes are explored during search, is widely used in many planning algorithms +(Świechowski et al., +2023 +; LaValle et al., +2001 +) +and Reinforcement Learning (RL) +(Hafner et al., +2019 +; Du et al., +2023 +; Wu et al., +2023 +) +algorithms for its good exploration-exploitation trade-off. Though tree-based search requires an environment model that can expand from arbitrary state +(Vodopivec et al., +2017 +) +, which often requires extra training in RL +(Hafner et al., +2023 +) +, such problem does not exist for LM tasks as we can conveniently backup to any state by setting the input to be the context and corresponding previous output by the LM. Thus, we work on the tree-based framework and use MCTS +(Świechowski et al., +2023 +) +to fully release the potential of LMs, while avoiding the cost of training a value function over language descriptions by leveraging the in-context learning +(Brown et al., +2020 +) +abilities of LLMs. +3 +Preliminaries +3.1 +Problem Setting and Prompting +Before describing LATS, we first define our problem and outline a few established methods that leverage large language models for reasoning or decision-making. In LM reasoning or decision making, we are given an input +x +𝑥 +x +in natural language and a pretrained language model +p +θ +​ +( +x +) +subscript +𝑝 +𝜃 +𝑥 +p_{\theta}(x) +parameterized by +θ +𝜃 +\theta +; our goal is to generate a final output +y +∼ +p +θ +​ +( +x +) +similar-to +𝑦 +subscript +𝑝 +𝜃 +𝑥 +y\sim p_{\theta}(x) +corresponding to the answer (reasoning) or completes the task (decision-making). Both +x +𝑥 +x +and +y +𝑦 +y +are language +sequences +, which are comprised of a list of +tokens +(the basic elements of natural language, often words), denoted as +x += +( +x +​ +[ +1 +] +, +… +, +x +​ +[ +n +] +) +𝑥 +𝑥 +delimited-[] +1 +… +𝑥 +delimited-[] +𝑛 +x=(x[1],\dots,x[n]) +and +y += +( +y +​ +[ +1 +] +, +… +, +y +​ +[ +n +] +) +𝑦 +𝑦 +delimited-[] +1 +… +𝑦 +delimited-[] +𝑛 +y=(y[1],\dots,y[n]) +. The LM decodes text autoregressively, i.e., without other inputs, the probability for an LM to generate a sequence +x +𝑥 +x +is given by +p +θ +​ +( +x +) += +∏ +i += +1 +n +p +θ +​ +( +x +​ +[ +i +] +| +x +​ +[ +1 +​ +… +​ +i +− +1 +] +) +subscript +𝑝 +𝜃 +𝑥 +superscript +subscript +product +𝑖 +1 +𝑛 +subscript +𝑝 +𝜃 +conditional +𝑥 +delimited-[] +𝑖 +𝑥 +delimited-[] +1 +… +𝑖 +1 +p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1\dots i-1]) +. Usually, to improve the LM, +prompts +are provided along with the input +x +𝑥 +x +, which are specific instructions or few-shot input-output examples. We denote the generic process where an input +x +𝑥 +x +is transformed into an output +y +𝑦 +y +by LM: +y +∼ +p +θ +​ +( +y +| +prompt +I +​ +O +​ +( +x +) +) +similar-to +𝑦 +subscript +𝑝 +𝜃 +conditional +𝑦 +subscript +prompt +𝐼 +𝑂 +𝑥 +y\sim p_{\theta}(y|\texttt{prompt}_{IO}(x)) +, where +prompt +I +​ +O +​ +( +x +) +subscript +prompt +𝐼 +𝑂 +𝑥 +\texttt{prompt}_{IO}(x) +denotes the input +x +𝑥 +x +. +Chain-of-thought (CoT) Prompting +(Wei et al., +2022 +) +was introduced to cater to scenarios where direct mapping from +x +𝑥 +x +to +y +𝑦 +y +is intricate, such as when +x +𝑥 +x +is from a mathematical query or challenging question. This method hinges on creating +thoughts +z +1 +, +… +, +z +n +subscript +𝑧 +1 +… +subscript +𝑧 +𝑛 +z_{1},\dots,z_{n} +that act as stepping stones between +x +𝑥 +x +and +y +𝑦 +y +; each thought +z +i +subscript +𝑧 +𝑖 +z_{i} +is a language sequence. To employ CoT prompting, thoughts are extracted sequentially as +z +i +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i +| +x +, +z +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑧 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +1 +z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1}) +, with the final output being +y +∼ +p +θ +C +​ +o +​ +T +​ +( +y +| +x +, +z +1 +​ +⋯ +​ +n +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +𝑦 +𝑥 +subscript +𝑧 +1 +⋯ +𝑛 +y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n}) +. +Tree-of-thought (ToT) Prompting +(Yao et al., +2023a +) +extends CoT prompting by exploring multiple reasoning paths over thoughts. It frames problems as a search over a tree where each node +s += +[ +x +, +z +1 +⋅ +i +] +𝑠 +𝑥 +subscript +𝑧 +⋅ +1 +𝑖 +s=[x,z_{1\cdot i}] +represents a partial solution state comprising the original input +x +𝑥 +x +and thought sequence +z +1 +​ +⋯ +​ +i +subscript +𝑧 +1 +⋯ +𝑖 +z_{1\cdots i} +. Thoughts +z +i +subscript +𝑧 +𝑖 +z_{i} +are generated by proposal or sampling with CoT +z +i +∼ +p +θ +C +​ +o +​ +T +​ +( +z +i +| +x +, +z +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑧 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝐶 +𝑜 +𝑇 +conditional +subscript +𝑧 +𝑖 +𝑥 +subscript +𝑧 +1 +⋯ +𝑖 +1 +z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1}) +. Deliberate search algorithms like breadth-first or depth-first search are used to systematically explore the tree, guided by heuristics based on language model evaluations +V +​ +( +s +) +𝑉 +𝑠 +V(s) +of each state. +Reasoning via Planning +(RAP) +(Hao et al., +2023 +) +is similar to ToT, except that MCTS is used over DFS or BFS. Heuristics are designed from an LM, such as the likelihood or confidence of an action, and the LM is used as a world model to predict subsequent states during the simulation step. +ReAct +(Yao et al., +2023b +) +extends language models to tasks where the mapping from +x +𝑥 +x +to +y +𝑦 +y +is enhanced by or requires interactions with an external environment, such as a game or API. This technique constructs an action space +A +^ += +A +∪ +Z +^ +𝐴 +𝐴 +𝑍 +\hat{A}=A\cup Z +that adds permissible actions +a +𝑎 +a +to the reasoning traces +z +𝑧 +z +from CoT. Observations +o +𝑜 +o +from the environment are used to improve both reasoning and acting. To solve problems with ReAct, after each observation, actions are generated from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +sequentially as +a +i +∼ +p +θ +R +​ +e +​ +A +​ +c +​ +t +​ +( +a +i +| +x +, +o +1 +​ +⋯ +​ +i +− +1 +, +a +1 +​ +⋯ +​ +i +− +1 +) +similar-to +subscript +𝑎 +𝑖 +superscript +subscript +𝑝 +𝜃 +𝑅 +𝑒 +𝐴 +𝑐 +𝑡 +conditional +subscript +𝑎 +𝑖 +𝑥 +subscript +𝑜 +1 +⋯ +𝑖 +1 +subscript +𝑎 +1 +⋯ +𝑖 +1 +a_{i}\sim p_{\theta}^{ReAct}(a_{i}|x,o_{1\cdots i-1},a_{1\cdots i-1}) +, with the final output being +y +∼ +p +θ +R +​ +e +​ +A +​ +c +​ +t +​ +( +y +| +x +, +o +1 +​ +⋯ +​ +n +, +a +1 +​ +⋯ +​ +n +) +similar-to +𝑦 +superscript +subscript +𝑝 +𝜃 +𝑅 +𝑒 +𝐴 +𝑐 +𝑡 +conditional +𝑦 +𝑥 +subscript +𝑜 +1 +⋯ +𝑛 +subscript +𝑎 +1 +⋯ +𝑛 +y\sim p_{\theta}^{ReAct}(y~{}|~{}x,o_{1\cdots n},a_{1\cdots n}) +. +While the previously described prompting techniques improve LM performance on reasoning tasks, they falter on difficult tasks that involve multifaceted decision-making due to several shortcomings: 1) +Flexibility +: Base prompting methods (CoT or ReAct) autoregressively sample from the LM, neglecting potential alternative continuations from specific states. 2) +Sensibility +: Reasoning-based methods (CoT, RAP, or ToT) rely solely on the internal representations of the LM and cannot consider external observations. This dependency risks fact hallucination and error propagation while setting a performance ceiling. 3) +Adaptability +: Current planning frameworks (RAP or ToT) use simple search algorithms such as BFS or cannot leverage environmental feedback to improve planning. Additionally, the agent is static and cannot reuse previous experience or learn from trial and error. While RAP also adopts MCTS, it is constrained to tasks where the LM can become a world model and accurately predict states. These shortcomings limit the ability of LMs to be deployed as general problem-solving agents and form the motivation for LATS. +3.2 +Monte-Carlo Tree Search (MCTS) +Monte-Carlo Tree Search (MCTS) is a heuristic search algorithm that is proved successful on many decision-making environments such as Atari +(Ye et al., +2021 +) +and Go +(Silver et al., +2016 +) +. MCTS builds a decision tree where every node in the tree is a state and edge is an action. MCTS runs for +k +𝑘 +k +episodes; for each episode, it starts from the root (i.e., initial state) and iteratively conducts two steps to expand the tree: 1) +Expansion +, where multiple children states +s +𝑠 +s +are explored from the current parent state +p +𝑝 +p +by sampling +n +𝑛 +n +actions, and 2) +Selection +, where the children with the highest UCT +(Upper Confidence bounds applied to Trees) +(Kocsis & Szepesvári, +2006 +) +value is selected by the next iteration. The UCT of a child state +s +𝑠 +s +is calculated as follows: +U +​ +C +​ +T +​ +( +s +) += +V +​ +( +s +) ++ +w +​ +ln +⁡ +N +​ +( +p +) +N +​ +( +s +) +, +𝑈 +𝐶 +𝑇 +𝑠 +𝑉 +𝑠 +𝑤 +𝑁 +𝑝 +𝑁 +𝑠 +UCT(s)=V(s)+w\sqrt{\frac{\ln N(p)}{N(s)}}, +(1) +where +N +​ +( +s +) +𝑁 +𝑠 +N(s) +is the number of visits to a node +s +𝑠 +s +, +V +​ +( +s +) +𝑉 +𝑠 +V(s) +is the value function (expected return) from the subtree of +s +𝑠 +s +, +w +𝑤 +w +is the exploration weight, and +p +𝑝 +p +is the parent node of +s +𝑠 +s +. The child node with the highest UCT value is selected for expansion in the next iteration. When the end of an episode is reached, a +backpropagation +is carried out: the return +r +𝑟 +r +is used for updating every +V +​ +( +s +) +𝑉 +𝑠 +V(s) +along the path +with the formula +V +​ +( +s +) += +V +old +​ +( +s +) +​ +( +N +​ +( +s +) +− +1 +) ++ +r +N +​ +( +s +) +𝑉 +𝑠 +subscript +𝑉 +old +𝑠 +𝑁 +𝑠 +1 +𝑟 +𝑁 +𝑠 +V(s)=\frac{V_{\text{old}}(s)(N(s)-1)+r}{N(s)} +, where +V +old +​ +( +s +) +subscript +𝑉 +old +𝑠 +V_{\text{old}}(s) +is the old value function. Normally, the major shortcoming of MCTS is that it requires an environment model to undo previous steps and form a searching tree, which is often a strong assumption. However, such a limitation does not exist for LMs, as we can conveniently reset to any step by simply copy-pasting historical text input. Such a special property is the key motivation of our work. +4 +Unifying Planning, Reasoning, and Acting +4.1 +LM Agent +LATS supports sequential reasoning or decision-making tasks on the basis of ReAct. At time step +t +𝑡 +t +, an agent receives an observation +o +t +∈ +O +subscript +𝑜 +𝑡 +𝑂 +o_{t}\in O +from the environment and takes an action +a +t +∈ +A +subscript +𝑎 +𝑡 +𝐴 +a_{t}\in A +following some policy +π +​ +( +a +t +| +x +, +o +1 +​ +⋯ +​ +i +− +1 +, +a +1 +​ +⋯ +​ +i +− +1 +) +𝜋 +conditional +subscript +𝑎 +𝑡 +𝑥 +subscript +𝑜 +1 +⋯ +𝑖 +1 +subscript +𝑎 +1 +⋯ +𝑖 +1 +\pi(a_{t}|x,o_{1\cdots i-1},a_{1\cdots i-1}) +, where +x +𝑥 +x +consists of the task instruction and a number of few-shot examples. We initialize the agent with +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +to leverage the useful language representations of an LM as a base decision-maker. We follow the ReAct instantiation in which the action space +A +^ += +A +∪ +Z +^ +𝐴 +𝐴 +𝑍 +\hat{A}=A\cup Z +consists of both the space of permissible actions +A +𝐴 +A +and language space of reasoning traces +Z +𝑍 +Z +. Actions directly affect the environment and result in observation, while thoughts are used to formalize decisions by organizing information, planning future actions, or injecting internal knowledge. The exact instantiation of the action space depends on the particular environment; for decision-making tasks actions might consist of commands on a website while for reasoning tasks the action space might be limited to a few external tools or APIs. +Instead of greedily decoding one trajectory or solution, we sample +n +𝑛 +n +actions from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +using the current state. This is based on the intuition that for complex decision-making tasks, there is likely to be a range of potential trajectories or reasoning paths that are correct +(Evans, +2010 +) +. Sampling a diverse set of candidates at each step mitigates the stochastic nature of LM text generation and enables greater exploration in both the decision-making and reasoning space. We wrap +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +within our proposed search algorithm to deliberately construct the best trajectory from sampled actions. +4.2 +LATS +Figure 3: +An overview of the six operations of LATS. A node is +selected +, +expanded +, +evaluated +, then +simulated +until a terminal node is reached, then the resulting value is +backpropagated +. If the trajectory fails, a +reflection +is generated and used as additional context for future trials. These operations are performed in succession until the budget is reached or task is successful. +The main component of LATS is a search algorithm that controls the overall problem-solving process with deliberate planning. To find the most promising trajectory and systemically balance exploration with exploitation, we adopt a variant of Monte Carlo Tree Search (MCTS) that frames decision-making as a tree search, in which each node +s += +[ +x +, +a +1 +​ +⋯ +​ +i +, +o +1 +​ +⋯ +​ +i +] +𝑠 +𝑥 +subscript +𝑎 +1 +⋯ +𝑖 +subscript +𝑜 +1 +⋯ +𝑖 +s=[x,a_{1\cdots i},o_{1\cdots i}] +represents a state comprising the original input +x +𝑥 +x +, action sequence +a +1 +⋅ +i +subscript +𝑎 +⋅ +1 +𝑖 +a_{1\cdot i} +, and observation sequence +o +1 +⋅ +i +subscript +𝑜 +⋅ +1 +𝑖 +o_{1\cdot i} +. +To adapt MCTS for language agents, LATS repurposes +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +as an agent, state evaluator, and feedback generator, leveraging the useful language priors of modern LMs to facilitate planning. While standard MCTS and RAP +Hao et al. ( +2023 +) +rely on internal dynamics models to facilitate simulation, LATS is model-free and uses environment interaction. LATS consists of a series of operations, +selection, expansion, evaluation, simulation, backpropagation, and reflection +, performed in succession until the task is successfully completed or a computational limit is reached. The full psuedocode of LATS can be found in Sec. +A +in the Appendix. +Selection. +In the first operation, the algorithm identifies a segment of the current tree most suitable for subsequent expansion. Starting from the root node, denoted as the initial state +s +0 +subscript +𝑠 +0 +s_{0} +, a child node is selected at each tree level until a leaf node is reached. To balance exploration and exploitation, we use the UCT algorithm as shown in Eq. +1 +. +Expansion. +After selecting a node, the second operation expands the tree by sampling +n +𝑛 +n +actions from +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, as described in the prior section. The environment receives each action and returns corresponding feedback as an observation. This results in +n +𝑛 +n +new child nodes added to the tree. This tree is stored in an external long-term memory structure. +Evaluation. +The third operation assigns a scalar value to each new child node to be used for selection and backpropagation. This value effectively quantifies the agent’s progress in task completion, serving as a heuristic to steer the search algorithm towards the most promising regions of the tree. Following +Yao et al. ( +2023a +) +we repurpose +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +into a value function by prompting it to reason about a given state. To obtain a scalar value, we instruct +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +to end its reasoning trace with a score indicating the correctness of the trajectory. This method offers enhanced flexibility over programmed heuristics +(Campbell et al., +2002 +) +and greater efficiency than learned heuristics +(Silver et al., +2017 +) +. +Simulation. +The fourth operation expands the currently selected node until a terminal state is reached. At each depth level we sample and evaluate nodes with the same operations, but prioritize nodes of highest value. Reaching a terminal state provides objective feedback on the correctness of a trajectory. If the task is completed successfully, then LATS terminates the search. If the solution is partially successful or unsuccessful, then we perform two additional operations as described below. +Backpropagation. +This operation updates the values of the tree based on the outcome of a trajectory. For each node +s +0 +, +s +1 +, +… +, +s +n +subscript +𝑠 +0 +subscript +𝑠 +1 +… +subscript +𝑠 +𝑛 +s_{0},s_{1},\dots,s_{n} +in the trajectory from root (initial state +s +0 +subscript +𝑠 +0 +s_{0} +) of the searching tree to leaf (terminal state +s +n +subscript +𝑠 +𝑛 +s_{n} +), its value is updated to reflect the outcome of the simulation by +N +​ +( +s +i +) += +N +old +​ +( +s +i +) ++ +1 +𝑁 +subscript +𝑠 +𝑖 +subscript +𝑁 +old +subscript +𝑠 +𝑖 +1 +N(s_{i})=N_{\text{old}}(s_{i})+1 +and +V +​ +( +s +i +) += +r ++ +N +old +​ +( +s +i +) +​ +V +old +​ +( +s +i +) +N +​ +( +s +i +) +𝑉 +subscript +𝑠 +𝑖 +𝑟 +subscript +𝑁 +old +subscript +𝑠 +𝑖 +subscript +𝑉 +old +subscript +𝑠 +𝑖 +𝑁 +subscript +𝑠 +𝑖 +V(s_{i})=\frac{r+N_{\text{old}}(s_{i})V_{\text{old}}(s_{i})}{N(s_{i})} +, where +r +𝑟 +r +is the return and +N +old +, +V +old +subscript +𝑁 +old +subscript +𝑉 +old +N_{\text{old}},V_{\text{old}} +are the old number of visits and value function. These updated values are used in the UCT formula (Eq. +1 +) to guide the selection of the next node for exploration. +Reflection. +In addition to the environmental feedback, we also leverage +self-reflection +to further refine the decision-making process +(Shinn et al., +2023 +; Madaan et al., +2023 +) +. Upon encountering an unsuccessful terminal node, +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +is prompted with the trajectory and final reward to provide a verbal self-reflection that summarizes the errors in the reasoning or acting process and proposes superior alternatives. We store both failed trajectories and corresponding reflections in the memory. In subsequent iterations, these are integrated as additional context to the agent and value function, refining both through in-context learning. This imparts a semantic gradient signal more useful than a scalar value, enabling the agent to learn from trial and error without the cost of expensive optimization processes such as reinforcement learning. +Conceptually, LATS has the following advantages as a general framework for reasoning and decision-making with LM agents. +(1) +Generality +: LATS supports both reasoning and decision-making tasks by defining a shared space of thoughts and actions. (2) +Deliberate +: The use of MCTS and LM value function ensures a principled search that selects options with high value while exploring promising alternatives. (3) +Adaptability +: LATS is designed around the use of external feedback through observations and self-reflection, enabling greater adaptation during problem-solving. (4) +Flexibility +: LATS can accommodate different scenarios, environments, and resource stipulations by modifying state design and tree dimensions. (5) +Modularity +: The base LM agent, reflection generator, and value function can be independently altered and adapted to individual LM properties. +5 +Experiments +To demonstrate the general applicability of LATS, we evaluate our method on a variety of decision-making domains that requires both reasoning and acting ability: programming +(Chen et al., +2021 +; Austin et al., +2021 +) +, HotPotQA +(Yang et al., +2018 +) +, and WebShop +(Yao et al., +2022 +) +. +5.1 +HotPotQA +For a task that can be approached with both reasoning-based and acting-based strategies, we consider HotPotQA +(Yang et al., +2018 +) +, a multi-hop question-answering benchmark that requires retrieval over two or more Wikipedia passages. For the action space, in addition to LM thoughts we follow the setup from +Yao et al. ( +2023b +) +, which provides the agent with API calls to search and lookup information. The output of these API calls and self-generated reflections form the observation space. We use a subset of 100 questions and three few-shot examples for each method. For ToT, we use DFS as the base search algorithm and scoring with the LM as the heuristic. For all methods that involve sampling, including LATS, we sample +k += +50 +𝑘 +50 +k=50 +trajectories. More details and prompts can be found in Sec. +D +and Sec. +E +in the Appendix. +We evaluate internal reasoning strategies by removing actions and observations from the context, corresponding to CoT +(Wei et al., +2022 +) +and its variants, CoT-SC +(Wang et al., +2022 +) +, ToT +(Yao et al., +2023a +) +, and RAP +(Hao et al., +2023 +) +. These methods rely solely on the agent’s existing knowledge to answer the question. We also consider acting-based methods ReAct, Reflexion, and LATS, which augment the agent with the interactive API environment and primarily evaluate its information retrieval abilities. While LATS is designed for scenarios where external feedback can enhance reasoning, we also implement a reasoning-only version with CoT as the base prompt. We also combine internal and external reasoning in LATS by first prompting with a CoT-based prompt, then switching to a ReAct-based prompt upon failure. This is closer to how humans might approach this task, by using tools to lookup additional information only when the answer is not already known. +Prompt Method +HotpotQA (EM) +I/O +0.32 +CoT +(Wei et al., +2022 +) +0.34 +CoT - SC +(Wang et al., +2022 +) +0.38 +ToT +(Yao et al., +2023a +) +0.55 +RAP +(Hao et al., +2023 +) +0.60 +RAP (n = 10) +0.60 +LATS (CoT) +0.60 +Prompt Method +HotpotQA (EM) +ReAct +(Yao et al., +2023b +) +0.32 +ReAct (best of k) +0.38 +Reflexion +(Shinn et al., +2023 +) +0.51 +LATS +0.61 +LATS (n = 3) +0.56 +LATS (n = 10) +0.64 +LATS (CoT + ReAct) +0.71 +Table 2: +GPT-3.5 reasoning-based prompting (left) and acting-based prompting (right) results on HotpotQA. LATS achieves the highest exact match (EM) for acting and is competitive on reasoning. Unless otherwise specified, we sample +n += +5 +𝑛 +5 +n=5 +nodes during expansion and +k += +50 +𝑘 +50 +k=50 +trajectories. +Results. +We observe in Tab. +2 +that both internal reasoning and external retrieval strategies perform well on HotPotQA. Due to their large-scale training corpus, modern LLMs already encode factual knowledge and can often directly answer the question correctly. While CoT can slightly enhance performance on questions requiring reasoning, larger gains are observed with search methods ToT and RAP, which can sample and explore more outputs. We observe similar results for acting-based methods. LATS surpasses ReAct, even when sampling the same number of trajectories, by expanding more nodes with principled search (see Fig. +5 +in Appendix +D +for a qualitative sample). This is demonstrated when modifying +n +𝑛 +n +, the number of nodes expanded during each iteration. Increasing +n +𝑛 +n +can consistently improve performance, although at greater computational and inference costs. LATS is also competitive to RAP on internal reasoning but performs worse than acting. Combining internal and external reasoning in LATS results in the highest performance, indicating the importance of external feedback in augmenting reasoning even in tasks the base LM can already perform. +5.2 +Programming +Prompt Method +Model +Pass@1 +CoT +(Wei et al., +2022 +) +GPT-3.5 +46.9 +ReAct +(Yao et al., +2023b +) +GPT-3.5 +56.9 +Reflexion +(Shinn et al., +2023 +) +GPT-3.5 +68.1 +ToT +(Yao et al., +2023a +) +GPT-3.5 +54.4 +RAP +(Hao et al., +2023 +) +GPT-3.5 +63.1 +LATS (Ours) +GPT-3.5 +83.8 +I/O +GPT-4 +80.1 +Reflexion +GPT-4 +91.0 +LATS +GPT-4 +94.4 +Prompt Method +Pass@1 +CoT +(Wei et al., +2022 +) +54.9 +ReAct +(Wei et al., +2022 +) +67.0 +Reflexion +(Shinn et al., +2023 +) +70.0 +ToT +(Yao et al., +2023a +) +65.8 +RAP +(Hao et al., +2023 +) +71.4 +LATS (Ours) +81.1 +Table 3: +GPT-3.5 and GPT-4 Pass@1 accuracy on HumanEval +(Chen et al., +2021 +) +and MBPP +(Austin et al., +2021 +) +. Prompting with LATS achieves the highest performance. We sample 5 solutions during expansion for +8 +iterations. +To demonstrate the importance of external observations for complex reasoning tasks, we evaluate the baselines and LATS on programming with Humaneval +(Chen et al., +2021 +) +and MBPP +(Austin et al., +2021 +) +. Both datasets measure the correctness of synthesized programs in Python from natural language docstrings. We use individual solutions as the action space and test suite and compiler feedback as the external observation. We follow +Chen et al. ( +2022a +) +and use an LLM to generate a synthetic test suite of syntactically valid “assert” statements for each question. For each step, the solution is evaluated on this test suite, and the results including successful and failed tests and compiler output, are added to the context as an observation. We use the same test suite for Reflexion. +For this task, the reasoning and acting baselines share an action space, but acting methods are able to incorporate observations as additional context. For LATS, since each action corresponds to a complete solution, we skip the simulation step of LATS and directly use the percentage of passed tests as the backpropagated reward. We use +k += +8 +𝑘 +8 +k=8 +iterations, set the number of generated tests at +4 +4 +4 +, and sample +n += +5 +𝑛 +5 +n=5 +solutions during expansion. After the search is completed, we select the solution with the highest value and evaluate it on the real test suite for the pass@1 accuracy evaluation. More details and prompts can be found in Sec. +D +and Sec. +F +in the Appendix. +Results. +We find in Tab +3 +that both search and semantic feedback are crucial for better performance. Despite not using observations, ToT and RAP are competitive with Reflexion. LATS has the highest performance on both datasets. Since RAP uses a similar search algorithm as LATS, this reveals the importance of external feedback for difficult reasoning tasks such as programming. With GPT-4, using LATS sets the state of the art for HumanEval, showing LATS can be used with more advanced LLMs for higher performance. +5.3 +Webshop +For a complex decision-making environment with practical applications, we consider WebShop +(Yao et al., +2022 +) +, an online shopping environment composed of a website with 1.18M real-world products and 12k human instructions. Agents must navigate a website through a variety of commands to purchase an item matching a user specification. We use the preconstructed action space of search and click commands and browser feedback and reflections for the observation. The performance is gauged using two metrics: an average score, reflecting the percentage of user-specified attributes met by the selected product, and a success rate, indicating the frequency with which the chosen product fulfills all given conditions. We compare against acting-based prompting methods and RL-based approaches. We evaluate on 50 instructions, expand +n += +5 +𝑛 +5 +n=5 +children for LATS, and set +k += +30 +𝑘 +30 +k=30 +for LATS, ReAct best of +k +𝑘 +k +, and Reflexion. More details and prompts are in Appendix +D +and +G +. +Results. +We find in Tab. +5 +that GPT-3.5 with ReAct is competitive to imitation learning, and can exceed reinforcement learning techniques with stronger prompting strategies. Sampling +k += +30 +𝑘 +30 +k=30 +trajectories with ReAct and Reflexion results in a similar performance, suggesting the semantic feedback is not as helpful in complex environments like WebShop. Indeed like in +Shinn et al. ( +2023 +) +, we find that generated reflections are often generic and do not provide useful feedback, resulting in a tendency for the agent to become stuck in local minima. However, using LATS indeed results in a noticeable improvement, indicating a more effective exploration for the same number of iterations. +5.4 +Additional Observations +Method +Score +SR +ReAct +(Yao et al., +2023b +) +53.8 +28.0 +ReAct (best of k) +59.1 +32.0 +Reflexion +(Shinn et al., +2023 +) +64.2 +35.0 +LATS +75.9 +38.0 +IL +59.9 +29.1 +IL+RL +62.4 +28.7 +Fine-tuning +(Furuta et al., +2023 +) +67.5 +45.0 +Expert +82.1 +59.6 +Table 4: +Score and success rate (SR) on Webshop. Table is separated into prompting, RL-based training, and human performance. For the same number of iterations, LATS improves both score and success rate, and surpasses RL-based training. IL/IL+RL taken from +Yao et al. ( +2022 +) +. +Prompt Method +HotPotQA (EM) +ToT (ReAct) +0.39 +RAP (ReAct) +0.54 +LATS (No LM Heuristic) +0.37 +LATS (DFS) +0.42 +LATS (No Reflection) +0.56 +LATS +0.61 +Table 5: +Ablation results on LATS and baseline variants in HotPotQA; we use ReAct as the base prompt and sample +n += +5 +𝑛 +5 +n=5 +children and +k += +50 +𝑘 +50 +k=50 +maximum trajectories. LATS requires every component and operation for optimal performance. +We also conduct additional experiments on HotPotQA to demonstrate the effect of each component of LATS. We also design a version of ToT and RAP with ReAct prompt and can handle external observations. We use HotPotQA as our setup incorporates both reasoning (through thoughts) and acting (through API calls); the results are shown in Tab. +5 +. More ablations for token consumption on HotPotQA are in Tab. +7 +in Appendix +C +. Note that baselines generally perform worse than the reasoning-only setting of HotPotQA, which indicates that the acting-based setting is more challenging and adaption of search algorithms to decision-making scenarios is non-trivial. +Self-reflection. +We use self-reflection to provide additional semantic signals for the agent. We observe a +0.05 +0.05 +0.05 +performance drop when removed from LATS, suggesting this is useful. This is a smaller gain Reflexion +(Shinn et al., +2023 +) +observes over ReAct +(Yao et al., +2023b +) +as shown in Tab. +2 +, suggesting overlap between the types of questions where there is an improvement with self-reflection and search. This variant outperforms RAP-ReAct, reflecting our improvements to MCTS. +Search Algorithm. +MCTS is a more principled search algorithm than variants like A* or DFS search and the basis for observed performance gains. We observe the effects of using DFS, and incorporate the LM-based heuristic used in ToT +(Yao et al., +2023a +) +in which branches with low values are pruned. This removes the selection and backpropagation operations, and we observe a +0.08 +0.08 +0.08 +drop in performance when sampling the same number of nodes, but outperforms ToT-ReAct. +6 +Conclusion +In this work, we introduce Language Agent Tree Search (LATS), the first framework to unify planning, acting, and reasoning for enhanced LLM problem solving. By deliberately constructing trajectories with search algorithms, incorporating external feedback, and enabling agents to learn from experience, LATS addresses key limitations of prior prompting techniques. Our evaluations demonstrate the ability of LATS to harness LLM capabilities for a variety of decision-making tasks while keeping its reasoning ability without additional training. The proposed synergies between search, interaction, and reflection offer a versatile approach to autonomous decision-making, highlighting the potential of LLMs as generalist agents. A full discussion of the limitations and broader impacts is in Appendix +B +. +References +Ahn et al. (2022) +Michael Ahn, Anthony Brohan, Noah Brown, Yevgen Chebotar, Omar Cortes, Byron David, Chelsea Finn, Chuyuan Fu, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Daniel Ho, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Eric Jang, Rosario Jauregui Ruano, Kyle Jeffrey, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Kuang-Huei Lee, Sergey Levine, Yao Lu, Linda Luu, Carolina Parada, Peter Pastor, Jornell Quiambao, Kanishka Rao, Jarek Rettinghouse, Diego Reyes, Pierre Sermanet, Nicolas Sievers, Clayton Tan, Alexander Toshev, Vincent Vanhoucke, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Mengyuan Yan, and Andy Zeng. +Do as i can, not as i say: Grounding language in robotic affordances. +arXiv:2204.01691 +, 2022. +Anthony et al. (2017) +T. Anthony, Z. Tian, and D. Barber. +Thinking fast and slow with deep learning and tree search. +In +NIPS +, 2017. +Austin et al. (2021) +Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, et al. +Program synthesis with large language models. +arXiv:2108.07732 +, 2021. +Baker et al. (2022) +Bowen Baker, Ilge Akkaya, Peter Zhokhov, Joost Huizinga, Jie Tang, Adrien Ecoffet, Brandon Houghton, Raul Sampedro, and Jeff Clune. +Video pretraining (vpt): Learning to act by watching unlabeled online videos. +arXiv:2206.11795 +, 2022. +Besta et al. (2023) +Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michal Podstawski, Hubert Niewiadomski, Piotr Nyczyk, and Torsten Hoefler. +Graph of thoughts: Solving elaborate problems with large language models. +arXiv:2308.09687 +, 2023. +Bowman et al. (2015) +Samuel R Bowman, Gabor Angeli, Christopher Potts, and Christopher D Manning. +A large annotated corpus for learning natural language inference. +In +EMNLP +, 2015. +Brown et al. (2020) +Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. +Language models are few-shot learners. +In +NeurIPS +, 2020. +Campbell et al. (2002) +Murray Campbell, A Joseph Hoane Jr, and Feng-hsiung Hsu. +Deep blue. +Artificial intelligence +, 2002. +Chen et al. (2022a) +Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. +Codet: Code generation with generated tests. +arXiv:2207.10397 +, 2022a. +Chen et al. (2021) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. +Evaluating large language models trained on code. +arXiv:2107.03374 +, 2021. +Chen et al. (2022b) +Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen. +Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. +arXiv preprint arXiv:2211.12588 +, 2022b. +Chowdhery et al. (2022) +Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. +Palm: Scaling language modeling with pathways. +arXiv:2204.02311 +, 2022. +Cobbe et al. (2021) +Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. +Training verifiers to solve math word problems. +arXiv:2110.14168 +, 2021. +Deng et al. (2023) +Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su. +Mind2web: Towards a generalist agent for the web. +arXiv:2306.06070 +, 2023. +Driess et al. (2023) +Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence. +Palm-e: An embodied multimodal language model. +arXiv:2303.03378 +, 2023. +Du et al. (2023) +Yilun Du, Mengjiao Yang, Bo Dai, Hanjun Dai, Ofir Nachum, Joshua B. Tenenbaum, Dale Schuurmans, and Pieter Abbeel. +Learning universal policies via text-guided video generation. +arXiv:2302.00111 +, 2023. +Evans (2010) +Jonathan St BT Evans. +Intuition and reasoning: A dual-process perspective. +Psychological Inquiry +, 2010. +Fan et al. (2022) +Linxi Fan, Guanzhi Wang, Yunfan Jiang, Ajay Mandlekar, Yuncong Yang, Haoyi Zhu, Andrew Tang, De-An Huang, Yuke Zhu, and Anima Anandkumar. +Minedojo: Building open-ended embodied agents with internet-scale knowledge. +In +NeurIPS Datasets and Benchmarks Track +, 2022. +Furuta et al. (2023) +Hiroki Furuta, Ofir Nachum, Kuang-Huei Lee, Yutaka Matsuo, Shixiang Shane Gu, and Izzeddin Gur. +Multimodal web navigation with instruction-finetuned foundation models. +arXiv preprint arXiv:2305.11854 +, 2023. +Gao et al. (2022) +Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig. +Pal: Program-aided language models. +arXiv preprint arXiv:2211.10435 +, 2022. +Guo et al. (2018) +Jiaxian Guo, Sidi Lu, Han Cai, Weinan Zhang, Yong Yu, and Jun Wang. +Long text generation via adversarial training with leaked information. +AAAI +, 2018. +Guss et al. (2019) +William H. Guss, Brandon Houghton, Nicholay Topin, Phillip Wang, Cayden Codel, Manuela Veloso, and Ruslan Salakhutdinov. +Minerl: A large-scale dataset of minecraft demonstrations. +In +IJCAI +, 2019. +Hafner et al. (2019) +Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha, Honglak Lee, and James Davidson. +Learning latent dynamics for planning from pixels. +In +ICML +, 2019. +Hafner et al. (2023) +Danijar Hafner, Jurgis Pasukonis, Jimmy Ba, and Timothy Lillicrap. +Mastering diverse domains through world models. +arXiv:2301.04104 +, 2023. +Hao et al. (2023) +Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and Zhiting Hu. +Reasoning with language model is planning with world model. +arXiv:2305.14992 +, 2023. +Huang et al. (2023) +Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, and Denny Zhou. +Large language models cannot self-correct reasoning yet. +arXiv:2310.01798 +, 2023. +Huang et al. (2022) +Wenlong Huang, Fei Xia, Ted Xiao, Harris Chan, Jacky Liang, Pete Florence, Andy Zeng, Jonathan Tompson, Igor Mordatch, Yevgen Chebotar, et al. +Inner monologue: Embodied reasoning through planning with language models. +arXiv:2207.05608 +, 2022. +Jiang et al. (2018) +D. Jiang, E. Ekwedike, and H. Liu. +Feedback-based tree search for reinforcement learning. +In +ICML +, 2018. +Kocsis & Szepesvári (2006) +Levente Kocsis and Csaba Szepesvári. +Bandit based monte-carlo planning. +In +ECML +, 2006. +Kojima et al. (2022) +Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. +Large language models are zero-shot reasoners. +arXiv:2205.11916 +, 2022. +LaValle et al. (2001) +Steven M LaValle, James J Kuffner, BR Donald, et al. +Rapidly-exploring random trees: Progress and prospects. +Algorithmic and computational robotics: new directions +, 2001. +Liu et al. (2018) +Evan Zheran Liu, Kelvin Guu, Panupong Pasupat, Tianlin Shi, and Percy Liang. +Reinforcement learning on web interfaces using workflow-guided exploration. +In +ICLR +, 2018. +Liu et al. (2023) +Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, and Jie Tang. +Agentbench: Evaluating llms as agents. +arXiv:2308.03688 +, 2023. +Madaan et al. (2023) +Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, Shashank Gupta, Bodhisattwa Prasad Majumder, Katherine Hermann, Sean Welleck, Amir Yazdanbakhsh, and Peter Clark. +Self-refine: Iterative refinement with self-feedback. +arXiv:2303.17651 +, 2023. +Nallapati et al. (2016) +Ramesh Nallapati, Bowen Zhou, Cicero dos Santos, Caglar Gulcehre, and Bing Xiang. +Abstractive text summarization using sequence-to-sequence rnns and beyond. +In +SIGNLL +, 2016. +Nye et al. (2021) +Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, et al. +Show your work: Scratchpads for intermediate computation with language models. +arXiv:2112.00114 +, 2021. +OpenAI (2023) +OpenAI. +Gpt-4 technical report. +arXiv:2303.08774 +, 2023. +Saparov & He (2022) +Abulhair Saparov and He He. +Language models are greedy reasoners: A systematic formal analysis of chain-of-thought. +arXiv:2210.01240 +, 2022. +Schick et al. (2023) +Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. +Toolformer: Language models can teach themselves to use tools. +arXiv:2302.04761 +, 2023. +Shen et al. (2023) +Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. +Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface. +arXiv:2303.17580 +, 2023. +Shinn et al. (2023) +Noah Shinn, Federico Cassano, Beck Labash, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. +Reflexion: Language agents with verbal reinforcement learning. +arXiv:2303.11366 +, 2023. +Shridhar et al. (2020) +Mohit Shridhar, Xingdi Yuan, Marc-Alexandre Côté, Yonatan Bisk, Adam Trischler, and Matthew Hausknecht. +Alfworld: Aligning text and embodied environments for interactive learning. +arXiv:2010.03768 +, 2020. +Silver et al. (2016) +David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. +Mastering the game of go with deep neural networks and tree search. +nature +, 2016. +Silver et al. (2017) +David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja Huang, Arthur Guez, Thomas Hubert, Lucas baker, Matthew Lai, Adrian Bolton, Yutian Chen, Timothy P. Lillicrap, Fan Hui, L. Sifre, George van den Driessche, Thore Graepel, and Demis Hassabis. +Mastering the game of go without human knowledge. +Nature +, 2017. +Sloman (1996) +Steven A. Sloman. +The empirical case for two systems of reasoning. +Psychological Bulletin +, 1996. +Sun et al. (2023) +Haotian Sun, Yuchen Zhuang, Lingkai Kong, Bo Dai, and Chao Zhang. +Adaplanner: Adaptive planning from feedback with language models. +arXiv:2305.16653 +, 2023. +Surís et al. (2023) +Dídac Surís, Sachit Menon, and Carl Vondrick. +Vipergpt: Visual inference via python execution for reasoning. +arXiv preprint arXiv:2303.08128 +, 2023. +Świechowski et al. (2023) +Maciej Świechowski, Konrad Godlewski, Bartosz Sawicki, and Jacek Mańdziuk. +Monte carlo tree search: A review of recent modifications and applications. +Artificial Intelligence Review +, 2023. +Touvron et al. (2023) +Hugo Touvron, Louis Martin, Kevin R. Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Daniel M. Bikel, Lukas Blecher, Cristian Cantón Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony S. Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel M. Kloumann, A. V. Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, R. Subramanian, Xia Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zhengxu Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and +Thomas Scialom. +Llama 2: Open foundation and fine-tuned chat models. +arXiv:2307.09288 +, 2023. +Vodopivec et al. (2017) +Tom Vodopivec, Spyridon Samothrakis, and Branko Ster. +On monte carlo tree search and reinforcement learning. +Journal of Artificial Intelligence Research +, 2017. +Wang et al. (2023) +Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar. +Voyager: An open-ended embodied agent with large language models. +arXiv:2305.16291 +, 2023. +Wang et al. (2022) +Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou. +Self-consistency improves chain of thought reasoning in language models. +arXiv:2203.11171 +, 2022. +Wei et al. (2022) +Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou. +Chain of thought prompting elicits reasoning in large language models. +arXiv:2201.11903 +, 2022. +Wooldridge & Jennings (1995) +Michael Wooldridge and Nicholas R Jennings. +Intelligent agents: Theory and practice. +The knowledge engineering review +, 1995. +Wu et al. (2023) +Philipp Wu, Alejandro Escontrela, Danijar Hafner, Pieter Abbeel, and Ken Goldberg. +Daydreamer: World models for physical robot learning. +In +CoRL +. PMLR, 2023. +Xie et al. (2023) +Yuxi Xie, Kenji Kawaguchi, Yiran Zhao, Xu Zhao, Min-Yen Kan, Junxian He, and Qizhe Xie. +Decomposition enhances reasoning via self-evaluation guided decoding. +arXiv:2305.00633 +, 2023. +Yang et al. (2018) +Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. +Hotpotqa: A dataset for diverse, explainable multi-hop question answering. +arXiv:1809.09600 +, 2018. +Yao et al. (2022) +Shunyu Yao, Howard Chen, John Yang, and Karthik R Narasimhan. +Webshop: Towards scalable real-world web interaction with grounded language agents. +In +NeurIPS +, 2022. +Yao et al. (2023a) +Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Thomas L. Griffiths, Yuan Cao, and Karthik Narasimhan. +Tree of thoughts: Deliberate problem solving with large language models. +arXiv:2305.10601 +, 2023a. +Yao et al. (2023b) +Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. +ReAct: Synergizing reasoning and acting in language models. +In +ICLR +, 2023b. +Yao et al. (2023c) +Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, Ran Xu, Phil Mui, Huan Wang, Caiming Xiong, and Silvio Savarese. +Retroformer: Retrospective large language agents with policy gradient optimization. +arXiv preprint arXiv:2308.02151 +, 2023c. +Ye et al. (2021) +Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, and Yang Gao. +Mastering atari games with limited data. +In +NeurIPS +, 2021. +Zhou et al. (2022) +Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Olivier Bousquet, Quoc Le, and Ed Chi. +Least-to-most prompting enables complex reasoning in large language models. +arXiv:2205.10625 +, 2022. +Zhu et al. (2023) +Xizhou Zhu, Yuntao Chen, Hao Tian, Chenxin Tao, Weijie Su, Chenyu Yang, Gao Huang, Bin Li, Lewei Lu, Xiaogang Wang, Yu Qiao, Zhaoxiang Zhang, and Jifeng Dai. +Ghost in the minecraft: Generally capable agents for open-world environments via large language models with text-based knowledge and memory. +arXiv:2305.17144 +, 2023. +7 +Appendix +The appendix is organized as follows. First in Sec. +A +, we show the pseudocode of our proposed algorithm, LATS; then in Sec. +B +, we provide further discussion of our method and its limitations, future direction and broader impact; then in Sec. +C +we provide additional experimental results; then in Sec. +D +, we specify the environment details in our experiments; finally, we list our prompts used for the three environments in Sec. +E +(HotPotQA), Sec. +F +(Programming) and Sec. +G +(Webshop) respectively. +Appendix A +LATS Pseudocode +Alg. +1 +shows the pseudocode of our algorithm LATS. Nodes are stored explicitly in the memory. Unless otherwise specified, in all experiments we use +n += +5 +𝑛 +5 +n=5 +and +w += +1 +𝑤 +1 +w=1 +. +Algorithm 1 +LATS +⁡ +( +S +0 +, +p +θ +, +p +V +, +p +ref +, +d +, +k +, +n +, +w +) +LATS +subscript +𝑆 +0 +subscript +𝑝 +𝜃 +subscript +𝑝 +𝑉 +subscript +𝑝 +ref +𝑑 +𝑘 +𝑛 +𝑤 +\operatorname{LATS}(S_{0},p_{\theta},{p_{V}},p_{\text{ref}},d,k,n,w) +Initial state +s +1 +subscript +𝑠 +1 +s_{1} +, action generator +p +θ +subscript +𝑝 +𝜃 +p_{\theta} +, value function +p +V +subscript +𝑝 +𝑉 +p_{V} +, reflection generator +p +ref +subscript +𝑝 +ref +p_{\text{ref}} +, number of generated actions +n +𝑛 +n +, depth limit +L +𝐿 +L +, number of roll-outs +K +𝐾 +K +, context +c +𝑐 +c +, and exploration weight +w +𝑤 +w +Initialize action space +A +𝐴 +A +, observation space +O +𝑂 +O +Initialize the state-action value function +p +V +: +S +× +A +↦ +ℝ +: +subscript +𝑝 +𝑉 +maps-to +𝑆 +𝐴 +ℝ +{p_{V}}:S\times A\mapsto\mathbb{R} +and visit counter +N +: +S +↦ +ℕ +: +𝑁 +maps-to +𝑆 +ℕ +{N}:S\mapsto\mathbb{N} +to zero +for +k +← +0 +, +… +, +K +− +1 +← +𝑘 +0 +… +𝐾 +1 +k\leftarrow 0,\dots,K-1 +do +for +t +← +0 +, +… +, +L +− +1 +← +𝑡 +0 +… +𝐿 +1 +t\leftarrow 0,\dots,L-1 +do +if +s +t +subscript +𝑠 +𝑡 +s_{t} +not terminal +then +▷ +▷ +\triangleright +Expansion & Simulation +for +i +← +1 +, +… +, +n +← +𝑖 +1 +… +𝑛 +i\leftarrow 1,\dots,n +do +Sample +a +t +( +i +) +∼ +p +θ +​ +( +a +∣ +s +t +) +similar-to +superscript +subscript +𝑎 +𝑡 +𝑖 +subscript +𝑝 +𝜃 +conditional +𝑎 +subscript +𝑠 +𝑡 +a_{t}^{(i)}\sim p_{\theta}(a\mid s_{t}) +Get +o +t +( +i +) +superscript +subscript +𝑜 +𝑡 +𝑖 +o_{t}^{(i)} +from environment, +s +t ++ +1 +( +i +) +← +( +c +t +( +i +) +, +o +t +( +i +) +, +a +t +( +i +) +) +← +superscript +subscript +𝑠 +𝑡 +1 +𝑖 +superscript +subscript +𝑐 +𝑡 +𝑖 +superscript +subscript +𝑜 +𝑡 +𝑖 +superscript +subscript +𝑎 +𝑡 +𝑖 +s_{t+1}^{(i)}\leftarrow(c_{t}^{(i)},o_{t}^{(i)},a_{t}^{(i)}) +, +c +t ++ +1 +( +i +) +← +( +o +t +( +i +) +, +a +t +( +i +) +) +← +superscript +subscript +𝑐 +𝑡 +1 +𝑖 +superscript +subscript +𝑜 +𝑡 +𝑖 +superscript +subscript +𝑎 +𝑡 +𝑖 +c_{t+1}^{(i)}\leftarrow(o_{t}^{(i)},a_{t}^{(i)}) +Evaluate +V +t +( +i +) +∼ +p +V +​ +( +s +t +( +i +) +) +similar-to +superscript +subscript +𝑉 +𝑡 +𝑖 +subscript +𝑝 +𝑉 +superscript +subscript +𝑠 +𝑡 +𝑖 +{V}_{t}^{(i)}\sim{p_{V}}(s_{t}^{(i)}) +▷ +▷ +\triangleright +Evaluation +V +​ +( +s +t +) +← +V +t +( +i +) +← +𝑉 +subscript +𝑠 +𝑡 +superscript +subscript +𝑉 +𝑡 +𝑖 +{V}(s_{t})\leftarrow{V}_{t}^{(i)} +Add +s +t +( +i +) +superscript +subscript +𝑠 +𝑡 +𝑖 +s_{t}^{(i)} +to children +end +for +end +if +if +s +t +subscript +𝑠 +𝑡 +s_{t} +is terminal +then +▷ +▷ +\triangleright +Reflection +Get +r +𝑟 +r +from environment +if +r +𝑟 +r +not success +then +reflection +← +p +ref +​ +( +c +t +) +← +reflection +subscript +𝑝 +ref +subscript +𝑐 +𝑡 +\text{reflection}\leftarrow p_{\text{ref}}(c_{t}) +c +← +reflection +← +𝑐 +reflection +c\leftarrow\text{reflection} +end +if +end +if +a +t +← +arg +⁡ +max +a +∈ +e +​ +( +s +t +) +⁡ +[ +V +​ +( +s +t +) ++ +w +​ +ln +⁡ +N +​ +( +s +t +− +1 +) +N +​ +( +s +t +) +] +← +subscript +𝑎 +𝑡 +subscript +𝑎 +𝑒 +subscript +𝑠 +𝑡 +𝑉 +subscript +𝑠 +𝑡 +𝑤 +𝑁 +subscript +𝑠 +𝑡 +1 +𝑁 +subscript +𝑠 +𝑡 +a_{t}\leftarrow\arg\max_{a\in e(s_{t})}\left[{V(s_{t})}+w\sqrt{\frac{\ln{N}(s_{t-1})}{{N}(s_{t})}}\right] +▷ +▷ +\triangleright +Selection +N +​ +( +s +t ++ +1 +) +← +N +​ +( +s +t ++ +1 +) ++ +1 +← +𝑁 +subscript +𝑠 +𝑡 +1 +𝑁 +subscript +𝑠 +𝑡 +1 +1 +{N}(s_{t+1})\leftarrow{N}(s_{t+1})+1 +if +a +t +subscript +𝑎 +𝑡 +a_{t} +is an output action +then +break +end +for +T +← +← +𝑇 +absent +T\leftarrow +the actual number of steps +for +t +← +T +− +1 +, +… +, +0 +← +𝑡 +𝑇 +1 +… +0 +t\leftarrow T-1,\dots,0 +do +▷ +▷ +\triangleright +Backpropagation +V +​ +( +s +t +) +← +V +​ +( +s +t +) +​ +( +N +​ +( +s +t +) +− +1 +) ++ +r +N +​ +( +s +t +) +← +𝑉 +subscript +𝑠 +𝑡 +𝑉 +subscript +𝑠 +𝑡 +𝑁 +subscript +𝑠 +𝑡 +1 +𝑟 +𝑁 +subscript +𝑠 +𝑡 +V(s_{t})\leftarrow\frac{V(s_{t})(N(s_{t})-1)+r}{N(s_{t})} +end +for +end +for +Appendix B +Discussion +Limitations. +Although LATS can improve reasoning and decision-making, this arrives at a higher computational cost relative to simpler prompting methods like ReAct or Reflexion. The search process takes more time than standard prompting or simpler techniques, and requires greater inference costs. While such an issue is mitigated by the fact that the number of nodes +n +𝑛 +n +expanded at every step provides a natural trade-off between performance and efficiency (setting +n += +1 +𝑛 +1 +n=1 +makes the method as effecient as ReAct with multiple trials or CoT-SC), in practice we recommend using LATS for difficult tasks like programming or for situations where performance is prioritized over efficiency. We hope that continued advancements in LLMs will reduce costs and increase the practicality of LATS. +Additionally, the benchmarks we use in this paper are relatively simple and focused on decision-making, compared to the complexity of real-world interactive environments. In addition, some environments might not easily support rollbacks to previous states. However, the design of LATS is flexible and can be adjusted to various resource constraints. Using planning-based prompting methods like LATS in environments like Minecraft +(Fan et al., +2022 +) +and more reasoning benchmarks would be interesting avenues for future work. +Broader impact. +LATS is a framework that enhances LLM performance through interactions with an environment. This improvement in autonomous decision-making may facilitate harmful uses of LLMs. Alternatively, LATS enhances interpretability and the potential for greater alignment, as it generates understandable, high-level linguistic reasoning and actions through several rounds of decision-making and reflection, rather than relying on implicit, low-level token values. +Appendix C +Ablations +Prompt Method +HotpotQA (EM) +LATS (w=0.5) +0.55 +LATS (w=2.0) +0.61 +LATS (d=4) +0.58 +LATS (CoT) +0.60 +LATS (No LM Heuristic) +0.37 +LATS +0.61 +Table 6: +Ablation results on LATS and baseline variants in HotPotQA measured by Exact Match (EM). We test different depth +d +𝑑 +d +, exploration factor +w +𝑤 +w +, and versions of LATS using CoT and without the LM value function. We sample +n += +5 +𝑛 +5 +n=5 +and +k += +50 +𝑘 +50 +k=50 +trajectories. +Figure 4: +Performance over successive iterations on HumanEval with GPT-3.5. +In this section, we ablate various designs of LATS. Experiments are conducted on HotPotQA with a maximum of +k += +50 +𝑘 +50 +k=50 +trajectories and sampling size of +n += +5 +𝑛 +5 +n=5 +and HumanEval with a maximum of +k += +8 +𝑘 +8 +k=8 +trajectories and sampling size of +n += +5 +𝑛 +5 +n=5 +. The result for HotPotQA is shown in Tab. +5 +and HumanEval in Fig. +4 +. +Exploration weight. +We find that there is lower performance on HotPotQA when the exploration weight +w +𝑤 +w +in the selection formula is decreased to +0.5 +0.5 +0.5 +, suggesting that this reduces the effectiveness of the search. Increasing +w +𝑤 +w +to +2.0 +2.0 +2.0 +does not lead to a performance improvement, but we tend to observe faster convergence. The optimal setting depends on the particular environment and complexity of the state space. +Depth. +In our main experiments we use a maximum depth of +d += +7 +𝑑 +7 +d=7 +on HotPotQA for all methods, following previous work +(Yao et al., +2023b +) +. We ablate the effect on LATS after reducing it to +d += +4 +𝑑 +4 +d=4 +. This results in only a slight drop in performance. We find that most questions can be answered within four steps, and using a greater number of steps tends to force the agent into local minima and rarely improves success. +LM value function. +The LM value function scores states based on expected future reward. Without this heuristic, the only signal to guide search would be from environment rewards for completed trajectories, which are scarce and often binary. When we remove the evaluation operation, we observe a dramatic +0.24 +0.24 +0.24 +drop in performance. +Performance over time. +To see the effects of increasing the number of trajectories sampled, we change +k +𝑘 +k +to different values. We conduct this experiment on HumanEval, which has a more noticeable difference due to sampling less trajectories. The results are shown in Fig. +4 +, in which LATS scales better with more iterations than Reflexion. +Sample complexity and Token cost. +One possible concern of LATS is that the tree-structured search might consume much more tokens than existing methods. To further study the computational cost of LATS compared to prior methods, we examine the sample complexity (i.e. asymptotic token cost) of all methods considered in this paper, and count the average number of nodes expanded by our method and other tree-structured methods (ToT and RAP) upon successful search on HotPotQA. We present the results in Tab. +7 +; the result shows that our method has the same sample complexity as other tree-based search methods, and has less average number of nodes expanded upon success, which indicates less token cost. The token cost gap will be even larger when taking failed trajectories into account, since our method has higher success rate and reaches computational budget limit less often. +Method +Performance ( +↑ +↑ +\uparrow +) +Sample complexity ( +↓ +↓ +\downarrow +) +Avg. #nodes upon success ( +↓ +↓ +\downarrow +) +ReAct (Best +k += +250 +𝑘 +250 +k=250 +) +0.42 +0.42 +0.42 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +CoT-SC ( +n += +1 +, +k += +250 +formulae-sequence +𝑛 +1 +𝑘 +250 +n=1,k=250 +) +0.40 +0.40 +0.40 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +LATS ( +n += +1 +, +k += +50 +formulae-sequence +𝑛 +1 +𝑘 +50 +n=1,k=50 +) +0.48 +0.48 +0.48 +O +​ +( +k +) +𝑂 +𝑘 +O(k) +N/A +ToT (ReAct) +0.49 +0.49 +0.49 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +84.05 +84.05 +84.05 +RAP (ReAct) +0.54 +0.54 +0.54 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +70.60 +70.60 +70.60 +LATS ( +n += +5 +, +k += +50 +formulae-sequence +𝑛 +5 +𝑘 +50 +n=5,k=50 +) +0.61 +0.61 +0.61 +O +​ +( +k +​ +n +) +𝑂 +𝑘 +𝑛 +O(kn) +66.65 +66.65 +66.65 +Table 7: +The performance, sample complexity of different methods and average number of nodes expanded upon success by methods with tree-based search. +n +𝑛 +n +is the number of children nodes expanded at every step and +k +𝑘 +k +is the number of trajectories. Our method has the same sample complexity as other methods with tree-based search and expands less nodes upon success, which indicates lower token cost. +Appendix D +Environment Details +D.1 +HotPotQA +Figure 5: +Example trajectories on HotPotQA for ReAct (left) and LATS (right). LATS can sample more actions and avoid failure from previous mistakes by evaluating states with an LM to guide the search toward promising areas of the tree. +HotPotQA +(Yang et al., +2018 +) +is a question-answering dataset that requires reasoning over multiple supporting documents to answer questions. It contains 113k Wikipedia-based question-answer pairs crafted by crowdworkers to be diverse, multi-hop, and explainable. Questions cover a range of types like entities, locations, dates, and comparison of shared properties between two entities. Crowdworkers also provide supporting facts from the documents that justify the answer. We use the HotPotQA benchmark setting with all the Wikipedia paragraphs to test retrieval. We use a randomly selected subset of 100 questions for our experiments and a maximum depth limit of 6. Fig. +5 +illustrates how ReAct and LATS work on an example task of HotPotQA, and gives a qualitative example on how LATS outperforms ReAct on the task. +Action Space. +We adopt the Wikipedia web API proposed in +Yao et al. ( +2023b +) +, with three types of actions to support interactive information retrieval: +(1) +search +[ +entity +], which returns the first 5 sentences from the corresponding +entity +wiki page if it exists, or else suggests top-5 similar entities from the Wikipedia search engine, +(2) +lookup +[ +string +], which returns the next sentence in the page containing +string +, +(3) +finish +[ +answer +], which finishes the current task with +answer +. +These API calls and free-form thoughts form the action space for this environment. +D.2 +Programming +The HumanEval dataset +(Chen et al., +2021 +) +is a collection of 164 handwritten programming problems introduced to evaluate the functional correctness of models for synthesizing programs from natural language descriptions. Each problem includes a function signature, docstring description, reference implementation, and multiple unit tests, with an average of 7.7 tests per problem. The programming tasks assess comprehension of natural language, reasoning, algorithms, and basic mathematics, at a difficulty level comparable to simple software interview questions. Pass rates are evaluated with the pass@k metric, where k samples are generated per problem and a problem is considered solved if any sample passes all tests. We use all 164 problems for our experiments and a maximum depth limit of 8. +The Mostly Basic Programming Problems (MBPP) +Austin et al. ( +2021 +) +benchmark contains 974 short Python functions designed to evaluate program synthesis techniques. The dataset was constructed by crowdsourcing from workers with basic Python knowledge. Each data point consists of a natural language description of a programming task, a reference solution implementation, and three test cases for functional correctness. The natural language prompts are typically short, one-sentence descriptions. Solutions cover common programming constructs including mathematical operations, list processing, string manipulation, and usage of the Python standard library. On average, solutions are 6.8 lines of code. The dataset is also supplemented with an additional set of 426 problems that were manually verified for unambiguous specifications, standard function signatures, and accurate test cases. We use a randomly selected subset of 397 problems for our experiments. +D.3 +WebShop +WebShop +(Yao et al., +2022 +) +is an interactive web-based environment designed to evaluate agents on grounded language understanding and decision-making. It simulates an e-commerce shopping task by providing agents with over 1 million real-world products scraped from Amazon, spanning 5 categories and 113 subcategories. These products contain rich linguistic information, with an average text length of 262 words and a vocabulary size of 224k. In addition, there are over 800k unique product options available for customization. The environment renders webpages in two modes: HTML mode provides pixel-level observations with interactive elements, while simple mode converts the raw HTML into a structured text observation more amenable for training agents. The action space consists of query searches and button clicks, which transition between 4 page types: search, results, item and item-detail. Instructions are crowdsourced natural language specifying product attributes and options, with a total of 12k collected. Automatic rewards are computed by comparing the product purchased by the agent against the attributes and options specified in the instruction, using both lexical matching and semantic similarity metrics. +Type +Argument +State +→ +→ +\rightarrow +Next State +search +[ +Query +] +Search +→ +→ +\rightarrow +Results +choose +Back to search +∗ +* +→ +→ +\rightarrow +Search +choose +Prev/Next page +Results +→ +→ +\rightarrow +Results +choose +[ +Product title +] +Results +→ +→ +\rightarrow +Item +choose +[ +Option +] +Item +→ +→ +\rightarrow +Item +choose +Desc/Overview +Item +→ +→ +\rightarrow +Item-Detail +choose +Previous +Item-Detail +→ +→ +\rightarrow +Item +choose +Buy +Item +→ +→ +\rightarrow +Episode End +Table 8: +Action space of webshop. +There are two evaluation metrics used in WebShop: (1) +Task Score +: defined as +( +100 +× +avg. reward +) +100 +avg. reward +(100\times\text{avg. reward}) +, which captures the average reward obtained across episodes; and (2) +Success Rate (SR) +defined as the portion of instructions where +r += +1 +𝑟 +1 +r=1 +. The reward is calculated based on the number of attributes satisfied by the selected item. We use 50 environments for our experiments and a maximum depth limit of 15. +Appendix E +HotPotQA Prompts +E.1 +Base Acting Prompt +Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +After each observation, provide the next Thought and next Action. +Here are some examples: +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +Thought 2: Arthur’s Magazine was started in 1844. I need to search First for Women next. +Action 2: Search[First for Women] +Observation 2: First for Women is a woman’s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey.[2] In 2011 the circulation of the magazine was 1,310,696 copies.[3] +Thought 3: First for Women was started in 1989. 1844 (Arthur’s Magazine) ¡ 1989 (First for Women), so Arthur’s Magazine was started first. +Action 3: Finish[Arthur’s Magazine] +(examples) +You have attempted to answer the following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question. +(trajectories) +(input) +E.2 +Base Reasoning Prompt +Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question. Start your response with either Action or an indexed Thought +Here are some examples: +Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into? +Let’s think step by step. +Thought 1: The eastern sector of Colorado orogeny extends into the High Plains. +Thought 2: High Plains rise in elevation from around 1,800 to 7,000 ft +Thought 3: The answer is 1,800 to 7,000 ft. +Action: Finish[1,800 to 7,000 ft] +(examples) +Previous trial: +(trajectories) +(input) +E.3 +Value Function Prompt +Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action. +Thus the correctness score is 10 +(other examples) +(failed trajectories) +(context) +E.4 +Reflection Prompt +Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types: +(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search. +(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage. +(3) Finish[answer], which returns the answer and finishes the task. +Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Question: Which magazine was started first Arthur’s Magazine or First for Women? +Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first. +Action 1: Search[Arthur’s Magazine] +Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3] +This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action. +Thus the correctness score is 10 +(other examples) +(failed trajectories) +(context) +Appendix F +Programming Prompts +F.1 +HumanEval function implementation example +Sample function signature: +⬇ +def +minSubArraySum +( +nums +): +Given +an +array +of +integers +nums +, +find +the +minimum +sum +of +any +non +- +empty +sub +- +array +of +nums +. +Example +minSubArraySum +([2, +3, +4, +1, +2, +4]) +== +1 +minSubArraySum +([-1, +-2, +-3]) +== +-6 +Sample function body implementation: +⬇ +min_sum += +float +(’ +inf +’) +for +i +in +range +( +len +( +nums +)): +current_sum += +0 +for +j +in +range +( +i +, +len +( +nums +)): +current_sum ++= +nums +[ +j +] +if +current_sum +< +min_sum +: +min_sum += +current_sum +return +min_sum +F.2 +Base Acting/Reasoning Prompt +You are an AI Python assistant. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Write your full implementation (restate the function signature). +Example 1: +[previous impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a +- +b +[unit test results from previous impl]: +Tested passed: +Tests failed: +assert add(1, 2) == 3 # output: -1 +assert add(1, 2) == 4 # output: -1 +[reflection on previous impl]: +The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input. +[improved impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a ++ +b +F.3 +Reflection Prompt +You are a Python programming assistant. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user. +Example 1: +[previous impl]: +⬇ +def +add +( +a +: +int +, +b +: +int +) +-> +int +: +””” +Given +integers +a +and +b +, +return +the +total +value +of +a +and +b +. +””” +return +a +- +b +[unit test results from previous impl]: +Tested passed: +Tests failed: +assert add(1, 2) == 3 # output: -1 +assert add(1, 2) == 4 # output: -1 +[reflection on previous impl]: +The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input. +F.4 +Test Case Generation Prompt +You are an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring. +(input) +Appendix G +WebShop Prompts +G.1 +Acting Prompt +Webshop +Instruction: +i would like a 3 ounce bottle of bright citrus deodorant for sensitive skin, and price lower than 50.00 dollars +[Search] +Action: search[3 ounce bright citrus deodorant sensitive skin] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next +] +[B078GWRC1J] +Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +$10.99 +[B078GTKVXY] +Ginger Fresh Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +$10.99 +[B08KBVJ4XN] +Barrel and Oak - Aluminum-Free Deodorant, Deodorant for Men, Essential Oil-Based Scent, 24-Hour Odor Protection, Cedar & Patchouli Blend, Gentle on Sensitive Skin (Mountain Sage, 2.7 oz, 2-Pack) +$15.95 +Action: think[B078GWRC1J and B078GTKVXY are bright citrus deodorant less then 50 dollars. I can check B078GWRC1J first.] +Observation: OK. +Action: click[B078GWRC1J] +Observation: +[Back to Search] +[Prev] +scent [assorted scents][bright citrus][calming lavender][ginger fresh][simply non-scents] +size [travel set (4-pack)][3 ounce (pack of 1)][3-ounce (2-pack)] +Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce +Price: $10.99 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[For 3 ounce bottle of bright citrus deodorant for sensitive skin, the item has options ’bright citrus’ and ’3 ounce (pack of 1)’ and seems good to buy.] +Observation: OK. +Action: click[bright citrus] +Observation: You have clicked bright citrus. +Action: click[3 ounce (pack of 1)] +Observation: You have clicked 3 ounce (pack of 1). +Action: click[Buy Now] +G.2 +Value Function Prompt +Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Here are some examples +Webshop +Instruction: +i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars +[Search] +Action: search[dairy free and apple variety pack of chips] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next] +[B07HRFJWP8] +Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total) +$100.0 +[B01KMHY5PG] +Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars) +$21.49 +[B008D2X8C4] +Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz) +$100.0 +Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.] +Observation: OK. +Action: click[B008D2X8C4] +Observation: +[Back to Search] +[Prev] +flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack] +size [0.8 ounce (pack of 24)][4 ounce (pack of 12)] +Price: $100.0 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.] +Observation: OK. +Action: click[variety pack] +Observation: You have clicked variety pack. +Action: click[0.8 ounce (pack of 24)] +Observation: You have clicked 0.8 ounce (pack of 24). +Action: click[Buy Now] +Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products. +Thus the correctness score is 5 +(input) +G.3 +Reflection Prompt +Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10. +Here are some examples +Webshop +Instruction: +i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars +[Search] +Action: search[dairy free and apple variety pack of chips] +Observation: +[Back to Search] +Page 1 (Total results: 50) +[Next] +[B07HRFJWP8] +Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total) +$100.0 +[B01KMHY5PG] +Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars) +$21.49 +[B008D2X8C4] +Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz) +$100.0 +Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.] +Observation: OK. +Action: click[B008D2X8C4] +Observation: +[Back to Search] +[Prev] +flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack] +size [0.8 ounce (pack of 24)][4 ounce (pack of 12)] +Price: $100.0 +Rating: N.A. +[Description] +[Features] +[Reviews] +[Buy Now] +Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.] +Observation: OK. +Action: click[variety pack] +Observation: You have clicked variety pack. +Action: click[0.8 ounce (pack of 24)] +Observation: You have clicked 0.8 ounce (pack of 24). +Action: click[Buy Now] +Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products. +(input) +Reflection: +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md new file mode 100644 index 0000000000000000000000000000000000000000..25e1ac5f2aefc5ff394b5636c0cce4552436d1cd --- /dev/null +++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md @@ -0,0 +1,202 @@ +--- +title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning + in Language Models' +id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la +tags: +- deepread +created: '2026-06-10T00:39:54.848871Z' +source: https://arxiv.org/abs/2310.04406 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:39:54.848723Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Computer Science > Artificial Intelligence +arXiv:2310.04406 +(cs) +[Submitted on 6 Oct 2023 ( +v1 +), last revised 6 Jun 2024 (this version, v3)] +Title: +Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models +Authors: +Andy Zhou +, +Kai Yan +, +Michal Shlapentokh-Rothman +, +Haohan Wang +, +Yu-Xiong Wang +View a PDF of the paper titled Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models, by Andy Zhou and 4 other authors +View PDF +HTML (experimental) +Abstract: +While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at +this https URL +Comments: +Code at +this https URL +Subjects: +Artificial Intelligence (cs.AI) +; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG) +Cite as: +arXiv:2310.04406 +[cs.AI] +(or +arXiv:2310.04406v3 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2310.04406 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Andy Zhou [ +view email +] +[v1] +Fri, 6 Oct 2023 17:55:11 UTC (371 KB) +[v2] +Tue, 5 Dec 2023 05:25:55 UTC (465 KB) +[v3] +Thu, 6 Jun 2024 02:51:17 UTC (960 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models, by Andy Zhou and 4 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2023-10 +Change to browse by: +cs +cs.CL +cs.CV +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md b/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md new file mode 100644 index 0000000000000000000000000000000000000000..00485e0e2b28c9396efd9f7bfd2978c177e464af --- /dev/null +++ b/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md @@ -0,0 +1,203 @@ +--- +title: '[2310.06770] SWE-bench: Can Language Models Resolve Real-World GitHub Issues?' +id: 231006770-swe-bench-can-language-models-resolve-real-world-github-issues +tags: +- deepread +created: '2026-06-10T00:23:35.577828Z' +source: https://arxiv.org/abs/2310.06770 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:35.577638Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2310.06770] SWE-bench: Can Language Models Resolve Real-World GitHub Issues? +Computer Science > Computation and Language +arXiv:2310.06770 +(cs) +[Submitted on 10 Oct 2023 ( +v1 +), last revised 11 Nov 2024 (this version, v3)] +Title: +SWE-bench: Can Language Models Resolve Real-World GitHub Issues? +Authors: +Carlos E. Jimenez +, +John Yang +, +Alexander Wettig +, +Shunyu Yao +, +Kexin Pei +, +Ofir Press +, +Karthik Narasimhan +View a PDF of the paper titled SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, by Carlos E. Jimenez and 6 other authors +View PDF +Abstract: +Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous. +Comments: +Data, code, and leaderboard are available at +this https URL +ICLR 2024, +this https URL +Subjects: +Computation and Language (cs.CL) +; Artificial Intelligence (cs.AI); Software Engineering (cs.SE) +Cite as: +arXiv:2310.06770 +[cs.CL] +(or +arXiv:2310.06770v3 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2310.06770 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Carlos E. Jimenez [ +view email +] +[v1] +Tue, 10 Oct 2023 16:47:29 UTC (2,003 KB) +[v2] +Fri, 5 Apr 2024 18:16:29 UTC (2,258 KB) +[v3] +Mon, 11 Nov 2024 23:05:04 UTC (2,398 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, by Carlos E. Jimenez and 6 other authors +View PDF +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2023-10 +Change to browse by: +cs +cs.AI +cs.SE +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md b/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md new file mode 100644 index 0000000000000000000000000000000000000000..f19fdb9491233e227c9191c36b05347ed20ca256 --- /dev/null +++ b/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md @@ -0,0 +1,208 @@ +--- +title: '[2311.08105] DiLoCo: Distributed Low-Communication Training of Language Models' +id: 231108105-diloco-distributed-low-communication-training-of-language-models +tags: +- deepread +created: '2026-06-10T00:30:20.411067Z' +source: https://arxiv.org/abs/2311.08105 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:30:20.410923Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2311.08105] DiLoCo: Distributed Low-Communication Training of Language Models +Computer Science > Machine Learning +arXiv:2311.08105 +(cs) +[Submitted on 14 Nov 2023 ( +v1 +), last revised 23 Sep 2024 (this version, v3)] +Title: +DiLoCo: Distributed Low-Communication Training of Language Models +Authors: +Arthur Douillard +, +Qixuan Feng +, +Andrei A. Rusu +, +Rachita Chhaparia +, +Yani Donchev +, +Adhiguna Kuncoro +, +Marc'Aurelio Ranzato +, +Arthur Szlam +, +Jiajun Shen +View a PDF of the paper titled DiLoCo: Distributed Low-Communication Training of Language Models, by Arthur Douillard and 8 other authors +View PDF +HTML (experimental) +Abstract: +Large language models (LLM) have become a critical component in many applications of machine learning. However, standard approaches to training LLM require a large number of tightly interconnected accelerators, with devices exchanging gradients and other intermediate states at each optimization step. While it is difficult to build and maintain a single computing cluster hosting many accelerators, it might be easier to find several computing clusters each hosting a smaller number of devices. In this work, we propose a distributed optimization algorithm, Distributed Low-Communication (DiLoCo), that enables training of language models on islands of devices that are poorly connected. The approach is a variant of federated averaging, where the number of inner steps is large, the inner optimizer is AdamW, and the outer optimizer is Nesterov momentum. On the widely used C4 dataset, we show that DiLoCo on 8 workers performs as well as fully synchronous optimization while communicating 500 times less. DiLoCo exhibits great robustness to the data distribution of each worker. It is also robust to resources becoming unavailable over time, and vice versa, it can seamlessly leverage resources that become available during training. +Subjects: +Machine Learning (cs.LG) +; Computation and Language (cs.CL) +Cite as: +arXiv:2311.08105 +[cs.LG] +(or +arXiv:2311.08105v3 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2311.08105 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Arthur Douillard [ +view email +] +[v1] +Tue, 14 Nov 2023 12:05:45 UTC (1,609 KB) +[v2] +Sat, 2 Dec 2023 14:10:14 UTC (1,610 KB) +[v3] +Mon, 23 Sep 2024 10:41:27 UTC (1,610 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled DiLoCo: Distributed Low-Communication Training of Language Models, by Arthur Douillard and 8 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2023-11 +Change to browse by: +cs +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md b/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md new file mode 100644 index 0000000000000000000000000000000000000000..4eb50797128480d47d6e8e18f198a2dda0b43986 --- /dev/null +++ b/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md @@ -0,0 +1,204 @@ +--- +title: '[2311.08516] LLMs cannot find reasoning errors, but can correct them given + the error location' +id: 231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error +tags: +- deepread +created: '2026-06-10T00:40:16.980357Z' +source: https://arxiv.org/abs/2311.08516 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:16.980220Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2311.08516] LLMs cannot find reasoning errors, but can correct them given the error location +Computer Science > Artificial Intelligence +arXiv:2311.08516 +(cs) +[Submitted on 14 Nov 2023 ( +v1 +), last revised 4 Jun 2024 (this version, v3)] +Title: +LLMs cannot find reasoning errors, but can correct them given the error location +Authors: +Gladys Tyen +, +Hassan Mansoor +, +Victor Cărbune +, +Peter Chen +, +Tony Mak +View a PDF of the paper titled LLMs cannot find reasoning errors, but can correct them given the error location, by Gladys Tyen and 4 other authors +View PDF +HTML (experimental) +Abstract: +While self-correction has shown promise in improving LLM outputs in terms of style and quality (e.g. Chen et al., 2023b; Madaan et al., 2023), recent attempts to self-correct logical or reasoning errors often cause correct answers to become incorrect, resulting in worse performances overall (Huang et al., 2023). In this paper, we show that poor self-correction performance stems from LLMs' inability to find logical mistakes, rather than their ability to correct a known mistake. Firstly, we benchmark several state-of-the-art LLMs on their mistake-finding ability and demonstrate that they generally struggle with the task, even in highly objective, unambiguous cases. Secondly, we test the correction abilities of LLMs -- separately from mistake finding -- using a backtracking setup that feeds ground truth mistake location information to the model. We show that this boosts downstream task performance across our 5 reasoning tasks, indicating that LLMs' correction abilities are robust. Finally, we show that it is possible to obtain mistake location information without ground truth labels or in-domain training data. We train a small classifier with out-of-domain data, which exhibits stronger mistake-finding performance than prompting a large model. We release our dataset of LLM-generated logical mistakes, BIG-Bench Mistake, to enable further research into locating LLM reasoning mistakes. +Comments: +ACL 2024 Findings +Subjects: +Artificial Intelligence (cs.AI) +; Computation and Language (cs.CL); Machine Learning (cs.LG) +Cite as: +arXiv:2311.08516 +[cs.AI] +(or +arXiv:2311.08516v3 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2311.08516 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Gladys Tyen [ +view email +] +[v1] +Tue, 14 Nov 2023 20:12:38 UTC (7,191 KB) +[v2] +Tue, 9 Jan 2024 03:32:32 UTC (7,191 KB) +[v3] +Tue, 4 Jun 2024 10:25:13 UTC (7,319 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled LLMs cannot find reasoning errors, but can correct them given the error location, by Gladys Tyen and 4 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2023-11 +Change to browse by: +cs +cs.CL +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md b/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md new file mode 100644 index 0000000000000000000000000000000000000000..d97a39f0a95537904058cff394c144e2e58510f8 --- /dev/null +++ b/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md @@ -0,0 +1,203 @@ +--- +title: '[2312.09152] Evaluating Augmented Reality Communication: How Can We Teach + Procedural Skill in AR?' +id: 231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural +tags: +- deepread +created: '2026-06-10T00:40:10.692238Z' +source: https://arxiv.org/abs/2312.09152 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:10.692096Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2312.09152] Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR? +Computer Science > Human-Computer Interaction +arXiv:2312.09152 +(cs) +[Submitted on 14 Dec 2023] +Title: +Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR? +Authors: +Manuel Rebol +, +Krzysztof Pietroszek +, +Neal Sikka +, +Claudia Ranniger +, +Colton Hood +, +Adam Rutenberg +, +Puja Sasankan +, +Christian Gütl +View a PDF of the paper titled Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?, by Manuel Rebol and 7 other authors +View PDF +HTML (experimental) +Abstract: +Augmented reality (AR) has great potential for use in healthcare applications, especially remote medical training and supervision. In this paper, we analyze the usage of an AR communication system to teach a medical procedure, the placement of a central venous catheter (CVC) under ultrasound guidance. We examine various AR communication and collaboration components, including gestural communication, volumetric information, annotations, augmented objects, and augmented screens. We compare how teaching in AR differs from teaching through videoconferencing-based communication. Our results include a detailed medical training steps analysis in which we compare how verbal and visual communication differs between video and AR training. We identify procedural steps in which medical experts give visual instructions utilizing AR components. We examine the change in AR usage and interaction over time and recognize patterns between users. Moreover, AR design recommendations are given based on post-training interviews. +Comments: +this https URL +Subjects: +Human-Computer Interaction (cs.HC) +Cite as: +arXiv:2312.09152 +[cs.HC] +(or +arXiv:2312.09152v1 +[cs.HC] +for this version) +https://doi.org/10.48550/arXiv.2312.09152 +Focus to learn more +arXiv-issued DOI via DataCite +Journal reference: +Proceedings of the 29th ACM Symposium on Virtual Reality Software and Technology (VRST 2023) +Related DOI +: +https://doi.org/10.1145/3611659.3615685 +Focus to learn more +DOI(s) linking to related resources +Submission history +From: Manuel Rebol [ +view email +] +[v1] +Thu, 14 Dec 2023 17:22:22 UTC (2,671 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?, by Manuel Rebol and 7 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.HC +< prev +| +next > +new +| +recent +| +2023-12 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md b/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md new file mode 100644 index 0000000000000000000000000000000000000000..ae4cc129135cc1ad5be27eb3cb3363ef79d0d658 --- /dev/null +++ b/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md @@ -0,0 +1,203 @@ +--- +title: '[2402.01817] LLMs Can''t Plan, But Can Help Planning in LLM-Modulo Frameworks' +id: 240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks +tags: +- deepread +created: '2026-06-10T00:40:16.134883Z' +source: https://arxiv.org/abs/2402.01817 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:16.134739Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2402.01817] LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks +Computer Science > Artificial Intelligence +arXiv:2402.01817 +(cs) +[Submitted on 2 Feb 2024 ( +v1 +), last revised 12 Jun 2024 (this version, v3)] +Title: +LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks +Authors: +Subbarao Kambhampati +, +Karthik Valmeekam +, +Lin Guan +, +Mudit Verma +, +Kaya Stechly +, +Siddhant Bhambri +, +Lucas Saldyt +, +Anil Murthy +View a PDF of the paper titled LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks, by Subbarao Kambhampati and 7 other authors +View PDF +HTML (experimental) +Abstract: +There is considerable confusion about the role of Large Language Models (LLMs) in planning and reasoning tasks. On one side are over-optimistic claims that LLMs can indeed do these tasks with just the right prompting or self-verification strategies. On the other side are perhaps over-pessimistic claims that all that LLMs are good for in planning/reasoning tasks are as mere translators of the problem specification from one syntactic format to another, and ship the problem off to external symbolic solvers. In this position paper, we take the view that both these extremes are misguided. We argue that auto-regressive LLMs cannot, by themselves, do planning or self-verification (which is after all a form of reasoning), and shed some light on the reasons for misunderstandings in the literature. We will also argue that LLMs should be viewed as universal approximate knowledge sources that have much more meaningful roles to play in planning/reasoning tasks beyond simple front-end/back-end format translators. We present a vision of {\bf LLM-Modulo Frameworks} that combine the strengths of LLMs with external model-based verifiers in a tighter bi-directional interaction regime. We will show how the models driving the external verifiers themselves can be acquired with the help of LLMs. We will also argue that rather than simply pipelining LLMs and symbolic components, this LLM-Modulo Framework provides a better neuro-symbolic approach that offers tighter integration between LLMs and symbolic components, and allows extending the scope of model-based planning/reasoning regimes towards more flexible knowledge, problem and preference specifications. +Subjects: +Artificial Intelligence (cs.AI) +; Machine Learning (cs.LG) +Cite as: +arXiv:2402.01817 +[cs.AI] +(or +arXiv:2402.01817v3 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2402.01817 +Focus to learn more +arXiv-issued DOI via DataCite +Journal reference: +Proceedings of the 41 st International Conference on Machine Learning, Vienna, Austria. PMLR 235, 2024 +Submission history +From: Subbarao Kambhampati [ +view email +] +[v1] +Fri, 2 Feb 2024 14:43:18 UTC (4,551 KB) +[v2] +Tue, 6 Feb 2024 01:29:37 UTC (4,552 KB) +[v3] +Wed, 12 Jun 2024 01:13:11 UTC (6,405 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks, by Subbarao Kambhampati and 7 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2024-02 +Change to browse by: +cs +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md b/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md new file mode 100644 index 0000000000000000000000000000000000000000..4efc88d1f34d445152e0fc496d8c2e6fbf97158c --- /dev/null +++ b/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md @@ -0,0 +1,214 @@ +--- +title: '[2402.03300] DeepSeekMath: Pushing the Limits of Mathematical Reasoning in + Open Language Models' +id: 240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang +tags: +- deepread +created: '2026-06-09T23:28:29.232007Z' +source: https://arxiv.org/abs/2402.03300 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:28:29.231807Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2402.03300] DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models +Computer Science > Computation and Language +arXiv:2402.03300 +(cs) +[Submitted on 5 Feb 2024 ( +v1 +), last revised 27 Apr 2024 (this version, v3)] +Title: +DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models +Authors: +Zhihong Shao +, +Peiyi Wang +, +Qihao Zhu +, +Runxin Xu +, +Junxiao Song +, +Xiao Bi +, +Haowei Zhang +, +Mingchuan Zhang +, +Y.K. Li +, +Y. Wu +, +Daya Guo +View a PDF of the paper titled DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models, by Zhihong Shao and 10 other authors +View PDF +HTML (experimental) +Abstract: +Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO. +Subjects: +Computation and Language (cs.CL) +; Artificial Intelligence (cs.AI); Machine Learning (cs.LG) +Cite as: +arXiv:2402.03300 +[cs.CL] +(or +arXiv:2402.03300v3 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2402.03300 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Zhihong Shao [ +view email +] +[v1] +Mon, 5 Feb 2024 18:55:32 UTC (3,417 KB) +[v2] +Tue, 6 Feb 2024 18:39:38 UTC (3,417 KB) +[v3] +Sat, 27 Apr 2024 15:25:53 UTC (3,417 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models, by Zhihong Shao and 10 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2024-02 +Change to browse by: +cs +cs.AI +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/240411018-many-shot-in-context-learning.md b/research/notes/240411018-many-shot-in-context-learning.md new file mode 100644 index 0000000000000000000000000000000000000000..fa692e1af0f5c28a7c33033d651709c8da2795d4 --- /dev/null +++ b/research/notes/240411018-many-shot-in-context-learning.md @@ -0,0 +1,228 @@ +--- +title: '[2404.11018] Many-Shot In-Context Learning' +id: 240411018-many-shot-in-context-learning +tags: +- deepread +created: '2026-06-10T00:40:15.011649Z' +source: https://arxiv.org/abs/2404.11018 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:15.011513Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2404.11018] Many-Shot In-Context Learning +Computer Science > Machine Learning +arXiv:2404.11018 +(cs) +[Submitted on 17 Apr 2024 ( +v1 +), last revised 17 Oct 2024 (this version, v3)] +Title: +Many-Shot In-Context Learning +Authors: +Rishabh Agarwal +, +Avi Singh +, +Lei M. Zhang +, +Bernd Bohnet +, +Luis Rosias +, +Stephanie Chan +, +Biao Zhang +, +Ankesh Anand +, +Zaheer Abbas +, +Azade Nova +, +John D. Co-Reyes +, +Eric Chu +, +Feryal Behbahani +, +Aleksandra Faust +, +Hugo Larochelle +View a PDF of the paper titled Many-Shot In-Context Learning, by Rishabh Agarwal and 13 other authors +View PDF +HTML (experimental) +Abstract: +Large language models (LLMs) excel at few-shot in-context learning (ICL) -- learning from a few examples provided in context at inference, without any weight updates. Newly expanded context windows allow us to investigate ICL with hundreds or thousands of examples -- the many-shot regime. Going from few-shot to many-shot, we observe significant performance gains across a wide variety of generative and discriminative tasks. While promising, many-shot ICL can be bottlenecked by the available amount of human-generated examples. To mitigate this limitation, we explore two new settings: Reinforced and Unsupervised ICL. Reinforced ICL uses model-generated chain-of-thought rationales in place of human examples. Unsupervised ICL removes rationales from the prompt altogether, and prompts the model only with domain-specific questions. We find that both Reinforced and Unsupervised ICL can be quite effective in the many-shot regime, particularly on complex reasoning tasks. Finally, we demonstrate that, unlike few-shot learning, many-shot learning is effective at overriding pretraining biases, can learn high-dimensional functions with numerical inputs, and performs comparably to fine-tuning. We also find that inference cost increases linearly in the many-shot regime, and frontier LLMs benefit from many-shot ICL to varying degrees. Our analysis also reveals the limitations of next-token prediction loss as an indicator of downstream ICL performance. +Comments: +NeurIPS (Spotlight) +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) +Cite as: +arXiv:2404.11018 +[cs.LG] +(or +arXiv:2404.11018v3 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2404.11018 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Rishabh Agarwal [ +view email +] +[v1] +Wed, 17 Apr 2024 02:49:26 UTC (327 KB) +[v2] +Wed, 22 May 2024 17:06:10 UTC (370 KB) +[v3] +Thu, 17 Oct 2024 17:45:09 UTC (414 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Many-Shot In-Context Learning, by Rishabh Agarwal and 13 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2024-04 +Change to browse by: +cs +cs.AI +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md b/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md new file mode 100644 index 0000000000000000000000000000000000000000..8e3fe7b47c78c3e1400c5c5cc674a318a4575a07 --- /dev/null +++ b/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md @@ -0,0 +1,197 @@ +--- +title: '[2406.12543] Phase-controlled heat modulation with Aharonov-Bohm interferometers' +id: 240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers +tags: +- deepread +created: '2026-06-10T00:40:09.876451Z' +source: https://arxiv.org/abs/2406.12543 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:09.876309Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2406.12543] Phase-controlled heat modulation with Aharonov-Bohm interferometers +Condensed Matter > Mesoscale and Nanoscale Physics +arXiv:2406.12543 +(cond-mat) +[Submitted on 18 Jun 2024] +Title: +Phase-controlled heat modulation with Aharonov-Bohm interferometers +Authors: +Sun-Yong Hwang +, +Björn Sothmann +, +Rosa López +View a PDF of the paper titled Phase-controlled heat modulation with Aharonov-Bohm interferometers, by Sun-Yong Hwang and 2 other authors +View PDF +HTML (experimental) +Abstract: +A heat modulator is proposed based on a voltage-biased Aharonov-Bohm interferometer. Once an electrical bias is applied, Peltier effects give rise to a flow of heat that can be modulated by a magnetic flux. We determine the corresponding temperature changes using a simple thermal model. Our calculations demonstrate that the modulated temperature difference can be as large as 80 mK at base temperature about 600 mK with relative temperature variations reaching 10\%. Our model also predicts, quite generally, the emergence of spin-polarized heat flows without any ferromagnetic contacts, if Rashba spin-orbit interaction is combined with the applied magnetic flux, which potentially paves the way towards caloritronic information processing. +Comments: +8 pages, 4 figures +Subjects: +Mesoscale and Nanoscale Physics (cond-mat.mes-hall) +Cite as: +arXiv:2406.12543 +[cond-mat.mes-hall] +(or +arXiv:2406.12543v1 +[cond-mat.mes-hall] +for this version) +https://doi.org/10.48550/arXiv.2406.12543 +Focus to learn more +arXiv-issued DOI via DataCite +Journal reference: +Phys. Rev. Research 6, 013215 (2024) +Related DOI +: +https://doi.org/10.1103/PhysRevResearch.6.013215 +Focus to learn more +DOI(s) linking to related resources +Submission history +From: Sun-Yong Hwang [ +view email +] +[v1] +Tue, 18 Jun 2024 12:22:44 UTC (1,894 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Phase-controlled heat modulation with Aharonov-Bohm interferometers, by Sun-Yong Hwang and 2 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cond-mat.mes-hall +< prev +| +next > +new +| +recent +| +2024-06 +Change to browse by: +cond-mat +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md new file mode 100644 index 0000000000000000000000000000000000000000..f8f39064136b250b4ddafbd13e3aaace2f447b7c --- /dev/null +++ b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md @@ -0,0 +1,2384 @@ +--- +title: '[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers' +id: 240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2 +tags: +- deepread +created: '2026-06-10T00:40:45.751799Z' +source: https://ar5iv.labs.arxiv.org/html/2408.06195 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:40:45.751621Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers +Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers +Zhenting Qi +∗‡† +Mingyuan Ma +∗‡† +Jiahang Xu +∗‡ +Li Lyna Zhang +‡⋄ +Fan Yang +‡ +Mao Yang +‡ +‡ +Microsoft Research Asia +† +Harvard University +Abstract +This paper introduces rStar, a self-play mutual reasoning approach that significantly improves reasoning capabilities of small language models (SLMs) without fine-tuning or superior models. rStar decouples reasoning into a self-play mutual generation-discrimination process. First, a target SLM augments the Monte Carlo Tree Search (MCTS) with +a rich set of human-like reasoning actions +to construct higher quality reasoning trajectories. Next, another SLM, with capabilities similar to the target SLM, acts as a discriminator to verify each trajectory generated by the target SLM. The mutually agreed reasoning trajectories are considered +mutual consistent +, thus are more likely to be correct. Extensive experiments across five SLMs demonstrate rStar can effectively solve diverse reasoning problems, including GSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be available at +here +. +$*$ +$*$ +footnotetext: +Equal contribution. Zhenting Qi and Mingyuan Ma did the work during an internship at MSRA +$\diamond$ +$\diamond$ +footnotetext: +Corresponding author: lzhani@microsoft.com +1 +Introduction +Despite their success, large language models (LLMs) face significant challenges in complex reasoning +(Valmeekam et al., +2022 +; Weng et al., +2023 +) +. For example, state of the art models like Mistral-7B +(Jiang et al., +2023 +) +can only achieve 36.5% accuracy on the GSM8K dataset, even with techniques like Chain-of-Throught (CoT) +(Wei et al., +2022 +) +. Although fine-tuning is shown to be an effective way to improve reasoning capability, most LLMs rely on fine-tuning data distilled or synthesized by +superior +models like GPT-4 +(Wang et al., +2024a +; Gou et al., +2023 +) +. Meanwhile, the community has been actively working on a complimentary and yet more challenging approach: Reasoning improvements +without +a superior teacher LLM. +Figure 1: +With 32 rounds of inference, rStar makes SLMs highly capable problem-solvers, matching or even surpassing the reasoning performance achieved after domain-specialized SFT. +A promising paradigm to improve reasoning without superior models is to leverage the knowledge within LLMs themselves +(Wang et al., +2023 +; Hao et al., +2023 +; Madaan et al., +2024 +) +. For example, RAP +(Hao et al., +2023 +) +adopts a self-exploration solution to iteratively improve LLM’s reasoning performance through self-rewarded feedback. Unfortunately, study suggests that this paradigm often suffers from two fundamental issues. +First, LLMs often struggle to effectively explore the solution space during reasoning. The self-exploration often traps in a solution space with low-quality reasoning steps even after many attempts. For example, our experiments reveal that after 32 rounds of self-exploration with RAP +(Hao et al., +2023 +) +, only 24% of the trajectories generated by LLaMA2-7B on GSM8K are correct. +Second, even the self-exploration can find high quality reasoning steps, it is difficult for SLMs to tell which reasoning steps are of higher quality or determine which final answers are correct, thus it is hard to effectively guide the self-exploration. Our study shows that a naïve reward-based self-exploration guidance can lead to results no better than random guesses (see Appendix +A.1 +). +A more troublesome fact is that the above two issues are more pronounced in the smaller version of LLMs, i.e., +SLM +s, due to their weaker capabilities. For instance, while GPT-4 can improve by self-refining its output +(Madaan et al., +2024 +; Wu et al., +2024 +; Zhou et al., +2024 +) +, the approaches are less effective in SLMs and may even lead to worse performance +(Forsman, +2024 +) +. This significantly hinders the adoption of neural language models. +This paper introduces +S +elf-play mu +T +u +A +l +R +easoning +(rStar), a novel approach that boosts SLMs’ reasoning capability during inference without fine-tuning or superior models. To address the aforementioned challenges, rStar decouples reasoning into a self-play mutual generation-discrimination process as illustrated in Fig. +2 +. +Specifically, rStar is unique in the following approaches. First, although relying on a conventional Monte Carlo Tree Search (MCTS) for SLMs to self-generate reasoning steps, rStar advocates +a richer set of reasoning actions +in the self-exploration. The new proposed actions simulate human reasoning behaviors given the current reasoning state, such as decomposing and searching for a specific reasoning step, proposing a new sub-question, or rephrasing the given question. This enables SLMs to generate high-quality candidate reasoning trajectories during self-exploration. +Second, to effectively guide the exploration among the generated reasoning trajectories, rStar augments the MCTS process with a new discrimination process called +mutual consistency +. In particular, rStar employs a second SLM with the similar capability, acting as a discriminator to provide unsupervised feedback on each candidate reasoning trajectory generated by MCTS. To improve the accuracy of the feedback, rStar hints the second SLM with sampled partial reasoning trajectories, asking it to complete the remaining reasoning steps. And rStar deems the mutually agreed reasoning trajectories of higher quality. Mutual consistency mirrors the common human practice in the absence of supervision, where agreement among peers (i.e., two SLMs) on derived answers suggests a higher likelihood of correctness. +As a result, mutual consistency offers more effective reasoning across diverse tasks than other approaches like self-consistency +(Wang et al., +2023 +) +and avoids the risk of overfitting when training a reward model +(Chen et al., +2024a +; Wang et al., +2024b +) +. +Figure 2: +Our self-play mutual reasoning is a generation-discrimination process: (1) a self-generator augments the target SLM to generate candidate reasoning trajectories using MCTS; (2) the discriminator uses another SLM to provide unsupervised feedback on each trajectory based on partial hints; (3) based on this feedback, the target SLM decides a final reasoning trajectory as the solution. +Extensive experiments across five SLMs and five diverse reasoning tasks demonstrate the effectiveness of rStar. With just 32 rounds of MCTS inference, rStar significantly enhances SLMs’ reasoning capabilities, matching or even surpassing the accuracy achieved after fine-tuning. For example, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral, and from 47.23% to 85.52% for LLaMA3-8B. Furthermore, we conduct comprehensive experiments to verify rStar’s superiority over state-of-the-art baselines, including single-round inference techniques like few-shot CoT, multi-round prompting approaches such as self-consistency, and self-improvement techniques such as RAP, ToT, self-evaluation and self-verification. +2 +Related Work +Prompting Language Models to Reason +. +Prompting-based methods, such as Chain-of-Thought +(Wei et al., +2022 +) +, focus on designing instructions and pipelines to enhance LLMs’ reasoning performance during inference. Recent advances include planning +(Hao et al., +2023 +; Ding et al., +2023 +) +, problem decomposition +(Zhou et al., +2022 +; Khot et al., +2022 +; Hao et al., +2023 +) +, abstraction +(Zheng et al., +2023 +) +, programming +(Chen et al., +2022 +; Zhou et al., +2023 +) +. +These methods aim to improve single-round inference performance and are orthogonal to ours. +LLM Self-improvement +. Recently, research on the self-improvement of LLMs has rapidly increased. +Fine-tuning based methods +(Chen et al., +2024b +; +a +) +leverage the capabilities of a well-pretrained LLM to synthesize data and progressively enhance its performance. Advanced prompting techniques, such as self-verification +(Gero et al., +2023 +; Zhou et al., +2023 +) +, and RAP +(Hao et al., +2023 +) +, improve performance through iterative self-exploring based on self-diagnosed feedback at inference time. +However, as illustrated in previous section, the achieved performance often depend on the LLM’s inherent capabilities, and for SLMs, their weaker instruction-following ability and unreliable self-rewarding can mislead self-improvement. +Sampling Reasoning Paths +. Recent works +(Brown et al., +2024 +; Li et al., +2024 +; Snell et al., +2024 +) +on mathematical reasoning have shown that sampling diverse reasoning paths can significantly enhance performance compared to greedy one-time decoding. Self-Consistency +(Wang et al., +2023 +) +sample a complete CoT path each time. Tree-search approaches +(Yao et al., +2024 +; Hao et al., +2023 +; Zhang et al., +2024 +) +, like MCTS, further improve the performance by breaking down tasks and sampling simpler, individual intermediate reasoning steps. However, most approaches have limited action spaces. For example, RAP +(Hao et al., +2023 +) +decomposes only subproblems, while AlphaMath +(Chen et al., +2024a +) +searches only for one CoT step, limiting effectiveness in generating better trajectories. +Answer Verification +. To select correct reasoning trajectories, majority voting +(Wang et al., +2023 +) +is a widely-used approach. To improve accuracy, some works train value or rewards model for verification +(Wang et al., +2024b +; Chen et al., +2024a +) +, but these require additional annotations and have risks in overfitting to specific tasks. Self-verification +(Weng et al., +2023 +) +leverages LLM capabilities for backward self-verification. Nevertheless, its effectiveness hinges on its inherent ability to reason effectively. Recent studies have shown that LLM struggles to evaluate itself and rectify its initial responses without any external feedbacks +(Huang et al., +2023 +; Feng et al., +2023 +) +. +3 +Methodology +3.1 +Overview +Problem Formulation +. To solve a reasoning problem by SLMs, we formulate it as a multi-step reasoning generation task, which breaks +the problem into simpler sub-tasks. This is more effective than traditional CoT-based reasoning +(Wei et al., +2022 +; Wang et al., +2023 +) +, as it is much easier for SLMs to correctly generate one step than complete reasoning steps in a single inference. We leverage the Monte-Carlo Tree Search (MCTS) algorithm +(Kocsis & Szepesvári, +2006 +) +to augment the target SLM for self-generating multi-step reasoning solutions. +Formally, for a given problem +x +𝑥 +x +and a target SLM +M +𝑀 +M +, the MCTS augments +M +𝑀 +M +to incrementally build a search tree +𝒯 +𝒯 +\mathcal{T} +. As illustrated in Fig. +3 +, the root node represents the question +x +𝑥 +x +, an edge represents an action +a +𝑎 +a +, each child node is an intermediate step +s +𝑠 +s +generated by +M +𝑀 +M +under the corresponding action. A path from the root node to a leaf node (denoted as +s +d +subscript +𝑠 +𝑑 +s_{d} +, also called a terminal node) constitutes a candidate solution trajectory +𝐭 += +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +d +𝐭 +direct-sum +𝑥 +subscript +𝑠 +1 +subscript +𝑠 +2 +… +subscript +𝑠 +𝑑 +\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d} +. From the search tree +𝒯 +𝒯 +\mathcal{T} +, we can extract a set of solution trajectories +𝕋 += +{ +𝐭 +1 +, +𝐭 +2 +, +… +, +𝐭 +n +} +​ +( +n +≥ +1 +) +𝕋 +superscript +𝐭 +1 +superscript +𝐭 +2 +… +superscript +𝐭 +𝑛 +𝑛 +1 +\mathbb{T}=\{\mathbf{t}^{1},\mathbf{t}^{2},...,\mathbf{t}^{n}\}(n\geq 1) +. Our goal is to find the trajectories that can achieve the correct answer for the given question. +Challenges in SLM Self-Improvement +. MCTS allows an SLM to explore and evaluate multiple potential solutions. Ideally, by balancing exploration of new possibilities with the exploitation of high-reward actions, the SLM can gradually refine its reasoning steps to generate a final correct reasoning trajectory. However, due to the limited capabilities in SLMs, traditional MCTS yields minimal improvement. First, the vast solution space makes it challenging for SLMs to generate effective solutions. Existing MCTS-based methods +(Hao et al., +2023 +; Kang et al., +2024 +) +that use single actions limit diversity and struggle to generalize across tasks. Approaches like self-consistency +(Wang et al., +2023 +) +use random sampling ensure diversity, SLMs often produce poor-quality solutions, requiring many attempts to find a correct solution, thereby increasing inference costs. +Second, it’s challenging to accurately reward each action. Without ground truth labels, it’s difficult to verify the correctness for each intermediate step +s +i +subscript +𝑠 +𝑖 +s_{i} +and the final answer in +s +d +subscript +𝑠 +𝑑 +s_{d} +. Majority voting in self-consistency requires most traces to be correct, which is often not the case for SLMs. Methods like RAP +(Hao et al., +2023 +) +use self-rewarding, but our study shows SLMs perform near-random self-rewarding (Appendix +A.1 +). Training a reward model, as in M +∗ +(Kang et al., +2024 +) +, can address this challenge but faces difficulties in collecting training data and generalizing across various tasks. +Overview +. +To address these challenges, this section introduces our methodology, rStar, which decomposes reasoning into solution generation and mutual verification in Fig. +2 +. To tackle the first challenge, we introduce a richer set of human-like reasoning actions that allows for thorough space exploration across diverse reasoning tasks. To address the second challenge, we design an SLM-tailored reward function to evaluate intermediate steps, avoiding reliance on their often unreliable self-evaluations. Moreover, we use another SLM as a discriminator to augment the MCTS process, mutually verifying the correctness of each trajectory with the generator SLM. +3.2 +Self-generating Reasoning Trajectory with MCTS Rollout +Figure 3: +An example to illustrate the process of self-generator. Highlighted nodes from top to bottom constitute a complete reasoning trace. +Given a question, MCTS augments the target SLM to explore a rich, human-like reasoning action space and generate the next steps based on the current state. +A Rich Set of Human-like Reasoning Actions +. At the core of MCTS generation lies the action space, which defines the scope of tree exploration. Most MCTS-based methods use a single action type to build the tree. For instance, in RAP, the action is to propose the next sub-question, whereas in AlphaMath +(Chen et al., +2024a +) +and MindStar +(Kang et al., +2024 +) +, the action is to generate the next reasoning step. +However, relying on a single action type can easily lead to ineffective space exploration. +To address this, we revisit how humans approach reasoning. +Different people solve problems using diverse actions: some break into sub-questions, others solve it directly, and some might rephrase the problem to focus on key conditions. Moreover, people adjust their approach based on current states, choosing different actions as needed. Inspired by this human reasoning process, we introduce a richer set of 5 actions to maximize the SLM’s potential for correctly solving complex reasoning problems. +⋄ +⋄ +\diamond +A1 +: Propose an one-step thought +. This action prompts the LLM to generate the next one-step thought for a given question, by considering the existing reasoning steps. Unlike the CoT, which generates complete thoughts, this approach simplifies the reasoning process and allows the LLM to perform better decision making +(Yao et al., +2024 +; Besta et al., +2024 +) +. +⋄ +⋄ +\diamond +A2 +: Propose the remaining thought steps. +Instead of generating only one step thought per state, this action aligns with standard CoT, enabling “fast thinking” to solve simple question in fewer steps. Given the already generated reasoning steps, it prompts the LLM to directly produce the remaining steps until reaching the final answer. +⋄ +⋄ +\diamond +A3 +: Propose next sub-question along with its answer. +This action is inspired by +least-to-most prompting +(Zhou et al., +2022 +) +, which breaks down a complex problem into a series of simpler sub-questions and solves them sequentially. Following RAP’s implementation, we prompt the LLM to ask and then answer the next sub-question. +⋄ +⋄ +\diamond +A4 +: Answer the sub-question again. +Considering that a sub-question might not be answered correctly by +A3 +, we propose this action to re-answer it. To improve accuracy, this action prompts the LLM to use few-shot CoT. Note that the original answer generated by +A3 +did not use a CoT-like prompt but instead followed the least-to-most problem decomposition prompt +(Zhou et al., +2022 +) +. +⋄ +⋄ +\diamond +A5 +: Rephrase the question/sub-question. +When analyzing incorrect cases, we found that many of them are due the LLM misunderstanding the question. For example, it might miss a specific condition provided in the question. Therefore, we propose a new action to rephrase the question more simply. Specifically, we prompt the LLM to clearly list all conditions given in the problem statement. +Table 1: +Ablation study on the effectiveness of our rich action space: we evaluate LLaMA3-8B on 200 sampled GSM8K questions. +Action Space +Accuracy +A +3 +subscript +𝐴 +3 +A_{3} +(i.e., RAP) +70.5 +A +3 +subscript +𝐴 +3 +A_{3} ++ +A +5 +subscript +𝐴 +5 +A_{5} +72.5 +A +3 +subscript +𝐴 +3 +A_{3} ++ +A +4 +subscript +𝐴 +4 +A_{4} ++ +A +5 +subscript +𝐴 +5 +A_{5} +73.5 +A +2 +subscript +𝐴 +2 +A_{2} ++ +A +3 +subscript +𝐴 +3 +A_{3} ++ +A +4 +subscript +𝐴 +4 +A_{4} ++ +A +5 +subscript +𝐴 +5 +A_{5} +74.0 +All ( +A +1 +subscript +𝐴 +1 +A_{1} ++ +A +2 +subscript +𝐴 +2 +A_{2} ++ +A +3 +subscript +𝐴 +3 +A_{3} ++ +A +4 +subscript +𝐴 +4 +A_{4} ++ +A +5 +subscript +𝐴 +5 +A_{5} +) +75.0 +The above 5 actions define a highly diverse action space +{ +A +1 +, +A +2 +, +A +3 +, +A +4 +, +A +5 +} +subscript +𝐴 +1 +subscript +𝐴 +2 +subscript +𝐴 +3 +subscript +𝐴 +4 +subscript +𝐴 +5 +\{A_{1},A_{2},A_{3},A_{4},A_{5}\} +. +At each step +i +𝑖 +i +, MCTS selects an action +a +i +subscript +𝑎 +𝑖 +a_{i} +from this space. We then use this action +a +i +subscript +𝑎 +𝑖 +a_{i} +to prompt the LLM to generate the next reasoning step +s +i +subscript +𝑠 +𝑖 +s_{i} +, based on the current state, which is the previous generated trajectory +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +i +− +1 +direct-sum +𝑥 +subscript +𝑠 +1 +subscript +𝑠 +2 +… +subscript +𝑠 +𝑖 +1 +x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1} +. Note that certain actions require orders. For example, +A4 +can only happen after +A3 +, and +A5 +can only happen after the root question. As shown in Table +1 +, each action plays a crucial role in improving the final reasoning accuracy. +Reward Function. +Another critical component in MCTS is the reward function, which evaluates the value of each action and directs the tree expansion. +We design a simple yet effective reward function for SLMs. First, we exclude self-rewarding techniques for any intermediate nodes due to the limited capabilities of SLMs. Second, to ensure generalization across different reasoning tasks, we avoid introducing external supervision (e.g., tools or trained value models). Our approach draws inspiration from AlphaGo +(Silver et al., +2017 +) +, where we score each intermediate node based on its contribution to the final correct answer. Consequently, actions that frequently lead to correct answers receive higher rewards, making them more likely to be selected in future MCTS tree expansions. +We define +Q +​ +( +s +, +a +) +𝑄 +𝑠 +𝑎 +Q(s,a) +as the reward value for node +s +𝑠 +s +generated under action +a +𝑎 +a +. +Initially, all unexplored nodes are assigned +Q +​ +( +s +i +, +a +i +) += +0 +𝑄 +subscript +𝑠 +𝑖 +subscript +𝑎 +𝑖 +0 +Q(s_{i},a_{i})=0 +, leading to random tree expansions. Upon reaching the first terminal node +n +d +subscript +𝑛 +𝑑 +n_{d} +, we compute a reward score +Q +​ +( +s +d +, +a +d +) +𝑄 +subscript +𝑠 +𝑑 +subscript +𝑎 +𝑑 +Q(s_{d},a_{d}) +based on whether it reaches the correct answer. +This score is then back-propagated to each intermediate node along the trajectory +𝐭 += +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +d +𝐭 +direct-sum +𝑥 +subscript +𝑠 +1 +subscript +𝑠 +2 +… +subscript +𝑠 +𝑑 +\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d} +. Specifically, for each +s +i +subscript +𝑠 +𝑖 +s_{i} +(for +i += +1 +, +2 +, +… +, +d +− +1 +𝑖 +1 +2 +… +𝑑 +1 +i=1,2,...,d-1 +), its +Q +𝑄 +Q +value is updated as follows: +Q +​ +( +s +i +, +a +i +) += +Q +​ +( +s +i +, +a +i +) ++ +Q +​ +( +s +d +, +a +d +) +𝑄 +subscript +𝑠 +𝑖 +subscript +𝑎 +𝑖 +𝑄 +subscript +𝑠 +𝑖 +subscript +𝑎 +𝑖 +𝑄 +subscript +𝑠 +𝑑 +subscript +𝑎 +𝑑 +Q(s_{i},a_{i})=Q(s_{i},a_{i})+Q(s_{d},a_{d}) +. To compute the +Q +​ +( +s +d +, +a +d +) +𝑄 +subscript +𝑠 +𝑑 +subscript +𝑎 +𝑑 +Q(s_{d},a_{d}) +for the terminal node, we use the likelihood (confidence) of self-consistency majority voting as the reward value. +Figure 4: +The prompt example for mutual reasoning consistency. +Solution Generation with MCTS Rollout +. We now describe how our MCTS generates candidate reasoning trajectories. Starting from the initial root node +s +0 +subscript +𝑠 +0 +s_{0} +, we perform multiple searches consisting of +selection +, +expansion +, +simulations +and +back-propagation +. Specifically, the simulation is performed using the default +rollout +policy, and to achieve more accurate reward estimation, we perform multiple rollouts. To balance the exploration and exploitation, we use the well-known Upper Confidence Bounds applied to Trees (UCT) +(Kocsis & Szepesvári, +2006 +) +to select each node. This selection process is mathematically represented as: +UCT +​ +( +s +, +a +) += +Q +​ +( +s +, +a +) +N +​ +( +s +, +a +) ++ +c +​ +ln +⁡ +N +p +​ +a +​ +r +​ +e +​ +n +​ +t +​ +( +s +) +N +​ +( +s +, +a +) +. +UCT +𝑠 +𝑎 +𝑄 +𝑠 +𝑎 +𝑁 +𝑠 +𝑎 +𝑐 +subscript +𝑁 +𝑝 +𝑎 +𝑟 +𝑒 +𝑛 +𝑡 +𝑠 +𝑁 +𝑠 +𝑎 +\text{UCT}(s,a)=\frac{Q(s,a)}{N(s,a)}+c\sqrt{\frac{\ln N_{parent}(s)}{N(s,a)}}. +where +N +​ +( +s +, +a +) +𝑁 +𝑠 +𝑎 +N(s,a) +is the number of times node +s +𝑠 +s +has been visited in previous iterations, and +N +p +​ +a +​ +r +​ +e +​ +n +​ +t +​ +( +s +) +subscript +𝑁 +𝑝 +𝑎 +𝑟 +𝑒 +𝑛 +𝑡 +𝑠 +N_{parent}(s) +represents the visiting count of the parent node of +s +𝑠 +s +. +Q +​ +( +s +, +a +) +𝑄 +𝑠 +𝑎 +Q(s,a) +is the estimated reward value and will be updated through back-propagation. +c +𝑐 +c +is a constant that balances exploitation and exploration. +Once the search reaches a terminal node, either a terminal state or a predetermined maximum tree depth +d +𝑑 +d +, we obtain a trajectory from the root to terminal node. We collect all trajectories from the rollout iterations as candidate solutions. The next section explains how we verify each of them. +3.3 +Reasoning Trajectory Selection with Mutual Consistency +In traditional MCTS, typically only one trajectory is selected as the final solution based on a specific metric, such as choosing the path with the highest reward from the rollout iterations. Unfortunately, after trying various existing methods, we found it challenging to define a single metric that reliably selects the trajectory containing the correct answer. +Therefore, we collect all trajectories and propose mutual reasoning consistency for answer selection. +Mutual Reasoning Consistency by Discriminator SLM +2 +. As shown in Fig. +2 +, in addition to the target SLM +M +𝑀 +M +, we introduce another SLM +M +^ +^ +𝑀 +\hat{M} +to serve as a discriminator, providing external unsupervised feedback for each candidate trajectory. +Specifically, for +𝐭 += +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +d +𝐭 +direct-sum +𝑥 +subscript +𝑠 +1 +subscript +𝑠 +2 +… +subscript +𝑠 +𝑑 +\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d} +, we mask the reasoning steps starting from a randomly sampled step +i +𝑖 +i +( +i +< +d +𝑖 +𝑑 +i". When the original question is answerable, please start the subquestion with "Now we can answer the question: ". +Question 1: Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice as 30 years old, how old is Kody? +Question 1.1: How old is Mohamed currently? +Answer 1.1: Mohamed is twice as old as 30 years, which means he is 30 * 2 = 60 years old. +Question 1.2: What was Kody’s age four years ago, given that it was half of Mohamed’s age at that time? +Answer 1.2: Four years ago, Mohamed was 60 - 4 = 56 years old, so Kody was half of that, which is 56 / 2 = 28 years old. +Question 1.3: Now we can answer the question: How old is Kody? +Answer 1.3: Kody is currently 28 + 4 = 32 years old. The answer is 32. +Question 2: On a moonless night, three fireflies danced in the evening breeze. They were joined by four less than a dozen more fireflies before two of the fireflies flew away. How many fireflies remained? +Question 2.1: How many fireflies joined? +Answer 2.1: The fireflies were joined by four less than a dozen more fireflies, which are 12 - 4 = 8 fireflies. The answer is 8. +Question 2.2: Now we can answer the question: How many fireflies remained? +Answer 2.2: Three fireflies were dancing originally. They were joined by 8 fireflies before two of them flew away. So there were 3 + 8 - 2 = 9 remaining. The answer is 9. +Question 3: Ali has four $10 bills and six $20 bills that he saved after working for Mr. James on his farm. Ali gives her sister half of the total money he has and uses 3/5 of the remaining amount of money to buy dinner. Calculate the amount of money he has after buying the dinner. +Question 3.1: How much money does Ali have after giving half of his total money to his sister? +Answer 3.1: Ali initially has four $10 bills and six $20 bills, totaling 4 * 10 + 6 * 20 = 160 dollars. Giving half of this to his sister leaves him with 160 / 2 = 80 dollars. The answer is 80. +Question 3.2: How much money does Ali spend on dinner? +Answer 3.2: Ali uses 3/5 of his remaining money, which is 80 dollars, to buy dinner. Therefore, he spends 80 * 3/5 = 48 dollars on dinner. The answer is 48. +Question 3.3: Now we can answer the question: How much money does Ali have after buying the dinner? +Answer 3.3: After buying the dinner, Ali has 80 - 48 = 32 dollars left. The answer is 32. +Question 4: A car is driving through a tunnel with many turns. After a while, the car must travel through a ring that requires a total of 4 right-hand turns. After the 1st turn, it travels 5 meters. After the 2nd turn, it travels 8 meters. After the 3rd turn, it travels a little further and at the 4th turn, it immediately exits the tunnel. If the car has driven a total of 23 meters around the ring, how far did it have to travel after the 3rd turn? +Question 4.1: How far did the car travel except for the 3rd turn? +Answer 4.1: It travels 5 meters after the 1st, 8 meters after the 2nd, and 0 meters after the 4th turn. It’s a total of 5 + 8 + 0 = 13 meters. The answer is 13. +Question 4.2: Now we can answer the question: How far did the car have to travel after the 3rd turn? +Answer 4.2: The car has driven a total of 23 meters around the ring. It travels 13 meters except for the 3rd turn. So it has to travel 23 - 13 = 10 meters after the 3rd turn. The answer is 10. +Question 5: {user question} +Action 5: Rephrase the question/sub-question. +You are an AI assistant to help me rephrase questions by splitting the question context into conditions. In your rephrased question, remember to fully express the information in the original question. +Original Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Olivia starts with $23. Condition 2: She buys five bagels, each costing $3. Question: How much money does Olivia have remaining after her purchase? +Original Question: Michael had 58 golf balls. On Tuesday, he lost 23 golf balls. On Wednesday, he lost 2 more. How many golf balls did he have at the end of Wednesday? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Michael initially has 58 golf balls. Condition 2: On Tuesday, he loses 23 golf balls. Condition 3: On Wednesday, he loses 2 additional golf balls. Question: What is the total number of golf balls Michael has left at the end of Wednesday? +Original Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Angelo and Melanie need to study 2 textbook chapters and 4 worksheets. Condition 2: They allocate 3 hours per textbook chapter and 1.5 hours per worksheet. Condition 3: Their daily study limit is 4 hours, with a 10-minute break every hour, three 10-minute snack breaks, and a 30-minute lunch break each day. Question: Over the next week, for how many days should they plan to study to cover all their materials? +Original Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Leah has 32 chocolates. Condition 2: Her sister has 42 chocolates. Condition 3: Together, they consume 35 chocolates. Question: How many chocolates remain between them after they have eaten some? +Original Question: There were nine computers in the server room. Five more computers were installed each day, from Monday to Thursday. How many computers are now in the server room? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Initially, there are nine computers in the server room. Condition 2: Each day, from Monday to Thursday, five additional computers are installed. Question: What is the total number of computers in the server room after these installations? +Original Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Jason starts with 20 lollipops. Condition 2: After giving some lollipops to Denny, Jason has 12 lollipops left. Question: How many lollipops did Jason give to Denny? +Original Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He rearranged five of these boxes into packages of six highlighters each and sold them for $3 per package. He sold the rest of the highlighters separately at the rate of three pens for $2. How much profit did he make in total, in dollars? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Sam purchases a dozen boxes of highlighters, with each box containing 30 pens, at $10 per box. Condition 2: He repackages five boxes into packages of six highlighters, selling each package for $3. Condition 3: He sells the remaining highlighters at a rate of three for $2. Question: What is Sam’s total profit from these transactions? +Original Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today? +Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Initially, there are 15 trees in the grove. Condition 2: Grove workers will add more trees to the grove today. Condition 3: After planting, the total number of trees in the grove will increase to 21. Question: How many trees did the grove workers plant today? +Original Question: {user question} +Rephrased Question: +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md new file mode 100644 index 0000000000000000000000000000000000000000..b3ac1ad4988e055862a44bb360bd4693cd8112ea --- /dev/null +++ b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md @@ -0,0 +1,191 @@ +--- +title: '[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers' +id: 240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers +tags: +- deepread +created: '2026-06-10T00:39:56.384617Z' +source: https://arxiv.org/abs/2408.06195 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:39:56.384488Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers +Computer Science > Computation and Language +arXiv:2408.06195 +(cs) +[Submitted on 12 Aug 2024] +Title: +Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers +Authors: +Zhenting Qi +, +Mingyuan Ma +, +Jiahang Xu +, +Li Lyna Zhang +, +Fan Yang +, +Mao Yang +View a PDF of the paper titled Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers, by Zhenting Qi and 5 other authors +View PDF +HTML (experimental) +Abstract: +This paper introduces rStar, a self-play mutual reasoning approach that significantly improves reasoning capabilities of small language models (SLMs) without fine-tuning or superior models. rStar decouples reasoning into a self-play mutual generation-discrimination process. First, a target SLM augments the Monte Carlo Tree Search (MCTS) with a rich set of human-like reasoning actions to construct higher quality reasoning trajectories. Next, another SLM, with capabilities similar to the target SLM, acts as a discriminator to verify each trajectory generated by the target SLM. The mutually agreed reasoning trajectories are considered mutual consistent, thus are more likely to be correct. Extensive experiments across five SLMs demonstrate rStar can effectively solve diverse reasoning problems, including GSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be available at +this https URL +. +Subjects: +Computation and Language (cs.CL) +Cite as: +arXiv:2408.06195 +[cs.CL] +(or +arXiv:2408.06195v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2408.06195 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Li Lyna Zhang [ +view email +] +[v1] +Mon, 12 Aug 2024 14:42:13 UTC (1,140 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers, by Zhenting Qi and 5 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2024-08 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md b/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md new file mode 100644 index 0000000000000000000000000000000000000000..37659dcfa3e528844394e6fccbf0d433cdf4502f --- /dev/null +++ b/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md @@ -0,0 +1,2144 @@ +--- +title: '[2410.20285] SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search + and Iterative Refinement' +id: 241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2 +tags: +- deepread +created: '2026-06-10T00:41:20.614811Z' +source: https://ar5iv.labs.arxiv.org/html/2410.20285 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:41:20.614654Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2410.20285] SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement +SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement +Antonis Antoniades +1∗ +, Albert Örwall +2 +, +Kexun Zhang +3 +, +Yuxi Xie +4 +, Anirudh Goyal +5 +, William Wang +1 +1 +University of California, Santa Barbara, +2 +Moatless AI, +3 +Carnegie Mellon University, +4 +National University of Singapore, +5 +Mila +Denotes equal contribution. +Correspondence to: +antonis@ucsb.edu +, +albert@moatless.ai +. +Code: +github.com/aorwall/moatless-tree-search +Abstract +Software engineers operating in complex and dynamic environments must continuously adapt to evolving requirements, learn iteratively from experience, and reconsider their approaches based on new insights. However, current large language model (LLM)-based software agents often rely on rigid processes and tend to repeat ineffective actions without the capacity to evaluate their performance or adapt their strategies over time. To address these challenges, we propose SWE-Search, a multi-agent framework that integrates Monte Carlo Tree Search (MCTS) with a self-improvement mechanism to enhance software agents’ performance on repository-level software tasks. SWE-Search extends traditional MCTS by incorporating a hybrid value function that leverages LLMs for both numerical value estimation and qualitative evaluation. This enables self-feedback loops where agents iteratively refine their strategies based on both quantitative numerical evaluations and qualitative natural language assessments of pursued trajectories. The framework includes a SWE-Agent for adaptive exploration, a Value Agent for iterative feedback, and a Discriminator Agent that facilitates multi-agent debate for collaborative decision-making. Applied to the SWE-bench benchmark, our approach demonstrates a 23% relative improvement in performance across five models compared to standard open-source agents without MCTS. Our analysis reveals how performance scales with increased search depth and identifies key factors that facilitate effective self-evaluation in software agents. This work highlights the potential of self-evaluation driven search techniques to enhance agent reasoning and planning in complex, dynamic software engineering environments. +1 +Introduction +Software engineering is a complex and iterative process involving exploration, problem-solving, and decision-making under uncertainty. Tasks such as debugging, feature development, and code refactoring require continuous assessment of different approaches, frequent backtracking, and the incorporation of new information. While machine learning has made progress in automating parts of this workflow +(Li et al., +2022 +; OpenAI et al., +2024 +; Ouyang et al., +2022 +; Yang et al., +2024b +) +, replicating the adaptive and strategic behavior of human engineers remains a significant challenge. This is due to the inherently non-linear and iterative nature of software engineering, where engineers dynamically explore various solutions, refine strategies based on feedback, and collaborate to identify the most effective path forward. Current large language model (LLM)-based software agents +(Xia et al., +2024 +; Zhang et al., +2024d +) +, while powerful, often struggle with complex, long-horizon tasks that require adaptive strategies and flexible reassessment over time. These agents can become trapped in repetitive patterns, limiting their effectiveness in tackling more intricate software engineering problems. +To address these challenges, we introduce +SWE-Search +, a multi-agent system that replicates the adaptability, iterative learning, and collaborative decision-making of human engineers. SWE-Search is designed to address three critical needs in software engineering: +Flexible Exploration and Adaptation +: Engineering problems often require exploring multiple approaches and adapting strategies based on evolving information +(Li et al., +2022 +) +. SWE-Search’s SWE-Agent operates in a flexible state space, allowing it to fluidly transition between actions such as planning, searching, and editing. This design mirrors the way engineers backtrack and adjust their approach dynamically, ensuring the agent can revise its course when faced with new challenges or information, and points towards the direction of more general, open-ended systems +(Wang et al., +2023 +; Ma et al., +2024a +; Lu et al., +2024b +; Faldor et al., +2024 +; Hu et al., +2024 +; Lu et al., +2024a +) +. +Iterative Learning through Feedback +: Effective engineering relies heavily on continuous testing and refinement. To replicate this, SWE-Search integrates a Monte Carlo Tree Search (MCTS) +(Silver et al., +2016b +) +planning module paired with a Value Agent. The MCTS module balances exploration and exploitation to guide the agent through complex solution spaces. The Value Agent augments this process by providing both utility estimates and qualitative feedback, allowing the agent to iteratively improve its decision-making based on past experiences, similar to how engineers refine their work through feedback and debugging. +Collaborative Decision-Making +: Complex problems often benefit from diverse perspectives +(Khan et al., +2024 +; Amayuelas et al., +2024 +; Du et al., +2023 +; Zhang et al., +2024c +) +. In SWE-Search, once a set of potential solutions is generated, the Discriminator Agent facilitates a multi-agent debate. Each agent advocates for different solutions by presenting arguments, which are critically evaluated by a judge agent. This process mirrors real-world engineering collaboration, where teams deliberate to refine and select the most robust solutions. +The architecture of SWE-Search is designed to automate software engineering tasks through these adaptive, feedback-driven, and collaborative processes. The SWE-Agent serves as the system’s problem solver, operating in a dynamic environment where it can backtrack and adapt its actions as necessary. The MCTS Planning Module efficiently guides exploration and exploitation, ensuring that the agent balances the need for innovation with the need to focus on promising solutions. The Value Agent provides continual feedback, offering both quantitative assessments and qualitative insights, helping the agent refine its strategy iteratively. Finally, the Discriminator Agent ensures that the final decision is rigorously vetted through a multi-agent debate, simulating the collaborative decision-making processes commonly found in engineering teams. +We evaluate SWE-Search on the SWE-bench benchmark, a comprehensive dataset from real-world open-source repositories. SWE-bench tests agents’ ability to resolve software issues by generating code patches that fix failing tests. SWE-Search demonstrates a +23 +% +percent +23 +23\% +relative performance improvement across five models compared to standard open-source agents, highlighting the effectiveness of strategic search and iterative self-evaluation. Through detailed analysis, we explore how performance scales with increased search depth and identify key factors that enhance self-assessment in software agents. Our work demonstrates the potential of MCTS and iterative learning to improve agent reasoning and planning in dynamic, complex domains like software engineering, introducing a new paradigm for autonomous software development. +2 +Related Work +Search methods +Various search approaches have been applied to Large Language Models (LLMs) to facilitate System 2 +(Kahneman, +2011 +; Saha et al., +2024 +; Pan et al., +2023 +; Bounsi et al., +2024 +) +thinking in non-linear reasoning structures. A critical feature of these approaches is their ability to backtrack. Unlike greedy processes +(Black, +2005 +) +, search algorithms explore multiple branches at each step, potentially escaping paths that lead to dead ends. These methods differ in their strategies for exploring and memorizing possible choices, and in their heuristics for switching between them. Breadth-first search +(Moore, +1959 +) +maintains all possible search paths, incurring significant memory and computational costs. Depth-first search +(Cormen et al., +2009 +) +, in contrast, prioritizes the most promising path in a more greedy manner. When applied to LLMs, these methods demonstrate a trade-off between diversity and quality in text generation +(Yao et al., +2023 +) +. The A +∗ +algorithm +(Hart et al., +1968 +) +combines aspects of breadth-first and greedy search to find optimal solutions using a predetermined evaluation function. In this work, we adopt Monte Carlo Tree Search (MCTS) +(Silver et al., +2016b +) +, an advanced search algorithm that conducts statistical tree search without requiring dedicated evaluation heuristics for each state. MCTS has achieved impressive results in complex strategy games +(Silver et al., +2016a +) +, protein folding +(Jumper et al., +2021 +) +, and algorithm discovery +(Fawzi et al., +2022 +) +. +Software Agents +Software agents are designed to perform autonomous actions within large codebases. Given a repository-level task, these agents typically locate relevant files and code segments before implementing necessary changes. We focus on the SWE-bench task +(Jimenez et al., +2024 +) +, which involves resolving real-world GitHub issues. Among the agents with disclosed technical details on SWE-bench, +Yang et al. ( +2024b +) +introduced the concept of agent-computer interfaces with SWE-agent. OpenDevin +(Wang et al., +2024b +) +presents a collection of community-driven agents, including CodeAct +(Wang et al., +2024a +) +. The Agentless approach demonstrated competitive performance using a simple two-step process of localization and repair. AutoCodeRover +(Zhang et al., +2024d +) +incorporated advanced code tools such as abstract syntax trees and spectrum-based fault localization. The Alibaba Lingma Agent +(Ma et al., +2024b +) +introduced a search-based approach for repository exploration, followed by a structured editing phase. While effective, it constitutes a more hand-designed solution specifically designed to interface with the search functionality of their agent. +3 +Methodology +SWE-Search is a multi-agent system designed to tackle complex software engineering tasks by integrating dynamic planning, value estimation, and deliberative decision-making. The core motivation behind this method is to emulate the sophisticated, iterative workflows of human software engineers, where exploration, planning, and collaboration are crucial to solving intricate problems. By leveraging the strengths of Monte Carlo Tree Search (MCTS) for planning, a Value Agent for utility estimation and feedback, and a Discriminator Agent for final decision-making through debate, SWE-Search provides a comprehensive, adaptive framework capable of navigating and solving real-world software engineering challenges. +SWE-Search consists of four primary components that work in synergy: +SWE-Search Framework and Action Agent +: Building on the moatless-tools framework +(Örwall, +2024 +) +, SWE-Search operates in a dynamic code environment with a flexible state-space and a git-like commit tree structure. This design facilitates efficient backtracking to previous states, enabling the Action Agent to explore diverse solution trajectories. The adaptable state-space enhances the system’s ability to exploit the MCTS module effectively. +Search Algorithm +: The core of SWE-Search’s exploration strategy is based on a Monte Carlo Tree Search (MCTS) which uses a heuristic-based selection process similar to AlphaZero +(Silver et al., +2016a +) +, specifically tailored for software engineering tasks. This modified MCTS algorithm effectively balances exploration and exploitation, helping the agent explore a diverse set of solutions and converge quickly on the most promising strategies. +Value (Function) Agent +: To approximate the utility of each observation, we employ an LLM-based value function, which in addition to outputting a value, also generates an explanation in natural language. This explanation can be leveraged to improve subsequent actions from parent nodes, enabling iterative self-improvement of the search process. +Discriminator Agent +: In the final stage of SWE-Search, the Discriminator Agent evaluates the solutions generated by the search process. Inspired by multi-agent debate frameworks +Du et al. ( +2023 +); Khan et al. ( +2024 +); Amayuelas et al. ( +2024 +) +, this agent engages in a structured debate, where multiple agents argue for or against the proposed solutions. The debate process not only surfaces diverse perspectives but also leads to a more rigorously justified final decision. +This system architecture combines the strengths of dynamic action selection, strategic planning, and collaborative deliberation, creating a comprehensive tool capable of handling the complexity and iterative nature of software engineering tasks. +3.1 +Problem Formulation +Figure 1: +SWE-Search Overview. +Tree search. +Each state is represented as a node from which the agent can expand from, and each corresponding action is presented as an edge. +Evaluation. +Uses all relevant context including trajectory information, file context, and executed tests, to provide a quantitative value estimation and qualitative explanation in natural language. +Expansion. +Nodes can be expanded using value function feedback from future actions. +The task of the SWE agent can be formalized as a tuple +ℳ += +( +𝒮 +, +𝒞 +, +𝒜 +, +𝒱 +, +𝒫 +, +p +0 +, +ρ +) +ℳ +𝒮 +𝒞 +𝒜 +𝒱 +𝒫 +subscript +𝑝 +0 +𝜌 +\mathcal{M}=(\mathcal{S},\mathcal{C},\mathcal{A},\mathcal{V},\mathcal{P},p_{0},\rho) +. Here, +𝒮 +𝒮 +\mathcal{S} +represents the state space, encompassing all possible states such as the current context of the files the agent is working on and the overall status of the codebase. The context space, denoted as +𝒞 +𝒞 +\mathcal{C} +, includes metadata about the repository and the initial problem description. The value function +𝒱 +𝒱 +\mathcal{V} +assigns a utility score to each state-action pair +O +​ +( +a +, +t +) +𝑂 +𝑎 +𝑡 +O(a,t) +, guiding the agent’s decisions. +The environment’s dynamics are defined by a context-dependent transition function +𝒫 +: +𝒮 +× +𝒜 +× +𝒞 +→ +Δ +​ +( +𝒮 +) +: +𝒫 +→ +𝒮 +𝒜 +𝒞 +Δ +𝒮 +\mathcal{P}:\mathcal{S}\times\mathcal{A}\times\mathcal{C}\rightarrow\Delta(\mathcal{S}) +, which models the evolution of the repository’s state after each action. The initial state distribution, +p +0 +: +𝒞 +→ +Δ +​ +( +𝒮 +) +: +subscript +𝑝 +0 +→ +𝒞 +Δ +𝒮 +p_{0}:\mathcal{C}\rightarrow\Delta(\mathcal{S}) +, specifies how the initial state depends on the given context, while +ρ +∈ +Δ +​ +( +𝒞 +) +𝜌 +Δ +𝒞 +\rho\in\Delta(\mathcal{C}) +defines the distribution over contexts. +Given an initial context +c +∼ +ρ +similar-to +𝑐 +𝜌 +c\sim\rho +and an initial state +s +0 +∼ +p +0 +( +⋅ +∣ +c +) +s_{0}\sim p_{0}(\cdot\mid c) +, the SWE agent executes its policy +π +: +𝒮 +× +𝒞 +→ +Δ +​ +( +𝒜 +) +: +𝜋 +→ +𝒮 +𝒞 +Δ +𝒜 +\pi:\mathcal{S}\times\mathcal{C}\rightarrow\Delta(\mathcal{A}) +, which selects actions based on the current state and context. At each time step +t +𝑡 +t +, the agent takes an action +a +t +∼ +π +​ +( +s +t +, +c +) +similar-to +subscript +𝑎 +𝑡 +𝜋 +subscript +𝑠 +𝑡 +𝑐 +a_{t}\sim\pi(s_{t},c) +and receives a corresponding reward +ℛ +​ +( +s +t +, +a +t +, +c +) +ℛ +subscript +𝑠 +𝑡 +subscript +𝑎 +𝑡 +𝑐 +\mathcal{R}(s_{t},a_{t},c) +. The environment then transitions to a new state +s +t ++ +1 +∼ +𝒫 +( +⋅ +∣ +s +t +, +a +t +, +c +) +s_{t+1}\sim\mathcal{P}(\cdot\mid s_{t},a_{t},c) +, and the agent continues to observe this updated state. Over time, this process generates a trajectory +τ +:= +{ +s +t +, +a +t +, +r +t +} +t += +0 +T +assign +𝜏 +superscript +subscript +subscript +𝑠 +𝑡 +subscript +𝑎 +𝑡 +subscript +𝑟 +𝑡 +𝑡 +0 +𝑇 +\tau:=\{s_{t},a_{t},r_{t}\}_{t=0}^{T} +as the agent interacts with the environment. +The agent’s objective is to maximize the cumulative reward over the trajectory, which is captured by the value function +v +​ +( +s +t +, +a +t +, +{ +s +i +} +i += +0 +t +− +1 +, +{ +a +i +} +i += +0 +t +− +1 +) +𝑣 +subscript +𝑠 +𝑡 +subscript +𝑎 +𝑡 +superscript +subscript +subscript +𝑠 +𝑖 +𝑖 +0 +𝑡 +1 +superscript +subscript +subscript +𝑎 +𝑖 +𝑖 +0 +𝑡 +1 +v(s_{t},a_{t},\{s_{i}\}_{i=0}^{t-1},\{a_{i}\}_{i=0}^{t-1}) +. This value function depends not only on the current state and action but also on the history of previous states and actions, which deviates from the assumptions of a Markovian process. Formally, the agent seeks to maximize the expected cumulative reward, defined as: +max +π +⁡ +V +T +​ +( +ρ +) += +max +π +⁡ +𝔼 +​ +τ +​ +[ +∑ +t += +0 +T +ℛ +​ +( +s +t +, +a +t +, +c +) +∣ +c +∼ +ρ +; +π +] +subscript +𝜋 +superscript +𝑉 +𝑇 +𝜌 +subscript +𝜋 +𝔼 +𝜏 +delimited-[] +similar-to +conditional +superscript +subscript +𝑡 +0 +𝑇 +ℛ +subscript +𝑠 +𝑡 +subscript +𝑎 +𝑡 +𝑐 +𝑐 +𝜌 +𝜋 +\max_{\pi}V^{T}(\rho)=\max_{\pi}\mathbb{E}{\tau}\left[\sum_{t=0}^{T}\mathcal{R}(s_{t},a_{t},c)\mid c\sim\rho;\pi\right] +. +This optimization captures the agent’s (in-context) process, as it adjusts its policy +π +𝜋 +\pi +to achieve the highest expected return across multiple trajectories, considering both current and historical information. +3.2 +SWE-Search Framework and Action Agent +The SWE-Search Action Agent builds on the moatless-tools framework +(Örwall, +2024 +) +. Its action space, +𝒜 +𝒜 +\mathcal{A} +, is organized as a two-tier hierarchy, comprising both action types and their corresponding specific actions. Formally, this can be expressed as +𝒜 += +( +t +, +a +) +∣ +t +∈ +𝒯 +, +a +∈ +𝒜 +t +formulae-sequence +𝒜 +conditional +𝑡 +𝑎 +𝑡 +𝒯 +𝑎 +subscript +𝒜 +𝑡 +\mathcal{A}={(t,a)\mid t\in\mathcal{T},a\in\mathcal{A}_{t}} +, where +𝒯 +𝒯 +\mathcal{T} +represents the set of action types (e.g., +Search +, +Plan +, +Edit +), and +𝒜 +t +subscript +𝒜 +𝑡 +\mathcal{A}_{t} +is the set of possible actions corresponding to each type +t +𝑡 +t +. These actions range from tool invocations and code modifications to the generation of structured text. To enhance the agent’s effectiveness in search-driven tasks, we introduced the following modifications: +One key modification we implemented is the expansion of the +Plan +state, allowing it to transition flexibly to any other state, rather than being limited to transitioning only to +Edit +. This change is motivated by the need to enable more dynamic and adaptive problem-solving behaviors within the agent. In the context of software engineering, rigid state transitions can be overly restrictive, forcing the agent into predetermined pathways that may not always align with the complexities of real-world scenarios. For instance, during code modification tasks, an agent might recognize mid-process that further planning, additional searches, or different types of analysis are necessary before proceeding with edits. Restricting transitions only to editing would artificially constrain the agent, potentially leading it to suboptimal actions or causing it to become stuck in unproductive loops. By allowing transitions to any state, we empower the agent to adapt to new information as it arises ( +Fig. +2 +), exploring a wider variety of trajectories. This enhanced flexibility reflects the iterative and often non-linear nature of real software engineering workflows, where engineers frequently revisit planning, testing, and research phases before committing to edits. +Second, the agent is empowered to execute any tests within the codebase at its discretion, as well as to create and implement new tests. The results of these tests are incorporated into both the value function and the agent’s subsequent decision-making process. It is crucial to highlight that the tests required to resolve a given instance (i.e., fail-to-pass tests) are not explicitly revealed to the agent. However, the agent can leverage any pre-existing tests within the repository, simulating the behavior of a real-world software engineer. +1 +1 +1 +This approach aligns with the practices of other SWE agents, and has been validated by the authors of SWE-bench, who confirmed its legitimacy as long as the fail-to-pass tests remain concealed from the model. +3.3 +Value (Function) Agent +The role of the Value Agent extends beyond simply estimating the expected utility of a given state-action pair +O +n +​ +( +s +n +, +a +n +) +subscript +𝑂 +𝑛 +subscript +𝑠 +𝑛 +subscript +𝑎 +𝑛 +O_{n}(s_{n},a_{n}) +. In addition to calculating the value +v +n +subscript +𝑣 +𝑛 +v_{n} +, the Value Agent generates a written explanation, denoted as +ε +𝜀 +\varepsilon +. This explanation serves a dual purpose: it provides transparency into the decision-making process and functions as feedback for the Action Agent, which can leverage this explanation when re-expanding from the parent node of +O +n +subscript +𝑂 +𝑛 +O_{n} +(see +Figure +1 +, +hindsight feedback +). This approach enables the system to iteratively refine its decision-making process, mirroring how a human software engineer continuously re-evaluates their approach based on new information to improve their problem-solving strategy. +The input to the value function consists of all state-action pairs up to and including the current state being evaluated, alongside specific instructions on how to assess the state. This allows the Value Agent to contextualize the decision within the trajectory, accounting for the sequence of actions and states leading up to the present. The final output of the value function can be formalized as: +( +v +t +, +ε +t +) += +V +​ +( +s +t +, +a +t +, +{ +s +i +} +i += +0 +​ +… +​ +t +− +1 +, +{ +a +i +} +i += +0 +​ +… +​ +t +− +1 +) +subscript +𝑣 +𝑡 +subscript +𝜀 +𝑡 +𝑉 +subscript +𝑠 +𝑡 +subscript +𝑎 +𝑡 +subscript +subscript +𝑠 +𝑖 +𝑖 +0 +… +𝑡 +1 +subscript +subscript +𝑎 +𝑖 +𝑖 +0 +… +𝑡 +1 +(v_{t},\varepsilon_{t})=V(s_{t},a_{t},\{s_{i}\}_{i=0{\ldots}t-1},\{a_{i}\}_{i=0{\ldots}t-1}) +(1) +Here, +v +t +subscript +𝑣 +𝑡 +v_{t} +represents the expected utility of the current state-action pair, while +ε +t +subscript +𝜀 +𝑡 +\varepsilon_{t} +is the accompanying explanation. +In practice, the Value Agent is tasked with analyzing the entire trajectory leading up to the current state-action pair, providing not only the required utility estimate +v +t +subscript +𝑣 +𝑡 +v_{t} +, but also a detailed explanation +ε +t +subscript +𝜀 +𝑡 +\varepsilon_{t} +. This explanation is critical for the agent’s overall performance, as it offers insight into the reasoning behind utility estimates, which in turn informs the Action Agent’s future decisions. We have observed that one of the key factors driving the effectiveness of the Value Agent lies in the clarity and specificity of these explanations. A well-articulated explanation can illuminate the strengths and limitations of different state types (e.g., +Search +, +Edit +, +Plan +), helping the Action Agent better understand which types of states are more promising or risky to pursue. +By providing detailed feedback on the potential utility of different actions and contextualizing them within the broader trajectory, the Value Agent enables more informed and strategic decision-making by the Action Agent. This integration of both quantitative and qualitative feedback leads to improved performance and more adaptive behavior throughout the task ( +Fig. +4 +a +). +3.4 +Search Algorithm +Our search tree is structured with nodes representing states +𝒮 +t +subscript +𝒮 +𝑡 +\mathcal{S}_{t} +and edges representing actions +𝒜 +t +subscript +𝒜 +𝑡 +\mathcal{A}_{t} +. The search algorithm employed is a modified Monte Carlo Tree Search (MCTS), specifically adapted for the tasks of the SWE-Agent. Unlike prior approaches for web agents that utilize language models in the selection process +Koh et al. ( +2024 +); Zhang et al. ( +2024b +) +, we deliberately choose not to rely on language models for node selection. Instead, we adopt a more straightforward heuristic-based selection function, similar to the approach used in AlphaZero +Silver et al. ( +2016a +; +2018 +) +. This decision is driven by the need for interpretability, efficiency, and the focus on tasks where heuristic-based exploration suffices to guide the agent effectively through complex software engineering environments. +At the core of our algorithm is a modified Upper Confidence Bound for Trees (UCT) selection criterion +Kocsis & Szepesvári ( +2006 +) +, which determines the next node to expand. This criterion balances exploitation of known high-reward actions with exploration of less-visited states. We introduce additional terms to encourage strategic exploration early in the search process, and to penalize over-exploration at later stages when convergence on the optimal solution is desired. The modified UCT function is expressed as: +U +​ +C +​ +T +​ +( +s +, +a +) += +e +​ +x +​ +p +​ +l +​ +o +​ +i +​ +t +​ +a +​ +t +​ +i +​ +o +​ +n ++ +e +​ +x +​ +p +​ +l +​ +o +​ +r +​ +a +​ +t +​ +i +​ +o +​ +n ++ +e +​ +a +​ +r +​ +l +​ +y +​ +_ +​ +d +​ +e +​ +p +​ +t +​ +h +​ +_ +​ +b +​ +o +​ +n +​ +u +​ +s +− +l +​ +a +​ +t +​ +e +​ +_ +​ +d +​ +e +​ +p +​ +t +​ +h +​ +_ +​ +p +​ +e +​ +n +​ +a +​ +l +​ +t +​ +y +𝑈 +𝐶 +𝑇 +𝑠 +𝑎 +𝑒 +𝑥 +𝑝 +𝑙 +𝑜 +𝑖 +𝑡 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑒 +𝑥 +𝑝 +𝑙 +𝑜 +𝑟 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑒 +𝑎 +𝑟 +𝑙 +𝑦 +_ +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +_ +𝑏 +𝑜 +𝑛 +𝑢 +𝑠 +𝑙 +𝑎 +𝑡 +𝑒 +_ +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +_ +𝑝 +𝑒 +𝑛 +𝑎 +𝑙 +𝑡 +𝑦 +UCT(s,a)=exploitation+exploration+early\_depth\_bonus-late\_depth\_penalty +(2) +This can be expressed more formally as: +U +​ +C +​ +T +​ +( +s +, +a +) += +V +​ +( +s +, +a +) ++ +C +​ +ln +⁡ +N +​ +( +s +) +N +​ +( +s +, +a +) ++ +α +​ +e +− +β +​ +( +d +− +1 +) +− +γ +​ +d +𝑈 +𝐶 +𝑇 +𝑠 +𝑎 +𝑉 +𝑠 +𝑎 +𝐶 +𝑁 +𝑠 +𝑁 +𝑠 +𝑎 +𝛼 +superscript +𝑒 +𝛽 +𝑑 +1 +𝛾 +𝑑 +UCT(s,a)=V(s,a)+C\sqrt{\frac{\ln N(s)}{N(s,a)}}+\alpha e^{-\beta(d-1)}-\gamma\sqrt{d} +(3) +V +​ +( +s +, +a +) +𝑉 +𝑠 +𝑎 +V(s,a) +is the value estimate of the state-action pair +, +N +​ +( +s +, +a +) +𝑁 +𝑠 +𝑎 +N(s,a) +is the number of times the state-action pair +( +s +, +a +) +𝑠 +𝑎 +(s,a) +has been visited, +N +​ +( +s +) +𝑁 +𝑠 +N(s) +is the visit count of state +s +𝑠 +s +, +d +𝑑 +d +is the depth of the node in the search tree, and +C +𝐶 +C +, +α +𝛼 +\alpha +, +β +𝛽 +\beta +, and +γ +𝛾 +\gamma +are constants that control the balance between exploration, exploitation, and depth-dependent rewards and penalties. +This formulation is inspired by the way software engineers explore potential solutions to a task. In practice, an engineer’s search process can be broken down into the following key phases, which our algorithm mirrors: +Early Exploration +: Initially, an engineer explores a wide variety of potential approaches to fully understand the problem and identify promising strategies. This is encouraged in our algorithm by the +e +​ +a +​ +r +​ +l +​ +y +​ +_ +​ +d +​ +e +​ +p +​ +t +​ +h +​ +_ +​ +b +​ +o +​ +n +​ +u +​ +s +𝑒 +𝑎 +𝑟 +𝑙 +𝑦 +_ +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +_ +𝑏 +𝑜 +𝑛 +𝑢 +𝑠 +early\_depth\_bonus +, represented by the term +α +​ +e +− +β +​ +( +d +− +1 +) +𝛼 +superscript +𝑒 +𝛽 +𝑑 +1 +\alpha e^{-\beta(d-1)} +, which rewards exploration at shallow depths, simulating the early phases of wide exploration. +Convergence and Exploitation +: As the engineer gains more information and narrows down the options, the focus shifts to exploiting the most effective solution paths. This transition is handled by the standard UCT exploitation term +V +​ +( +s +, +a +) +𝑉 +𝑠 +𝑎 +V(s,a) +and is further reinforced by the +l +​ +a +​ +t +​ +e +​ +_ +​ +d +​ +e +​ +p +​ +t +​ +h +​ +_ +​ +p +​ +e +​ +n +​ +a +​ +l +​ +t +​ +y +𝑙 +𝑎 +𝑡 +𝑒 +_ +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +_ +𝑝 +𝑒 +𝑛 +𝑎 +𝑙 +𝑡 +𝑦 +late\_depth\_penalty +( +− +γ +​ +d +𝛾 +𝑑 +-\gamma\sqrt{d} +), which discourages over-exploration as the agent delves deeper into the search tree. +Quick Abandonment of Poor Strategies +: Software engineers are also adept at abandoning poor strategies when new information indicates that a particular approach is not viable. We capture this behavior by implementing a simple heuristic rule that abandons nodes associated with consecutive low rewards, ensuring that the agent does not waste resources on unproductive trajectories. +At each step, the node with the highest UCT value is selected for expansion, formalized as: +s +∗ += +arg +​ +max +( +s +, +a +) +⁡ +U +​ +C +​ +T +​ +( +s +, +a +) +superscript +𝑠 +subscript +arg +max +𝑠 +𝑎 +𝑈 +𝐶 +𝑇 +𝑠 +𝑎 +s^{*}=\operatorname*{arg\,max}_{(s,a)}UCT(s,a) +(4) +This approach effectively mimics the decision-making process of a software engineer, who balances exploration of potential strategies with a focus on converging towards the optimal solution, while remaining flexible enough to backtrack when necessary. By incorporating heuristic feedback and depth-based adjustments, the algorithm avoids getting stuck in unproductive paths and enhances the agent’s ability to identify high-reward strategies with minimal computational overhead +Appendix +6 +. +3.4.1 +Discriminator Agent +The final stage of SWE-Search involves the Discriminator Agent, whose role is to evaluate the candidate solutions generated by the search process and select the one most likely to resolve the issue at hand. This module accepts up to five final solutions produced by the search and engages in a multi-agent debate to determine the most promising option. Drawing inspiration from recent work on persuasive multi-agent debates +(Khan et al., +2024 +; Amayuelas et al., +2024 +) +, the Discriminator leverages the collective reasoning of multiple agents to ensure a more robust final selection. Configuration and hyperparameter details can be found in +Table +2 +. +In this stage, agents are presented with the original problem statement and candidate solutions. They engage in a structured debate to determine the most effective solution, supporting their choices with logical reasoning and evidence from the search process. This debate encourages a thorough exploration of trade-offs between solutions, potentially uncovering strengths or weaknesses not evident during individual searches. Finally, a judge agent evaluates the arguments and selects the solution deemed most likely to resolve the issue. This process simulates the collaborative decision-making in software engineering teams, where diverse perspectives lead to a more thorough evaluation of candidate solutions, ultimately increasing the likelihood of identifying the most optimal outcome. +The discriminator process not only enhances the robustness of the final solution but also adds transparency, as the reasoning behind the choice is clearly articulated and evaluated. This ensures that the selected solution is well-reasoned and thoroughly vetted before implementation. +Figure 2: +Hindsight feedback error correction. +Instance sympy__sympy-15678, SWE-Search with Qwen2.5-72B-Instruct. Initially, the Action Agent performs edits and runs tests, which pass. It prematurely concludes the search. Without actually knowing the proposed solution does not resolve the issue, the Value Agent identifies potentially missed tests and assigns a low reward. Upon re-expansion using the Value Agent’s feedback, new tests fail, prompting the Action Agent to make additional edits, which result in a preferred solution which ultimately resolves the issue. +4 +Experiments +Benchmark +For our experiments, we utilize SWE-bench Lite, a curated subset of the official SWE-bench, containing 300 instances. This dataset is specifically designed to be self-contained and focuses primarily on evaluating functional bug fixes, providing a controlled environment to assess the performance of our system. +Evaluation Metrics +We use two metrics: resolve rate ( +Pass@1 +) and +Pass@5 +. Resolve rate is the percentage of issues successfully resolved, measuring overall effectiveness. Pass@5 is the percentage of issues where a correct solution is found within five attempts. This allows us to assess the efficiency of the search in identifying successful bug fixes within a limited number of iterations. +Baselines +Software agents leverage diverse tools, architectures, and models, leading to variability in their performance on subsets of the SWE-bench Lite dataset +(Zhang et al., +2024a +) +. For comparison, we build upon the moatless-tools framework +(Örwall, +2024 +) +, a high-performing open-source agent commonly used in research settings +(Chowdhury et al., +2024 +) +. To isolate the impact of our search approach, we adapt moatless-tools as our baseline, referred to as moatless-adapted. This allows us to fairly compare the performance of SWE-Search against moatless-adapted across various models, including two closed-source models (GPT-4o, GPT-4o-mini) and three open-source models (Qwen2.5-72B-Instruct +(Yang et al., +2024a +) +, Llama-3.1-70B-Instruct +(Dubey et al., +2024 +) +, and DeepSeek-V2.5 +(DeepSeek-AI et al., +2024 +) +). We also reference official moatless-tools GPT-4o results on SWE-bench Lite to ensure a fair and consistent comparison. +Implementation Details +For consistency, we use identical prompts across all models. In SWE-Search, we limit each node to a maximum of three expansions and cap the total search iterations at 100. Further details on model hyperparameters can be found in +Appendix, +2 +. +Table 1: +Resolve Rate Comparison, SWE-bench Lite +Model +Moatless-v1 +Moatless-adapted +SWE-Search +% +Δ +Δ +\Delta +GPT-4o +24.3 +25.7 +31.0 ++17 +GPT-4o-mini +– +13.0 +17.0 ++24 +Qwen-2.5-72b-Instruct +– +18.0 +24.7 ++27 +Deepseek-V2.5 +– +16.3 +21.0 ++22 +Llama-3.1-70b-Instruct +– +13.6 +17.7 ++23 +Mean % +Δ +Δ +\Delta ++23 +4.1 +Experimental Results +4.1.1 +SWE-Search Surpasses all Corresponding Base Agents and Enables Smaller, Open Source Models to Approach GPT-4o +On average, SWE-Search outperforms the baseline agent across all five models, achieving a 23% relative improvement +(Table +1 +) +. Notably, SWE-Search with Qwen-2.5-72B-Instruct exceeds the performance of GPT-4o using the original Moatless-v1 framework, and closely matches its performance when compared with the Moatless-adapted agent, with only a slight difference ( +Δ += +− +1 +% +Δ +percent +1 +\Delta=-1\% +). Interestingly, all five models demonstrate significant improvement when utilizing the proposed approach, with consistent gains across different models. +4.1.2 +Search Enables Agents to Make Better Use of More Flexibility +To prevent goal divergence, most agents, including moatless-tools, rely on strict transition rules, where state transitions follow predetermined sequences (e.g., Search +→ +→ +\rightarrow +Identify, Plan +→ +→ +\rightarrow +Edit). In moatless-adapted, we introduce a more flexible transition logic that allows a Plan state to transition into any other state type. This added flexibility has both advantages and drawbacks. On the positive side, it enables the agent to autonomously correct its trajectory without external feedback, particularly when the necessary adjustments span only a limited portion of the task. However, this increased flexibility also introduces the risk of the agent becoming trapped in infinite loops. Without a high-level control mechanism to detect and mitigate these situations, the agent may fail to recover from such loops. This trade-off is evident in the modest performance difference between Moatless-v1 and moatless-adapted, with a slight performance improvement of only 1.4% ( +Table +1 +). +4.1.3 +Impact of Hindsight Feedback on Agent Performance +One key advantage of utilizing LLMs as general value functions is their dual ability to provide both quantitative value estimates and qualitative assessments in natural language. These qualitative insights can significantly enhance the agent’s action generation and search process by offering detailed feedback on potential errors or overlooked aspects of the task. In practice, feedback was also crucial in eliciting diversity in the actions taken by the agent, as without it, the agent would often take very similar actions when re-expanding from a parent node. +As shown in +Figure +2 +, this mechanism plays a critical role in improving the agent’s performance. During the initial expansion, the agent prematurely concludes that the task is complete. However, the value function correctly identifies gaps in the test coverage, specifically in addressing potential corner cases, and assigns a low reward. This feedback prompts the agent to re-expand the parent state, leading to the introduction of new tests, which subsequently fail. The agent then performs a series of edits (summarized in the figure for brevity), ultimately resolving the task correctly. Empirically, we observe that the instances unresolved by moatless-adapted but successfully solved by SWE-Search are often attributed to this search-and-feedback loop, where iterative feedback drives the agent toward a correct solution. +4.2 +Importance of Comprehensive State Information for Value Function Performance +Model +Pass@1 +Pass@5 +GPT-4o +31.0 +34.0 +GPT-4o-mini +17.0 +22.3 +Qwen-2.5-72b-Instruct +24.7 +25.7 +Deepseek-V2.5 +21.0 +23.3 +Llama-3.1-70b-Instruct +21.0 +22.3 +Figure 3: +SWE-bench SWE-Search results +The effectiveness of SWE-Search hinges on the value function’s ability to accurately differentiate between desirable and undesirable states, and to provide actionable feedback that drives improvement. However, our experiments revealed that the value function sometimes failed to recognize critical decision points in the search tree. It frequently misinterpreted the purpose of certain actions, leading to the undervaluation of effective strategies by assigning low rewards. As shown in +Figure +4 +a +, before the introduction of state-specific value prompts, the agent consistently assigned low rewards even when the Action Agent correctly identified the need for additional context, such as locating relevant files. This issue persisted despite the agent successfully identifying the files later. By implementing state-specific prompts across core state clusters (Searching, Planning, Editing), the value function became significantly more adept at interpreting the intent behind actions and evaluating their outcomes within each state. For further details on experiments distinguishing between effective and ineffective states, refer to +Appendix +8 +. +Figure 4: +(a) Importance of state-specific value prompts. +On the left and right are the respective Value Agents’ outputs with and without state-specific prompts. While the action in both cases is effective in finding the right file, the non-state-specific scenario does not recognize this and assigns a low reward. On the contrary, the state-specific prompt correctly assigns a high reward to this state. +(b) Performance scaling with search depth across different language models. +The graph shows the number of issues resolved as a function of the number of transitions (search iterations) for all models used. +Scaling SWE agents with Inference-time Compute +The success of large language models (LLMs) has traditionally been attributed to the expansion of training data and model size, i.e., training-time compute +(Wei et al., +2022 +; Chung et al., +2022 +) +. Recently, researchers have started exploring how different methods scale with inference-time +(OpenAI, +2024 +; Snell et al., +2024 +; Dubey et al., +2024 +) +. Here, we study the performance of software engineering agents through increased inference-time compute. As shown in +Figure +4 +b +, increasing search iterations leads to a consistent rise in the number of resolved issues. To ensure experimental feasibility across the 300 instances in the SWE-bench Lite dataset, we applied conservative parameters (maximum iterations += +100 +absent +100 +=100 +, maximum expansions per node += +3 +absent +3 +=3 +). Approaches like SWE-Search enable the allocation of greater resources to specific challenges, such as addressing critical software vulnerabilities +(Rigaki et al., +2024 +; Fang et al., +2024 +) +, offering a scalable solution to complex tasks. +Figure 5: +(a) Value Function vs. Discriminator Comparison. +Comparison of value function vs. discriminator ability to discern the final solution that resolved the issue when there is one. The discriminator performs better across all models except GPT-4o-mini. DeepSeek-V2.5 had the smallest disparity between the two methods, suggesting an ability to act as a well-calibrated value function. +(b) Model-Specific Issue Resolution. +Venn diagram of resolved issues by model. Each model can solve a handful of unique instances. +Convergence of Value Function and Discriminator to Right Solution +The search process can yield multiple proposed solutions. Ideally, the mean trajectory value of the the proposed solution that resolves the issue will always be the highest, which would yields the ideal performance of the agent +(Table +3 +) +. In practice, the value function successfully converged on the correct solution 73% of the time on average across the five models. The discriminator module performed even better, increasing the proportion of correct solutions selected to 84%. While in typical large action spaces, Monte Carlo Tree Search (MCTS) is run for thousands of iterations +(Silver et al., +2016b +) +, the value function’s success rate remains impressive given the computational constraints. However, SWE-Search could further benefit from enhanced methods for identifying the correct solutions more consistently, allowing it to fully reach its potential. +Different Models can Resolve Vastly Different Issue Subsets +When comparing the resolved instances across the five models, we observed significant diversity in the subsets of issues each model successfully solved. As shown in +Figure +5 +, each model managed to resolve at least one unique instance. Notably, a surprising number of issues (33) were solved by other models but not by GPT-4o. This suggests that model diversity could play an important role, at least in the short term, in enhancing the performance of SWE-agents. +5 +Discussion and Conclusion +In this paper, we introduced SWE-Search, a general framework that integrates Monte Carlo Tree Search (MCTS) and qualitative feedback to enhance the performance of software engineering agents. The proposed approach demonstrated improvements over different baseline models, highlighting the potential of search-based methods in software engineering tasks. +One of the key advantages of search-based approaches, as demonstrated in our work, is their ability to scale performance with increased inference-time compute. This flexibility allows the system to adapt to problems that require higher computational resources, such as discovering software vulnerabilities or even generating large codebases from scratch. Future research should focus on two main directions: (a) investigating how search agents scale with computational resources, and (b) expanding the application of software agent search to a broader range of complex use cases. +Given that search techniques like MCTS closely resemble the problem-solving processes of human software engineers, we expect these methods to become increasingly prevalent in agent-driven systems. As the nature of software engineering tasks evolves, system architectures will need to become more fluid and adaptable, fully leveraging the potential of search-based techniques. This evolution will likely lead to the development of larger, more general agentic systems capable of tackling a wide array of software engineering challenges. +References +Amayuelas et al. (2024) +Alfonso Amayuelas, Xianjun Yang, Antonis Antoniades, Wenyue Hua, Liangming Pan, and William Wang. +Multiagent collaboration attack: Investigating adversarial attacks in large language model collaborations via debate, 2024. +URL +https://arxiv.org/abs/2406.14711 +. +Black (2005) +Paul E. Black. +greedy algorithm, feb 2005. +URL +https://www.nist.gov/dads/HTML/greedyalgo.html +. +Accessed: TODAY. +Bounsi et al. (2024) +Wilfried Bounsi, Borja Ibarz, Andrew Dudzik, Jessica B. Hamrick, Larisa Markeeva, Alex Vitvitskyi, Razvan Pascanu, and Petar Veličković. +Transformers meet neural algorithmic reasoners, 2024. +URL +https://arxiv.org/abs/2406.09308 +. +Chowdhury et al. (2024) +Neil Chowdhury, James Aung, Chan Jun Shern, Oliver Jaffe, Dane Sherburn, Giulio Starace, Evan Mays, Rachel Dias, Marwan Aljubeh, Mia Glaese, Carlos E. Jimenez, John Yang, Kevin Liu, and Aleksander Madry. +Introducing SWE-bench verified, August 2024. +URL +https://openai.com/research/introducing-swe-bench-verified +. +OpenAI Blog. +Chung et al. (2022) +Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Alex Castro-Ros, Marie Pellat, Kevin Robinson, Dasha Valter, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei. +Scaling instruction-finetuned language models, 2022. +URL +https://arxiv.org/abs/2210.11416 +. +Cormen et al. (2009) +Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein. +Introduction to Algorithms, Third Edition +. +The MIT Press, 3rd edition, 2009. +ISBN 0262033844. +DeepSeek-AI et al. (2024) +DeepSeek-AI, Qihao Zhu, Daya Guo, Zhihong Shao, Dejian Yang, Peiyi Wang, Runxin Xu, Y. Wu, Yukun Li, Huazuo Gao, Shirong Ma, Wangding Zeng, Xiao Bi, Zihui Gu, Hanwei Xu, Damai Dai, Kai Dong, Liyue Zhang, Yishi Piao, Zhibin Gou, Zhenda Xie, Zhewen Hao, Bingxuan Wang, Junxiao Song, Deli Chen, Xin Xie, Kang Guan, Yuxiang You, Aixin Liu, Qiushi Du, Wenjun Gao, Xuan Lu, Qinyu Chen, Yaohui Wang, Chengqi Deng, Jiashi Li, Chenggang Zhao, Chong Ruan, Fuli Luo, and Wenfeng Liang. +Deepseek-coder-v2: Breaking the barrier of closed-source models in code intelligence, 2024. +URL +https://arxiv.org/abs/2406.11931 +. +Du et al. (2023) +Yilun Du, Shuang Li, Antonio Torralba, Joshua B. Tenenbaum, and Igor Mordatch. +Improving factuality and reasoning in language models through multiagent debate, 2023. +URL +https://arxiv.org/abs/2305.14325 +. +Dubey et al. (2024) +Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, Anirudh Goyal, Anthony Hartshorn, Aobo Yang, Archi Mitra, Archie Sravankumar, Artem Korenev, Arthur Hinsvark, Arun Rao, Aston Zhang, Aurelien Rodriguez, Austen Gregerson, Ava Spataru, Baptiste Roziere, Bethany Biron, Binh Tang, Bobbie Chern, Charlotte Caucheteux, Chaya Nayak, Chloe Bi, Chris Marra, Chris McConnell, Christian Keller, Christophe Touret, Chunyang Wu, Corinne Wong, Cristian Canton Ferrer, Cyrus Nikolaidis, Damien Allonsius, Daniel Song, Danielle Pintz, Danny Livshits, David Esiobu, Dhruv Choudhary, Dhruv Mahajan, Diego Garcia-Olano, Diego Perino, Dieuwke Hupkes, Egor Lakomkin, Ehab AlBadawy, Elina Lobanova, Emily Dinan, Eric Michael Smith, Filip Radenovic, Frank Zhang, Gabriel Synnaeve, Gabrielle Lee, Georgia Lewis Anderson, Graeme Nail, Gregoire Mialon, Guan Pang, Guillem Cucurell, Hailey Nguyen, Hannah Korevaar, Hu Xu, Hugo Touvron, Iliyan Zarov, +Imanol Arrieta Ibarra, Isabel Kloumann, Ishan Misra, Ivan Evtimov, Jade Copet, Jaewon Lee, Jan Geffert, Jana Vranes, Jason Park, Jay Mahadeokar, Jeet Shah, Jelmer van der Linde, Jennifer Billock, Jenny Hong, Jenya Lee, Jeremy Fu, Jianfeng Chi, Jianyu Huang, Jiawen Liu, Jie Wang, Jiecao Yu, Joanna Bitton, Joe Spisak, Jongsoo Park, Joseph Rocca, Joshua Johnstun, Joshua Saxe, Junteng Jia, Kalyan Vasuden Alwala, Kartikeya Upasani, Kate Plawiak, Ke Li, Kenneth Heafield, Kevin Stone, Khalid El-Arini, Krithika Iyer, Kshitiz Malik, Kuenley Chiu, Kunal Bhalla, Lauren Rantala-Yeary, Laurens van der Maaten, Lawrence Chen, Liang Tan, Liz Jenkins, Louis Martin, Lovish Madaan, Lubo Malo, Lukas Blecher, Lukas Landzaat, Luke de Oliveira, Madeline Muzzi, Mahesh Pasupuleti, Mannat Singh, Manohar Paluri, Marcin Kardas, Mathew Oldham, Mathieu Rita, Maya Pavlova, Melanie Kambadur, Mike Lewis, Min Si, Mitesh Kumar Singh, Mona Hassan, Naman Goyal, Narjes Torabi, Nikolay Bashlykov, Nikolay Bogoychev, Niladri Chatterji, Olivier +Duchenne, Onur Çelebi, Patrick Alrassy, Pengchuan Zhang, Pengwei Li, Petar Vasic, Peter Weng, Prajjwal Bhargava, Pratik Dubal, Praveen Krishnan, Punit Singh Koura, Puxin Xu, Qing He, Qingxiao Dong, Ragavan Srinivasan, Raj Ganapathy, Ramon Calderer, Ricardo Silveira Cabral, Robert Stojnic, Roberta Raileanu, Rohit Girdhar, Rohit Patel, Romain Sauvestre, Ronnie Polidoro, Roshan Sumbaly, Ross Taylor, Ruan Silva, Rui Hou, Rui Wang, Saghar Hosseini, Sahana Chennabasappa, Sanjay Singh, Sean Bell, Seohyun Sonia Kim, Sergey Edunov, Shaoliang Nie, Sharan Narang, Sharath Raparthy, Sheng Shen, Shengye Wan, Shruti Bhosale, Shun Zhang, Simon Vandenhende, Soumya Batra, Spencer Whitman, Sten Sootla, Stephane Collot, Suchin Gururangan, Sydney Borodinsky, Tamar Herman, Tara Fowler, Tarek Sheasha, Thomas Georgiou, Thomas Scialom, Tobias Speckbacher, Todor Mihaylov, Tong Xiao, Ujjwal Karn, Vedanuj Goswami, Vibhor Gupta, Vignesh Ramanathan, Viktor Kerkez, Vincent Gonguet, Virginie Do, Vish Vogeti, Vladan Petrovic, Weiwei Chu, +Wenhan Xiong, Wenyin Fu, Whitney Meers, Xavier Martinet, Xiaodong Wang, Xiaoqing Ellen Tan, Xinfeng Xie, Xuchao Jia, Xuewei Wang, Yaelle Goldschlag, Yashesh Gaur, Yasmine Babaei, Yi Wen, Yiwen Song, Yuchen Zhang, Yue Li, Yuning Mao, Zacharie Delpierre Coudert, Zheng Yan, Zhengxing Chen, Zoe Papakipos, Aaditya Singh, Aaron Grattafiori, Abha Jain, Adam Kelsey, Adam Shajnfeld, Adithya Gangidi, Adolfo Victoria, Ahuva Goldstand, Ajay Menon, Ajay Sharma, Alex Boesenberg, Alex Vaughan, Alexei Baevski, Allie Feinstein, Amanda Kallet, Amit Sangani, Anam Yunus, Andrei Lupu, Andres Alvarado, Andrew Caples, Andrew Gu, Andrew Ho, Andrew Poulton, Andrew Ryan, Ankit Ramchandani, Annie Franco, Aparajita Saraf, Arkabandhu Chowdhury, Ashley Gabriel, Ashwin Bharambe, Assaf Eisenman, Azadeh Yazdan, Beau James, Ben Maurer, Benjamin Leonhardi, Bernie Huang, Beth Loyd, Beto De Paola, Bhargavi Paranjape, Bing Liu, Bo Wu, Boyu Ni, Braden Hancock, Bram Wasti, Brandon Spence, Brani Stojkovic, Brian Gamido, Britt Montalvo, Carl +Parker, Carly Burton, Catalina Mejia, Changhan Wang, Changkyu Kim, Chao Zhou, Chester Hu, Ching-Hsiang Chu, Chris Cai, Chris Tindal, Christoph Feichtenhofer, Damon Civin, Dana Beaty, Daniel Kreymer, Daniel Li, Danny Wyatt, David Adkins, David Xu, Davide Testuggine, Delia David, Devi Parikh, Diana Liskovich, Didem Foss, Dingkang Wang, Duc Le, Dustin Holland, Edward Dowling, Eissa Jamil, Elaine Montgomery, Eleonora Presani, Emily Hahn, Emily Wood, Erik Brinkman, Esteban Arcaute, Evan Dunbar, Evan Smothers, Fei Sun, Felix Kreuk, Feng Tian, Firat Ozgenel, Francesco Caggioni, Francisco Guzmán, Frank Kanayet, Frank Seide, Gabriela Medina Florez, Gabriella Schwarz, Gada Badeer, Georgia Swee, Gil Halpern, Govind Thattai, Grant Herman, Grigory Sizov, Guangyi, Zhang, Guna Lakshminarayanan, Hamid Shojanazeri, Han Zou, Hannah Wang, Hanwen Zha, Haroun Habeeb, Harrison Rudolph, Helen Suk, Henry Aspegren, Hunter Goldman, Ibrahim Damlaj, Igor Molybog, Igor Tufanov, Irina-Elena Veliche, Itai Gat, Jake Weissman, James +Geboski, James Kohli, Japhet Asher, Jean-Baptiste Gaya, Jeff Marcus, Jeff Tang, Jennifer Chan, Jenny Zhen, Jeremy Reizenstein, Jeremy Teboul, Jessica Zhong, Jian Jin, Jingyi Yang, Joe Cummings, Jon Carvill, Jon Shepard, Jonathan McPhie, Jonathan Torres, Josh Ginsburg, Junjie Wang, Kai Wu, Kam Hou U, Karan Saxena, Karthik Prasad, Kartikay Khandelwal, Katayoun Zand, Kathy Matosich, Kaushik Veeraraghavan, Kelly Michelena, Keqian Li, Kun Huang, Kunal Chawla, Kushal Lakhotia, Kyle Huang, Lailin Chen, Lakshya Garg, Lavender A, Leandro Silva, Lee Bell, Lei Zhang, Liangpeng Guo, Licheng Yu, Liron Moshkovich, Luca Wehrstedt, Madian Khabsa, Manav Avalani, Manish Bhatt, Maria Tsimpoukelli, Martynas Mankus, Matan Hasson, Matthew Lennie, Matthias Reso, Maxim Groshev, Maxim Naumov, Maya Lathi, Meghan Keneally, Michael L. Seltzer, Michal Valko, Michelle Restrepo, Mihir Patel, Mik Vyatskov, Mikayel Samvelyan, Mike Clark, Mike Macey, Mike Wang, Miquel Jubert Hermoso, Mo Metanat, Mohammad Rastegari, Munish Bansal, Nandhini +Santhanam, Natascha Parks, Natasha White, Navyata Bawa, Nayan Singhal, Nick Egebo, Nicolas Usunier, Nikolay Pavlovich Laptev, Ning Dong, Ning Zhang, Norman Cheng, Oleg Chernoguz, Olivia Hart, Omkar Salpekar, Ozlem Kalinli, Parkin Kent, Parth Parekh, Paul Saab, Pavan Balaji, Pedro Rittner, Philip Bontrager, Pierre Roux, Piotr Dollar, Polina Zvyagina, Prashant Ratanchandani, Pritish Yuvraj, Qian Liang, Rachad Alao, Rachel Rodriguez, Rafi Ayub, Raghotham Murthy, Raghu Nayani, Rahul Mitra, Raymond Li, Rebekkah Hogan, Robin Battey, Rocky Wang, Rohan Maheswari, Russ Howes, Ruty Rinott, Sai Jayesh Bondu, Samyak Datta, Sara Chugh, Sara Hunt, Sargun Dhillon, Sasha Sidorov, Satadru Pan, Saurabh Verma, Seiji Yamamoto, Sharadh Ramaswamy, Shaun Lindsay, Shaun Lindsay, Sheng Feng, Shenghao Lin, Shengxin Cindy Zha, Shiva Shankar, Shuqiang Zhang, Shuqiang Zhang, Sinong Wang, Sneha Agarwal, Soji Sajuyigbe, Soumith Chintala, Stephanie Max, Stephen Chen, Steve Kehoe, Steve Satterfield, Sudarshan Govindaprasad, Sumit Gupta, +Sungmin Cho, Sunny Virk, Suraj Subramanian, Sy Choudhury, Sydney Goldman, Tal Remez, Tamar Glaser, Tamara Best, Thilo Kohler, Thomas Robinson, Tianhe Li, Tianjun Zhang, Tim Matthews, Timothy Chou, Tzook Shaked, Varun Vontimitta, Victoria Ajayi, Victoria Montanez, Vijai Mohan, Vinay Satish Kumar, Vishal Mangla, Vítor Albiero, Vlad Ionescu, Vlad Poenaru, Vlad Tiberiu Mihailescu, Vladimir Ivanov, Wei Li, Wenchen Wang, Wenwen Jiang, Wes Bouaziz, Will Constable, Xiaocheng Tang, Xiaofang Wang, Xiaojian Wu, Xiaolan Wang, Xide Xia, Xilun Wu, Xinbo Gao, Yanjun Chen, Ye Hu, Ye Jia, Ye Qi, Yenda Li, Yilin Zhang, Ying Zhang, Yossi Adi, Youngjin Nam, Yu, Wang, Yuchen Hao, Yundi Qian, Yuzi He, Zach Rait, Zachary DeVito, Zef Rosnbrick, Zhaoduo Wen, Zhenyu Yang, and Zhiwei Zhao. +The llama 3 herd of models, 2024. +URL +https://arxiv.org/abs/2407.21783 +. +Faldor et al. (2024) +Maxence Faldor, Jenny Zhang, Antoine Cully, and Jeff Clune. +Omni-epic: Open-endedness via models of human notions of interestingness with environments programmed in code, 2024. +URL +https://arxiv.org/abs/2405.15568 +. +Fang et al. (2024) +Richard Fang, Rohan Bindu, Akul Gupta, Qiusi Zhan, and Daniel Kang. +Teams of llm agents can exploit zero-day vulnerabilities, 2024. +URL +https://arxiv.org/abs/2406.01637 +. +Fawzi et al. (2022) +A. Fawzi, M. Balog, A. Huang, T. Hubert, B. Romera-Paredes, M. Barekatain, A. Novikov, F. J. R. Ruiz, J. Schrittwieser, G. Swirszcz, D. Silver, D. Hassabis, and P. Kohli. +Discovering faster matrix multiplication algorithms with reinforcement learning. +Nature +, 610(7930):47–53, 2022. +doi: +10.1038/s41586-022-05172-4 +. +Hart et al. (1968) +Peter E. Hart, Nils J. Nilsson, and Bertram Raphael. +A formal basis for the heuristic determination of minimum cost paths. +IEEE Trans. Syst. Sci. Cybern. +, 4(2):100–107, 1968. +doi: +10.1109/TSSC.1968.300136 +. +URL +https://doi.org/10.1109/TSSC.1968.300136 +. +Hu et al. (2024) +Shengran Hu, Cong Lu, and Jeff Clune. +Automated design of agentic systems, 2024. +URL +https://arxiv.org/abs/2408.08435 +. +Jimenez et al. (2024) +Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. +Swe-bench: Can language models resolve real-world github issues?, 2024. +URL +https://arxiv.org/abs/2310.06770 +. +Jumper et al. (2021) +J. Jumper, R. Evans, A. Pritzel, T. Green, M. Figurnov, O. Ronneberger, K. Tunyasuvunakool, R. Bates, A. Žídek, A. Potapenko, A. Bridgland, C. Meyer, S. A. A. Kohl, A. J. Ballard, A. Cowie, B. Romera-Paredes, S. Nikolov, R. Jain, J. Adler, T. Back, S. Petersen, D. Reiman, E. Clancy, M. Zielinski, M. Steinegger, M. Pacholska, T. Berghammer, S. Bodenstein, D. Silver, O. Vinyals, A. W. Senior, K. Kavukcuoglu, P. Kohli, and D. Hassabis. +Highly accurate protein structure prediction with AlphaFold. +Nature +, 596(7873):583–589, 2021. +doi: +10.1038/s41586-021-03819-2 +. +Kahneman (2011) +Daniel Kahneman. +Thinking, fast and slow +. +Farrar, Straus and Giroux, New York, NY, US, 2011. +ISBN 978-0-374-27563-1. +Khan et al. (2024) +Akbir Khan, John Hughes, Dan Valentine, Laura Ruis, Kshitij Sachan, Ansh Radhakrishnan, Edward Grefenstette, Samuel R. Bowman, Tim Rocktäschel, and Ethan Perez. +Debating with more persuasive llms leads to more truthful answers, 2024. +URL +https://arxiv.org/abs/2402.06782 +. +Kocsis & Szepesvári (2006) +Levente Kocsis and Csaba Szepesvári. +Bandit based monte-carlo planning. +In Johannes Fürnkranz, Tobias Scheffer, and Myra Spiliopoulou (eds.), +Machine Learning: ECML 2006 +, pp.  282–293, Berlin, Heidelberg, 2006. Springer Berlin Heidelberg. +ISBN 978-3-540-46056-5. +Koh et al. (2024) +Jing Yu Koh, Stephen McAleer, Daniel Fried, and Ruslan Salakhutdinov. +Tree search for language model agents, 2024. +URL +https://arxiv.org/abs/2407.01476 +. +Li et al. (2022) +Yujia Li, David Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser, Rémi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal Lago, Thomas Hubert, Peter Choy, Cyprien de Masson d’Autume, Igor Babuschkin, Xinyun Chen, Po-Sen Huang, Johannes Welbl, Sven Gowal, Alexey Cherepanov, James Molloy, Daniel J. Mankowitz, Esme Sutherland Robson, Pushmeet Kohli, Nando de Freitas, Koray Kavukcuoglu, and Oriol Vinyals. +Competition-level code generation with alphacode. +Science +, 378(6624):1092–1097, 2022. +doi: +10.1126/science.abq1158 +. +URL +https://www.science.org/doi/abs/10.1126/science.abq1158 +. +Lu et al. (2024a) +Chris Lu, Cong Lu, Robert Tjarko Lange, Jakob Foerster, Jeff Clune, and David Ha. +The ai scientist: Towards fully automated open-ended scientific discovery, 2024a. +URL +https://arxiv.org/abs/2408.06292 +. +Lu et al. (2024b) +Cong Lu, Shengran Hu, and Jeff Clune. +Intelligent go-explore: Standing on the shoulders of giant foundation models, 2024b. +URL +https://arxiv.org/abs/2405.15143 +. +Ma et al. (2024a) +Yecheng Jason Ma, William Liang, Guanzhi Wang, De-An Huang, Osbert Bastani, Dinesh Jayaraman, Yuke Zhu, Linxi Fan, and Anima Anandkumar. +Eureka: Human-level reward design via coding large language models, 2024a. +URL +https://arxiv.org/abs/2310.12931 +. +Ma et al. (2024b) +Yingwei Ma, Qingping Yang, Rongyu Cao, Binhua Li, Fei Huang, and Yongbin Li. +How to understand whole software repository?, 2024b. +URL +https://arxiv.org/abs/2406.01422 +. +Moore (1959) +E.F. Moore. +The Shortest Path Through a Maze +. +Bell Telephone System. Technical publications. monograph. Bell Telephone System., 1959. +URL +https://books.google.com/books?id=IVZBHAAACAAJ +. +OpenAI (2024) +OpenAI. +OpenAI o1 System Card, September 2024. +URL +https://openai.com/research/o1-system-card +. +Online report. +OpenAI et al. (2024) +OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo +Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan +Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O’Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, +Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia +Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. +Gpt-4 technical report, 2024. +URL +https://arxiv.org/abs/2303.08774 +. +Örwall (2024) +Albert Örwall. +Moatless tools, jun 2024. +URL +https://github.com/aorwall/moatless-tools +. +Accessed: 2024-07-16. +Ouyang et al. (2022) +Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. +Training language models to follow instructions with human feedback, 2022. +URL +https://arxiv.org/abs/2203.02155 +. +Pan et al. (2023) +Liangming Pan, Alon Albalak, Xinyi Wang, and William Yang Wang. +Logic-lm: Empowering large language models with symbolic solvers for faithful logical reasoning, 2023. +URL +https://arxiv.org/abs/2305.12295 +. +Rigaki et al. (2024) +Maria Rigaki, Carlos Catania, and Sebastian Garcia. +Hackphyr: A local fine-tuned llm agent for network security environments, 2024. +URL +https://arxiv.org/abs/2409.11276 +. +Saha et al. (2024) +Swarnadeep Saha, Archiki Prasad, Justin Chih-Yao Chen, Peter Hase, Elias Stengel-Eskin, and Mohit Bansal. +System-1.x: Learning to balance fast and slow planning with language models, 2024. +URL +https://arxiv.org/abs/2407.14414 +. +Silver et al. (2016a) +David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, Sander Dieleman, Dominik Grewe, John Nham, Nal Kalchbrenner, Ilya Sutskever, Timothy Lillicrap, Madeleine Leach, Koray Kavukcuoglu, Thore Graepel, and Demis Hassabis. +Mastering the game of go with deep neural networks and tree search. +Nature +, 529(7587):484–489, 1 2016a. +ISSN 1476-4687. +doi: +10.1038/nature16961 +. +URL +https://doi.org/10.1038/nature16961 +. +Silver et al. (2016b) +David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Vedavyas Panneershelvam, Marc Lanctot, Sander Dieleman, Dominik Grewe, John Nham, Nal Kalchbrenner, Ilya Sutskever, Timothy P. Lillicrap, Madeleine Leach, Koray Kavukcuoglu, Thore Graepel, and Demis Hassabis. +Mastering the game of go with deep neural networks and tree search. +Nat. +, 529(7587):484–489, 2016b. +doi: +10.1038/NATURE16961 +. +URL +https://doi.org/10.1038/nature16961 +. +Silver et al. (2018) +David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, Timothy Lillicrap, Karen Simonyan, and Demis Hassabis. +A general reinforcement learning algorithm that masters chess, shogi, and go through self-play. +Science +, 362(6419):1140–1144, 2018. +doi: +10.1126/science.aar6404 +. +URL +https://www.science.org/doi/abs/10.1126/science.aar6404 +. +Snell et al. (2024) +Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar. +Scaling llm test-time compute optimally can be more effective than scaling model parameters, 2024. +URL +https://arxiv.org/abs/2408.03314 +. +Wang et al. (2023) +Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar. +Voyager: An open-ended embodied agent with large language models, 2023. +URL +https://arxiv.org/abs/2305.16291 +. +Wang et al. (2024a) +Xingyao Wang, Yangyi Chen, Lifan Yuan, Yizhe Zhang, Yunzhu Li, Hao Peng, and Heng Ji. +Executable code actions elicit better llm agents, 2024a. +URL +https://arxiv.org/abs/2402.01030 +. +Wang et al. (2024b) +Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and Graham Neubig. +Opendevin: An open platform for ai software developers as generalist agents, 2024b. +URL +https://arxiv.org/abs/2407.16741 +. +Wei et al. (2022) +Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, et al. +Emergent abilities of large language models. +arXiv preprint arXiv:2206.07682 +, 2022. +Xia et al. (2024) +Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang. +Agentless: Demystifying llm-based software engineering agents, 2024. +URL +https://arxiv.org/abs/2407.01489 +. +Yang et al. (2024a) +An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jianxin Yang, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Xuejing Liu, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, Zhifang Guo, and Zhihao Fan. +Qwen2 technical report, 2024a. +URL +https://arxiv.org/abs/2407.10671 +. +Yang et al. (2024b) +John Yang, Carlos E. Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press. +Swe-agent: Agent-computer interfaces enable automated software engineering, 2024b. +URL +https://arxiv.org/abs/2405.15793 +. +Yao et al. (2023) +Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. +Tree of thoughts: Deliberate problem solving with large language models. +In Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (eds.), +Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023 +, 2023. +URL +http://papers.nips.cc/paper_files/paper/2023/hash/271db9922b8d1f4dd7aaef84ed5ac703-Abstract-Conference.html +. +Zhang et al. (2024a) +Kexun Zhang, Weiran Yao, Zuxin Liu, Yihao Feng, Zhiwei Liu, Rithesh Murthy, Tian Lan, Lei Li, Renze Lou, Jiacheng Xu, Bo Pang, Yingbo Zhou, Shelby Heinecke, Silvio Savarese, Huan Wang, and Caiming Xiong. +Diversity empowers intelligence: Integrating expertise of software engineering agents, 2024a. +URL +https://arxiv.org/abs/2408.07060 +. +Zhang et al. (2024b) +Yao Zhang, Zijian Ma, Yunpu Ma, Zhen Han, Yu Wu, and Volker Tresp. +Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration, 2024b. +URL +https://arxiv.org/abs/2408.15978 +. +Zhang et al. (2024c) +Yiqun Zhang, Xiaocui Yang, Shi Feng, Daling Wang, Yifei Zhang, and Kaisong Song. +Can llms beat humans in debating? a dynamic multi-agent framework for competitive debate, 2024c. +URL +https://arxiv.org/abs/2408.04472 +. +Zhang et al. (2024d) +Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, and Abhik Roychoudhury. +Autocoderover: Autonomous program improvement, 2024d. +URL +https://arxiv.org/abs/2404.05427 +. +Appendix A +Reproducibility +All models and data used in our work are publicly available. We additionally provide hyperparameter details in +Appendix +2 +. The code will be released as a public repository upon publication. +Appendix B +Additional Implementation Details +Moatless-adapted is an extended version of the moatless-tools library with support for a tree structure, the ability to revert to earlier versions of the codebase, and the capability to run tests. +The standard implementation of moatless-tools is based on a finite state machine structure where a state holds information about file context and properties set in the configuration or from previous states. It can then transition to a new state when an action is executed. The request that initiates the action is created by an LLM. This follows a linear structure where one state can transition to another state. In moatless-adapted, this model is extended so that a state can expand by using actions to create more states. The connections between states are then represented in a tree structure with nodes. +Each state has a file context associated with it. This file context will be included in the prompt sent to an LLM. To limit the size of the prompt, files are divided into ”spans,” where a span could be, for example, a section of code (e.g., imports), a class, or a function. These are identified by span IDs. Thus, the LLM sees a limited part of the code at a time but can request more context by searching for or adding files and spans. The file context therefore changes over time, and a specific state of file context is linked to a specific state. +In the standard implementation of moatless-tools, changes to the codebase are made linearly, and each change is saved directly to the file system. In moatless-adapted, however, there is a need to be able to revert to earlier states and thus return to a previous version of the codebase. To handle this, the code is stored in a git repository where each change is committed, and each state has a reference to a commit as well as the current patch of the diff from the initial commit that existed before starting. This way, one can go back to an earlier state by specifying the state ID, and the commit that was current at that time will be checked out. +The test files present in the file context are run each time the Plan state is initiated, and the test results are provided to the state. The tests are then run in Docker images built via the SWE-bench library. To use this approach in a benchmark where a larger number of instances should be able to run simultaneously, a solution is used where these images are run as pods in a Kubernetes cluster. Moatless-tools communicates with the testbed by applying patches and running commands via an API. When a new instance starts, a pod is created which is then reset at each run, applying the current patch and running tests according to the test command specified in the SWE-bench library. It’s important to add here that the agent is not aware of the +PASS_TO_PASS +or +FAIL_TO_PASS +tests in the SWE-bench harness, but only knows how to run the tests. This corresponds to a real engineering environment where each project can have its own test commands. +Appendix C +MCTS Hyperparameters +The Monte Carlo Tree Search (MCTS) algorithm used in this study employs several hyperparameters. +Table 2: +MCTS Hyperparameters +Hyperparameter +Description +Default +c_param +UCT exploration parameter +1.41 +max_expansions +Max children per node +5 +max_iterations +Max MCTS iterations +100 +provide_feedback +Enable feedback +True +best_first +Use best-first strategy +True +value_function_temperature +Value function temperature +0.2 +max_depth +Max tree depth +20 +UCT Score Calculation Parameters +exploration_weight +UCT exploration weight +1.0 +depth_weight +Depth penalty weight +0.8 +depth_bonus_factor +Depth bonus factor +200.0 +high_value_threshold +High-value node threshold +55.0 +low_value_threshold +Low-value node threshold +50.0 +very_high_value_threshold +Very high-value threshold +75.0 +high_value_leaf_bonus_constant +High-value leaf bonus +20.0 +high_value_bad_children_bonus_constant +High-value bad children bonus +20.0 +high_value_child_penalty_constant +High-value child penalty +5.0 +Action Model Parameters +action_model_temperature +Action model temperature +0.2 +Discriminator Parameters +number_of_agents +Number of Discriminator Agents +5 +number_of_round +Number of debate rounds +3 +discriminator_temperature +Discriminator temperature +1.0 +These hyperparameters can be adjusted to fine-tune the MCTS algorithm’s performance for specific problem domains or computational constraints. The values listed here are the defaults as defined in the +TreeSearchSettings +class and the MCTS implementation. +Appendix D +Ability of MCTS to Escape Unproductive Loops vs. Baseline +Figure 6: +Avoiding Repetitive Actions, django__django__10914. +We found that the base agent can often get stuck performing repetitive actions +that do not bring it closer to solving the issue, and which commonly lead to unresolvable dead-ends. In this example, the base agent +was stuck implementing wrong tests which continuously returned errors. In contrast, when this happens in +SWE-Search, the Value Agent recognizes this, terminating these trajectories quickly, +as happens in Node 73 (orange). +Appendix E +Model Instance Resolution Uniqueness +To understand the complementary strengths of different models in resolving software issues, we analyzed how unique their resolved issue subsets where. Figure +7 +illustrates the resolution patterns for each model across five of the codebases in SWE-bench-lite. +Figure 7: +Unique Issue Resolution Patterns Across Models and Libraries. +Each column represents a different Python reposiroty, and each row within a column represents a specific issue. Colored blocks indicate successful resolution by the corresponding model (see legend). White spaces denote unresolved issues. This visualization highlights the diverse problem-solving capabilities of different models across various software domains, demonstrating that no single model dominates across all issues and libraries. +Appendix F +Ability of Value Function to Discern Successful Trajectories +Before implementing SWE-Search, we conducted a general study across many models to evaluate the models’ ability to differentiate states which led to resolved vs. unresolved issues. Figure +8 +shows the results of this study. We found that in general, models assigned higher rewards to states which eventually led to resolved issues. Of particular interest was the Deepseek model, which seemed to identify critical errors in trajectories effectively. This was also observed in the final agent (see Fig. +5 +a). +Figure 8: +Average State Reward Comparison Across Models. +This graph compares the average state rewards assigned by different language models for resolved (green) and unresolved (red) issues. Error bars indicate standard deviation. Most models consistently assign higher rewards to states leading to resolved issues, with the exception of the. The ’Average’ column represents the mean across all models, demonstrating a clear distinction between resolved and unresolved states. +Appendix G +Value Function Prompts +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md b/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md new file mode 100644 index 0000000000000000000000000000000000000000..0cd5f107eee0dd474a43b16695912551d6efcfa5 --- /dev/null +++ b/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md @@ -0,0 +1,200 @@ +--- +title: '[2412.21139] Training Software Engineering Agents and Verifiers with SWE-Gym' +id: 241221139-training-software-engineering-agents-and-verifiers-with-swe-gym +tags: +- deepread +created: '2026-06-10T00:22:57.639430Z' +source: https://arxiv.org/abs/2412.21139 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:22:57.639309Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2412.21139] Training Software Engineering Agents and Verifiers with SWE-Gym +Computer Science > Software Engineering +arXiv:2412.21139 +(cs) +[Submitted on 30 Dec 2024 ( +v1 +), last revised 6 Jun 2025 (this version, v2)] +Title: +Training Software Engineering Agents and Verifiers with SWE-Gym +Authors: +Jiayi Pan +, +Xingyao Wang +, +Graham Neubig +, +Navdeep Jaitly +, +Heng Ji +, +Alane Suhr +, +Yizhe Zhang +View a PDF of the paper titled Training Software Engineering Agents and Verifiers with SWE-Gym, by Jiayi Pan and 6 other authors +View PDF +HTML (experimental) +Abstract: +We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents, achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories. +Comments: +Accepted at ICML 2025. Code at +this https URL +Subjects: +Software Engineering (cs.SE) +; Computation and Language (cs.CL) +Cite as: +arXiv:2412.21139 +[cs.SE] +(or +arXiv:2412.21139v2 +[cs.SE] +for this version) +https://doi.org/10.48550/arXiv.2412.21139 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Jiayi Pan [ +view email +] +[v1] +Mon, 30 Dec 2024 18:15:39 UTC (156 KB) +[v2] +Fri, 6 Jun 2025 07:53:20 UTC (295 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Training Software Engineering Agents and Verifiers with SWE-Gym, by Jiayi Pan and 6 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.SE +< prev +| +next > +new +| +recent +| +2024-12 +Change to browse by: +cs +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md b/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md new file mode 100644 index 0000000000000000000000000000000000000000..ca10638d8e6004c747f968d001164579d7058a06 --- /dev/null +++ b/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md @@ -0,0 +1,196 @@ +--- +title: '[2501.04519] rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved + Deep Thinking' +id: 250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep +tags: +- deepread +created: '2026-06-10T00:40:01.597011Z' +source: https://arxiv.org/abs/2501.04519 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:40:01.596873Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.04519] rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking +Computer Science > Computation and Language +arXiv:2501.04519 +(cs) +[Submitted on 8 Jan 2025] +Title: +rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking +Authors: +Xinyu Guan +, +Li Lyna Zhang +, +Yifei Liu +, +Ning Shang +, +Youran Sun +, +Yi Zhu +, +Fan Yang +, +Mao Yang +View a PDF of the paper titled rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking, by Xinyu Guan and 7 other authors +View PDF +HTML (experimental) +Abstract: +We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids naïve step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at +this https URL +. +Subjects: +Computation and Language (cs.CL) +Cite as: +arXiv:2501.04519 +[cs.CL] +(or +arXiv:2501.04519v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2501.04519 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Li Lyna Zhang [ +view email +] +[v1] +Wed, 8 Jan 2025 14:12:57 UTC (632 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking, by Xinyu Guan and 7 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md b/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md new file mode 100644 index 0000000000000000000000000000000000000000..f13686520ee09bfc2ef3e5b0ebfa3bb8870455fa --- /dev/null +++ b/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md @@ -0,0 +1,3557 @@ +--- +title: '[2501.04519] \sysname: Small LLMs Can Master Math Reasoning with Self-Evolved + Deep Thinking' +id: 250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th +tags: +- deepread +created: '2026-06-10T00:40:46.873514Z' +source: https://ar5iv.labs.arxiv.org/html/2501.04519 +source_domain: ar5iv.labs.arxiv.org +fetched_at: '2026-06-10T00:40:46.873327Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.04519] \sysname: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking +\sysname +: Small LLMs Can Master Math Reasoning +with Self-Evolved Deep Thinking +Xinyu Guan +∗ +Li Lyna Zhang +∗⋄ +Yifei Liu +Ning Shang   Youran Sun    Yi Zhu    Fan Yang    Mao Yang +Microsoft Research Asia +Abstract +We present +\sysname +to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. +\sysname +achieves this by exercising “deep thinking” through Monte Carlo Tree Search (MCTS), where a math +policy SLM +performs test-time search guided by an SLM-based +process reward model +. +\sysname +introduces three innovations to tackle the challenges in training the two SLMs: +(1) +a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate +step-by-step verified reasoning trajectories +used to train the policy SLM; +(2) +a novel process reward model training method that avoids naïve step-level score annotation, yielding a more effective +process preference model (PPM) +; +(3) +a +self-evolution recipe +in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. +Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, +\sysname +boosts SLMs’ math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), +\sysname +solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at +https://github.com/microsoft/rStar +. +Task +(pass@1 Acc) +rStar-Math +(Qwen-7B) +rStar-Math +(Qwen-1.5B) +rStar-Math +(Phi3-mini) +OpenAI +o1-preview +OpenAI +o1-mini +QWQ +32B-preview +GPT-4o +DeepSeek-V3 +MATH +90.0 +88.6 +86.4 +85.5 +90.0 +90.6 +76.6 +90.2 +AIME 2024 +53.3 +46.7 +43.3 +44.6 +56.7 +50.0 +9.3 +39.2 +Olympiad Bench +65.6 +64.6 +60.3 +- +65.3 +61.2 +43.3 +55.4 +College Math +60.5 +59.3 +59.1 +- +57.8 +55.8 +48.5 +58.9 +Omni-Math +50.5 +48.5 +46.0 +52.5 +60.5 +49.6 +30.5 +35.9 +Table 1: +\sysname +enables frontier math reasoning in SLMs via deep thinking over 64 trajectories. +$*$ +$*$ +footnotetext: +Equal contribution. +$\diamond$ +$\diamond$ +footnotetext: +Project leader; correspondence to lzhani@microsoft.com +$\S$ +$\S$ +footnotetext: +Xinyu Guan and Youran Sun did this work during the internship at MSRA. Xinyu Guan (2001gxy@gmail.com) is with Peking University, Youran Sun is with Tsinghua University. +1 +Introduction +Recent studies have demonstrated that large language models (LLMs) are capable of tackling mathematical problems +(Team, +2024a +; Yang et al., +2024 +; OpenAI, +2024 +; Liu et al., +2024 +) +. However, the conventional approach of having LLMs generate complete solutions in a single inference – akin to System 1 thinking +(Daniel, +2011 +) +– often yields fast but error-prone results +(Valmeekam et al., +2023 +; OpenAI, +2023 +) +. In response, test-time compute scaling +(Snell et al., +2024 +; Qi et al., +2024 +) +suggests a paradigm shift toward a System 2-style thinking, which emulates human reasoning through a slower and deeper thought process. In this paradigm, an LLM serves as a policy model to generate multiple math reasoning steps, which are then evaluated by another LLM acting as a reward model +(OpenAI, +2024 +) +. The steps and solutions deemed more likely to be correct are selected. The process repeats iteratively and ultimately derives the final answer. +In the test-time compute paradigm, the key is to train a powerful policy model that generates promising solution steps and a reliable reward model that accurately evaluates them, both of which depend on +high-quality +training data. Unfortunately, it is well-known that off-the-shelf high-quality math reasoning data is scarce, and synthesizing high-quality math data faces fundamental challenges. +For the policy model, it is challenging to distinguish erroneous reasoning steps from the correct ones, complicating the elimination of low-quality data. It is worth noting that in math reasoning, a correct final answer does not ensure the correctness of the entire reasoning trace +(Lanham et al., +2023 +) +. Incorrect intermediate steps significantly decrease data quality. +As for the reward model, process reward modeling (PRM) shows a great potential by providing fine-grained feedback on intermediate steps +(Lightman et al., +2023 +) +. However, the training data is even scarcer in this regard: accurate step-by-step feedback requires intense human labeling efforts and is impractical to scale, while those automatic annotation attempts show limited gains due to noisy reward scores +(Luo et al., +2024 +; Wang et al., +2024c +; Chen et al., +2024 +) +. +Due to the above challenges, existing distill-based data synthesis approaches to training policy models, e.g., scaling up GPT4-distilled CoT data +(Tang et al., +2024 +; Huang et al., +2024 +) +, have shown diminishing returns and cannot exceed the capability of their teacher model; meanwhile, as of today, training reliable PRMs for math reasoning remains an open question. +Figure 1: +The overview of +\sysname +. +In this work, we introduce +\sysname +, a self-evolvable System 2-style reasoning approach that achieves the state-of-the-art math reasoning, rivaling and sometimes even surpassing OpenAI o1 on challenging math competition benchmarks with a model size as small as 7 billion. Unlike solutions relying on superior LLMs for data synthesis, +\sysname +leverages smaller language models (SLMs) with Monte Carlo Tree Search (MCTS) to establish a self-evolutionary process, iteratively generating higher-quality training data. To achieve self-evolution, +\sysname +introduces three key innovations. +First, a novel code-augmented CoT data synthesis method, which performs +extensive +MCTS rollouts to generate +step-by-step verified reasoning trajectories +with +self-annotated MCTS Q-values +. Specifically, math problem-solving is decomposed into multi-step generation within MCTS. At each step, the SLM serving as the policy model samples candidate nodes, each generating a one-step CoT and the corresponding Python code. To verify the generation quality, only nodes with successful Python code execution are retained, thus mitigating errors in intermediate steps. Moreover, extensive MCTS rollouts automatically assign a Q-value to each intermediate step based on its contribution: steps contributing to more trajectories that lead to the correct answer are given higher Q-values and considered higher quality. This ensures that the reasoning trajectories generated by SLMs consist of correct, high-quality intermediate steps. +Second, a novel method that trains an SLM acting as a +process preference model +, i.e., a PPM to implement the desired PRM, that reliably predicts a reward label for each math reasoning step. The PPM leverages the fact that, although Q-values are still not precise enough to score each reasoning step despite using extensive MCTS rollouts, the Q-values can reliably distinguish positive (correct) steps from negative (irrelevant/incorrect) ones. Thus the training method constructs preference pairs for each step based on Q-values and uses a pairwise ranking loss +(Ouyang et al., +2022 +) +to optimize PPM’s score prediction for each reasoning step, achieving reliable labeling. This approach avoids conventional methods that directly use Q-values as reward labels +(Luo et al., +2024 +; Chen et al., +2024 +) +, which are inherently noisy and imprecise in stepwise reward assignment. +Finally, a four-round self-evolution recipe that progressively builds both a frontier policy model and PPM from scratch. We begin by curating a dataset of 747k math word problems from publicly available sources. In each round, we use the latest policy model and PPM to perform MCTS, generating increasingly high-quality training data using the above two methods to train a stronger policy model and PPM for next round. Each round achieves progressive refinement: (1) a stronger policy SLM, (2) a more reliable PPM, (3) generating better reasoning trajectories via PPM-augmented MCTS, and (4) improving training data coverage to tackle more challenging and even competition-level math problems. +Extensive experiments across four SLMs (1.5B-7B) and seven math reasoning tasks demonstrate the effectiveness of +\sysname +. Remarkably, +\sysname +improves all four SLMs, matching or even surpassing OpenAI o1 on challenging math benchmarks. On MATH benchmark, with 8 search trajectories, +\sysname +boosts Qwen2.5-Math-7B from 58.8% to 89.4% and Qwen2.5-Math-1.5B from 51.2% to 87.8%. With 64 trajectories, the scores rise to 90% and 88.4%, outperforming o1-preview by 4.5% and 2.6% and matching o1-mini’s 90%. On the Olympiad-level AIME 2024, +\sysname +solves on average 53.3% (8/15) of the problems, exceeding o1-preview by 8.7% and all other open-sourced LLMs. We further conduct comprehensive experiments to verify the superiority of step-by-step verified reasoning trajectories over state-of-the-art data synthesis baselines, as well as the PPM’s effectiveness compared to outcome reward models and Q value-based PRMs. Finally, we present key findings from +\sysname +deep thinking, including the intrinsic self-reflection capability and PPM’s preference for theorem-applications intermediate steps. +2 +Related Works +Math Data Synthesis +. Advancements in LLM math reasoning have largely relied on curating high-quality CoT data, with most leading approaches being GPT-distilled, using frontier models like GPT-4 for synthesis +(Wang et al., +2024b +; Gou et al., +2023 +; Luo et al., +2023 +) +. Notable works include NuminaMath +(Jia LI and Polu, +2024a +) +and +MetaMath +(Yu et al., +2023b +) +. While effective, this limits reasoning to the capabilities of the teacher LLM. +Hard problems that the teacher LLM cannot solve are excluded in the training set. +Even solvable problems may contain error-prone intermediate steps, which are hard to detect. Although rejection sampling methods +(Yuan et al., +2023 +; Brown et al., +2024 +) +can improve data quality, +they do not guarantee correct intermediate steps. As a result, scaling up CoT data has diminishing returns, with gains nearing saturation—e.g., OpenMathInstruct-2 +(Toshniwal et al., +2024 +) +only sees a 3.9% boost on MATH despite an 8× increase in dataset size. +Scaling Test-time Compute +has introduced new scaling laws, allowing LLMs to improve performance across by generating multiple samples and using reward models for best-solution selection +(Snell et al., +2024 +; Wu et al., +2024 +; Brown et al., +2024 +) +. Various test-time search methods have been proposed +(Kang et al., +2024 +; Wang et al., +2024a +) +, including random sampling +(Wang et al., +2023 +) +and tree-search methods +(Yao et al., +2024 +; Hao et al., +2023 +; Zhang et al., +2024b +; Qi et al., +2024 +) +like MCTS. However, open-source methods for scaling test-time computation have shown limited gains in math reasoning, often due to policy LLM or reward model limitations. +\sysname +addresses this by iteratively evolving the policy LLM and reward model, achieving System 2 mathematical reasoning performance comparable to OpenAI o1 +(OpenAI, +2024 +) +. +Reward Models +are crucial for effective System 2 reasoning but are challenging to obtain. Recent works include LLM-as-a-Judge for verification +(Zheng et al., +2023 +; Qi et al., +2024 +) +and specialized reward models like Outcome Reward Model +(Yang et al., +2024 +; Yu et al., +2023a +) +and Process Reward Model (PRM) +(Lightman et al., +2024 +) +. While PRMs offer promising dense, step-level reward signals for +complex reasoning +(Luo et al., +2024 +; Wang et al., +2024c +) +, collecting step-level annotations remains an obstacle. While +Kang et al. ( +2024 +); Wang et al. ( +2024a +) +rely on costly human-annotated datasets like PRM800k +(Lightman et al., +2024 +) +, +recent approaches +(Wang et al., +2024c +; Luo et al., +2024 +) +explore automated annotation via Monte Carlo Sampling or MCTS. However, they struggle to generate precise reward scores, which limits performance gains. +\sysname +introduces a novel process preference reward (PPM) that eliminates the need for accurate step-level reward score annotation. +3 +Methodology +3.1 +Design Choices +MCTS for Effective System 2 Reasoning +. +We aim to train a math policy SLM and a process reward model (PRM), and integrating both within Monte Carlo Tree Search (MCTS) for System 2 deep thinking. MCTS is chosen for two key reasons. First, it breaks down complex math problems into simpler single-step generation tasks, reducing the difficulty for the policy SLM compared to other System 2 methods like Best-of-N +(Brown et al., +2024 +) +or self-consistency +(Wang et al., +2023 +) +, which require generating full solutions in one inference. +Second, the step-by-step generation in MCTS naturally yields step-level training data for both models. Standard MCTS rollout automatically assign Q-value to each step based on its contribution to the final correct answer, obviating the need for human-generated step-level annotations for process reward model training. +Ideally, advanced LLMs such as GPT-4 could be integrated within MCTS to generate training data. However, this approach faces two key challenges. First, even these powerful models struggle to consistently solve difficult problems, such as Olympiad-level mathematics. Consequently, the resulting training data would primarily consist of simpler solvable problems, limiting its diversity and quality. Second, annotating per-step Q-values demands extensive MCTS rollouts; insufficient tree exploration can lead to spurious Q-value assignments, such as overestimating suboptimal steps. Given that each rollout involves multiple single-step generations and these models are computationally expensive, increasing rollouts significantly raises inference costs. +Overview +. To this end, we explore using two 7B SLMs (a policy SLM and a PRM) to generate higher-quality training data, with their smaller size allowing for extensive MCTS rollouts on accessible hardware (e.g., 4 +× +\times +40GB A100 GPUs). However, self-generating data presents greater challenges for SLMs, due to their weaker capabilities. +SLMs frequently fail to generate correct solutions, and even when the final answer is correct, the intermediate steps are often flawed or of poor quality. Moreover, SLMs solve fewer challenging problems compared to advanced models like GPT-4. +This section introduces our methodology, as illustrated in Fig. +1 +. To mitigate errors and low-quality intermediate steps, we introduce a code-augmented CoT synthetic method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories, annotated with Q-values. To further improve SLM performance on challenging problems, we introduce a four-round self-evolution recipe. In each round, both the policy SLM and the reward model are updated to stronger versions, progressively tackling more difficult problems and generating higher-quality training data. Finally, we present a novel process reward model training approach that eliminates the need for precise per-step reward annotations, yielding the more +effective process preference model (PPM). +3.2 +Step-by-Step Verified Reasoning Trajectory +We start by introducing our method for generating step-by-step verified reasoning trajectories with per-step Q-value annotations. Given a problem +x +x +and a policy model +M +M +, we run the standard MCTS to incrementally construct a search tree for step-by-step solution exploration. As shown in Fig. +1 +(a), +the root node represents question +x +x +, while child nodes correspond to intermediate steps +s +s +generated by +M +M +. A root-to-leaf path ending at terminal node +s +d +s_{d} +forms a trajectory +𝐭 += +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +d +\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d} +, with each step +s +i +s_{i} +assigned a Q-value +Q +​ +( +s +i +) +Q(s_{i}) +. +From the search tree +𝒯 +\mathcal{T} +, we extract solution trajectories +𝕋 += +{ +𝐭 +1 +, +𝐭 +2 +, +… +, +𝐭 +n +} +​ +( +n +≥ +1 +) +\mathbb{T}=\{\mathbf{t}^{1},\mathbf{t}^{2},...,\mathbf{t}^{n}\}(n\geq 1) +. Our goal is to select high-quality trajectories from +𝒯 +\mathcal{T} +to construct the training set. For this purpose, we introduce code-augmented CoT synthesis method to filter out low-quality generations and perform extensive rollouts to improve the reliability of Q-value accuracy. +Code-augmented CoT Generation +. Prior MCTS approaches primarily generate natural language (NL) CoTs +(Qi et al., +2024 +; Zhang et al., +2024a +) +. However, LLMs often suffer from hallucination, producing incorrect or irrelevant steps yet still arrive at the correct answer by chance +(Lanham et al., +2023 +) +. These flawed steps are challenging to detect and eliminate. To address this, we propose a novel code execution augmented CoT. As shown in Fig. +2 +, the policy model generates a one-step NL CoT alongside its corresponding Python code, where the NL CoT is embedded as a Python comment. Only generations with successfully executed Python code are retained as valid candidates. +Figure 2: +An example of Code-augmented CoT. +Specifically, starting from the initial root node +x +x +, we perform multiple MCTS iterations through +selection +, +expansion +, +rollout +, and +back-propagation +. At step +i +i +, we collect the latest reasoning trajectory +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +i +− +1 +x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1} +as the current state. Based on this state, we prompt (see Appendix +A.3 +) the policy model to generate +n +n +candidates +s +i +, +0 +, +… +, +s +i +, +n +− +1 +s_{i,0},...,s_{i,n-1} +for step +i +i +. Python code execution is then employed to filter valid nodes. As shown in Fig. +2 +, each generation +s +i +, +j +s_{i,j} +is concatenated with the code from all previous steps, forming +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +i +− +1 +⊕ +s +i +, +j +s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}\oplus s_{i,j} +. Candidates that execute successfully are retained as valid nodes and scored by the PPM, which assigns a Q-value +q +​ +( +s +i +) +q(s_{i}) +. +Then, we use the well-known Upper Confidence bounds for Trees (UCT) +(Kocsis and Szepesvári, +2006 +) +to select the best node among the +n +n +candidates. This selection process is mathematically represented as: +UCT +​ +( +s +) += +Q +​ +( +s +) ++ +c +​ +ln +⁡ +N +p +​ +a +​ +r +​ +e +​ +n +​ +t +​ +( +s +) +N +​ +( +s +) +; +where +Q +​ +( +s +) += +q +​ +( +s +) +N +​ +( +s +) +\displaystyle\text{UCT}(s)=Q(s)+c\sqrt{\frac{\ln N_{parent}(s)}{N(s)}};\quad\text{where}\quad Q(s)=\frac{q(s)}{N(s)} +(1) +where +N +​ +( +s +) +N(s) +denotes the number of visits to node +s +s +, and +N +parent +​ +( +s +) +N_{\text{parent}}(s) +is the visit count of +s +s +’s parent node. The predicted reward +q +​ +( +s +) +q(s) +is provided by the PPM and will be updated through back-propagation. +c +c +is a constant that balances exploitation and exploration. +Extensive Rollouts for Q-value Annotation +. Accurate Q-value +Q +​ +( +s +) +Q(s) +annotation in Eq. +1 +is crucial for guiding MCTS node selection towards correct problem-solving paths and identifying high-quality steps within trajectories. +To improve Q-value reliability, we draw inspiration from Go players, who retrospectively evaluate the reward of each move based on game outcomes. Although initial estimates may be imprecise, repeated gameplay refines these evaluations over time. Similarly, in each rollout, we update the Q-value of each step based on its contribution to achieving the correct final answer. After extensive MCTS rollouts, steps consistently leading to correct answers achieve higher Q-values, occasional successes yield moderate Q-values, and consistently incorrect steps receive low Q-values. Specifically, we introduce two self-annotation methods to obtain these step-level Q-values. Fig. +1 +(c) shows the detailed setting in the four rounds of self-evolution. +Terminal-guided annotation +. During the first two rounds, when the PPM is unavailable or insufficiently accurate, we use terminal-guided annotation. Formally, let +q +​ +( +s +i +) +k +q(s_{i})^{k} +denote the q value for step +s +i +s_{i} +after back-propagation in the +k +t +​ +h +k^{th} +rollout. Following AlphaGo +(Silver et al., +2017 +) +and rStar +(Qi et al., +2024 +) +, we score each intermediate node based on its contribution to the final correct answer: +q +​ +( +s +i +) +k += +q +​ +( +s +i +) +k +− +1 ++ +q +​ +( +s +d +) +k +; +\displaystyle q(s_{i})^{k}=q(s_{i})^{k-1}+q(s_{d})^{k}; +(2) +where the initial q value +q +​ +( +s +i +) +0 += +0 +q(s_{i})^{0}=0 +in the first rollout. If this step frequently leads to a correct answer, its +q +q +value will increase; otherwise, it decreases. Terminal nodes are scored as +q +​ +( +s +d +) += +1 +q(s_{d})=1 +for correct answers and +q +​ +( +s +d +) += +− +1 +q(s_{d})=-1 +otherwise, as shown in Fig. +1 +. +PRM-augmented annotation +. Starting from the third round, we use PPM to score each step for more effective generation. Compared to terminal-guided annotation, which requires multiple rollouts for a meaningful +q +q +value, PPM directly predicts a non-zero initial +q +q +value. +PPM-augmented MCTS also helps the policy model to generate higher-quality steps, guiding solutions towards correct paths. Formally, for step +s +i +s_{i} +, PPM predicts an initial +q +​ +( +s +i +) +0 +q(s_{i})^{0} +value based on the partial trajectory: +q +​ +( +s +i +) +0 += +P +​ +P +​ +M +​ +( +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +i +− +1 +⊕ +s +i +) +\displaystyle q(s_{i})^{0}=PPM(x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}\oplus s_{i}) +(3) +This +q +q +value will be updated based on terminal node’s +q +​ +( +s +d +) +q(s_{d}) +value through MCTS +back-propagation +in Eq. +2 +. +For terminal node +s +d +s_{d} +, we do not use PRM for scoring during training data generation. Instead, we assign a more accurate score based on ground truth labels as terminal-guided rewarding. +3.3 +Process Preference Model +Process reward models, which provide granular step-level reward signals, is highly desirable for solving challenging math problems. However, obtaining high-quality step-level training data remains an open challenge. Existing methods rely on human annotations +(Lightman et al., +2023 +) +or MCTS-generated scores +(Zhang et al., +2024a +; Chen et al., +2024 +) +to assign a score for each step. These scores then serve as training targets, with methods such as MSE loss +(Chen et al., +2024 +) +or pointwise loss +(Wang et al., +2024c +; Luo et al., +2024 +; Zhang et al., +2024a +) +used to minimize the difference between predicted and labeled scores. +As a result, the precision of these annotated step-level reward scores directly determines the effectiveness of the resulting process reward model. +Unfortunately, precise per-step scoring remains a unsolved challenge. Although our extensive MCTS rollouts improve the reliability of Q-values, precisely evaluating fine-grained step quality presents a major obstacle. For instance, among a set of correct steps, it is difficult to rank them as best, second-best, or average and then assign precise scores. Similarly, among incorrect steps, differentiating the worst from moderately poor steps poses analogous challenges. Even expert human annotation struggles with consistency, particularly at scale, leading to inherent noise in training labels. +We introduce a novel training method that trains a process preference model (PPM) by constructing step-level positive-negative preference pairs. As shown in Fig. +1 +(b), instead of using Q-values as direct reward labels, we use them to select steps from MCTS tree for preference pair construction. For each step, we select two candidates with the highest Q-values as positive steps and two with the lowest as negative steps. Critically, the selected positive steps must lead to a correct final answer, while negative steps must lead to incorrect answers. For intermediate steps (except the final answer step), the positive and negative pairs share the same preceding steps. For the final answer step, where identical reasoning trajectories rarely yield different final answers, we relax this restriction. +We select two correct trajectories with the highest average Q-values as positive examples and two incorrect trajectories with the lowest average Q-values as negative examples. Following +(Ouyang et al., +2022 +) +, we define our loss function using the standard Bradley-Terry model with a pairwise ranking loss: +ℒ +p +​ +p +​ +m +​ +( +θ +) += +− +1 +2 +× +2 +​ +E +( +x +, +y +i +p +​ +o +​ +s +, +y +i +n +​ +e +​ +g +∈ +𝔻 +) +​ +[ +l +​ +o +​ +g +​ +( +σ +​ +( +r +θ +​ +( +x +, +y +i +p +​ +o +​ +s +) +− +r +θ +​ +( +x +, +y +i +n +​ +e +​ +g +) +) +) +] +\displaystyle\mathcal{L}_{ppm}(\theta)=-\frac{1}{2\times 2}E_{(x,y_{i}^{pos},y_{i}^{neg}\in\mathbb{D})}[log(\sigma(r_{\theta}(x,y_{i}^{pos})-r_{\theta}(x,y_{i}^{neg})))] +(4) +when +i +is not final answer step +, +y +i +p +​ +o +​ +s += +s +1 +⊕ +… +⊕ +s +i +− +1 +⊕ +s +i +p +​ +o +​ +s +; +y +i +n +​ +e +​ +g += +s +1 +⊕ +… +⊕ +s +i +− +1 +⊕ +s +i +n +​ +e +​ +g +\displaystyle\text{when $i$ is not final answer step},y_{i}^{pos}=s_{1}\oplus...\oplus s_{i-1}\oplus s_{i}^{pos};y_{i}^{neg}=s_{1}\oplus...\oplus s_{i-1}\oplus s_{i}^{neg}\vskip-4.30554pt +(5) +Here, +r +θ +​ +( +x +, +y +i +) +r_{\theta}(x,y_{i}) +denotes the output of the PPM, where +x +x +is the problem and +y +y +is the trajectory from the first step to the +i +t +​ +h +i^{th} +step. +3.4 +Self-Evolved Deep Thinking +3.4.1 +Training with Step-by-Step Verified Reasoning Trajectory +Math Problems Collection +. We collect a large dataset of 747k math word problems with final answer ground-truth labels, primarily from NuminaMath +(Jia LI and Polu, +2024a +) +and MetaMath +(Yu et al., +2023b +) +. Notably, only competition-level problems (e.g., Olympiads and AIME/AMC) from NuminaMath are included, as we observe that grade-school-level problems do not significantly improve LLM complex math reasoning. To augment the limited competition-level problems, we follow +(Li et al., +2024 +) +and use GPT-4 to synthesize new problems based on the seed problems in 7.5k MATH train set and 3.6k AMC-AIME training split. However, GPT-4 often generated unsolvable problems or incorrect solutions for challenging seed problems. To filter these, we prompt GPT-4 to generate 10 solutions per problem, retaining only those with at least 3 consistent solutions. +Reasoning Trajectories Collection +. Instead of using the original solutions in the 747k math dataset, we conduct extensive MCTS rollouts (Sec. +3.2 +) to generate higher-quality step-by-step verified reasoning trajectories. In each self-evolution round, we perform 16 rollouts per math problem, which leads to 16 reasoning trajectories. Problems are then categories by difficulty based on the correct ratio of the generated trajectories: +easy +(all solutions are correct), +medium +(a mix of correct and incorrect solutions) and +hard +(all solutions are incorrect). For +hard +problems with no correct trajectories, an additional MCTS with 16 rollouts is performed. After that, all step-by-step trajectories and their annotated Q-values are collected and filtered to train the policy SLM and process preference model. +Supervised Fine-tuning the Policy SLM +. Through extensive experiments, we find that selecting high-quality reasoning trajectories is the key for fine-tuning a frontier math LLM. While methods such as GPT-distillation and Best-of-N can include low-quality or erroneous intermediate steps, a more effective approach ensures that every step in the trajectory is of high quality. To achieve this, we use per-step Q-values to select optimal trajectories from MCTS rollouts. Specifically, for each math problem, we select the top-2 trajectories with the highest average Q-values among those leading to correct answers as SFT training data. +Training PPM +. The PPM is initialized from the fine-tuned policy model, with its next-token prediction head replaced by a scalar-value head consisting of a linear layer and a tanh function to constrain outputs to the range [-1, 1]. We filter out math problems where all solution trajectories are fully correct or incorrect. For problems with mixed outcomes, we select two positive and two negative examples for each step based on Q-values, which are used as preference pairs for training data. +3.4.2 +Recipe for Self-Evolution +Table 2: +Percentage of the 747k math problems correctly solved in each round. Only problems have correct solutions are included in the training set. The first round uses DeepSeek-Coder-Instruct as the policy LLM, while later rounds use our fine-tuned 7B policy SLM. +# +models in MCTS +GSM-level +MATH-level +Olympiad-level +All +Round 1 +DeepSeek-Coder-V2-Instruct +96.61% +67.36% +20.99% +60.17% +Round 2 +policy SLM-r1 +97.88% +67.40% +56.04% +66.60% +Round 3 +policy SLM-r2, PPM-r2 +98.15% +88.69% +62.16% +77.86% +Round 4 +policy SLM-r3, PPM-r3 +98.15% +94.53% +80.58% +90.25% +Table 3: +Pass@1 accuracy of the resulting policy SLM in each round, showing continuous improvement until surpassing the bootstrap model. +Round# +MATH +AIME 2024 +AMC 2023 +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +DeepSeek-Coder-V2-Instruct +(bootstrap model) +75.3 +13.3 +57.5 +37.6 +46.2 +94.9 +64.7 +Base (Qwen2.5-Math-7B) +58.8 +0.0 +22.5 +21.8 +41.6 +91.6 +51.7 +\hdashline +policy SLM-r1 +69.6 +3.3 +30.0 +34.7 +44.5 +88.4 +57.4 +policy SLM-r2 +73.6 +10.0 +35.0 +39.0 +45.7 +89.1 +59.7 +policy SLM-r3 +75.8 +16.7 +45.0 +44.1 +49.6 +89.3 +62.8 +policy SLM-r4 +78.4 +26.7 +47.5 +47.1 +52.5 +89.7 +65.7 +Table 4: +The quality of PPM consistently improves across rounds. The policy model has been fixed with policy SLM-r1 for a fair comparison. +Round# +MATH +AIME 2024 +AMC 2023 +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +PPM-r1 +75.2 +10.0 +57.5 +35.7 +45.4 +90.9 +60.3 +PPM-r2 +84.1 +26.7 +75.0 +52.7 +54.2 +93.3 +73.0 +PPM-r3 +85.2 +33.3 +77.5 +59.5 +55.6 +93.9 +76.6 +PPM-r4 +87.0 +43.3 +77.5 +61.5 +56.8 +94.2 +77.8 +Due to the weaker capabilities of SLMs, we perform four rounds of MCTS deep thinking to progressively generate higher-quality data and expand the training set with more challenging math problems. Each round uses MCTS to generate step-by-step verified reasoning trajectories, which are then used to train the new policy SLM and PPM. The new models are then applied in next round to generate higher-quality training data. Fig. +1 +(c) and Table +2 +detail the models used for data generation in each round, along with the identifiers of the trained policy model and PPM. Next, we outline the details and specific improvements targeted in each round. +Round 1: Bootstrapping an initial strong policy SLM-r1 +. To enable SLMs to self-generate reasonably good training data, we perform a bootstrap round to fine-tune an initial strong policy model, denoted as SLM-r1. +As shown in Table +2 +, we run MCTS with DeepSeek-Coder-V2-Instruct (236B) to collect the SFT data. With no available reward model in this round, we use terminal-guided annotation for Q-values and limit MCTS to 8 rollouts for efficiency. For correct solutions, the top-2 trajectories with the highest average Q-values are selected as SFT data. We also train PPM-r1, but the limited rollouts yields unreliable Q-values, affecting the effectiveness of PPM-r1 ( Table +4 +). +Round 2: Training a reliable PPM-r2 +. In this round, with the policy model updated to the 7B SLM-r1, we conduct extensive MCTS rollouts for more reliable Q-value annotation and train the first reliable reward model, PPM-r2. Specifically, we perform 16 MCTS rollouts per problem. The resulting step-by-step verified reasoning trajectories show significant improvements in both quality and Q-value precision. As shown in Table +4 +, PPM-r2 is notably more effective than in the bootstrap round. Moreover, the policy SLM-r2 also continues to improve as expected (Table +3 +). +Round 3: PPM-augmented MCTS to significantly improve data quality +. With the reliable PPM-r2, we perform PPM-augmented MCTS in this round to generate data, leading to significantly higher-quality trajectories that cover more math and Olympiad-level problems in the training set (Table +2 +). The generated reasoning trajectories and self-annotated Q-values are then used to train the new policy SLM-r3 and PPM-r3, both of which show significant improvements. +Round 4: Solving challenging math problems +. After the third round, while grade school and MATH problems achieve high success rates, only 62.16% of Olympiad-level problems are included in the training set. This is +NOT +solely due to weak reasoning abilities in our SLMs, as many Olympiad problems remain unsolved by GPT-4 or o1. To improve coverage, we adopt a straightforward strategy. For unsolved problems after 16 MCTS rollouts, we perform an additional 64 rollouts, and if needed, increase to 128. We also conduct multiple MCTS tree expansions with different random seeds. This boosts the success rate of Olympiad-level problems to 80.58%. +After four rounds of self-evolution, 90.25% of the 747k math problems are successfully covered into the training set, as shown in Table +2 +. Among the remaining unsolved problems, a significant portion consists of synthetic questions. We manually review a random sample of 20 problems and find that 19 are incorrectly labeled with wrong answers. Based on this, we conclude that the remaining unsolved problems are of low quality and thus terminate the self-evolution at round 4. +4 +Evaluation +4.1 +Setup +Evaluation Datasets +. We evaluate +\sysname +on diverse mathematical benchmarks. In addition to the widely-used GSM8K +(Cobbe et al., +2021 +) +, we include challenging benchmarks from multiple domains: +(i) +competition and Olympiad-level benchmarks, such as MATH-500 +(Lightman et al., +2023 +) +, AIME 2024 +(AI-MO, +2024a +) +, AMC 2023 +(AI-MO, +2024b +) +and Olympiad Bench +(He et al., +2024 +) +. Specifically, AIME is the exams designed to challenge the brightest high school math students in American, with the 2024 dataset comprising 30 problems from AIME I and II exams; +(ii) +college-level math problems from College Math +(Tang et al., +2024 +) +and +(iii) +out-of-domain math benchmark: GaoKao (Chinese +College Entrance Exam) En 2023 +(Liao et al., +2024 +) +. +Base Models and Setup +. +\sysname +is a general approach applicable to various LLMs. To show its effectiveness and generalizability, we use SLMs of different sizes as the base policy models: +Qwen2.5-Math-1.5B +(Qwen, +2024b +) +, Phi3-mini-Instruct (3B) +(Microsoft, +2024 +; Abdin et al., +2024 +) +, Qwen2-Math-7B +(Qwen, +2024a +) +and Qwen2.5-Math-7B +(Qwen, +2024c +) +. Among these, Phi3-mini-Instruct is a general-purpose SLM without specialization in math reasoning. +Due to limited GPU resources, we performed 4 rounds of self-evolution exclusively on Qwen2.5-Math-7B, yielding 4 evolved policy SLMs (Table +3 +) and 4 PPMs (Table +4 +). For the other 3 policy LLMs, we fine-tune them using step-by-step verified trajectories generated from Qwen2.5-Math-7B’s 4th round. The final PPM from this round is then used as the reward model for the 3 policy SLMs. +Baselines +. +\sysname +is a System 2 method. We compare it against three strong baselines representing both System 1 and System 2 approaches: +(i) +Frontier LLMs +, including GPT-4o, the latest Claude, OpenAI o1-preview and o1-mini. +We measure their accuracy on AMC 2023, Olympiad Bench, College Math, Gaokao and GSM8K, with accuracy numbers for other benchmarks are taken from public technical reports +(Team, +2024a +) +. +(ii) +Open-sourced superior reasoning models +, including DeepSeek-Coder-v2-Instruct, Mathstral +(Team, +2024b +) +, NuminaMath-72B +(Jia LI and Polu, +2024a +) +, and LLaMA3.1 +(Dubey et al., +2024 +) +, which represent the current mainstream System 1 approaches for improving LLM math reasoning. +(iii) +Both System 1 and System 2 performance of the base models trained from the original models teams +, including Instruct versions (e.g., Qwen2.5-Math-7B-Instruct) and Best-of-N (e.g., Qwen2.5-Math-72B-Instruct+Qwen2.5-Math-RM-72B). Notably, the reward model used for the three Qwen base models is a 72B ORM, significantly larger than our 7B PPM. +Evaluation Metric +. We report Pass@1 accuracy for all baselines. For System 2 baselines, we use default evaluation settings, such as default thinking time for o1-mini and o1-preview. For Qwen models with Best-of-N, we re-evaluate MATH-500, AIME/AMC accuracy; other benchmarks results are from their technical reports. For a fair comparison, +\sysname +run MCTS to generate the same number of solutions as Qwen. Specifically, for AIME/AMC, we generate 16 trajectories for AIME/AMC and 8 for other benchmarks, using PPM to select the best solution. We also report performance with increased test-time computation using 64 trajectories, denoted as +\sysname +64 +. +Table 5: +The results of +\sysname +and other frontier LLMs on the most challenging math benchmarks. +\sysname +64 +shows the Pass@1 accuracy achieved when sampling 64 trajectories. +Competition and College Level +OOD +Model +Method +MATH +AIME +2024 +AMC +2023 +Olympiad +Bench +College +Math +GSM8K +Gaokao +En 2023 +Frontier LLMs +GPT-4o +System 1 +76.6 +9.3 +47.5 +43.3 +48.5 +92.9 +67.5 +Claude3.5-Sonnet +System 1 +78.3 +16.0 +- +- +- +96.4 +- +GPT-o1-preview +- +85.5 +44.6 +90.0 +- +- +- +- +GPT-o1-mini +- +90.0 +56.7 +95.0 +65.3 +57.8 +94.8 +78.4 +Open-Sourced Reasoning LLMs +DeepSeek-Coder-V2-Instruct +System 1 +75.3 +13.3 +57.5 +37.6 +46.2 +94.9 +64.7 +Mathstral-7B-v0.1 +System 1 +57.8 +0.0 +37.5 +21.5 +33.7 +84.9 +46.0 +NuminaMath-72B-CoT +System 1 +64.0 +3.3 +70.0 +32.6 +39.7 +90.8 +58.4 +LLaMA3.1-8B-Instruct +System 1 +51.4 +6.7 +25.0 +15.4 +33.8 +76.6 +38.4 +LLaMA3.1-70B-Instruct +System 1 +65.4 +23.3 +50.0 +27.7 +42.5 +94.1 +54.0 +Qwen2.5-Math-72B-Instruct +System 1 +85.6 +30.0 +70.0 +49.0 +49.5 +95.9 +71.9 +Qwen2.5-Math-72B-Instruct+72B ORM +System 2 +85.8 +36.7 +72.5 +54.5 +50.6 +96.4 +76.9 +General Base Model: Phi3-mini-Instruct (3.8B) +Phi3-mini-Instruct (base model) +System 1 +41.4 +3.33 +7.5 +12.3 +33.1 +85.7 +37.1 +\sysname +(3.8B SLM+7B PPM) +System 2 +85.4 +40.0 +77.5 +59.3 +58.0 +94.5 +77.1 +\sysname +64 +(3.8B SLM+7B PPM) +System 2 +86.4 +43.3 +80.0 +60.3 +59.1 +94.7 +77.7 +Math-Specialized Base Model: Qwen2.5-Math-1.5B +Qwen2.5-Math-1.5B (base model) +System 1 +51.2 +0.0 +22.5 +16.7 +38.4 +74.6 +46.5 +Qwen2.5-Math-1.5B-Instruct +System 1 +60.0 +10.0 +60.0 +38.1 +47.7 +84.8 +65.5 +Qwen2.5-Math-1.5B-Instruct+72B ORM +System 2 +83.4 +20.0 +72.5 +47.3 +50.2 +94.1 +73.0 +\sysname +(1.5B SLM+7B PPM) +System 2 +87.8 +46.7 +80.0 +63.5 +59.0 +94.3 +77.7 +\sysname +64 +(1.5B SLM+7B PPM) +System 2 +88.6 +46.7 +85.0 +64.6 +59.3 +94.8 +79.5 +Math-Specialized Base Model: Qwen2-Math-7B +Qwen2-Math-7B (base model) +System 1 +53.4 +3.3 +25.0 +17.3 +39.4 +80.4 +47.3 +Qwen2-Math-7B-Instruct +System 1 +73.2 +13.3 +62.5 +38.2 +45.9 +89.9 +62.1 +Qwen2-Math-7B-Instruct+72B ORM +System 2 +83.4 +23.3 +62.5 +47.6 +47.9 +95.1 +71.9 +\sysname +(7B SLM+7B PPM) +System 2 +88.2 +43.3 +80.0 +63.1 +58.4 +94.6 +78.2 +\sysname +64 +(7B SLM+7B PPM) +System 2 +88.6 +46.7 +85.0 +63.4 +59.3 +94.8 +79.2 +Math-Specialized Base Model: Qwen2.5-Math-7B +Qwen2.5-Math-7B (base model) +System 1 +58.8 +0.0 +22.5 +21.8 +41.6 +91.6 +51.7 +Qwen2.5-Math-7B-Instruct +System 1 +82.6 +6.0 +62.5 +41.6 +46.8 +95.2 +66.8 +Qwen2.5-Math-7B-Instruct+72B ORM +System 2 +88.4 +26.7 +75.0 +49.9 +49.6 +97.9 +75.1 +\sysname +(7B SLM+7B PPM) +System 2 +89.4 +50.0 +87.5 +65.3 +59.0 +95.0 +80.5 +\sysname +64 +(7B SLM+7B PPM) +System 2 +90.0 +53.3 +87.5 +65.6 +60.5 +95.2 +81.3 +4.2 +Main Results +Results on diverse challenging math benchmarks +. Table +5 +shows the results of +\sysname +with comparing to state-of-the-art reasoning models. We highlight three key observations: +(1) +\sysname +significantly improves SLMs math reasoning capabilities, achieving performance comparable to or surpassing OpenAI o1 with substantially smaller model size (1.5B-7B). For example, Qwen2.5-Math-7B, originally at 58.8% accuracy on MATH, improved dramatically to 90.0% with +\sysname +, outperforming o1-preview and Claude 3.5 Sonnet while matching o1-mini. On the College Math benchmark, +\sysname +exceeds o1-mini by 2.7%. On AIME 2024, +\sysname +scored 53.3%, ranking just below o1-mini, with the 7B model solving 8/15 problems in both AIME I and II, placing in the top 20% of the brightest high school math students. +Notably, 8 of the unsolved problems were geometry-based, requiring visual understanding, a capability +\sysname +currently does not support. +(2) +Despite using smaller policy models (1.5B-7B) and reward models (7B), +\sysname +significantly outperforms state-of-the-art System 2 baselines. Compared to Qwen Best-of-N baselines, which use the same base models (Qwen2-Math-7B, Qwen2.5-Math-1.5B/7B) but a 10 +× +\times +larger reward model (Qwen2.5-Math-RM-72B), +\sysname +consistently improves the reasoning accuracy of all base models to state-of-the-art levels. Even against Best-of-N with a 10 +× +\times +larger Qwen2.5-Math-72B-Instruct policy model, +\sysname +surpasses it on all benchmarks except GSM8K, using the same number of sampled solutions. +(3) +Beyond well-known benchmarks like MATH, GSM8K, and AIME, which may risk over-optimization, +\sysname +shows strong generalizability on other challenging math benchmarks, including Olympiad Bench, College Math, and the Chinese College Entrance Math Exam (Gaokao), setting new state-of-the-art scores. As discussed in Sec. +3.4 +, our training set is primarily sourced from public datasets, with no specific optimizations for these benchmarks. +Figure 3: +Reasoning performance under scaling up the test-time compute. +Scaling up test-time computation +. +\sysname +uses MCTS to augment the policy model, searching solutions guided by the PPM. By increasing test-time computation, it explores more trajectories, potentially improving performance. +In Fig. +3 +, we show the impact of test-time compute scaling by comparing the accuracy of the official Qwen Best-of-N across different numbers of sampled trajectories on four challenging math benchmarks. Sampling only one trajectory corresponds to the policy LLM’s Pass@1 accuracy, indicating a fallback to System 1 reasoning. We highlight two key observations: +(1) +With only 4 trajectories, +\sysname +significantly outperforms Best-of-N baselines, exceeding o1-preview and approaching o1-mini, demonstrating its effectiveness. +(2) +Scaling test-time compute improves reasoning accuracy across all benchmarks, though with varying trends. On Math, AIME, and Olympiad Bench, +\sysname +shows saturation or slow improvement at 64 trajectories, while on College Math, performance continues to improve steadily. +4.3 +Ablation Study and Analysis +We ablate the effectiveness of our three innovations. For System 2-style inference, Pass@1 accuracy is measured with 16 trajectories for AIME and AMC, and 8 for other benchmarks. +Table 6: +The continuously improved math reasoning capabilities through +\sysname +self-evolved deep thinking. Starting from round 2, the 7B base model powered by +\sysname +surpasses GPT-4o. +Round# +MATH +AIME 2024 +AMC 2023 +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +GPT-4o +76.6 +9.3 +47.5 +43.3 +48.5 +92.9 +67.5 +Base 7B model +58.8 +0.0 +22.5 +21.8 +41.6 +91.6 +51.7 +\sysname +Round 1 +75.2 +10.0 +57.5 +35.7 +45.4 +90.9 +60.3 +\sysname +Round 2 +86.6 +43.3 +75.0 +59.4 +55.6 +94.0 +76.4 +\sysname +Round 3 +87.0 +46.7 +80.0 +61.6 +56.5 +94.2 +77.1 +\sysname +Round 4 +89.4 +50.0 +87.5 +65.3 +59.0 +95.0 +80.5 +The effectiveness of self-evolution +. The impressive results in Table +5 +are achieved after 4 rounds of +\sysname +self-evolved deep thinking. Table +6 +shows the math reasoning performance in each round, demonstrating a continuous improvement in accuracy. +In round 1, the main improvement comes from applying SFT to the base model. Round 2 brings a significant boost with the application of a stronger PPM in MCTS, which unlocks the full potential of System 2 deep reasoning. Notably, starting from round 2, +\sysname +outperforms GPT-4o. Rounds 3 and 4 show further improvements, driven by stronger System 2 reasoning through better policy SLMs and PPMs. +The effectiveness of step-by-step verified reasoning trajectory +. +\sysname +generates step-by-step verified reasoning trajectories, which eliminate error intermediate steps and further expand training set with more challenging problems. To evaluate its effectiveness, we use the data generated from round 4 as SFT training data and compare it against +three strong baselines: +(i) +GPT-distillation, which includes open-sourced CoT solutions synthesized using GPT-4, such as MetaMath +(Yu et al., +2023b +) +, NuminaMath-CoT +(Jia LI and Polu, +2024b +) +; +(ii) +Random sampling from self-generation, +which use the same policy model (i.e., policy SLM-r3) to randomly generate trajectories; +(iii) +Rejection sampling, where 32 trajectories are randomly sampled from the policy model, with high-quality solutions ranked by our trained ORM (appendix +A.1 +). For fairness, we select two correct trajectories for each math problem in baseline (ii) and (iii). All SFT experiments use the same training recipe. +Table 7: +Ablation study on the effectiveness of our step-by-step verified reasoning trajectories as the SFT dataset. We report the SFT accuracy of Qwen2.5-Math-7B fine-tuned with different datasets. +Dataset +MATH +AIME +AMC +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +GPT-4o +- +76.6 +9.3 +47.5 +43.3 +48.5 +92.9 +67.5 +GPT4-distillation +(Open-sourced) +MetaMath +55.2 +3.33 +32.5 +19.1 +39.2 +85.1 +43.6 +NuminaMath-CoT +69.6 +10.0 +50.0 +37.2 +43.4 +89.8 +59.5 +Self-generation +by policy SLM-r3 +Random sample +72.4 +10.0 +45.0 +41.0 +48.0 +87.5 +57.1 +Rejection sampling +73.4 +13.3 +47.5 +44.7 +50.8 +89.3 +61.7 +Step-by-step verified (ours) +78.4 +26.7 +47.5 +47.1 +52.5 +89.7 +65.7 +Table +7 +shows the math reasoning accuracy of Qwen2.5-Math-7B fine-tuned on different datasets. We highlight two observations: +(i) +Fine-tuning with our step-by-step verified trajectories significantly outperforms all other baselines. This is primarily due to our PPM-augmented MCTS for code-augmented CoT synthesis, which provides denser verification during math solution generation. It proves more effective than both random sampling, which lacks verification, and rejection sampling, where ORM provides only sparse verification. +(ii) +Even randomly sampled code-augmented CoT solutions from our SLM yields comparable or better performance than GPT-4 synthesized NuminaMath and MetaMath datasets. +This indicates that our policy SLMs, after rounds of self-evolution, can generate high-quality math solutions. These results demonstrates the huge potential of our method to self-generate higher-quality reasoning data without relying on advanced LLM distillation. +The effectiveness of PPM +. We train both a strong ORM and Q-value score-based PRM (PQM) for comparison. To ensure a fair evaluation, we use the highest-quality training data: the step-by-step verified trajectories generated in round 4, with selected math problems matching those used for PPM training. Similar to PPM, we use step-level Q-values as to select positive and negative trajectories for each math problem. +The ORM is trained using a pairwise ranking loss +(Ouyang et al., +2022 +) +, while the PQM follows +(Chen et al., +2024 +; Zhang et al., +2024a +) +to use Q-values as reward labels and optimize with MSE loss. Detailed training settings are provided in Appendix +A.1 +. +Table 8: +Ablation study on the reward model. Process reward models (PQM and PPM) outperform ORM, with PPM pushing the frontier of math reasoning capabilities. +RM +Inference +MATH +AIME +AMC +Olympiad Bench +College Math +GSM8K +GaokaoEn +o1-mini +- +90.0 +56.7 +95.0 +65.3 +55.6 +94.8 +78.6 +ORM +Best-of-N +82.6 +26.7 +65.0 +55.1 +55.5 +92.3 +72.5 +PQM +MCTS +88.2 +46.7 +85.0 +62.9 +57.6 +94.6 +79.5 +PPM +MCTS +89.4 +50.0 +87.5 +65.3 +59.0 +95.0 +80.5 +Table +8 +compares the performance of ORM, PQM, and PPM for System 2 reasoning using our final round policy model. ORM provides reward signals only at the end of problem solving, so we use the Best-of-N method, while PRM and PPM leverage MCTS-driven search. As shown in Table +8 +, both PQM and PPM outperform ORM by providing denser step-level reward signals, leading to higher accuracy on complex math reasoning tasks. However, PQM struggles on more challenging benchmarks, such as MATH and Olympiad Bench, due to the inherent imprecision of Q-values. +In contrast, PPM constructs step-level preference data for training, enabling our 7B policy model to achieve comparable or superior performance to o1-mini across all benchmarks. +5 +Findings and Discussions +Figure 4: +An example of intrinsic self-reflection during +\sysname +deep thinking. +The emergence of intrinsic self-reflection capability +. A key breakthrough in OpenAI o1 is its intrinsic self-reflection capability. When the model makes an error, it recognizes the mistake and can self-correct with a correct answer +(Noam Brown and Lightman, +2024 +) +. Yet it has consistently +been found to be largely ineffective in open-sourced LLMs. The community has actively explored various approaches, including self-correction +(Huang et al., +2023 +; Kumar et al., +2024 +) +, self-reflection +(Renze and Guven, +2024 +; Shinn et al., +2024 +) +, to explicitly train or prompt LLMs to develop such capability. +In our experiments, we unexpectedly observe that our MCTS-driven deep thinking exhibits self-reflection during problem-solving. As shown in Fig. +4 +, the model initially formalizes an equation using +SymPy +in the first three steps, which would lead to an incorrect answer (left branch). Interestingly, in the fourth step (right branch), the policy model recognizes the low quality of its earlier steps and refrains from continuing along the initial problem-solving path. Instead, it backtracks and resolves the problem using a new, simpler approach, ultimately arriving at the correct answer. An additional example of self-correction is provided in Appendix +A.2 +. Notably, no self-reflection training data or prompt was included, suggesting that advanced System 2 reasoning can foster intrinsic self-reflection. +Figure 5: +Pass@1 accuracy of policy models and their accuracy after applying System 2 reasoning with various reward models, shows that reward models primarily determine the final performance. +PPM shapes the reasoning boundary in System 2 deep thinking +. Both the policy and reward models are crucial for System 2 deep reasoning. Our experiments show that once the policy model attains a reasonably strong capability level, +(see Appendix +A.1 +), the PPM becomes the key determinant of the upper performance limit. +Fig. +5 +summarizes the accuracy of policy models of different sizes, as well as the improvements achieved with reward models. Despite variations in Pass@1 accuracy due to differences in training strategies, datasets, and model scales, the reward model proves to be the dominant factor in System 2 reasoning. For instance, although the SFT accuracy of +\sysname +-7B is lower than Qwen2.5-Math-72B-Instruct, pairing it with our 7B PPM allows +\sysname +to outperform the 72B policy model with Qwen 72B ORM. Moreover, despite varying Pass@1 accuracy across our three policy SLM sizes, the final reasoning accuracy converges after applying the PPM. +PPM spots theorem-application steps +. When solving challenging math problems, identifying and applying relevant theorems or key conclusions often form the cornerstone of successful problem-solving +(Xin et al., +2024 +) +. In our experiments, we find that during +\sysname +problem-solving, our PPM effectively identifies critical theorem-application intermediate steps within policy model’s deep thinking process. These steps are predicted with high reward scores, guiding the policy model to generate the correct solution. Appendix +A.2 +provides examples where the PPM successfully identifies key theorems such as Fermat’s little theorem +(Weisstein, +a +) +, Vieta’s formulas +(Weisstein, +b +) +, the AM-GM inequality +( +amg, +) +, the Pythagorean theorem +( +pyt, +) +, and the Shoelace Theorem +( +sho, +) +, etc. +Generalization discussions +. +\sysname +offers a general methodology for improving LLM reasoning applicable to various domains. First, +\sysname +can generalize to more challenging math tasks, such as theorem proving, though its current focus is on word problems due to dataset limitations. Nonetheless, +\sysname +demonstrates the potential to prove mathematical statements. As shown in Appendix +A.2 +, it successfully proves an Olympiad-level problem involving Fermat’s Little Theorem, providing a step-by-step correct proof through its deep reasoning process. Second, +\sysname +can generalize to other domains, such as code and commonsense reasoning. Notably, synthesizing step-by-step verified training trajectories for general reasoning requires a mechanism to provide feedback on whether a given trajectory reaches the desired output at the end of MCTS rollout. For instance, in code reasoning, this could involve designing extensive test cases; in general reasoning, feedback could be obtained through human labeling or mutual verification with another LLM +(Qi et al., +2024 +) +. +6 +Conclusion +In this work, we present +\sysname +, a self-evolved System 2 deep thinking approach that significantly boosts the math reasoning capabilities of small LLMs, achieving state-of-the-art OpenAI o1-level performance. Our approach demonstrates that SLMs can self-generate high-quality training data for frontier-level math reasoning. Extensive experiments across four different-sized SLMs and challenging math benchmarks demonstrate the superiority of +\sysname +, with achieving leading results while outperforming existing math reasoning LLMs and Best-of-N baselines. We also reveal key findings, including the emergence of self-reflection and the effectiveness of the PPM in identifying critical intermediate steps, such as theorem-application steps. Finally, +\sysname +can achieve further improvements by collecting more challenging math problems, we leave this as future work. +Acknowledgement +In the early stages of this work, we faced significant challenges due to limited GPU resources and restricted access to the GPT-4 API. We are deeply grateful to Qiufeng Yin and Chengmin Chi for their assistance in collecting math problems and providing GPT-4 resources for new math problem synthesis. Special thanks go to my colleagues, Lingxiao Ma, Ying Cao, Baotong Lu, Jing Liu, Jiahang Xu, Chengruidong Zhang, Siyuan Wang, Gaokai Zhang, Yujian Li, and Yang Wang, for generously sharing their GPU quotas with us. +References +[1] +Inequality of arithmetic and geometric means. +URL +https://artofproblemsolving.com/wiki/index.php/AM-GM_Inequality +. +[2] +Pythagorean theorem. +URL +https://en.wikipedia.org/wiki/Pythagorean_theorem +. +[3] +Shoelace theorem. +URL +https://artofproblemsolving.com/wiki/index.php/Shoelace_Theorem +. +Abdin et al. [2024] +Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, +Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, +et al. +Phi-3 technical report: A highly capable language model locally on +your phone. +arXiv preprint arXiv:2404.14219 +, 2024. +AI-MO [2024a] +AI-MO. +Aime 2024, 2024a. +URL +https://huggingface.co/datasets/AI-MO/aimo-validation-aime +. +AI-MO [2024b] +AI-MO. +Amc 2023, 2024b. +URL +https://huggingface.co/datasets/AI-MO/aimo-validation-amc +. +Brown et al. [2024] +Bradley Brown, Jordan Juravsky, Ryan Ehrlich, Ronald Clark, Quoc V Le, +Christopher Ré, and Azalia Mirhoseini. +Large language monkeys: Scaling inference compute with repeated +sampling. +arXiv preprint arXiv:2407.21787 +, 2024. +Chen et al. [2024] +Guoxin Chen, Minpeng Liao, Chengxi Li, and Kai Fan. +Alphamath almost zero: process supervision without process, 2024. +Cobbe et al. [2021] +Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz +Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, +et al. +Training verifiers to solve math word problems. +arXiv preprint arXiv:2110.14168 +, 2021. +Daniel [2011] +Kahneman Daniel. +Thinking, fast and slow. +Macmillan +, 2011. +Dubey et al. [2024] +Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad +Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, +et al. +The llama 3 herd of models. +arXiv preprint arXiv:2407.21783 +, 2024. +Gou et al. [2023] +Zhibin Gou, Zhihong Shao, Yeyun Gong, Yujiu Yang, Minlie Huang, Nan Duan, +Weizhu Chen, et al. +Tora: A tool-integrated reasoning agent for mathematical problem +solving. +arXiv preprint arXiv:2309.17452 +, 2023. +Hao et al. [2023] +Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and +Zhiting Hu. +Reasoning with language model is planning with world model. +arXiv preprint arXiv:2305.14992 +, 2023. +He et al. [2024] +Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, +Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, et al. +Olympiadbench: A challenging benchmark for promoting agi with +olympiad-level bilingual multimodal scientific problems. +arXiv preprint arXiv:2402.14008 +, 2024. +Huang et al. [2023] +Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, +Xinying Song, and Denny Zhou. +Large language models cannot self-correct reasoning yet. +arXiv preprint arXiv:2310.01798 +, 2023. +Huang et al. [2024] +Zhen Huang, Haoyang Zou, Xuefeng Li, Yixiu Liu, Yuxiang Zheng, Ethan Chern, +Shijie Xia, Yiwei Qin, Weizhe Yuan, and Pengfei Liu. +O1 replication journey – part 2: Surpassing o1-preview through +simple distillation big progress or bitter lesson? +Github +, 2024. +URL +https://github.com/GAIR-NLP/O1-Journey +. +Jia LI and Polu [2024a] +Lewis Tunstall Ben Lipkin Roman Soletskyi Shengyi Costa Huang Kashif Rasul +Longhui Yu Albert Jiang Ziju Shen Zihan Qin Bin Dong Li Zhou Yann Fleureau +Guillaume Lample Jia LI, Edward Beeching and Stanislas Polu. +Numinamath. +[https://github.com/project-numina/aimo-progress-prize](https://github.com/project-numina/aimo-progress-prize/blob/main/report/numina_dataset.pdf) +, +2024a. +Jia LI and Polu [2024b] +Lewis Tunstall Ben Lipkin Roman Soletskyi Shengyi Costa Huang Kashif Rasul +Longhui Yu Albert Jiang Ziju Shen Zihan Qin Bin Dong Li Zhou Yann Fleureau +Guillaume Lample Jia LI, Edward Beeching and Stanislas Polu. +Numinamath cot, 2024b. +URL +https://huggingface.co/datasets/AI-MO/NuminaMath-CoT +. +Kang et al. [2024] +Jikun Kang, Xin Zhe Li, Xi Chen, Amirreza Kazemi, and Boxing Chen. +Mindstar: Enhancing math reasoning in pre-trained llms at inference +time. +arXiv preprint arXiv:2405.16265 +, 2024. +Kocsis and Szepesvári [2006] +Levente Kocsis and Csaba Szepesvári. +Bandit based monte-carlo planning. +volume 2006, pages 282–293, 09 2006. +ISBN 978-3-540-45375-8. +doi: +10.1007/11871842_29 +. +Kumar et al. [2024] +Aviral Kumar, Vincent Zhuang, Rishabh Agarwal, Yi Su, John D Co-Reyes, Avi +Singh, Kate Baumli, Shariq Iqbal, Colton Bishop, Rebecca Roelofs, et al. +Training language models to self-correct via reinforcement learning. +arXiv preprint arXiv:2409.12917 +, 2024. +Lanham et al. [2023] +Tamera Lanham, Anna Chen, Ansh Radhakrishnan, Benoit Steiner, Carson Denison, +Danny Hernandez, Dustin Li, Esin Durmus, Evan Hubinger, Jackson Kernion, +et al. +Measuring faithfulness in chain-of-thought reasoning. +arXiv preprint arXiv:2307.13702 +, 2023. +Li et al. [2024] +Chen Li, Weiqi Wang, Jingcheng Hu, Yixuan Wei, Nanning Zheng, Han Hu, Zheng +Zhang, and Houwen Peng. +Common 7b language models already possess strong math capabilities. +arXiv preprint arXiv:2403.04706 +, 2024. +Liao et al. [2024] +Minpeng Liao, Wei Luo, Chengxi Li, Jing Wu, and Kai Fan. +Mario: Math reasoning with code interpreter output–a reproducible +pipeline. +arXiv preprint arXiv:2401.08190 +, 2024. +Lightman et al. [2023] +Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy +Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. +Let’s verify step by step. +arXiv preprint arXiv:2305.20050 +, 2023. +Lightman et al. [2024] +Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, +Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. +Let’s verify step by step. +In +The Twelfth International Conference on Learning +Representations +, 2024. +URL +https://openreview.net/forum?id=v8L0pN6EOi +. +Liu et al. [2024] +Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang +Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al. +Deepseek-v3 technical report. +arXiv preprint arXiv:2412.19437 +, 2024. +Luo et al. [2023] +Haipeng Luo, Qingfeng Sun, Can Xu, Pu Zhao, Jianguang Lou, Chongyang Tao, Xiubo +Geng, Qingwei Lin, Shifeng Chen, and Dongmei Zhang. +Wizardmath: Empowering mathematical reasoning for large language +models via reinforced evol-instruct. +arXiv preprint arXiv:2308.09583 +, 2023. +Luo et al. [2024] +Liangchen Luo, Yinxiao Liu, Rosanne Liu, Samrat Phatale, Harsh Lara, Yunxuan +Li, Lei Shu, Yun Zhu, Lei Meng, Jiao Sun, et al. +Improve mathematical reasoning in language models by automated +process supervision. +arXiv preprint arXiv:2406.06592 +, 2024. +Microsoft [2024] +Microsoft. +Phi-3-mini-4k-instruct, 2024. +URL +https://huggingface.co/microsoft/Phi-3-mini-4k-instruct +. +Noam Brown and Lightman [2024] +Ilge Akkaya Noam Brown and Hunter Lightman. +Openai’s noam brown, ilge akkaya and hunter lightman on o1 and +teaching llms to reason better, 2024. +URL +https://www.youtube.com/watch?v=jPluSXJpdrA +. +OpenAI [2023] +OpenAI. +Gpt-4 technical report. +2023. +OpenAI [2024] +OpenAI. +Openai o1 system card. +preprint +, 2024. +Ouyang et al. [2022] +Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela +Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. +Training language models to follow instructions with human feedback. +Advances in Neural Information Processing Systems +, +35:27730–27744, 2022. +Qi et al. [2024] +Zhenting Qi, Mingyuan Ma, Jiahang Xu, Li Lyna Zhang, Fan Yang, and Mao Yang. +Mutual reasoning makes smaller llms stronger problem-solvers. +arXiv preprint arXiv:2408.06195 +, 2024. +Qwen [2024a] +Qwen. +Qwen2-math-7b, 2024a. +URL +https://huggingface.co/Qwen/Qwen2-Math-7B +. +Qwen [2024b] +Qwen. +Qwen2.5-math-1.5b, 2024b. +URL +https://huggingface.co/Qwen/Qwen2.5-Math-1.5B +. +Qwen [2024c] +Qwen. +Qwen2.5-math-7b, 2024c. +URL +https://huggingface.co/Qwen/Qwen2.5-Math-7B +. +Renze and Guven [2024] +Matthew Renze and Erhan Guven. +Self-reflection in llm agents: Effects on problem-solving +performance. +arXiv preprint arXiv:2405.06682 +, 2024. +Shinn et al. [2024] +Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu +Yao. +Reflexion: Language agents with verbal reinforcement learning. +Advances in Neural Information Processing Systems +, 36, 2024. +Silver et al. [2017] +David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew +Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore +Graepel, et al. +Mastering chess and shogi by self-play with a general reinforcement +learning algorithm. +arXiv preprint arXiv:1712.01815 +, 2017. +Snell et al. [2024] +Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar. +Scaling llm test-time compute optimally can be more effective than +scaling model parameters. +arXiv preprint arXiv:2408.03314 +, 2024. +Tang et al. [2024] +Zhengyang Tang, Xingxing Zhang, Benyou Wan, and Furu Wei. +Mathscale: Scaling instruction tuning for mathematical reasoning. +arXiv preprint arXiv:2403.02884 +, 2024. +Team [2024a] +Qwen Team. +Qwq: Reflect deeply on the boundaries of the unknown, November +2024a. +URL +https://qwenlm.github.io/blog/qwq-32b-preview/ +. +Team [2024b] +The Mistral AI Team. +Mathstral-7b-v0.1, 2024b. +URL +https://huggingface.co/mistralai/Mathstral-7B-v0.1 +. +Toshniwal et al. [2024] +Shubham Toshniwal, Wei Du, Ivan Moshkov, Branislav Kisacanin, Alexan +Ayrapetyan, and Igor Gitman. +Openmathinstruct-2: Accelerating ai for math with massive open-source +instruction data. +arXiv preprint arXiv:2410.01560 +, 2024. +Valmeekam et al. [2023] +Karthik Valmeekam, Sarath Sreedharan, Matthew Marquez, Alberto Olmo, and +Subbarao Kambhampati. +On the planning abilities of large language models (a critical +investigation with a proposed benchmark). +arXiv preprint arXiv:2302.06706 +, 2023. +Wang et al. [2024a] +Chaojie Wang, Yanchen Deng, Zhiyi Lv, Shuicheng Yan, and An Bo. +Q*: Improving multi-step reasoning for llms with deliberative +planning, 2024a. +Wang et al. [2024b] +Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui +Zhang, Linqi Song, Mingjie Zhan, and Hongsheng Li. +Mathcoder: Seamless code integration in LLMs for enhanced +mathematical reasoning. +In +The Twelfth International Conference on Learning +Representations +, 2024b. +URL +https://openreview.net/forum?id=z8TW0ttBPp +. +Wang et al. [2024c] +Peiyi Wang, Lei Li, Zhihong Shao, R. X. Xu, Damai Dai, Yifei Li, Deli Chen, +Y. Wu, and Zhifang Sui. +Math-shepherd: Verify and reinforce llms step-by-step without human +annotations, 2024c. +Wang et al. [2023] +Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc V Le, Ed H. Chi, Sharan Narang, +Aakanksha Chowdhery, and Denny Zhou. +Self-consistency improves chain of thought reasoning in language +models. +In +The Eleventh International Conference on Learning +Representations +, 2023. +URL +https://openreview.net/forum?id=1PL1NIMMrw +. +Weisstein [a] +Eric W. Weisstein. +Fermat’s little theorem, a. +URL +https://mathworld.wolfram.com/FermatsLittleTheorem.html +. +Weisstein [b] +Eric W. Weisstein. +Vieta’s formulas, from mathworld—a wolfram web resource, +b. +URL +http://mathworld.wolfram.com/Tree.html +. +Wu et al. [2024] +Yangzhen Wu, Zhiqing Sun, Shanda Li, Sean Welleck, and Yiming Yang. +An empirical analysis of compute-optimal inference for +problem-solving with language models. +arXiv preprint arXiv:2408.00724 +, 2024. +Xin et al. [2024] +Huajian Xin, Daya Guo, Zhihong Shao, Zhizhou Ren, Qihao Zhu, Bo Liu, Chong +Ruan, Wenda Li, and Xiaodan Liang. +Deepseek-prover: Advancing theorem proving in llms through +large-scale synthetic data. +arXiv preprint arXiv:2405.14333 +, 2024. +Yang et al. [2024] +An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao, Bowen Yu, Chengpeng Li, +Dayiheng Liu, Jianhong Tu, Jingren Zhou, Junyang Lin, et al. +Qwen2. 5-math technical report: Toward mathematical expert model via +self-improvement. +arXiv preprint arXiv:2409.12122 +, 2024. +Yao et al. [2024] +Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and +Karthik Narasimhan. +Tree of thoughts: Deliberate problem solving with large language +models. +Advances in Neural Information Processing Systems +, 36, 2024. +Yu et al. [2023a] +Fei Yu, Anningzhe Gao, and Benyou Wang. +Outcome-supervised verifiers for planning in mathematical reasoning. +arXiv preprint arXiv:2311.09724 +, 2023a. +Yu et al. [2023b] +Longhui Yu, Weisen Jiang, Han Shi, Jincheng Yu, Zhengying Liu, Yu Zhang, +James T Kwok, Zhenguo Li, Adrian Weller, and Weiyang Liu. +Metamath: Bootstrap your own mathematical questions for large +language models. +arXiv preprint arXiv:2309.12284 +, 2023b. +Yuan et al. [2023] +Zheng Yuan, Hongyi Yuan, Chengpeng Li, Guanting Dong, Keming Lu, Chuanqi Tan, +Chang Zhou, and Jingren Zhou. +Scaling relationship on learning mathematical reasoning with large +language models. +arXiv preprint arXiv:2308.01825 +, 2023. +Zhang et al. [2024a] +Dan Zhang, Sining Zhoubian, Ziniu Hu, Yisong Yue, Yuxiao Dong, and Jie Tang. +Rest-mcts*: Llm self-training via process reward guided tree search. +arXiv preprint arXiv:2406.03816 +, 2024a. +Zhang et al. [2024b] +Di Zhang, Jiatong Li, Xiaoshui Huang, Dongzhan Zhou, Yuqiang Li, and Wanli +Ouyang. +Accessing gpt-4 level mathematical olympiad solutions via monte carlo +tree self-refine with llama-3 8b. +arXiv preprint arXiv:2406.07394 +, 2024b. +Zheng et al. [2023] +Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao +Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, Hao Zhang, Joseph E. +Gonzalez, and Ion Stoica. +Judging LLM-as-a-judge with MT-bench and chatbot arena. +In +Thirty-seventh Conference on Neural Information Processing +Systems Datasets and Benchmarks Track +, 2023. +Appendix A +Appendix +A.1 +Additional Experiments and Details +Data Generation Details +. As detailed in Sec. +3.4 +, each round starts by self-generating step-by-step verified trajectories for 747k math word problems. The maximum tree depth +d +d +is set to 16, with 16 MCTS rollouts conducted per problem by default. At each step, we allow to explore 8 candidate nodes, and the constant +c +c +in Eq. +1 +is set to 2 to promote greater exploration. In the bootstrap round, due to the large size of the initial policy model (236B), we used smaller parameters: 8 rollouts and 5 candidate nodes per step. To improve the accuracy of solving challenging problems in round 4, we increase the number of candidate nodes to 16 and conduct 2 MCTS tree expansions per problem using different random seeds. Detailed prompts are available in Appendix +A.3 +. +Training Details +. In each round, we collect step-by-step verified trajectories to fine-tune the policy LLM and train the PPM. To reduce noise +in synthetic math problems (e.g., incorrect ground-truth answers labeled by GPT-4), we remove synthetic problems with trajectories achieving less than 50% accuracy. Based on our extensive experiments, the policy LLM is fine-tuned from the initial base model in each round, rather than training incrementally on the model from the previous round. +All policy SLMs are trained for 2 epochs with a sequence length of 4096 tokens and a batch size of 128. We use AdamW optimizer with a linear learning rate scheduler, setting the initial learning rate to 7e-6 for Qwen models, and a cosine scheduler with an initial learning rate of 5e-6 for Phi3-mini-Instruct. +The PPM is trained for 1 epoch with a batch size of 512 and an initial learning rate of 7e-6. +Training the ORM and PQM +. The Outcome Reward Model (ORM) and the Q-value-based Process Reward Model (PQM) share the same model architecture and training parameters with our PPM. To train the ORM, we collect trajectories from math problems containing both correct and incorrect solutions. Specifically, the two trajectories with the highest average Q-values are selected as positive examples, while the two with the lowest are chosen as negative examples. Following Qwen2.5-Math +(Yang et al., +2024 +) +, we adopt the pairwise ranking loss +(Ouyang et al., +2022 +) +to optimize the ORM. To train the PQM, we follow +Chen et al. ( +2024 +) +to use step-level Q-values as reward labels. Let +𝐱 += +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +d +\mathbf{x}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d} +be the trajectory, with annotated Q-values +𝐐 += +( +Q +​ +( +s +1 +) +, +Q +​ +( +s +1 +) +, +… +, +Q +​ +( +s +d +) +) +\mathbf{Q}=(Q(s_{1}),Q(s_{1}),...,Q(s_{d})) +and predicted Q-values +𝐐 +′ += +( +Q +′ +​ +( +s +1 +) +, +Q +′ +​ +( +s +1 +) +, +… +, +Q +′ +​ +( +s +d +) +) +\mathbf{Q^{\prime}}=(Q^{\prime}(s_{1}),Q^{\prime}(s_{1}),...,Q^{\prime}(s_{d})) +for each step. To stabilize PQM training, we treat each trajectory as a single training sample and predict Q-values for all steps simultaneously, rather than splitting it into individual per-step samples. Specifically, to predict the Q-value +Q +′ +​ +( +s +i +) +Q^{\prime}(s_{i}) +for step +s +i +s_{i} +, PQM takes the trajectory from the question up to step +s +i +s_{i} +(i.e., +x +⊕ +s +1 +⊕ +s +2 +⊕ +… +⊕ +s +i +x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i} +) as input and outputs a value between -1 and 1. We use a mean squared error (MSE) loss for PQM training: +ℒ +p +​ +r +​ +m +​ +( +𝐱 +) += +‖ +𝐐 +− +𝐐 +′ +‖ +𝟐 +\mathcal{L}_{prm}(\bf{x})=\|\bf{Q}-\bf{Q^{\prime}}\|^{2} +(6) +Self-evolution Inference Costs. +In the initial bootstrap round, we use DeepSeek-Coder-v2-Instruct (236B) as the policy model, using 10 nodes of 8×80GB H100 GPUs with 8 MCTS rollouts. This required approximately two weeks to finish the data generation. For rounds 2–4, using our fine-tuned 7B SLM as the policy model, data generation was performed on 15 nodes of 4×40GB A100 GPUs, +with each round completed in three days. In the final round, to include more challenging problems, we increased the number of MCTS rollouts to 64, extending the data generation time to one week. +Table 9: +Inference costs of +\sysname +. We show the average number of generated tokens required to generate a trajectory for a given question. +MATH +AIME 2024 +AMC 2023 +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +5453 +15693 +14544 +7889 +4503 +3299 +6375 +Inference Setting +. In our evaluation, we run multiple MCTS to generate candidate solution trajectories. For each problem, we generate 32 candidate nodes at each step and use the PPM to score each node. Since the PPM effectively provides step-level quality evaluations, we limit MCTS to just 4 rollouts per step to update the Q-values. After completing MCTS, the trajectory with the highest PPM score is selected as the final answer. Table +9 +presents the average number of tokens generated to produce a trajectory in MCTS. +Table 10: +Pass@1 (greedy) accuracy of our fine-tuned policy models for Phi3-mini, Qwen2.5-Math-1.5B, Qwen2-Math-7B and Qwen2.5-Math-7B. +Model +MATH +AIME 2024 +AMC 2023 +Olympiad Bench +College Math +GSM8K +GaokaoEn 2023 +General Base Model: Phi3-mini-Instruct (3.8B) +Phi3-mini-Instruct +41.4 +3.33 +7.5 +12.3 +33.1 +85.7 +37.1 +Our policy model +68.0 +10.0 +37.5 +36.6 +48.7 +87.9 +53.2 +Math-Specialized Base Model: Qwen2.5-Math-1.5B +Qwen2.5-Math-1.5B +51.2 +0.0 +22.5 +16.7 +38.4 +74.6 +46.5 +Qwen2.5-Math-1.5B-Instruct +60.0 +10.0 +60.0 +38.1 +47.7 +84.8 +65.5 +Our policy model +74.8 +13.3 +47.5 +42.5 +50.1 +83.1 +58.7 +Math-Specialized Base Model: Qwen2-Math-7B +Qwen2-Math-7B +53.4 +3.3 +25.0 +17.3 +39.4 +80.4 +47.3 +Qwen2-Math-7B-Instruct +73.2 +13.3 +62.5 +38.2 +45.9 +89.9 +62.1 +Our policy model +73.8 +16.7 +45.0 +43.9 +52.0 +88.3 +65.2 +Math-Specialized Base Model: Qwen2.5-Math-7B +Qwen2.5-Math-7B +58.8 +0.0 +22.5 +21.8 +41.6 +91.6 +51.7 +Qwen2.5-Math-7B-Instruct +82.6 +6.0 +62.5 +41.6 +46.8 +95.2 +66.8 +Our policy model +78.4 +26.7 +47.5 +47.1 +52.5 +89.7 +65.7 +Figure 6: +Pass@N accuracy with random sampling from different policy models. Compared to the official Qwen instruct version, our policy model exhibits a stronger ability to sample correct solutions. +Figure 7: +Pass@N accuracy with PPM-augmented MCTS. Under the same PPM guidance, the four policy models of varying sizes demonstrate convergent capabilities in sampling correct solutions. +Pass@N. +Table +10 +compares the math reasoning performance of our policy models with the instruct versions developed by the original model team. Our policy models do not consistently outperform the instruct versions. For example, on the Qwen2.5-Math-7B base model, Qwen2.5-Math-7B-Instruct achieves 4.2% higher accuracy on the MATH benchmark. However, in System 2 deep thinking paradigm, the pass@1 accuracy alone does not fully reflect the reasoning capabilities for the policy model. To provide a more comprehensive evaluation, Fig. +6 +and Fig. +7 +present the pass@N accuracy. In this metric, the policy model generates +N +N +solutions under two settings: random sampling (Fig. +6 +) and PPM-augmented MCTS deep thinking (Fig. +7 +). A problem is considered solved if one solution is correct. +As shown in Figure +6 +, our policy model achieves comparable Pass@64 accuracy with Qwen2.5-Math-7B-Instruct on MATH and significantly outperforms it on other benchmarks. +Figure +7 +further compares the pass@N accuracy of our four policy models (different sizes) after MCTS deep thinking under the same PPM guidance. We can observe that after generating 64 trajectories, the pass@N accuracy of different policy models becomes comparable. +A.2 +Examples of Reasoning Trajectories +Self-correction Example +Question: The coordinates of a parallelogram are (5, 3), (6, 8), (7, 4) and +( +x +, +y +) +(x,y) +and +x +> +7 +x>7 +. What is the value of +x ++ +y +x+y +Our trajectory +: + +# Import necessary libraries +⬇ +import +numpy +as +np + +# Define the coordinates of the parallelogram +⬇ +A += +np +. +array +([5, +3]) +B += +np +. +array +([6, +8]) +C += +np +. +array +([7, +4]) + +# Calculate the vector differences to find the missing coordinate D +# In a parallelogram, the vector difference between two adjacent sides is equal to the vector difference between the other two adjacent sides +# We can use the formula: D = A + (C - B) +⬇ +D += +A ++ +( +C +- +B +) + +# Since the coordinates provided are (5, 3), (6, 8), (7, 4) and (x, y), we can infer that the vector from B to C should be the same as the vector from A to the missing point (x, y) +# This means: (x - 5, y - 3) = (1, -4) because the vector from B to C is (7 - 6, 4 - 8) = (1, -4) +# Solve for x and y +⬇ +x += +5 ++ +1 +y += +3 ++ +(-4) + +# Verify that the point (x, y) exists and satisfies the condition x > 7 +# Since x = 6 is not greater than 7, we need to use the correct formula that maintains the parallelogram property and x > 7 +# Correcting the formula to find the other possible D [ +Self-correction! +] +⬇ +D += +C ++ +( +B +- +A +) + +# Solve for x and y again +⬇ +x += +7 ++ +(6 +- +5) +y += +4 ++ +(8 +- +3) + +# Verify that the point (x, y) exists and satisfies the condition x > 7 +# Since x = 8 is greater than 7, which satisfies the condition given, we can now find the sum of x and y +⬇ +x_plus_y += +x ++ +y + +# Now print the final answer +⬇ +print +( +x_plus_y +) +# This should print the value of x + y + +17 +The value of x + y is +17 + +Math statement proof using Fermat’s little theorem +Question: If ( pt+a ) is a perfect square for some value of ( t ), then ( a +p +− +1 +2 +{}^{\frac{p-1}{2}} +- 1 ) is divisible by ( p ) (where ( p ) is a prime number). +Our trajectory for proving +: + +# Step 1: Define the variables +⬇ +from +sympy +import +symbols +, +isprime +, +sqrt +, +mod_inverse +# Define the variables +⬇ +p +, +t +, +a += +symbols +(’ +p +t +a +’)\\ + +# Step 2: Assume p*t + a is a perfect square +# This means there exists an integer k such that p*t + a = k +2 +# Rearrange to find a in terms of p, t, and k +# a = k +2 +- p*t + +# +Step 3: Use Fermat’s Little Theorem +# Fermat’s Little Theorem states that if p is a prime and a is an integer not divisible by p, +then a +(p-1) +≡ +\equiv +1 (mod p) +# We need to show that a +((p-1)/2) +- 1 is divisible by p + +# Step 4: Substitute a = k +2 +- p*t into Fermat’s Little Theorem +# a +((p-1)/2) +≡ +\equiv +(k +2 +- p*t) +((p-1)/2) +(mod p) +# Since p*t is divisible by p, (k +2 +- p*t) +((p-1)/2) +≡ +\equiv +k +(p-1) +(mod p) +# By Fermat’s Little Theorem, k +(p-1) +≡ +\equiv +1 (mod p) + +# Step 5: Therefore, a +((p-1)/2) +≡ +\equiv +1 (mod p) +# This implies a +((p-1)/2) +- 1 +≡ +\equiv +0 (mod p) +# Hence, a +((p-1)/2) +- 1 is divisible by p + +# Now print the final answer +print("The expression a +((p-1)/2) +- 1 is divisible by p.") + + +The expression +a +( +( +p +− +1 +) +/ +2 +) +a^{((p-1)/2)} +- 1 is divisible by p. +The statement is true. + +Problem-solving with application of the Vieta’s formulas and the AM-GM inequality +Question: Let +a +a +be a positive real number such that all the roots of +x +3 ++ +a +​ +x +2 ++ +a +​ +x ++ +1 += +0 +x^{3}+ax^{2}+ax+1=0 +are real. Find the smallest possible value of +a +. +a. +Our trajectory for proving +: + +# Step 1: Define the polynomial equation +from sympy import symbols, solve, Eq +# Define the variable and the parameter +x, a = symbols(’x a’) + +[PPM Score: -0.46430] +# Step 2: Define the polynomial equation +polynomial = x**3 + a*x**2 + a*x + 1 + +[PPM Score: 0.53937] +# +Step 3: Use Vieta’s formulas to relate the coefficients to the roots +# Since the polynomial is monic (leading coefficient is 1), the sum of the roots (taken one at a time) is -a +# The sum of the products of the roots taken two at a time is a +# The product of the roots is -1 +# Since all roots are real, we can denote them as r1, r2, r3 +# From Vieta’s formulas: +# r1 + r2 + r3 = -a +# r1*r2 + r2*r3 + r3*r1 = a +# r1*r2*r3 = -1 + +[PPM Score: +0.9989] +# +Step 4: Use the AM-GM inequality to find the minimum value of a +# By AM-GM, (r1*r2 + r2*r3 + r3*r1) / 3 >= (r1*r2*r3) +(2/3) +# Substituting the known values: +# a / 3 >= (-1) +(2/3) +# a / 3 >= 1 +# a >= 3 + +[PPM Score: +0.9999] +# Step 5: Verify if a = 3 is achievable +# If a = 3, then r1 + r2 + r3 = -3 and r1*r2 + r2*r3 + r3*r1 = 3 +# We need to check if there exist real roots that satisfy these conditions +# Consider the case where r1 = r2 = r3 = -1 +# Then r1 + r2 + r3 = -3 and r1*r2 + r2*r3 + r3*r1 = 3, and r1*r2*r3 = -1 +# Thus, a = 3 is achievable with real roots + +[PPM Score: 1.0] +# Step 6: Print the final answer +print(3) + +3 +The smallest possible value of a is +3 +. + +Problem-solving with application the Shoelace Theorem +Question: In the diagram, +D +D +and +E +E +are the midpoints of +A +​ +B +¯ +\overline{AB} +and +B +​ +C +¯ +\overline{BC} +respectively. Determine the area of quadrilateral +D +​ +B +​ +E +​ +F +DBEF +. [asy] +size(180); defaultpen(linewidth(.7pt)+fontsize(10pt)); +pair A, B, C, D, E, F; +A=(0,6); +B=(0,0); +C=(8,0); +D=(0,3); +E=(4,0); +F=(8/3,2); +draw(E–A–C–D); +draw((-1,0)–(10,0), EndArrow); +draw((0,-1)–(0,8), EndArrow); +label(" +A +​ +( +0 +, +6 +) +A(0,6) +", A, W); +label(" +B +​ +( +0 +, +0 +) +B(0,0) +", B, SW); +label(" +C +​ +( +8 +, +0 +) +C(8,0) +", C, S); +label(" +D +D +", D, W); +label(" +E +E +", E, S); +label(" +F +F +", F, SW); +label(" +x +x +", (10,0), dir(0)); +label(" +y +y +", (0,8), dir(90)); +[/asy] +Our trajectory for proving +: + +# Step 1: Define the coordinates of points A, B, C, D, and E +⬇ +A += +(0, +6) +B += +(0, +0) +C += +(8, +0) +D += +(0, +3) +# +Midpoint +of +AB +E += +(4, +0) +# +Midpoint +of +BC + +[PPM Score: -0.76639] +# Step 2: Define the coordinates of point F using the section formula (known to be (8/3, 2)) +⬇ +F += +(8/3, +2) + +[PPM Score: 0.8429] +# +Step 3: Use the Shoelace Theorem to find the area of quadrilateral DBEF +# Shoelace Theorem formula: Area = 0.5 * |x1y2 + x2y3 + x3y4 + x4y1 - (y1x2 + y2x3 + y3x4 + y4x1)| +# Coordinates of D, B, E, F are (0, 3), (0, 0), (4, 0), (8/3, 2) respectively +⬇ +x1 +, +y1 += +D +x2 +, +y2 += +B +x3 +, +y3 += +E +x4 +, +y4 += +F +area += +0.5 +* +abs +( +x1 +* +y2 ++ +x2 +* +y3 ++ +x3 +* +y4 ++ +x4 +* +y1 +- +( +y1 +* +x2 ++ +y2 +* +x3 ++ +y3 +* +x4 ++ +y4 +* +x1 +)) + +[PPM Score: +0.9972] +# Step 4: Print the area of quadrilateral DBEF +⬇ +print +( +area +) + +8.0 + +8 + +A.3 +Prompt Examples +MCTS Prompt for DeepSeek-V2-Coder-Instruct (bootstrap round) +You are a powerful agent with broad math knowledge and great python programming skills. You need to use python interpreter to do accurate calculation on math equations. +!!! Remember: +1. Use code solve the problem step by step. The solution should include three parts: , , and . +2. All calculations should be done in python code. Provide concise reasoning and thinking in the comments of the code. +3. The most related python packages include ‘math‘, ‘sympy‘, ‘scipy‘, and ‘numpy‘. +4. Please use the following template: +Question: the input question +Construct the code step by step. Use to indicate the end of each step. Ensure your code can execute correctly(excluding ) and print the answer. Avoid undefined variables (NameError), unimported packages, or formatting errors (SyntaxError, TypeError). In the last step of the code, print the final answer and add a comment: Now print the final answer. +Execute the code in using the Python interpreter and display the printed results. +The concise answer without verbose context, put your final answer’s numerical part (without unit, only focus on the numerical part if it’s a choice question) in +boxed. Now! It’s your turn. +Question: +{input} +The following are 2 demonstration examples: +Question: Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? + +# Step 1: Calculate the total weight lifted with two 20-pound weights +total_weight_20 = 2 * 20 * 12 + +# Step 2: Calculate the weight lifted per repetition with two 15-pound weights +weight_per_rep_15 = 2 * 15 + +# Step 3: Calculate the number of repetitions needed to lift the same total weight with two 15-pound weights +reps_needed = total_weight_20 / weight_per_rep_15 + +# Now print the final answer +print(reps_needed) + +16.0 From the result, we can see that Terrell must lift the 15-pound weights +boxed16 times to lift the same total weight. +, +Question: Find the value of +x +x +that satisfies +3 +​ +x ++ +5 +6 +​ +x ++ +5 += +5 +3 +\frac{\sqrt{3x+5}}{\sqrt{6x+5}}=\frac{\sqrt{5}}{3} +. Express your answer as a common fraction. + +from sympy import symbols, Eq, solve, sqrt +# Define the variable x +x = symbols(’x’) + +# Define the equation +equation = Eq(sqrt(3*x + 5) / sqrt(6*x + 5), sqrt(5) / 3) + +# Solve the equation for x +solution = solve(equation, x) + +# Now print the final answer +print(solution) + +[20/3] + From the result, we can see that the value of x is +20 +3 + +MCTS Prompt for our policy SLM +MCTS prompt for next step generation +: +<|user|>: +{input question} +<|assistant|>: Let’s think step by step and solve the problem with code. +{trajectory of previous steps} +SFT Prompt Example +: +<|user|>: +How many numbers can you get by multiplying two or more distinct members of the set +1 +, +2 +, +3 +, +5 +, +11 +\\ +{1,2,3,5,11\\ +} +together? +<|assistant|>: Let’s think step by step and solve the problem with code. + +# Step 1: Create the set of numbers +⬇ +numbers += +{1, +2, +3, +5, +11} + +# Step 2: Define a function to generate all possible products of two or more distinct members of the set +⬇ +from +itertools +import +combinations +def +generate_products +( +numbers +): +products += +set +() +for +r +in +range +(2, +len +( +numbers +) ++ +1): +# +For +combinations +of +2, +3, +4, +and +5 +elements +for +combo +in +combinations +( +numbers +, +r +): +product += +1 +for +num +in +combo +: +product +*= +num +products +. +add +( +product +) +return +products + +# Step 3: Calculate the products +⬇ +unique_products += +generate_products +( +numbers +) + +# Step 4: Calculate the number of unique products +⬇ +num_unique_products += +len +( +unique_products +) + +# Now print the final answer +⬇ +print +( +num_unique_products +) + +15 +From the result, we can see that there are +15 +unique products. + +◄ +Feeling +lucky? +Conversion +report +Report +an issue +View original +on arXiv +► \ No newline at end of file diff --git a/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md b/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md new file mode 100644 index 0000000000000000000000000000000000000000..fa9fcabb47acd0ab4c4b2421dfea5563cbe1bab6 --- /dev/null +++ b/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md @@ -0,0 +1,203 @@ +--- +title: '[2501.09136] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG' +id: 250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag +tags: +- deepread +created: '2026-06-10T00:24:47.557837Z' +source: https://arxiv.org/abs/2501.09136 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:24:47.557707Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.09136] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG +Computer Science > Artificial Intelligence +arXiv:2501.09136 +(cs) +[Submitted on 15 Jan 2025 ( +v1 +), last revised 1 Apr 2026 (this version, v4)] +Title: +Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG +Authors: +Aditi Singh +, +Abul Ehtesham +, +Saket Kumar +, +Tala Talaei Khoei +, +Athanasios V. Vasilakos +View a PDF of the paper titled Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG, by Aditi Singh and 4 other authors +View PDF +HTML (experimental) +Abstract: +Large Language Models (LLMs) have advanced artificial intelligence by enabling human-like text generation and natural language understanding. However, their reliance on static training data limits their ability to respond to dynamic, real-time queries, resulting in outdated or inaccurate outputs. Retrieval-Augmented Generation (RAG) has emerged as a solution, enhancing LLMs by integrating real-time data retrieval to provide contextually relevant and up-to-date responses. Despite its promise, traditional RAG systems are constrained by static workflows and lack the adaptability required for multi-step reasoning and complex task management. Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these limitations by embedding autonomous AI agents into the RAG pipeline. These agents leverage agentic design patterns reflection, planning, tool use, and multi-agent collaboration to dynamically manage retrieval strategies, iteratively refine contextual understanding, and adapt workflows through operational structures ranging from sequential steps to adaptive collaboration. This integration enables Agentic RAG systems to deliver flexibility, scalability, and context-awareness across diverse applications. This paper presents an analytical survey of Agentic RAG systems. It traces the evolution of RAG paradigms, introduces a principled taxonomy of Agentic RAG architectures based on agent cardinality, control structure, autonomy, and knowledge representation, and provides a comparative analysis of design trade-offs across existing frameworks. The survey examines applications in healthcare, finance, education, and enterprise document processing, and distills practical lessons for system designers and practitioners. Finally, it identifies key open research challenges related to evaluation, coordination, memory management, efficiency, and governance, outlining directions for future research. +Subjects: +Artificial Intelligence (cs.AI) +; Computation and Language (cs.CL); Information Retrieval (cs.IR) +Cite as: +arXiv:2501.09136 +[cs.AI] +(or +arXiv:2501.09136v4 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2501.09136 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Abul Ehtesham [ +view email +] +[v1] +Wed, 15 Jan 2025 20:40:25 UTC (20,962 KB) +[v2] +Mon, 3 Feb 2025 04:01:36 UTC (22,453 KB) +[v3] +Tue, 4 Feb 2025 04:48:00 UTC (22,430 KB) +[v4] +Wed, 1 Apr 2026 15:51:06 UTC (13,996 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG, by Aditi Singh and 4 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +cs.CL +cs.IR +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250109891-evolving-deeper-llm-thinking.md b/research/notes/250109891-evolving-deeper-llm-thinking.md new file mode 100644 index 0000000000000000000000000000000000000000..7de7575973d75805c55cb6a083e71e8857aceda5 --- /dev/null +++ b/research/notes/250109891-evolving-deeper-llm-thinking.md @@ -0,0 +1,196 @@ +--- +title: '[2501.09891] Evolving Deeper LLM Thinking' +id: 250109891-evolving-deeper-llm-thinking +tags: +- deepread +created: '2026-06-10T00:24:58.674469Z' +source: https://arxiv.org/abs/2501.09891 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:24:58.674344Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.09891] Evolving Deeper LLM Thinking +Computer Science > Artificial Intelligence +arXiv:2501.09891 +(cs) +[Submitted on 17 Jan 2025] +Title: +Evolving Deeper LLM Thinking +Authors: +Kuang-Huei Lee +, +Ian Fischer +, +Yueh-Hua Wu +, +Dave Marwood +, +Shumeet Baluja +, +Dale Schuurmans +, +Xinyun Chen +View a PDF of the paper titled Evolving Deeper LLM Thinking, by Kuang-Huei Lee and 6 other authors +View PDF +HTML (experimental) +Abstract: +We explore an evolutionary search strategy for scaling inference time compute in Large Language Models. The proposed approach, Mind Evolution, uses a language model to generate, recombine and refine candidate responses. The proposed approach avoids the need to formalize the underlying inference problem whenever a solution evaluator is available. Controlling for inference cost, we find that Mind Evolution significantly outperforms other inference strategies such as Best-of-N and Sequential Revision in natural language planning tasks. In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more than 98% of the problem instances using Gemini 1.5 Pro without the use of a formal solver. +Subjects: +Artificial Intelligence (cs.AI) +Cite as: +arXiv:2501.09891 +[cs.AI] +(or +arXiv:2501.09891v1 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2501.09891 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Dale Schuurmans [ +view email +] +[v1] +Fri, 17 Jan 2025 00:41:44 UTC (3,183 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Evolving Deeper LLM Thinking, by Kuang-Huei Lee and 6 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md b/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md new file mode 100644 index 0000000000000000000000000000000000000000..ed24746bf2c90b5eecf0bc53b02df7665e84ad5f --- /dev/null +++ b/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md @@ -0,0 +1,386 @@ +--- +title: '[2501.12599] Kimi k1.5: Scaling Reinforcement Learning with LLMs' +id: 250112599-kimi-k15-scaling-reinforcement-learning-with-llms +tags: +- deepread +created: '2026-06-10T00:24:53.655467Z' +source: https://arxiv.org/abs/2501.12599 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:24:53.655188Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.12599] Kimi k1.5: Scaling Reinforcement Learning with LLMs +Computer Science > Artificial Intelligence +arXiv:2501.12599 +(cs) +[Submitted on 22 Jan 2025 ( +v1 +), last revised 3 Jun 2025 (this version, v4)] +Title: +Kimi k1.5: Scaling Reinforcement Learning with LLMs +Authors: +Kimi Team +, +Angang Du +, +Bofei Gao +, +Bowei Xing +, +Changjiu Jiang +, +Cheng Chen +, +Cheng Li +, +Chenjun Xiao +, +Chenzhuang Du +, +Chonghua Liao +, +Chuning Tang +, +Congcong Wang +, +Dehao Zhang +, +Enming Yuan +, +Enzhe Lu +, +Fengxiang Tang +, +Flood Sung +, +Guangda Wei +, +Guokun Lai +, +Haiqing Guo +, +Han Zhu +, +Hao Ding +, +Hao Hu +, +Hao Yang +, +Hao Zhang +, +Haotian Yao +, +Haotian Zhao +, +Haoyu Lu +, +Haoze Li +, +Haozhen Yu +, +Hongcheng Gao +, +Huabin Zheng +, +Huan Yuan +, +Jia Chen +, +Jianhang Guo +, +Jianlin Su +, +Jianzhou Wang +, +Jie Zhao +, +Jin Zhang +, +Jingyuan Liu +, +Junjie Yan +, +Junyan Wu +, +Lidong Shi +, +Ling Ye +, +Longhui Yu +, +Mengnan Dong +, +Neo Zhang +, +Ningchen Ma +, +Qiwei Pan +, +Qucheng Gong +, +Shaowei Liu +, +Shengling Ma +, +Shupeng Wei +, +Sihan Cao +, +Siying Huang +, +Tao Jiang +, +Weihao Gao +, +Weimin Xiong +, +Weiran He +, +Weixiao Huang +, +Weixin Xu +, +Wenhao Wu +, +Wenyang He +, +Xianghui Wei +, +Xianqing Jia +, +Xingzhe Wu +, +Xinran Xu +, +Xinxing Zu +, +Xinyu Zhou +, +Xuehai Pan +, +Y. Charles +, +Yang Li +, +Yangyang Hu +, +Yangyang Liu +, +Yanru Chen +, +Yejie Wang +, +Yibo Liu +, +Yidao Qin +, +Yifeng Liu +, +Ying Yang +, +Yiping Bao +, +Yulun Du +, +Yuxin Wu +, +Yuzhi Wang +, +Zaida Zhou +, +Zhaoji Wang +, +Zhaowei Li +, +Zhen Zhu +, +Zheng Zhang +, +Zhexu Wang +, +Zhilin Yang +, +Zhiqi Huang +, +Zihao Huang +, +Ziyao Xu +, +Zonghan Yang +, +Zongyu Lin +View a PDF of the paper titled Kimi k1.5: Scaling Reinforcement Learning with LLMs, by Kimi Team and 95 other authors +View PDF +HTML (experimental) +Abstract: +Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI's o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%). +Comments: +25 pages +Subjects: +Artificial Intelligence (cs.AI) +; Machine Learning (cs.LG) +Cite as: +arXiv:2501.12599 +[cs.AI] +(or +arXiv:2501.12599v4 +[cs.AI] +for this version) +https://doi.org/10.48550/arXiv.2501.12599 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Flood Sung [ +view email +] +[v1] +Wed, 22 Jan 2025 02:48:14 UTC (614 KB) +[v2] +Wed, 5 Mar 2025 02:16:32 UTC (614 KB) +[v3] +Wed, 28 May 2025 03:57:30 UTC (614 KB) +[v4] +Tue, 3 Jun 2025 02:14:54 UTC (603 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Kimi k1.5: Scaling Reinforcement Learning with LLMs, by Kimi Team and 95 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.AI +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md b/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md new file mode 100644 index 0000000000000000000000000000000000000000..026758bd2ae72d030117a4ff8ea83ff8ca218f65 --- /dev/null +++ b/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md @@ -0,0 +1,206 @@ +--- +title: '[2501.18512] Streaming DiLoCo with overlapping communication: Towards a Distributed + Free Lunch' +id: 250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed +tags: +- deepread +created: '2026-06-10T00:30:21.211856Z' +source: https://arxiv.org/abs/2501.18512 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:30:21.211709Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.18512] Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch +Computer Science > Computation and Language +arXiv:2501.18512 +(cs) +[Submitted on 30 Jan 2025] +Title: +Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch +Authors: +Arthur Douillard +, +Yanislav Donchev +, +Keith Rush +, +Satyen Kale +, +Zachary Charles +, +Zachary Garrett +, +Gabriel Teston +, +Dave Lacey +, +Ross McIlroy +, +Jiajun Shen +, +Alexandre Ramé +, +Arthur Szlam +, +Marc'Aurelio Ranzato +, +Paul Barham +View a PDF of the paper titled Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch, by Arthur Douillard and Yanislav Donchev and Keith Rush and Satyen Kale and Zachary Charles and Zachary Garrett and Gabriel Teston and Dave Lacey and Ross McIlroy and Jiajun Shen and Alexandre Ram\'e and Arthur Szlam and Marc'Aurelio Ranzato and Paul Barham +View PDF +HTML (experimental) +Abstract: +Training of large language models (LLMs) is typically distributed across a large number of accelerators to reduce training time. Since internal states and parameter gradients need to be exchanged at each and every single gradient step, all devices need to be co-located using low-latency high-bandwidth communication links to support the required high volume of exchanged bits. Recently, distributed algorithms like DiLoCo have relaxed such co-location constraint: accelerators can be grouped into ``workers'', where synchronizations between workers only occur infrequently. This in turn means that workers can afford being connected by lower bandwidth communication links without affecting learning quality. However, in these methods, communication across workers still requires the same peak bandwidth as before, as the synchronizations require all parameters to be exchanged across all workers. In this paper, we improve DiLoCo in three ways. First, we synchronize only subsets of parameters in sequence, rather than all at once, which greatly reduces peak bandwidth. Second, we allow workers to continue training while synchronizing, which decreases wall clock time. Third, we quantize the data exchanged by workers, which further reduces bandwidth across workers. By properly combining these modifications, we show experimentally that we can distribute training of billion-scale parameters and reach similar quality as before, but reducing required bandwidth by two orders of magnitude. +Subjects: +Computation and Language (cs.CL) +Cite as: +arXiv:2501.18512 +[cs.CL] +(or +arXiv:2501.18512v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2501.18512 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Arthur Douillard [ +view email +] +[v1] +Thu, 30 Jan 2025 17:23:50 UTC (3,278 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch, by Arthur Douillard and Yanislav Donchev and Keith Rush and Satyen Kale and Zachary Charles and Zachary Garrett and Gabriel Teston and Dave Lacey and Ross McIlroy and Jiajun Shen and Alexandre Ram\'e and Arthur Szlam and Marc'Aurelio Ranzato and Paul Barham +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md b/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md new file mode 100644 index 0000000000000000000000000000000000000000..e14838a352d8126b4f6d4560de606bc94f63d325 --- /dev/null +++ b/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md @@ -0,0 +1,179 @@ +--- +title: '[2501.18639] A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, + Applications, and Advances' +id: 250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli +tags: +- deepread +created: '2026-06-10T00:25:14.929249Z' +source: https://arxiv.org/abs/2501.18639 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:25:14.929112Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2501.18639] A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances +Computer Science > Logic in Computer Science +arXiv:2501.18639 +(cs) +[Submitted on 28 Jan 2025] +Title: +A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances +Authors: +Xichen Tang +View a PDF of the paper titled A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances, by Xichen Tang +View PDF +Abstract: +This comprehensive survey examines Lean 4, a state-of-the-art interactive theorem prover and functional programming language. We analyze its architectural design, type system, metaprogramming capabilities, and practical applications in formal verification and mathematics. Through detailed comparisons with other proof assistants and extensive case studies, we demonstrate Lean 4's unique advantages in proof automation, performance, and usability. The paper also explores recent developments in its ecosystem, including libraries, tools, and educational applications, providing insights into its growing impact on formal methods and mathematical formalization. +Subjects: +Logic in Computer Science (cs.LO) +; Programming Languages (cs.PL) +Cite as: +arXiv:2501.18639 +[cs.LO] +(or +arXiv:2501.18639v1 +[cs.LO] +for this version) +https://doi.org/10.48550/arXiv.2501.18639 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Xichen Tang [ +view email +] +[v1] +Tue, 28 Jan 2025 17:15:54 UTC (2,729 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances, by Xichen Tang +View PDF +view license +Current browse context: +cs.LO +< prev +| +next > +new +| +recent +| +2025-01 +Change to browse by: +cs +cs.PL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md b/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md new file mode 100644 index 0000000000000000000000000000000000000000..7d8254e71f933483236ebbec2430872f47677dbc --- /dev/null +++ b/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md @@ -0,0 +1,183 @@ +--- +title: '[2502.02047] AmaSQuAD: A Benchmark for Amharic Extractive Question Answering' +id: 250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering +tags: +- deepread +created: '2026-06-10T00:24:12.341628Z' +source: https://arxiv.org/abs/2502.02047 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:24:12.341471Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2502.02047] AmaSQuAD: A Benchmark for Amharic Extractive Question Answering +Computer Science > Computation and Language +arXiv:2502.02047 +(cs) +[Submitted on 4 Feb 2025] +Title: +AmaSQuAD: A Benchmark for Amharic Extractive Question Answering +Authors: +Nebiyou Daniel Hailemariam +, +Blessed Guda +, +Tsegazeab Tefferi +View a PDF of the paper titled AmaSQuAD: A Benchmark for Amharic Extractive Question Answering, by Nebiyou Daniel Hailemariam and 2 other authors +View PDF +HTML (experimental) +Abstract: +This research presents a novel framework for translating extractive question-answering datasets into low-resource languages, as demonstrated by the creation of the AmaSQuAD dataset, a translation of SQuAD 2.0 into Amharic. The methodology addresses challenges related to misalignment between translated questions and answers, as well as the presence of multiple answer instances in the translated context. For this purpose, we used cosine similarity utilizing embeddings from a fine-tuned BERT-based model for Amharic and Longest Common Subsequence (LCS). Additionally, we fine-tune the XLM-R model on the AmaSQuAD synthetic dataset for Amharic Question-Answering. The results show an improvement in baseline performance, with the fine-tuned model achieving an increase in the F1 score from 36.55% to 44.41% and 50.01% to 57.5% on the AmaSQuAD development dataset. Moreover, the model demonstrates improvement on the human-curated AmQA dataset, increasing the F1 score from 67.80% to 68.80% and the exact match score from 52.50% to 52.66%.The AmaSQuAD dataset is publicly available Datasets +Subjects: +Computation and Language (cs.CL) +Cite as: +arXiv:2502.02047 +[cs.CL] +(or +arXiv:2502.02047v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2502.02047 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Blessed Guda [ +view email +] +[v1] +Tue, 4 Feb 2025 06:27:39 UTC (778 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled AmaSQuAD: A Benchmark for Amharic Extractive Question Answering, by Nebiyou Daniel Hailemariam and 2 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2025-02 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250210915-first-passage-times-with-fast-immigration.md b/research/notes/250210915-first-passage-times-with-fast-immigration.md new file mode 100644 index 0000000000000000000000000000000000000000..84fa2f8790935e24d7af9191ce27d6f1809eb9ca --- /dev/null +++ b/research/notes/250210915-first-passage-times-with-fast-immigration.md @@ -0,0 +1,186 @@ +--- +title: '[2502.10915] First passage times with fast immigration' +id: 250210915-first-passage-times-with-fast-immigration +tags: +- deepread +created: '2026-06-10T00:41:13.957308Z' +source: https://arxiv.org/abs/2502.10915 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:41:13.957158Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2502.10915] First passage times with fast immigration +Mathematics > Probability +arXiv:2502.10915 +(math) +[Submitted on 15 Feb 2025] +Title: +First passage times with fast immigration +Authors: +Hwai-Ray Tung +, +Sean D Lawley +View a PDF of the paper titled First passage times with fast immigration, by Hwai-Ray Tung and Sean D Lawley +View PDF +HTML (experimental) +Abstract: +Many scientific questions can be framed as asking for a first passage time (FPT), which generically describes the time it takes a random "searcher" to find a "target." The important timescale in a variety of biophysical systems is the time it takes the fastest searcher(s) to find a target out of many searchers. Previous work on such fastest FPTs assumes that all searchers are initially present in the domain, which makes the problem amenable to extreme value theory. In this paper, we consider an alternative model in which searchers progressively enter the domain at a constant "immigration" rate. In the fast immigration rate limit, we determine the probability distribution and moments of the $k$-th fastest FPT. Our rigorous theory applies to many models of stochastic motion, including random walks on discrete networks and diffusion on continuous state spaces. Mathematically, our analysis involves studying the extrema of an infinite sequence of random variables which are both not independent and not identically distributed. Our results constitute a rare instance in which extreme value statistics can be determined exactly for strongly correlated random variables. +Comments: +25 pages, 1 figure +Subjects: +Probability (math.PR) +MSC +classes: +60G70, 92B99, 60J60 +Cite as: +arXiv:2502.10915 +[math.PR] +(or +arXiv:2502.10915v1 +[math.PR] +for this version) +https://doi.org/10.48550/arXiv.2502.10915 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Sean Lawley [ +view email +] +[v1] +Sat, 15 Feb 2025 21:52:42 UTC (6,228 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled First passage times with fast immigration, by Hwai-Ray Tung and Sean D Lawley +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +math.PR +< prev +| +next > +new +| +recent +| +2025-02 +Change to browse by: +math +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md b/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md new file mode 100644 index 0000000000000000000000000000000000000000..4a2bd5dac4cb648310ff364af65fe303b8cf985f --- /dev/null +++ b/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md @@ -0,0 +1,187 @@ +--- +title: '[2502.12996] Eager Updates For Overlapped Communication and Computation in + DiLoCo' +id: 250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco +tags: +- deepread +created: '2026-06-10T00:35:23.460079Z' +source: https://arxiv.org/abs/2502.12996 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:35:23.459912Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2502.12996] Eager Updates For Overlapped Communication and Computation in DiLoCo +Computer Science > Computation and Language +arXiv:2502.12996 +(cs) +[Submitted on 18 Feb 2025] +Title: +Eager Updates For Overlapped Communication and Computation in DiLoCo +Authors: +Satyen Kale +, +Arthur Douillard +, +Yanislav Donchev +View a PDF of the paper titled Eager Updates For Overlapped Communication and Computation in DiLoCo, by Satyen Kale and Arthur Douillard and Yanislav Donchev +View PDF +HTML (experimental) +Abstract: +Distributed optimization methods such as DiLoCo have been shown to be effective in training very large models across multiple distributed workers, such as datacenters. These methods split updates into two parts: an inner optimization phase, where the workers independently execute multiple optimization steps on their own local data, and an outer optimization step, where the inner updates are synchronized. While such approaches require orders of magnitude less communication than standard data-parallel training, in settings where the workers are datacenters, even the limited communication requirements of these approaches can still cause significant slow downs due to the blocking necessary at each outer optimization step. In this paper, we investigate techniques to mitigate this issue by overlapping communication with computation in a manner that allows the outer optimization step to fully overlap with the inner optimization phase. We show that a particular variant, dubbed eager updates, provides competitive performance with standard DiLoCo in settings with low bandwidth between workers. +Comments: +arXiv admin note: text overlap with +arXiv:2501.18512 +Subjects: +Computation and Language (cs.CL) +Cite as: +arXiv:2502.12996 +[cs.CL] +(or +arXiv:2502.12996v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2502.12996 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Arthur Douillard [ +view email +] +[v1] +Tue, 18 Feb 2025 16:16:14 UTC (1,257 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Eager Updates For Overlapped Communication and Computation in DiLoCo, by Satyen Kale and Arthur Douillard and Yanislav Donchev +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2025-02 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md b/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md new file mode 100644 index 0000000000000000000000000000000000000000..71e5619fd9e5debae2cb95c5088d3a09958fc334 --- /dev/null +++ b/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md @@ -0,0 +1,261 @@ +--- +title: '[2503.14476] DAPO: An Open-Source LLM Reinforcement Learning System at Scale' +id: 250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale +tags: +- deepread +created: '2026-06-09T23:20:59.609829Z' +source: https://arxiv.org/abs/2503.14476 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:20:59.609711Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2503.14476] DAPO: An Open-Source LLM Reinforcement Learning System at Scale +Computer Science > Machine Learning +arXiv:2503.14476 +(cs) +[Submitted on 18 Mar 2025 ( +v1 +), last revised 20 May 2025 (this version, v2)] +Title: +DAPO: An Open-Source LLM Reinforcement Learning System at Scale +Authors: +Qiying Yu +, +Zheng Zhang +, +Ruofei Zhu +, +Yufeng Yuan +, +Xiaochen Zuo +, +Yu Yue +, +Weinan Dai +, +Tiantian Fan +, +Gaohong Liu +, +Lingjun Liu +, +Xin Liu +, +Haibin Lin +, +Zhiqi Lin +, +Bole Ma +, +Guangming Sheng +, +Yuxuan Tong +, +Chi Zhang +, +Mofan Zhang +, +Wang Zhang +, +Hang Zhu +, +Jinhua Zhu +, +Jiaze Chen +, +Jiangjie Chen +, +Chengyi Wang +, +Hongli Yu +, +Yuxuan Song +, +Xiangpeng Wei +, +Hao Zhou +, +Jingjing Liu +, +Wei-Ying Ma +, +Ya-Qin Zhang +, +Lin Yan +, +Mu Qiao +, +Yonghui Wu +, +Mingxuan Wang +View a PDF of the paper titled DAPO: An Open-Source LLM Reinforcement Learning System at Scale, by Qiying Yu and 34 other authors +View PDF +HTML (experimental) +Abstract: +Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecoupled Clip and $\textbf{D}$ynamic s$\textbf{A}$mpling $\textbf{P}$olicy $\textbf{O}$ptimization ($\textbf{DAPO}$) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model. Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the verl framework, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL. +Comments: +Project Page: +this https URL +Subjects: +Machine Learning (cs.LG) +; Computation and Language (cs.CL) +Cite as: +arXiv:2503.14476 +[cs.LG] +(or +arXiv:2503.14476v2 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2503.14476 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Qiying Yu [ +view email +] +[v1] +Tue, 18 Mar 2025 17:49:06 UTC (4,369 KB) +[v2] +Tue, 20 May 2025 01:37:34 UTC (4,369 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled DAPO: An Open-Source LLM Reinforcement Learning System at Scale, by Qiying Yu and 34 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2025-03 +Change to browse by: +cs +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md b/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md new file mode 100644 index 0000000000000000000000000000000000000000..ad82ac91352c5cad8ba49239dc61530b1f423c78 --- /dev/null +++ b/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md @@ -0,0 +1,212 @@ +--- +title: '[2503.20783] Understanding R1-Zero-Like Training: A Critical Perspective' +id: 250320783-understanding-r1-zero-like-training-a-critical-perspective +tags: +- deepread +created: '2026-06-09T23:20:56.927541Z' +source: https://arxiv.org/abs/2503.20783 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:20:56.927395Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2503.20783] Understanding R1-Zero-Like Training: A Critical Perspective +Computer Science > Machine Learning +arXiv:2503.20783 +(cs) +[Submitted on 26 Mar 2025 ( +v1 +), last revised 6 Oct 2025 (this version, v2)] +Title: +Understanding R1-Zero-Like Training: A Critical Perspective +Authors: +Zichen Liu +, +Changyu Chen +, +Wenjun Li +, +Penghui Qi +, +Tianyu Pang +, +Chao Du +, +Wee Sun Lee +, +Min Lin +View a PDF of the paper titled Understanding R1-Zero-Like Training: A Critical Perspective, by Zichen Liu and 7 other authors +View PDF +HTML (experimental) +Abstract: +DeepSeek-R1-Zero has shown that reinforcement learning (RL) at scale can directly enhance the reasoning capabilities of LLMs without supervised fine-tuning. In this work, we critically examine R1-Zero-like training by analyzing its two core components: base models and RL. We investigate a wide range of base models, including DeepSeek-V3-Base, to understand how pretraining characteristics influence RL performance. Our analysis reveals that DeepSeek-V3-Base already exhibit ''Aha moment'', while Qwen2.5 base models demonstrate strong reasoning capabilities even without prompt templates, suggesting potential pretraining biases. Additionally, we identify an optimization bias in Group Relative Policy Optimization (GRPO), which artificially increases response length (especially for incorrect outputs) during training. To address this, we introduce Dr. GRPO, an unbiased optimization method that improves token efficiency while maintaining reasoning performance. Leveraging these insights, we present a minimalist R1-Zero recipe that achieves 43.3% accuracy on AIME 2024 with a 7B base model, establishing a new state-of-the-art. Our code is available at +this https URL +. +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) +Cite as: +arXiv:2503.20783 +[cs.LG] +(or +arXiv:2503.20783v2 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2503.20783 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Zichen Liu [ +view email +] +[v1] +Wed, 26 Mar 2025 17:59:14 UTC (2,551 KB) +[v2] +Mon, 6 Oct 2025 09:30:03 UTC (1,366 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Understanding R1-Zero-Like Training: A Critical Perspective, by Zichen Liu and 7 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2025-03 +Change to browse by: +cs +cs.AI +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md b/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md new file mode 100644 index 0000000000000000000000000000000000000000..817c39210d5046c9fdffb0d0883e87a171f7acc8 --- /dev/null +++ b/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md @@ -0,0 +1,196 @@ +--- +title: '[2504.07164] R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling + Open-Weights SWE Agents' +id: 250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open +tags: +- deepread +created: '2026-06-10T00:23:07.985532Z' +source: https://arxiv.org/abs/2504.07164 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:07.985299Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2504.07164] R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents +Computer Science > Software Engineering +arXiv:2504.07164 +(cs) +[Submitted on 9 Apr 2025] +Title: +R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents +Authors: +Naman Jain +, +Jaskirat Singh +, +Manish Shetty +, +Liang Zheng +, +Koushik Sen +, +Ion Stoica +View a PDF of the paper titled R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents, by Naman Jain and 5 other authors +View PDF +HTML (experimental) +Abstract: +Improving open-source models on real-world SWE tasks (solving GITHUB issues) faces two key challenges: 1) scalable curation of execution environments to train these models, and, 2) optimal scaling of test-time compute. We introduce AgentGym, the largest procedurally-curated executable gym environment for training real-world SWE-agents, consisting of more than 8.7K tasks. AgentGym is powered by two main contributions: 1) SYNGEN: a synthetic data curation recipe that enables scalable curation of executable environments using test-generation and back-translation directly from commits, thereby reducing reliance on human-written issues or unit tests. We show that this enables more scalable training leading to pass@1 performance of 34.4% on SWE-Bench Verified benchmark with our 32B model. 2) Hybrid Test-time Scaling: we provide an in-depth analysis of two test-time scaling axes; execution-based and execution-free verifiers, demonstrating that they exhibit complementary strengths and limitations. Test-based verifiers suffer from low distinguishability, while execution-free verifiers are biased and often rely on stylistic features. Surprisingly, we find that while each approach individually saturates around 42-43%, significantly higher gains can be obtained by leveraging their complementary strengths. Overall, our approach achieves 51% on the SWE-Bench Verified benchmark, reflecting a new state-of-the-art for open-weight SWE-agents and for the first time showing competitive performance with proprietary models such as o1, o1-preview and sonnet-3.5-v2 (with tools). We will open-source our environments, models, and agent trajectories. +Comments: +Website: +this https URL +Subjects: +Software Engineering (cs.SE) +; Computation and Language (cs.CL); Machine Learning (cs.LG) +Cite as: +arXiv:2504.07164 +[cs.SE] +(or +arXiv:2504.07164v1 +[cs.SE] +for this version) +https://doi.org/10.48550/arXiv.2504.07164 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Naman Jain [ +view email +] +[v1] +Wed, 9 Apr 2025 17:55:19 UTC (14,303 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents, by Naman Jain and 5 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.SE +< prev +| +next > +new +| +recent +| +2025-04 +Change to browse by: +cs +cs.CL +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md b/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md new file mode 100644 index 0000000000000000000000000000000000000000..2fe3df9c277804fcdbddf281eafad0802d1854d0 --- /dev/null +++ b/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md @@ -0,0 +1,209 @@ +--- +title: '[2504.21798] SWE-smith: Scaling Data for Software Engineering Agents' +id: 250421798-swe-smith-scaling-data-for-software-engineering-agents +tags: +- deepread +created: '2026-06-10T00:22:56.761688Z' +source: https://arxiv.org/abs/2504.21798 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:22:56.761560Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2504.21798] SWE-smith: Scaling Data for Software Engineering Agents +Computer Science > Software Engineering +arXiv:2504.21798 +(cs) +[Submitted on 30 Apr 2025 ( +v1 +), last revised 21 May 2025 (this version, v2)] +Title: +SWE-smith: Scaling Data for Software Engineering Agents +Authors: +John Yang +, +Kilian Lieret +, +Carlos E. Jimenez +, +Alexander Wettig +, +Kabir Khandpur +, +Yanzhe Zhang +, +Binyuan Hui +, +Ofir Press +, +Ludwig Schmidt +, +Diyi Yang +View a PDF of the paper titled SWE-smith: Scaling Data for Software Engineering Agents, by John Yang and 9 other authors +View PDF +HTML (experimental) +Abstract: +Despite recent progress in Language Models (LMs) for software engineering, collecting training data remains a significant pain point. Existing datasets are small, with at most 1,000s of training instances from 11 or fewer GitHub repositories. The procedures to curate such datasets are often complex, necessitating hundreds of hours of human labor; companion execution environments also take up several terabytes of storage, severely limiting their scalability and usability. To address this pain point, we introduce SWE-smith, a novel pipeline for generating software engineering training data at scale. Given any Python codebase, SWE-smith constructs a corresponding execution environment, then automatically synthesizes 100s to 1,000s of task instances that break existing test(s) in the codebase. Using SWE-smith, we create a dataset of 50k instances sourced from 128 GitHub repositories, an order of magnitude larger than all previous works. We train SWE-agent-LM-32B, achieving 40.2% Pass@1 resolve rate on the SWE-bench Verified benchmark, state of the art among open source models. We open source SWE-smith (collection procedure, task instances, trajectories, models) to lower the barrier of entry for research in LM systems for automated software engineering. All assets available at +this https URL +. +Comments: +All assets available at +this https URL +Subjects: +Software Engineering (cs.SE) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) +Cite as: +arXiv:2504.21798 +[cs.SE] +(or +arXiv:2504.21798v2 +[cs.SE] +for this version) +https://doi.org/10.48550/arXiv.2504.21798 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: John Yang B [ +view email +] +[v1] +Wed, 30 Apr 2025 16:56:06 UTC (2,670 KB) +[v2] +Wed, 21 May 2025 17:21:45 UTC (2,691 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled SWE-smith: Scaling Data for Software Engineering Agents, by John Yang and 9 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.SE +< prev +| +next > +new +| +recent +| +2025-04 +Change to browse by: +cs +cs.AI +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md b/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md new file mode 100644 index 0000000000000000000000000000000000000000..a8e713af37963f190b5ef4771167db4b4cc4758c --- /dev/null +++ b/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md @@ -0,0 +1,446 @@ +--- +title: '[2506.13585] MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning + Attention' +id: 250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent +tags: +- deepread +created: '2026-06-09T23:28:26.873770Z' +source: https://arxiv.org/abs/2506.13585 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:28:26.873612Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2506.13585] MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention +Computer Science > Computation and Language +arXiv:2506.13585 +(cs) +[Submitted on 16 Jun 2025] +Title: +MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention +Authors: +MiniMax +: +Aili Chen +, +Aonian Li +, +Bangwei Gong +, +Binyang Jiang +, +Bo Fei +, +Bo Yang +, +Boji Shan +, +Changqing Yu +, +Chao Wang +, +Cheng Zhu +, +Chengjun Xiao +, +Chengyu Du +, +Chi Zhang +, +Chu Qiao +, +Chunhao Zhang +, +Chunhui Du +, +Congchao Guo +, +Da Chen +, +Deming Ding +, +Dianjun Sun +, +Dong Li +, +Enwei Jiao +, +Haigang Zhou +, +Haimo Zhang +, +Han Ding +, +Haohai Sun +, +Haoyu Feng +, +Huaiguang Cai +, +Haichao Zhu +, +Jian Sun +, +Jiaqi Zhuang +, +Jiaren Cai +, +Jiayuan Song +, +Jin Zhu +, +Jingyang Li +, +Jinhao Tian +, +Jinli Liu +, +Junhao Xu +, +Junjie Yan +, +Junteng Liu +, +Junxian He +, +Kaiyi Feng +, +Ke Yang +, +Kecheng Xiao +, +Le Han +, +Leyang Wang +, +Lianfei Yu +, +Liheng Feng +, +Lin Li +, +Lin Zheng +, +Linge Du +, +Lingyu Yang +, +Lunbin Zeng +, +Minghui Yu +, +Mingliang Tao +, +Mingyuan Chi +, +Mozhi Zhang +, +Mujie Lin +, +Nan Hu +, +Nongyu Di +, +Peng Gao +, +Pengfei Li +, +Pengyu Zhao +, +Qibing Ren +, +Qidi Xu +, +Qile Li +, +Qin Wang +, +Rong Tian +, +Ruitao Leng +, +Shaoxiang Chen +, +Shaoyu Chen +, +Shengmin Shi +, +Shitong Weng +, +Shuchang Guan +, +Shuqi Yu +, +Sichen Li +, +Songquan Zhu +, +Tengfei Li +, +Tianchi Cai +, +Tianrun Liang +, +Weiyu Cheng +, +Weize Kong +, +Wenkai Li +, +Xiancai Chen +, +Xiangjun Song +, +Xiao Luo +, +Xiao Su +, +Xiaobo Li +, +Xiaodong Han +, +Xinzhu Hou +, +Xuan Lu +, +Xun Zou +, +Xuyang Shen +, +Yan Gong +, +Yan Ma +, +Yang Wang +, +Yiqi Shi +, +Yiran Zhong +, +Yonghong Duan +, +Yongxiang Fu +, +Yongyi Hu +, +Yu Gao +, +Yuanxiang Fan +, +Yufeng Yang +, +Yuhao Li +, +Yulin Hu +, +Yunan Huang +, +Yunji Li +, +Yunzhi Xu +, +Yuxin Mao +, +Yuxuan Shi +, +Yuze Wenren +, +Zehan Li +, +Zelin Li +, +Zhanxu Tian +, +Zhengmao Zhu +, +Zhenhua Fan +, +Zhenzhen Wu +, +Zhichao Xu +, +Zhihang Yu +, +Zhiheng Lyu +, +Zhuo Jiang +, +Zibo Gao +, +Zijia Wu +, +Zijian Song +, +Zijun Sun +et al. (27 additional authors not shown) +You must enable JavaScript to view entire author list. +View a PDF of the paper titled MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention, by MiniMax: Aili Chen and 125 other authors +View PDF +HTML (experimental) +Abstract: +We introduce MiniMax-M1, the world's first open-weight, large-scale hybrid-attention reasoning model. MiniMax-M1 is powered by a hybrid Mixture-of-Experts (MoE) architecture combined with a lightning attention mechanism. The model is developed based on our previous MiniMax-Text-01 model, which contains a total of 456 billion parameters with 45.9 billion parameters activated per token. The M1 model natively supports a context length of 1 million tokens, 8x the context size of DeepSeek R1. Furthermore, the lightning attention mechanism in MiniMax-M1 enables efficient scaling of test-time compute. These properties make M1 particularly suitable for complex tasks that require processing long inputs and thinking extensively. MiniMax-M1 is trained using large-scale reinforcement learning (RL) on diverse problems including sandbox-based, real-world software engineering environments. In addition to M1's inherent efficiency advantage for RL training, we propose CISPO, a novel RL algorithm to further enhance RL efficiency. CISPO clips importance sampling weights rather than token updates, outperforming other competitive RL variants. Combining hybrid-attention and CISPO enables MiniMax-M1's full RL training on 512 H800 GPUs to complete in only three weeks, with a rental cost of just $534,700. We release two versions of MiniMax-M1 models with 40K and 80K thinking budgets respectively, where the 40K model represents an intermediate phase of the 80K training. Experiments on standard benchmarks show that our models are comparable or superior to strong open-weight models such as the original DeepSeek-R1 and Qwen3-235B, with particular strengths in complex software engineering, tool utilization, and long-context tasks. We publicly release MiniMax-M1 at +this https URL +. +Comments: +A technical report from MiniMax. The authors are listed in alphabetical order. We open-source our MiniMax-M1 at +this https URL +Subjects: +Computation and Language (cs.CL) +; Machine Learning (cs.LG) +Cite as: +arXiv:2506.13585 +[cs.CL] +(or +arXiv:2506.13585v1 +[cs.CL] +for this version) +https://doi.org/10.48550/arXiv.2506.13585 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Junteng Liu [ +view email +] +[v1] +Mon, 16 Jun 2025 15:08:02 UTC (1,207 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention, by MiniMax: Aili Chen and 125 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.CL +< prev +| +next > +new +| +recent +| +2025-06 +Change to browse by: +cs +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md b/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md new file mode 100644 index 0000000000000000000000000000000000000000..a7109dbe08d66cfe05e501b2a36a5fcf19e254ae --- /dev/null +++ b/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md @@ -0,0 +1,204 @@ +--- +title: '[2506.21263] DiLoCoX: A Low-Communication Large-Scale Training Framework for + Decentralized Cluster' +id: 250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr +tags: +- deepread +created: '2026-06-10T00:35:45.127464Z' +source: https://arxiv.org/abs/2506.21263 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:35:45.127344Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2506.21263] DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster +Computer Science > Machine Learning +arXiv:2506.21263 +(cs) +[Submitted on 26 Jun 2025] +Title: +DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster +Authors: +Ji Qi +, +WenPeng Zhu +, +Li Li +, +Ming Wu +, +YingJun Wu +, +Wu He +, +Xun Gao +, +Jason Zeng +, +Michael Heinrich +View a PDF of the paper titled DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster, by Ji Qi and 8 other authors +View PDF +HTML (experimental) +Abstract: +The distributed training of foundation models, particularly large language models (LLMs), demands a high level of communication. Consequently, it is highly dependent on a centralized cluster with fast and reliable interconnects. Can we conduct training on slow networks and thereby unleash the power of decentralized clusters when dealing with models exceeding 100 billion parameters? In this paper, we propose DiLoCoX, a low-communication large-scale decentralized cluster training framework. It combines Pipeline Parallelism with Dual Optimizer Policy, One-Step-Delay Overlap of Communication and Local Training, and an Adaptive Gradient Compression Scheme. This combination significantly improves the scale of parameters and the speed of model pre-training. We justify the benefits of one-step-delay overlap of communication and local training, as well as the adaptive gradient compression scheme, through a theoretical analysis of convergence. Empirically, we demonstrate that DiLoCoX is capable of pre-training a 107B foundation model over a 1Gbps network. Compared to vanilla AllReduce, DiLoCoX can achieve a 357x speedup in distributed training while maintaining negligible degradation in model convergence. To the best of our knowledge, this is the first decentralized training framework successfully applied to models with over 100 billion parameters. +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) +Cite as: +arXiv:2506.21263 +[cs.LG] +(or +arXiv:2506.21263v1 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2506.21263 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Wu He [ +view email +] +[v1] +Thu, 26 Jun 2025 13:45:04 UTC (320 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster, by Ji Qi and 8 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2025-06 +Change to browse by: +cs +cs.AI +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/250718071-group-sequence-policy-optimization.md b/research/notes/250718071-group-sequence-policy-optimization.md new file mode 100644 index 0000000000000000000000000000000000000000..dfe10a119fc94aff88a182cb8a3c4a9145494635 --- /dev/null +++ b/research/notes/250718071-group-sequence-policy-optimization.md @@ -0,0 +1,213 @@ +--- +title: '[2507.18071] Group Sequence Policy Optimization' +id: 250718071-group-sequence-policy-optimization +tags: +- deepread +created: '2026-06-09T23:21:02.328678Z' +source: https://arxiv.org/abs/2507.18071 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:21:02.328533Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2507.18071] Group Sequence Policy Optimization +Computer Science > Machine Learning +arXiv:2507.18071 +(cs) +[Submitted on 24 Jul 2025 ( +v1 +), last revised 28 Jul 2025 (this version, v2)] +Title: +Group Sequence Policy Optimization +Authors: +Chujie Zheng +, +Shixuan Liu +, +Mingze Li +, +Xiong-Hui Chen +, +Bowen Yu +, +Chang Gao +, +Kai Dang +, +Yuqiong Liu +, +Rui Men +, +An Yang +, +Jingren Zhou +, +Junyang Lin +View a PDF of the paper titled Group Sequence Policy Optimization, by Chujie Zheng and 11 other authors +View PDF +HTML (experimental) +Abstract: +This paper introduces Group Sequence Policy Optimization (GSPO), our stable, efficient, and performant reinforcement learning algorithm for training large language models. Unlike previous algorithms that adopt token-level importance ratios, GSPO defines the importance ratio based on sequence likelihood and performs sequence-level clipping, rewarding, and optimization. We demonstrate that GSPO achieves superior training efficiency and performance compared to the GRPO algorithm, notably stabilizes Mixture-of-Experts (MoE) RL training, and has the potential for simplifying the design of RL infrastructure. These merits of GSPO have contributed to the remarkable improvements in the latest Qwen3 models. +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI); Computation and Language (cs.CL) +Cite as: +arXiv:2507.18071 +[cs.LG] +(or +arXiv:2507.18071v2 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2507.18071 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Chujie Zheng [ +view email +] +[v1] +Thu, 24 Jul 2025 03:50:32 UTC (259 KB) +[v2] +Mon, 28 Jul 2025 11:11:33 UTC (259 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Group Sequence Policy Optimization, by Chujie Zheng and 11 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2025-07 +Change to browse by: +cs +cs.AI +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md b/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md new file mode 100644 index 0000000000000000000000000000000000000000..43b0acf1f2acb1c0baf7b9ae884163cd87fafd3f --- /dev/null +++ b/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md @@ -0,0 +1,215 @@ +--- +title: '[2512.21852] A Comedy of Estimators: On KL Regularization in RL Training of + LLMs' +id: 251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms +tags: +- deepread +created: '2026-06-09T23:21:27.833089Z' +source: https://arxiv.org/abs/2512.21852 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:21:27.832936Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2512.21852] A Comedy of Estimators: On KL Regularization in RL Training of LLMs +Computer Science > Machine Learning +arXiv:2512.21852 +(cs) +[Submitted on 26 Dec 2025 ( +v1 +), last revised 18 Mar 2026 (this version, v3)] +Title: +A Comedy of Estimators: On KL Regularization in RL Training of LLMs +Authors: +Vedant Shah +, +Johan Obando-Ceron +, +Vineet Jain +, +Brian Bartoldson +, +Bhavya Kailkhura +, +Sarthak Mittal +, +Glen Berseth +, +Pablo Samuel Castro +, +Yoshua Bengio +, +Nikolay Malkin +, +Moksh Jain +, +Siddarth Venkatraman +, +Aaron Courville +View a PDF of the paper titled A Comedy of Estimators: On KL Regularization in RL Training of LLMs, by Vedant Shah and 12 other authors +View PDF +Abstract: +The reasoning performance of large language models (LLMs) can be substantially improved by training them with reinforcement learning (RL). The RL objective for LLM training involves a regularization term, which is the reverse Kullback-Leibler (KL) divergence between the trained policy and the reference policy. Since computing the KL divergence exactly is intractable, various estimators are used in practice to estimate it from on-policy samples. Despite its wide adoption, including in several open-source libraries, there is no systematic study analyzing the numerous ways of incorporating KL estimators in the objective and their effect on the downstream performance of RL-trained models. Recent works show that prevailing practices for incorporating KL regularization do not provide correct gradients for stated objectives, creating a discrepancy between the objective and its implementation. In this paper, we further analyze these practices and study the gradients of several estimators configurations, revealing how design choices shape gradient bias. We substantiate these findings with empirical observations by RL fine-tuning \texttt{Qwen2.5-7B}, \texttt{Llama-3.1-8B-Instruct} and \texttt{Qwen3-4B-Instruct-2507} with different configurations and evaluating their performance on both in- and out-of-distribution tasks. Through our analysis, we observe that, in on-policy settings: (1) estimator configurations with biased gradients can result in training instabilities; and (2) using estimator configurations resulting in unbiased gradients leads to better performance on in-domain as well as out-of-domain tasks. We also investigate the performance resulting from different KL configurations in off-policy settings and observe that KL regularization can help stabilize off-policy RL training resulting from asynchronous setups. +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI) +Cite as: +arXiv:2512.21852 +[cs.LG] +(or +arXiv:2512.21852v3 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2512.21852 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Johan Obando-Ceron [ +view email +] +[v1] +Fri, 26 Dec 2025 04:20:58 UTC (209 KB) +[v2] +Tue, 6 Jan 2026 15:07:53 UTC (209 KB) +[v3] +Wed, 18 Mar 2026 00:41:09 UTC (210 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled A Comedy of Estimators: On KL Regularization in RL Training of LLMs, by Vedant Shah and 12 other authors +View PDF +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2025-12 +Change to browse by: +cs +cs.AI +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md b/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md new file mode 100644 index 0000000000000000000000000000000000000000..8a17774bb0c9f7af60645329fcd662f8d7b0325c --- /dev/null +++ b/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md @@ -0,0 +1,210 @@ +--- +title: '[2601.18734] Self-Distilled Reasoner: On-Policy Self-Distillation for Large + Language Models' +id: 260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language +tags: +- deepread +created: '2026-06-09T23:55:21.280271Z' +source: https://arxiv.org/abs/2601.18734 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:55:21.280139Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2601.18734] Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models +Computer Science > Machine Learning +arXiv:2601.18734 +(cs) +[Submitted on 26 Jan 2026 ( +v1 +), last revised 20 Mar 2026 (this version, v3)] +Title: +Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models +Authors: +Siyan Zhao +, +Zhihui Xie +, +Mengchen Liu +, +Jing Huang +, +Guan Pang +, +Feiyu Chen +, +Aditya Grover +View a PDF of the paper titled Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models, by Siyan Zhao and 6 other authors +View PDF +HTML (experimental) +Abstract: +Knowledge distillation improves large language model (LLM) reasoning by compressing the knowledge of a teacher LLM to train smaller LLMs. On-policy distillation advances this approach by having the student sample its own trajectories while a teacher LLM provides dense token-level supervision, addressing the distribution mismatch between training and inference in off-policy distillation methods. However, on-policy distillation typically requires a separate, often larger, teacher LLM and does not explicitly leverage ground-truth solutions available in reasoning datasets. Inspired by the intuition that a sufficiently capable LLM can rationalize external privileged reasoning traces and teach its weaker self, we introduce On-Policy Self-Distillation (OPSD), a learning algorithm where a single LLM acts as both teacher and student with different contexts. The teacher policy conditions on privileged information (e.g., verified reasoning traces) while the student policy sees only the question; training minimizes the per-token divergence between these distributions over the student's own rollouts. We demonstrate the efficacy of our method on multiple mathematical reasoning benchmarks, achieving superior token efficiency compared to reinforcement learning methods and better performance over off-policy distillation methods. Code repo: +this https URL +. +Comments: +code is released here: +this https URL +Subjects: +Machine Learning (cs.LG) +; Computation and Language (cs.CL) +Cite as: +arXiv:2601.18734 +[cs.LG] +(or +arXiv:2601.18734v3 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2601.18734 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Siyan Zhao [ +view email +] +[v1] +Mon, 26 Jan 2026 17:56:50 UTC (265 KB) +[v2] +Thu, 5 Mar 2026 18:19:57 UTC (282 KB) +[v3] +Fri, 20 Mar 2026 15:40:19 UTC (268 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models, by Siyan Zhao and 6 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2026-01 +Change to browse by: +cs +cs.CL +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/260120802-reinforcement-learning-via-self-distillation.md b/research/notes/260120802-reinforcement-learning-via-self-distillation.md new file mode 100644 index 0000000000000000000000000000000000000000..47ce2afec19a82bb4e8ab691d304b9160d0a21c9 --- /dev/null +++ b/research/notes/260120802-reinforcement-learning-via-self-distillation.md @@ -0,0 +1,215 @@ +--- +title: '[2601.20802] Reinforcement Learning via Self-Distillation' +id: 260120802-reinforcement-learning-via-self-distillation +tags: +- deepread +created: '2026-06-09T23:54:49.976930Z' +source: https://arxiv.org/abs/2601.20802 +source_domain: arxiv.org +fetched_at: '2026-06-09T23:54:49.976791Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2601.20802] Reinforcement Learning via Self-Distillation +Computer Science > Machine Learning +arXiv:2601.20802 +(cs) +[Submitted on 28 Jan 2026 ( +v1 +), last revised 16 Feb 2026 (this version, v2)] +Title: +Reinforcement Learning via Self-Distillation +Authors: +Jonas Hübotter +, +Frederike Lübeck +, +Lejs Behric +, +Anton Baumann +, +Marco Bagatella +, +Daniel Marta +, +Ido Hakimi +, +Idan Shenfeld +, +Thomas Kleine Buening +, +Carlos Guestrin +, +Andreas Krause +View a PDF of the paper titled Reinforcement Learning via Self-Distillation, by Jonas H\"ubotter and 10 other authors +View PDF +HTML (experimental) +Abstract: +Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math. Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck. Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain why an attempt failed. We formalize this setting as reinforcement learning with rich feedback and introduce Self-Distillation Policy Optimization (SDPO), which converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model. SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy. In this way, SDPO leverages the model's ability to retrospectively identify its own mistakes in-context. Across scientific reasoning, tool use, and competitive programming on LiveCodeBench v6, SDPO improves sample efficiency and final accuracy over strong RLVR baselines. Notably, SDPO also outperforms baselines in standard RLVR environments that only return scalar feedback by using successful rollouts as implicit feedback for failed attempts. Finally, applying SDPO to individual questions at test time accelerates discovery on difficult binary-reward tasks, achieving the same discovery probability as best-of-k sampling or multi-turn conversations with 3x fewer attempts. +Subjects: +Machine Learning (cs.LG) +; Artificial Intelligence (cs.AI) +Cite as: +arXiv:2601.20802 +[cs.LG] +(or +arXiv:2601.20802v2 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2601.20802 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Jonas Hübotter [ +view email +] +[v1] +Wed, 28 Jan 2026 17:45:12 UTC (1,009 KB) +[v2] +Mon, 16 Feb 2026 14:49:34 UTC (2,122 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Reinforcement Learning via Self-Distillation, by Jonas H\"ubotter and 10 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2026-01 +Change to browse by: +cs +cs.AI +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +Links to Code Toggle +Papers with Code +( +What is Papers with Code? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/260324477-composer-2-technical-report.md b/research/notes/260324477-composer-2-technical-report.md new file mode 100644 index 0000000000000000000000000000000000000000..bcdcb9dc360b5126c19fa99c24e5c178e06b08ab --- /dev/null +++ b/research/notes/260324477-composer-2-technical-report.md @@ -0,0 +1,293 @@ +--- +title: '[2603.24477] Composer 2 Technical Report' +id: 260324477-composer-2-technical-report +tags: +- deepread +created: '2026-06-10T00:23:01.085397Z' +source: https://arxiv.org/abs/2603.24477 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:01.085279Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2603.24477] Composer 2 Technical Report +Computer Science > Software Engineering +arXiv:2603.24477 +(cs) +[Submitted on 25 Mar 2026 ( +v1 +), last revised 26 Mar 2026 (this version, v2)] +Title: +Composer 2 Technical Report +Authors: +Cursor Research +: +Aaron Chan +, +Ahmed Shalaby +, +Alexander Wettig +, +Aman Sanger +, +Andrew Zhai +, +Anurag Ajay +, +Ashvin Nair +, +Charlie Snell +, +Chen Lu +, +Chen Shen +, +Emily Jia +, +Federico Cassano +, +Hanpeng Liu +, +Haoyu Chen +, +Henry Wildermuth +, +Jacob Jackson +, +Janet Li +, +Jediah Katz +, +Jiajun Yao +, +Joey Hejna +, +Josh Warner +, +Julius Vering +, +Kevin Frans +, +Lee Danilek +, +Less Wright +, +Lujing Cen +, +Luke Melas-Kyriazi +, +Michael Truell +, +Michiel de Jong +, +Naman Jain +, +Nate Schmidt +, +Nathan Wang +, +Niklas Muennighoff +, +Oleg Rybkin +, +Paul Loh +, +Phillip Kravtsov +, +Rishabh Yadav +, +Sahil Shah +, +Sam Kottler +, +Alexander M Rush +, +Shengtong Zhang +, +Shomil Jain +, +Sriram Sankar +, +Stefan Heule +, +Stuart H. Sul +, +Sualeh Asif +, +Victor Rong +, +Wanqi Zhu +, +William Lin +, +Yuchen Wu +, +Yuri Volkov +, +Yury Zemlyanskiy +, +Zack Holbrook +, +Zhiyuan Zhang +View a PDF of the paper titled Composer 2 Technical Report, by Cursor Research: Aaron Chan and 53 other authors +View PDF +HTML (experimental) +Abstract: +Composer 2 is a specialized model designed for agentic software engineering. The model demonstrates strong long-term planning and coding intelligence while maintaining the ability to efficiently solve problems for interactive use. The model is trained in two phases: first, continued pretraining to improve the model's knowledge and latent coding ability, followed by large-scale reinforcement learning to improve end-to-end coding performance through stronger reasoning, accurate multi-step execution, and coherence on long-horizon realistic coding problems. We develop infrastructure to support training in the same Cursor harness that is used by the deployed model, with equivalent tools and structure, and use environments that match real problems closely. To measure the ability of the model on increasingly difficult tasks, we introduce a benchmark derived from real software engineering problems in large codebases including our own. Composer 2 is a frontier-level coding model and demonstrates a process for training strong domain-specialized models. On our CursorBench evaluations the model achieves a major improvement in accuracy compared to previous Composer models (61.3). On public benchmarks the model scores 61.7 on Terminal-Bench and 73.7 on SWE-bench Multilingual in our harness, comparable to state-of-the-art systems. +Subjects: +Software Engineering (cs.SE) +; Machine Learning (cs.LG) +Cite as: +arXiv:2603.24477 +[cs.SE] +(or +arXiv:2603.24477v2 +[cs.SE] +for this version) +https://doi.org/10.48550/arXiv.2603.24477 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Alexander M. Rush [ +view email +] +[v1] +Wed, 25 Mar 2026 16:18:37 UTC (1,616 KB) +[v2] +Thu, 26 Mar 2026 01:57:05 UTC (1,605 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Composer 2 Technical Report, by Cursor Research: Aaron Chan and 53 other authors +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.SE +< prev +| +next > +new +| +recent +| +2026-03 +Change to browse by: +cs +cs.LG +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md b/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md new file mode 100644 index 0000000000000000000000000000000000000000..d8f12f6c4b436fde9a6712c939a8807b3bd135a2 --- /dev/null +++ b/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md @@ -0,0 +1,190 @@ +--- +title: '[2605.05029] The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale + Neural Evidence' +id: 260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu +tags: +- deepread +created: '2026-06-10T00:31:38.772752Z' +source: https://arxiv.org/abs/2605.05029 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:31:38.772451Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2605.05029] The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence +Computer Science > Machine Learning +arXiv:2605.05029 +(cs) +[Submitted on 6 May 2026] +Title: +The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence +Authors: +Kejun Liu +View a PDF of the paper titled The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence, by Kejun Liu +View PDF +HTML (experimental) +Abstract: +We report a systematic failure mode in predictive representation learning. Across 2695 neural network configurations trained to predict linear-Gaussian dynamics, the optimal encoder tracks the environment rather than the system it is meant to model. The mean causal fidelity -- the fraction of encoder sensitivity allocated to system degrees of freedom -- is 0.49, and only 2.5% of configurations exceed 0.70. The failure intensifies with dimension: at N=100, the optimal encoder becomes causally blind (fidelity ~10^{-8}) while achieving 92% lower prediction error than the causal representation. We prove this is not an optimization artifact but a structural property of the predictive objective: when environment modes are slower or less noisy than system modes, every minimizer of the population risk encodes the former. The set of dynamics exhibiting this predictive-causal gap is open and of positive measure in parameter space. In a nonlinear Duffing-GRU sweep, unconstrained predictors learn environment-dominant representations in 55% of tasks (95% CI 41--68%) versus 24% under operational grounding (p=2.3e-3); the median out-of-distribution MSE inflation under environment shift is 1.82x versus 1.00x. Operational grounding -- restricting the loss to system observables -- partially suppresses the gap, but causal fidelity is never recovered without an explicit system-environment boundary. The results identify the predictive-causal gap as a structural limit of learning, with implications for self-supervised representation learning, world models, and the scaling paradigm. +Comments: +15 pages, 5 figures, 3 tables. Supplemental Material included (Sections S1-S10) +Subjects: +Machine Learning (cs.LG) +MSC +classes: +68T07, 62M45, 37M10, 81S2268T07, 62M45, 37M10, 81S22 68T07, 62M45, 37M10, 81S22 +Cite as: +arXiv:2605.05029 +[cs.LG] +(or +arXiv:2605.05029v1 +[cs.LG] +for this version) +https://doi.org/10.48550/arXiv.2605.05029 +Focus to learn more +arXiv-issued DOI via DataCite +Submission history +From: Kejun Liu [ +view email +] +[v1] +Wed, 6 May 2026 15:25:37 UTC (138 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence, by Kejun Liu +View PDF +HTML (experimental) +TeX Source +view license +Current browse context: +cs.LG +< prev +| +next > +new +| +recent +| +2026-05 +Change to browse by: +cs +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +IArxiv recommender toggle +IArxiv Recommender +( +What is IArxiv? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md b/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md new file mode 100644 index 0000000000000000000000000000000000000000..1df86bea99a7f8d79f3b8d2cb5e0a7fa94333288 --- /dev/null +++ b/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md @@ -0,0 +1,196 @@ +--- +title: '[2606.07412] Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent + Skills' +id: 260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2 +tags: +- deepread +created: '2026-06-10T00:41:23.035686Z' +source: https://ar5iv.labs.arxiv.org/html/2606.07412 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:41:23.035515Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +[2606.07412] Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills +Computer Science > Software Engineering +arXiv:2606.07412 +(cs) +[Submitted on 5 Jun 2026] +Title: +Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills +Authors: +Chuan Xiao +, +Zhengbo Jiao +, +Shaobo Wang +, +Wei Wang +, +Bing Zhao +, +Hu Wei +, +Linfeng Zhang +, +Lin Qu +View a PDF of the paper titled Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills, by Chuan Xiao and 7 other authors +View PDF +Abstract: +LLM-driven software engineering agents have become a central testbed for real-world language-model capability, yet their training remains limited by the availability of high-quality SWE tasks. Existing synthetic data methods typically create tasks through fixed mutation or bug-injection procedures, making the resulting distributions largely independent of the agent's own weaknesses and training progress. We introduce Socratic-SWE, a closed-loop self-evolution framework that reuses the agent's historical solving traces as a source of training signal. Rather than treating traces only as evidence for reward computation, Socratic-SWE distills them into structured agent skills that summarize recurring failures and effective repair patterns. These skills then guide the generation of targeted repair tasks in real repositories. Candidate tasks are checked through execution-based validation and scored with a solver-gradient alignment reward, so that the retained tasks are both verifiable and useful for improving the Solver. The updated Solver produces new traces, enabling the task curriculum to adapt over successive rounds. Across SWE-bench Verified, SWE-bench Lite, SWE-bench Pro, and Terminal-Bench 2.0, Socratic-SWE consistently improves over self-evolving baselines under the same compute budget, reaching 50.40% on SWE-bench Verified after three iterations. These results suggest that solving traces can serve as a scalable substrate for self-evolving SWE agents. +Comments: +21 pages, 5 figures. Under review +Subjects: +Software Engineering (cs.SE) +; Artificial Intelligence (cs.AI) +Cite as: +arXiv:2606.07412 +[cs.SE] +(or +arXiv:2606.07412v1 +[cs.SE] +for this version) +https://doi.org/10.48550/arXiv.2606.07412 +Focus to learn more +arXiv-issued DOI via DataCite (pending registration) +Submission history +From: Zhengbo Jiao [ +view email +] +[v1] +Fri, 5 Jun 2026 16:00:17 UTC (755 KB) +Full-text links: +Access Paper: +View a PDF of the paper titled Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills, by Chuan Xiao and 7 other authors +View PDF +TeX Source +view license +Current browse context: +cs.SE +< prev +| +next > +new +| +recent +| +2026-06 +Change to browse by: +cs +cs.AI +References & Citations +NASA ADS +Google Scholar +Semantic Scholar +export BibTeX citation +Loading... +BibTeX formatted citation +× +loading... +Data provided by: +Bookmark +Bibliographic Tools +Bibliographic and Citation Tools +Bibliographic Explorer Toggle +Bibliographic Explorer +( +What is the Explorer? +) +Connected Papers Toggle +Connected Papers +( +What is Connected Papers? +) +Litmaps Toggle +Litmaps +( +What is Litmaps? +) +scite.ai Toggle +scite Smart Citations +( +What are Smart Citations? +) +Code, Data, Media +Code, Data and Media Associated with this Article +alphaXiv Toggle +alphaXiv +( +What is alphaXiv? +) +Links to Code Toggle +CatalyzeX Code Finder for Papers +( +What is CatalyzeX? +) +DagsHub Toggle +DagsHub +( +What is DagsHub? +) +GotitPub Toggle +Gotit.pub +( +What is GotitPub? +) +Huggingface Toggle +Hugging Face +( +What is Huggingface? +) +ScienceCast Toggle +ScienceCast +( +What is ScienceCast? +) +Demos +Demos +Replicate Toggle +Replicate +( +What is Replicate? +) +Spaces Toggle +Hugging Face Spaces +( +What is Spaces? +) +Spaces Toggle +TXYZ.AI +( +What is TXYZ.AI? +) +Related Papers +Recommenders and Search Tools +Link to Influence Flower +Influence Flower +( +What are Influence Flowers? +) +Core recommender toggle +CORE Recommender +( +What is CORE? +) +Author +Venue +Institution +Topic +About arXivLabs +arXivLabs: experimental projects with community collaborators +arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website. +Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them. +Have an idea for a project that will add value for arXiv's community? +Learn more about arXivLabs +. +Which authors of this paper are endorsers? +| +Disable MathJax +( +What is MathJax? +) \ No newline at end of file diff --git a/research/notes/bugs-scaling-data-for-software-engineering-agents.md b/research/notes/bugs-scaling-data-for-software-engineering-agents.md new file mode 100644 index 0000000000000000000000000000000000000000..d2a2e43ad1199bbf7b28c64633fc8ce209677183 --- /dev/null +++ b/research/notes/bugs-scaling-data-for-software-engineering-agents.md @@ -0,0 +1,8283 @@ +--- +title: '\bugs: Scaling Data for Software Engineering Agents' +id: bugs-scaling-data-for-software-engineering-agents +tags: +- deepread +created: '2026-06-10T00:23:06.342374Z' +source: https://arxiv.org/html/2504.21798 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:06.342106Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +\bugs: Scaling Data for Software Engineering Agents +\bugs +: Scaling Data for Software Engineering Agents +John Yang +1 +, +Kilian Lieret +2 +, +Carlos E. Jimenez +2 +, +Alexander Wettig +2 +, +Kabir Khandpur +3 +, +\And +Yanzhe Zhang +1 +, +Binyuan Hui +4 +, +Ofir Press +2 +, +Ludwig Schmidt +1 +, +Diyi Yang +1 +\And +1 +Stanford University +2 +Princeton University +3 +Indepedent +4 +Alibaba Qwen +Abstract +Despite recent progress in Language Models (LMs) for software engineering, collecting training data remains a significant pain point. +Existing datasets are small, with at most +1 +, +000 +1 +000 +1{,}000 +1 , 000 +s of training instances from +11 +11 +11 +11 +or fewer GitHub repositories. +The procedures to curate such datasets are often complex, necessitating hundreds of hours of human labor; companion execution environments also take up several terabytes of storage, severely limiting their scalability and usability. +To address this pain point, we introduce SWE-smith, a novel pipeline for generating software engineering training data at scale. +Given any Python codebase, +\bugs +constructs a corresponding execution environment, then automatically synthesizes +100 +100 +100 +100 +s to +1 +, +000 +1 +000 +1{,}000 +1 , 000 +s of task instances that break existing test(s) in the codebase. +Using +\bugs +, we create a dataset of +50 +50 +50 +50 +k instances sourced from +128 +128 +128 +128 +GitHub repositories, an order of magnitude larger than all previous works. +We train +SWE-agent-LM-32B +, achieving +40.2 +40.2 +40.2 +40.2 +% Pass@1 resolve rate on the SWE-bench Verified benchmark, state of the art among open source models. +We open source +\bugs +(collection procedure, task instances, trajectories, models) to lower the barrier of entry for research in LM systems for automated software engineering. +All assets available at +https://swesmith.com +. +1 +Introduction +Figure 1: +Scaling task instances +(left) and +performance +(right) for SWE-agent’s with +\bugs +. +Using +\bugs +, we can create +100 +100 +100 +100 +s to +1000 +1000 +1000 +1000 +s of instances for any Python codebase, enabling us to train +SWE-agent-LM-32B +which achieves +40.2 +40.2 +40.2 +40.2 +% on SWE-bench Verified. +Language Model (LM) agents, such as SWE-agent +(Yang et al., +2024a +) +or OpenHands +(Wang et al., +2024 +) +, +have made remarkable progress towards automating software engineering (SE) tasks, as tracked by benchmarks such as SWE-bench +(Jimenez et al., +2024b +) +. +However, the most effective agents still rely on proprietary LMs, as building open source LMs for SE remains bottlenecked by the lack of large-scale, high-quality training data. +To ensure that open research remains relevant in this field, it is critical to develop infrastructure for collecting software engineering training data at scale. +The current open-source ecosystem offers two kinds of data sources to train LMs on SE tasks. +One simple approach is to crawl pull requests (PRs) and issues from GitHub repositories. +However, without execution environments or tests, these instances offer no reliable way of validating generated solutions, +and LMs are limited to learning from the surface form of code +(Xie et al., +2025a +) +or via rewards based on superficial string similarity +(Wei et al., +2025 +) +. +In contrast, SWE-bench provides reliable validation by running unit tests against proposed solutions. +Another line of work has simply extended the SWE-bench collection strategy to a new set of repositories for training purposes +(Pan et al., +2024 +) +. +This produces flexible environments for training and distilling LM agents, since we can generate agent trajectories and filter them based on the unit test results. +However, the scalability of this approach is severely limited by the challenges associated with SWE-bench’s collection strategy. +SWE-bench’s filtering process leaves only a small number of PRs that not only resolve a Github issue, but also make meaningful changes to unit tests. +Also, setting up execution environments for each instance requires a substantial amount of human intervention. +In this paper, we introduce the +\bugs +toolkit, which marries the flexible execution environments of SWE-bench with scalable instance collection (Figure +1 +). +\bugs +features several techniques to automatically synthesize bugs in existing GitHub repositories, +such as (1) generating errant rewrites of functions with an LM, (2) procedurally modifying the abstract syntax tree (AST) of functions, (3) undoing PRs, and (4) combining bugs. +Our key insight is that execution-based validation can not only validate proposed solutions, +but also identify bug candidates which cause substantial software regression (i.e., break tests). +Figure 2: +\bugs +creates training data for software engineering agents by crafting bugs into real codebases. +Given a codebase, we employ several strategies to create task instances that break existing tests. +Using +\bugs +, we create +50 +50 +50 +50 +k+ task instances with execution environments from +128 +128 +128 +128 +real world repositories. +In a nutshell, +\bugs +puts forth the following task creation workflow, as shown in Figure +2 +. +Given a codebase, we automatically set up a corresponding environment using SWE-agent +(Yang et al., +2024a +) +. +Within this environment, we then use the aforementioned techniques to synthesize +100 +100 +100 +100 +s to +1 +, +000 +1 +000 +1,000 +1 , 000 +s of task instances. +Finally, we craft realistic issue descriptions automatically with LMs. +\bugs +’s design significantly reduces the amount of human labor and storage required for constructing execution environments. +Using +\bugs +, we create a dataset of +50 +50 +50 +50 +k task instances across +128 +128 +128 +128 +real-world GitHub repositories. +Using the +\bugs +dataset, we achieve a new open-weight state of the art result on SWE-bench verified. +Using the SWE-smith task instances, we generate +5 +, +016 +5 +016 +5{,}016 +5 , 016 +expert trajectories with Claude 3.7 Sonnet and fine-tune Qwen 2.5 Coder Instruct +32 +32 +32 +32 +B. +The resulting LM, +SWE-agent-LM-32B +, achieves +40.2 +% +percent +40.2 +40.2\% +40.2 % +(+ +33.4 +33.4 +33.4 +33.4 +%) on SWE-bench Verified in a single attempt, without inference-time scaling. +This sets a new state of the art for open-weight models. +The scale and diversity of the +\bugs +dataset enables us to begin establishing truths and investigate interesting phenomena about developing SWE-agents. +Training on more instances, bug types, and repositories helps. +LM generated issue text approximates real ones effectively. +Using +\bugs +, we find that it’s possible to optimize LMs to perform well for specific repositories while only suffering minor generalization loss. +We release +\bugs +as an open-source toolkit — including instances, environments, and trajectories — to catalyze the development of stronger open-source LM agents. +2 +\bugs +: Software Task Generation at Scale +The core principle of +\bugs +’s collection strategy is to define an execution environment first, and then synthesize task instances within the environment. +Conceptually, this is a simple inversion of SWE-bench’s approach, which instead prioritizes identifying task instances, and then attempts to build an environment for each. +In this section, we describe the procedure in detail and show how, in practice, +\bugs +scales significantly better in terms of repositories, task instances, and storage. +2.1 +Collection +Building execution environments for repositories with passing tests. +Given a repository, we run SWE-agent +(Yang et al., +2024a +) +on the latest commit for at most +100 +100 +100 +100 +steps, instructing it to install the codebase and run the test suite. +We then manually verify the installation and testing instructions, check if more than +80 +80 +80 +80 +% of existing tests pass, and finally create a Docker image for the repository. +We target repositories for the +5 +, +000 +5 +000 +5,000 +5 , 000 +most downloaded packages listed in the Python Package Index (PyPI) as of November 18, 2024, sort the PyPI packages by GitHub stars, and then remove any PyPI package with less than +1 +, +000 +1 +000 +1,000 +1 , 000 +stars, as well as all +12 +12 +12 +12 +SWE-bench test repositories from consideration. +More in § +A.2 +. +Creating task instance candidates. +Per repository, we employ four different strategies to create candidates. +As shown in Figure +2 +, each strategy takes in a repository as input, then produces task instance candidates represented as +.diff +files. +Extensive details in § +B +. +• +LM Generation +: Per repository, we identify all programmatic entities (functions, classes), then take two approaches: (1) provide an LM with the function and prompt it to introduce errant +modifications +(henceforth referred to as “LM Modify”), and (2) given only the function header and docstring, ask the LM to +rewrite +it (“LM Rewrite”). +More in § +B.1 +. +• +Procedural Modification +: Per function, we acquire an abstract syntax tree (AST) representation of the code, then randomly perform one or more transformations (e.g., remove a conditional/loop, change an operator, + +11 +11 +11 +11 +more. See Table +8 +). +More in § +B.2 +. +• +Combine Bugs +: LM generation and Procedural Modification task instances exclusively edit one function or class. +To create more complex tasks that require editing multiple portions of the codebase, we devise a “Patch Combination” strategy that creates a task instance by aggregating candidates from the same file(s) or module(s). +More in § +B.3 +. +• +Invert PRs +(or “PR Mirror”): Per repository, we collect all PRs that modify Python files. +Per PR, we attempt to +undo +its revisions in the current version of the repository. +To achieve this, we provide an LM with the PR’s code changes (a +.diff +plaintext) and prompt it to rewrite each affected file such that the PR edits are reverted. +Unlike SWE-bench, we do +not +check out the PR’s base commit, as the install specifications determined in the previous step may not be compatible with older versions of the repo. +More in § +B.4 +. +Execution-based validation of candidates. +We apply each candidate patch to the corresponding repository, run the test suite, and only keep patches that break one or more existing, passing tests (referred to as +Fail-to-Pass +or +F2P +test(s)). +For efficiency purposes, we also limit testing runtime to two minutes; bug candidates that cause test runtimes in excess of this time limit are discarded. +Minor additional details in § +A.3 +. +Generating problem statements. +The issue text associated with a bug can significantly alter the difficulty and feasibility of the task instance. +Detailed descriptions of “expected” vs. “observed” behavior or bug-reproduction code in issue text heavily affect an agent’s capacity to localize bugs or iterate on proposed solutions. +We explore several techniques covered fully in § +D +, and ultimately settle on a simple strategy. +Per task instance, we provide an LM with the +.diff +patch, source code of a random F2P test, and execution output from running the repository’s test suite with the bug patch applied. +We prompt the LM for GitHub issue-style text that includes reproduction code based on the F2P test. +What human labor remains? +The steps requiring manual effort are (1) parsing the correct installation setup procedures from the agent trajectory ( +∼ +7 +similar-to +absent +7 +\sim 7 +∼ 7 +min per repository), +and (2) implementing the parser for test outputs ( +∼ +1 +similar-to +absent +1 +\sim 1 +∼ 1 +min per repository). +Step two requires very little time because parsers can be reused for repositories with the same testing infrastructure (e.g., +pytest +). +\bugs +removes the need for manual efforts to determine installation specifications for multiple versions of a codebase across time, the most costly step of SWE-bench collection. +Creating +\bugs +took one author +∼ +20 +similar-to +absent +20 +\sim{}20 +∼ 20 +h of human labor. +2.2 +Features +We apply +\bugs +to +128 +128 +128 +128 +Python repositories, generating a total of +50 +50 +50 +50 +k instances. +Table +1 +captures the key statistics. +On average, we generate +381 +381 +381 +381 +task instances per repository, with as many as +2277 +2277 +2277 +2277 +for +pandas-dev/pandas +. +We summarize the distribution of task instances per repository in Figure +1 +, where repositories are grouped into one of six general categories. +\bugs +took $ +1360 +1360 +1360 +1360 +to create ($ +1000 +1000 +1000 +1000 +to generate bugs, $ +160 +160 +160 +160 +for automatic repository installation with SWE-agent, $ +200 +200 +200 +200 +to generate issues for +10 +10 +10 +10 +K bugs). +Generating an issue costs +2.54 +2.54 +2.54 +2.54 +¢ on average. +More dataset analyses in § +C +. +\captionof +figure +Distribution of instances per repo for +128 +128 +128 +128 +repo’s grouped into +6 +6 +6 +6 +categories. +Bug Type +Yield % +# Insts +Cost +F2P +Lines +Combine +96.9 +96.9 +96.9 +96.9 +% +10 +, +092 +10 +092 +10,092 +10 , 092 +0.00 +0.00 +0.00 +0.00 +¢ +15 +15 +15 +15 +11 +11 +11 +11 +LM Modify +56.0 +56.0 +56.0 +56.0 +% +17 +, +887 +17 +887 +17,887 +17 , 887 +0.38 +0.38 +0.38 +0.38 +¢ +4 +4 +4 +4 +3 +3 +3 +3 +LM Rewrite +35.0 +35.0 +35.0 +35.0 +% +4 +, +173 +4 +173 +4,173 +4 , 173 +3.93 +3.93 +3.93 +3.93 +¢ +4 +4 +4 +4 +24 +24 +24 +24 +PR Mirror +33.8 +33.8 +33.8 +33.8 +% +2 +, +344 +2 +344 +2,344 +2 , 344 +5.53 +5.53 +5.53 +5.53 +¢ +3 +3 +3 +3 +14 +14 +14 +14 +Procedural +40.2 +40.2 +40.2 +40.2 +% +15 +, +641 +15 +641 +15,641 +15 , 641 +0.00 +0.00 +0.00 +0.00 +¢ +7 +7 +7 +7 +5 +5 +5 +5 +Total +50.1 +50,137 +2.32¢ +6 +5 +Table 1: +Summary of +\bugs +statistics. +“Yield %” is the % of candidates generated by a strategy that break +1 ++ +limit-from +1 +1+ +1 + +tests. +“Cost” is the average cost to generate one candidate. +“F2P” (Fail to Pass tests), “Lines [Edited]” are median values. +Bug generation strategies vary in cost and yield rate. +Of methods relying on LMs, PR Mirrors are more expensive because the task entails rewriting entire files, as opposed to individual functions for LM Modify and LM Rewrite. +Yield rates are limited by either lack of test coverage for the change or because the bug candidate did not actually introduce relevant issues. +For example, for LM Rewrite, the LM is asked to re-implement the function; it is +not +explicitly asked for bugs. +When requested outright (LM Modify), the yield is higher. +Dataset +# Tasks +# Repos +Exec? +Source +Env. Size +R2E +(Jain et al., +2024 +) +0.25 +0.25 +0.25 +0.25 +k +137 +137 +137 +137 +Synth +270 +270 +270 +270 +GBs +R2E-gym (Subset) +(Jain et al., +2025 +) +4.6 +4.6 +4.6 +4.6 +k +10 +10 +10 +10 +Synth +4 +4 +4 +4 +TBs +SWE-bench-extra +(Badertdinov et al., +2024 +) +6.38 +6.38 +6.38 +6.38 +k +2 +2 +2 +2 +k +Real +- +SWE-bench-train +(Jimenez et al., +2024b +) +19 +19 +19 +19 +k +37 +37 +37 +37 +Real +- +SWE-fixer +(Xie et al., +2025a +) +115 +115 +115 +115 +k +856 +856 +856 +856 +Real +- +SWE-gym +(Pan et al., +2024 +) +2.4 +2.4 +2.4 +2.4 +k +11 +11 +11 +11 +Real +6 +6 +6 +6 +TBs +\bugs +50 +50 +50 +50 +k +128 +128 +128 +128 +Both +295 +295 +295 +295 +GBs +Table 2: +Comparison of open source training datasets for software engineering tasks. +Relative to existing datasets, +\bugs +has multiple times the number of task instances, repositories, and environments at a fraction of prior storage costs. +SWE-fixer and SWE-bench-train task instances do not have execution environments, so “Env. Size” is blank. +How difficult are +\bugs +task instances? +To determine whether task instances produced by +\bugs +are realistic and challenging, we train a Qwen +2.5 +2.5 +2.5 +2.5 +32 +32 +32 +32 +B model on +1 +, +699 +1 +699 +1{,}699 +1 , 699 +human-annotated (task, label) pairs from +Chowdhury et al. ( +2024 +) +to rate tasks as ( +easy +, +medium +, +hard +) by training. +To quantify difficulty, each difficulty label corresponds to values of +1 +1 +1 +1 +/ +5 +5 +5 +5 +/ +9 +9 +9 +9 +. +The model achieves +75.3 +75.3 +75.3 +75.3 +% test accuracy. +We then rate difficulty of task instances from both +\bugs +and prior SWE-bench style datasets +(Chowdhury et al., +2024 +; Jimenez et al., +2024b +; Pan et al., +2024 +; Yang et al., +2024b +) +. +\bugs +task instances span a broad range of difficulties, similar to SWE-bench and SWE-gym. +The average difficulty score for +\bugs +( +5.27 +5.27 +5.27 +5.27 +– +5.72 +5.72 +5.72 +5.72 +across bug generation strategies) is comparable to SWE-bench ( +5.01 +5.01 +5.01 +5.01 +) and SWE-gym ( +5.62 +5.62 +5.62 +5.62 +). This suggests SWE-smith enables realistic and appropriately challenging evaluation. +We discuss why bug strategies yield different levels of difficulty and visualize difficulty per dataset in § +E +. +Scaling execution environments. +Unlike SWE-bench which creates a Docker image per task instance, +\bugs +leverages a simpler design where tasks from the same repository share the same environment, reducing storage overhead significantly, as shown in Table +2 +. +This approach not only makes scaling task instances more affordable, but also renders +\bugs +more accessible and maintainable than existing datasets. +We estimate that creating a similar quantity of task instances ( +50 +50 +50 +50 +k) using SWE-bench would require +50 +50 +50 +50 +to +150 +150 +150 +150 +TBs of storage for environments, a +500 +500 +500 +500 +x difference. +Extended discussion in § +C.1 +. +3 +Experiments +To explore the utility of +\bugs +for training software engineering agents, we use rejection sampling fine-tuning +(Yuan et al., +2023 +) +as the primary procedure for improving a base LM with +\bugs +. +Our experiment workflow is as follows. +First, we curate a subset of +\bugs +task instances. +Next, we run an agent system with an expert model on this subset. +At this step, the trajectory corresponding to each run is recorded. +Then, we fine-tune the base (or “student”) model on the trajectories corresponding to resolved instances. +Finally, we evaluate the agent system run with the student model on a separate, test split. +Models. +For expert models, we use +claude-3-7-sonnet-20250219 +(Anthropic, +2025 +) +. +For fair comparisons with prior works +(Pan et al., +2024 +) +, we also use +claude-3-5-sonnet-20240620 +and +gpt-4o-2024-08-06 +. +We use the +Qwen-2.5-Coder-Instruct +(Hui et al., +2024 +) +7 +7 +7 +7 +B and +32 +32 +32 +32 +B series as the base models. +Training and hyperparameter details are in § +F.1 +. +Agent system. +We use SWE-agent +(Yang et al., +2024a +) +, an agent system for solving GitHub issues. +SWE-agent provides a base LM with an Agent Computer Interface (ACI) that enables more effective interactions with a codebase. +At each turn, SWE-agent prompts an LM to generate a ReAct +(Yao et al., +2023b +) +style (thought, action) pair, where the action either edits a file or executes a shell command. +We choose SWE-agent because, at the time of writing, SWE-agent with Claude 3.7 Sonnet is the top open source solution on SWE-bench. +When generating trajectories with expert models, we run SWE-agent for at most +75 +75 +75 +75 +steps and $ +2.00 +2.00 +2.00 +2.00 +cost limit. +For inference of student models, we impose the same +75 +75 +75 +75 +step maximum and fix temperature at +0.0 +0.0 +0.0 +0.0 +. +Full configuration details are in § +F.1 +. +Evaluation metrics. +We evaluate on the SWE-bench Lite and Verified +(Chowdhury et al., +2024 +) +subsets. +SWE-bench evaluates AI systems on their ability to solve software issues from +12 +12 +12 +12 +real world GitHub repositories. +The Lite split is a subset of +300 +300 +300 +300 +instances, curated to be an easier evaluation set that’s less costly to run. +The Verified split is a human-curated subset of +500 +500 +500 +500 +instances, selected for clearer problem statements and more reliable evaluation. +To assess generalization beyond Python, we also evaluate on SWE-bench Multilingual, a new dataset introduced in this paper. +SWE-Bench Multilingual consists of +300 +300 +300 +300 +task instances that cover +9 +9 +9 +9 +additional programming languages. +See § +F.2 +for more details. +We report the +% resolved +metric, the proportion of successfully resolved instances. +4 +Results +Table +3 +compares the performance of Qwen 2.5 Coder Instruct models (7B and 32B), fine-tuned on +5 +, +016 +5 +016 +5{,}016 +5 , 016 +\bugs +trajectories. +We refer to them as +SWE-agent-LM-7B +and +SWE-agent-LM-32B +; the latter achieves state-of-the-art performance. +Model +System +Train Size +Lite +Verified +Closed Weight Models +GPT-4o +(OpenAI, +2024a +) +Agentless +- +32.0 +32.0 +32.0 +32.0 +38.8 +38.8 +38.8 +38.8 +OpenHands +- +22.0 +22.0 +22.0 +22.0 +- +SWE-agent +- +18.3 +18.3 +18.3 +18.3 +23.0 +23.0 +23.0 +23.0 +Claude 3.5 Sonnet +(Anthropic, +2024 +) +Agentless +- +40.7 +40.7 +40.7 +40.7 +50.8 +50.8 +50.8 +50.8 +AutoCodeRover +- +- +46.2 +46.2 +46.2 +46.2 +OpenHands +- +41.7 +41.7 +41.7 +41.7 +53.0 +53.0 +53.0 +53.0 +SWE-agent +- +23.0 +23.0 +23.0 +23.0 +33.6 +33.6 +33.6 +33.6 +Claude 3.7 Sonnet +(Anthropic, +2025 +) +SWE-agent +- +48.0 +58.2 +Llama3-SWE-RL-70B +(Wei et al., +2025 +) +Agentless +11M +- +41.0 +41.0 +41.0 +41.0 +Open Weight Models +Lingma-SWE-GPT-72B +(Ma et al., +2024 +) +SWE-SynInfer +- +- +28.8 +28.8 +28.8 +28.8 +Qwen3-235B-A22B +(Qwen et al., +2025 +) +OpenHands +- +- +34.4 +34.4 +34.4 +34.4 +R2E-Gym-32B +(Jain et al., +2025 +) +OpenHands +3.3 +3.3 +3.3 +3.3 +k +- +34.4 +34.4 +34.4 +34.4 +SWE-fixer-72B +(Xie et al., +2025a +) +SWE-Fixer +110 +110 +110 +110 +k +24.7 +24.7 +24.7 +24.7 +32.8 +32.8 +32.8 +32.8 +SWE-gym-32B +(Pan et al., +2024 +) +OpenHands +491 +491 +491 +491 +15.3 +15.3 +15.3 +15.3 +20.6 +20.6 +20.6 +20.6 +SWE-agent-LM-7B +SWE-agent +2 +2 +2 +2 +k +11.7 +11.7 +11.7 +11.7 +15.2 +15.2 +15.2 +15.2 +SWE-agent-LM-32B +SWE-agent +5 +5 +5 +5 +k +30.7 +40.2 +Table 3: +Resolve rates for existing solutions on SWE-bench Lite and Verified, collected from +Jimenez et al. ( +2024a +) +, compared to models fine-tuned on +\bugs +. +All performance numbers are pass@ +1 +1 +1 +1 +. +We do +not +compare against systems that use verifiers or multiple attempts at test time. +The final dataset of +5 +, +016 +5 +016 +5{,}016 +5 , 016 +training points was curated as follows. +We start by collecting a large pool of expert trajectories. +First, we carried out each of the ablations in Section +4.1 +, giving us an initial set of +5 +, +105 +5 +105 +5{,}105 +5 , 105 +trajectories. +Next, based on our observation that PR Mirror and LM Rewrite task instances yield the most effective expert trajectories (discussed below), we run the expert model on all task instances of these types, bumping up the total number to +6 +, +457 +6 +457 +6{,}457 +6 , 457 +task instances. +Ultimately, we attempt to generate expert trajectories for +8 +, +686 +8 +686 +8{,}686 +8 , 686 +unique task instances, or +17.3 +17.3 +17.3 +17.3 +% of the +\bugs +dataset. +Reinforcing the difficulty rating findings from Section +2.2 +, we observe that +\bugs +task instances are non-trivial for the top agent systems today. +The final pool of +6 +, +457 +6 +457 +6{,}457 +6 , 457 +represents a +36 +36 +36 +36 +% resolve rate of all +17 +, +906 +17 +906 +17{,}906 +17 , 906 +attempts to solve one of the +8 +, +686 +8 +686 +8{,}686 +8 , 686 +task instances. +Next, we perform minor filtering of this collection. +As reported in +Pan et al. ( +2024 +) +, we also observe that “easier” trajectories – task instances that are repeatedly solved across multiple runs — degrade model performance. +Therefore, we limit the number of times any +\bugs +task instance is represented in the training set to +3 +3 +3 +3 +trajectories. +This leads to the final +5 +, +016 +5 +016 +5{,}016 +5 , 016 +training set. +More details in § +F.3 +. +Performance improves with more data points. +Extending similar graphs from +Jain et al. ( +2025 +); Pan et al. ( +2024 +) +, Figure +1 +shows increasing performance with more trajectories. +Comparison at the same training set size. +To compare with prior works +(Jain et al., +2025 +; Pan et al., +2024 +) +, we run expert trajectory generation on +1000 +1000 +1000 +1000 +random +\bugs +task instances with SWE-agent + Claude 3.5 Sonnet ( +800 +800 +800 +800 +) or GPT-4o ( +200 +200 +200 +200 +). +We then fine-tune the +32 +32 +32 +32 +B model on +500 +500 +500 +500 +successful trajectories, a training set size both works report on. +Our model achieves a +28.2 +28.2 +28.2 +28.2 +% resolve rate on SWE-bench Verified, a relative difference of ++ +8.2 +8.2 ++8.2 ++ 8.2 +% with +Pan et al. ( +2024 +) +and ++ +0.7 +0.7 ++0.7 ++ 0.7 +% with +Jain et al. ( +2025 +) +. +4.1 +Ablations of +\bugs +We perform several ablations of how +\bugs +’s bug and problem statement generation strategies impact the quality of training data. +We use Claude 3.7 Sonnet as the expert for fine-tuning Qwen +2.5 +2.5 +2.5 +2.5 +7 +7 +7 +7 +B Coder Instruct, and report the performance on SWE-bench Verified. +LM Rewrite and Procedural bugs are comparable to PR mirrors. +We randomly sample +1000 +1000 +1000 +1000 +instances per bug generation strategy (LM Modify, LM Rewrite, Procedural Modifications, PR Mirrors). +Per instance, we generate issue text with an LM and run expert trajectory generation. +We then fine-tune a student model per strategy, capping training points to the minimum number of successful trajectories from any strategy ( +507 +507 +507 +507 +) for fair comparison. +Table +5 +summarizes the results. +Trajectories generated from PR mirrors are empirically the most effective training data — this is expected, since they are most reflective of SWE-bench. +What’s noteworthy is that trajectories from Procedural Modification and LM Rewrite instances lead to competitive models. +There is a steep drop-off with LM Modify bugs. +LM generated issues are comparable to real issues. +We randomly sample +600 +600 +600 +600 +PR Mirror task instances. +We compare LM generated issues with three alternatives — fixed issue templates, the source code + test logs of a random Fail-to-Pass test, and the original issue text associated with the PR. +We again cap training points to the minimum number of successful trajectories ( +259 +259 +259 +259 +) for fairness. +As shown in Table +5 +, training on task instances with LM generated issues is empirically comparable to using the original issue text. +Using fixed issue templates not only leads to the fewest successful trajectories, but also results in relatively homogeneous problem solving sequences. +The expert trajectories from fixed issue templates have +31 +31 +31 +31 +% fewer unique actions compared to LM generated text ( +379 +379 +379 +379 +vs. +550 +550 +550 +550 +). +While providing a Fail-to-Pass test case leads to more successful expert trajectories, leaking the evaluation criteria causes the model to skip over writing a reproduction script, which accounts for the performance drop. +Of +500 +500 +500 +500 +SWE-bench Verified instances, the student model trained on LM-generated issues attempts to reproduce the bug for +379 +379 +379 +379 +of the runs. +The model trained on test-based issues only does so for +127 +127 +127 +127 +cases, a +66 +66 +66 +66 +% decrease. +Strategy +# Trajs. +% Resolved +LM Modify +802 +802 +802 +802 +5.7 +5.7 +5.7 +5.7 +( +± +1.5 +plus-or-minus +1.5 +\pm 1.5 +± 1.5 +) +LM Rewrite +507 +507 +507 +507 +8.8 +8.8 +8.8 +8.8 +( +± +1.7 +plus-or-minus +1.7 +\pm 1.7 +± 1.7 +) +Procedural +745 +745 +745 +745 +8.6 +8.6 +8.6 +8.6 +( +± +1.8 +plus-or-minus +1.8 +\pm 1.8 +± 1.8 +) +PR Mirror +557 +557 +557 +557 +9.2 +9.2 +9.2 +9.2 +( +± +1.7 +) +\pm 1.7) +± 1.7 ) +Table 4: +Comparison of training on +1000 +1000 +1000 +1000 +\bugs +instances created with different strategies. +Issue +# Trajs. +% Resolved +Fixed +259 +6.4 +6.4 +6.4 +6.4 +( +± +1.5 +plus-or-minus +1.5 +\pm 1.5 +± 1.5 +) +F2P Test +390 +7.3 +7.3 +7.3 +7.3 +( +± +1.9 +plus-or-minus +1.9 +\pm 1.9 +± 1.9 +) +LM +328 +7.7 +7.7 +7.7 +7.7 +( +± +1.5 +plus-or-minus +1.5 +\pm 1.5 +± 1.5 +) +Original +319 +7.8 +7.8 +7.8 +7.8 +( +± +1.8 +plus-or-minus +1.8 +\pm 1.8 +± 1.8 +) +Table 5: +Comparing training on +600 +600 +600 +600 +PR Mirror instances with varied issue text. +Task difficulty correlates with solvability but not with effectiveness as training data. +First, we run our difficulty rating model on +10 +10 +10 +10 +k randomly selected +\bugs +task instances. +From this pool, we curate subsets of +1000 +1000 +1000 +1000 +instances corresponding to the three difficulty levels, then run expert trajectory generation per subset +3 +3 +3 +3 +times. +For the +easy +/ +medium +/ +hard +subsets, the resolve rate by the expert model are +58.6 +58.6 +58.6 +58.6 +%, +41.0 +41.0 +41.0 +41.0 +%, and +17.0 +17.0 +17.0 +17.0 +% respectively. +Next, from all successful trajectories, we create four fine-tuning datasets of +500 +500 +500 +500 +trajectories each corresponding to difficulty scores of +2 +2 +2 +2 +, +4 +4 +4 +4 +, +6 +6 +6 +6 +, and +8 +8 +8 +8 +. +As mentioned in Section +2.2 +, the corresponding scores for +easy +/ +medium +/ +hard +are +1 +1 +1 +1 +/ +5 +5 +5 +5 +/ +9 +9 +9 +9 +. +Therefore, the SFT dataset for score +2 +2 +2 +2 +is made up of trajectories corresponding to +375 +375 +375 +375 +easy +and +125 +125 +125 +125 +medium +instances, and so on. +Somewhat surprisingly, we do not observe strong correlation between increased difficulty and downstream performance. +For the student models trained on the +2 +2 +2 +2 +/ +4 +4 +4 +4 +/ +6 +6 +6 +6 +/ +8 +8 +8 +8 +difficulty SFT datasets, we get pass@1 scores of +12.4 +12.4 +12.4 +12.4 +%, +10.8 +10.8 +10.8 +10.8 +%, +13.6 +13.6 +13.6 +13.6 +%, and +12.2 +12.2 +12.2 +12.2 +% on SWE-bench Verified. +Figure 3: +We fine-tune a +7 +7 +7 +7 +B base and our +32 +32 +32 +32 +B models on +700 +700 +700 +700 +trajectories for SymPy. +Specialization boosts performance with minor generalization loss. +Figure 4: +At +700 +700 +700 +700 +training samples, we observe performance increases logarithmically with repositories. +Training on more repositories improves general performance. +We train models in four settings by sampling +700 +700 +700 +700 +expert trajectories on Procedural Modification tasks from pools of +4 +4 +4 +4 +, +25 +25 +25 +25 +, +50 +50 +50 +50 +, and +100 +100 +100 +100 +repositories. +Echoing similar findings for code generation tasks +(Xie et al., +2025b +) +, we find that increasing repositories represented in the training set improves performance, as shown in Figure +4 +, with an approximately logarithmic relation between model performance and number of repositories. +Repository-specialized models excel on the target repository with minor generalization loss. +We experiment with training models to be specialists on one particular repository. +To assess performance, we evaluate models on a subset of SWE-bench Verified tasks that are (1) from SymPy, and (2) created after January 1st, 2022, a total of +22 +22 +22 +22 +instances. +To create SymPy specific training data, we first select a base commit of SymPy just before the cutoff date. +Next, we create +1276 +1276 +1276 +1276 +Procedural Modification task instances, then generate +700 +700 +700 +700 +expert trajectories. +We evaluate specialization in two settings: (1) single-repository fine-tuning, and (2) specialist stage fine-tuning, both shown in Figure +4 +. +For single-repository tuning, we compare a model initialized with +Qwen-2.5-Coder-Instruct 7B +and trained on +700 +700 +700 +700 +instances sampled from +100 +100 +100 +100 +repositories, to the same Qwen base model but fine-tuned on the +700 +700 +700 +700 +SymPy instances only. +For specialist stage fine-tuning, we simply compare +SWE-agent-LM-32B +to the same model further fine-tuned on the +700 +700 +700 +700 +SymPy instances. +Specialization significantly boosts performance for the target repository with only slight drops in general performance in both the single-repository fine-tuning ( +21.2 +21.2 +21.2 +21.2 +% vs. +13.6 +13.6 +13.6 +13.6 +%) and specialist stage fine-tuning ( +42.4 +42.4 +42.4 +42.4 +% vs. +33.3 +33.3 +33.3 +33.3 +%) settings. +4.2 +Analysis of Agent Behavior +Figure 5: +SWE-agent-LM-32B +takes fewer steps to submit compared to Claude 3.7 Sonnet for instances resolved by both models. +Figure 6: +For unsuccessfully resolved tasks, a frequent failure mode is that +SWE-agent-LM-32B +will repeat actions. +This section analyzes the behavior, failure modes, and efficiency of SWE-agent when run with +SWE-agent-LM-32B +or Claude 3.7 Sonnet on SWE-bench verified. +SWE-agent-LM-32B can solve tasks efficiently. +SWE-agent-LM-32B +resolves tasks in fewer steps on average (24.9) than Claude 3.7 Sonnet (29.1), though the difference becomes marginal when accounting for different average difficulties of the resolved tasks: On the overlap of tasks that are resolved by both LMs, +SWE-agent-LM-32B +uses 24.8 steps compared to 25.6 used by Claude 3.7 Sonnet (see Fig. +6 +). +While shorter trajectories are not always preferred (additional actions can be used for additional validation purposes, for example), this shows that +SWE-agent-LM-32B +solves tasks very efficiently. +At the same time +SWE-agent-LM-32B +also demonstrates that it can remain focused throughout long trajectories, with 31 instances being resolved after 40 steps or more. +We further highlight that the accuracy of naturally terminating +1 +1 +1 +i.e., excluding agent runs that are terminated due to errors or cost/step count limits. Note that SWE-agent still extracts and submits any changes performed by the agent in these cases and some of them can be successful (for example if the agent is terminated due to cost while testing already performed edits). +agent submissions with +SWE-agent-LM-32B +achieve an accuracy nearly matching that of Claude 3.7 Sonnet (60% vs 63%), showing that +SWE-agent-LM-32B +is adept at determining whether an instance has been resolved. +As the overall cost and turn count averages scale strongly with the cost and turn limits, we reserve a more thorough analysis for § +F.5.1 +. +Repetitive actions are a key problem. +We observe a tendency for +SWE-agent-LM-32B +to get stuck in long sequences of repetitive actions, in particular long sequences of calls that display different portions of a file instead of using search commands. +2 +2 +2 +In fact, these +str_replace_editor view +commands make up +73 +73 +73 +73 +% of the longest repetitive sequences. For this analysis, we look at repetitions of the base command, i.e., without any arguments. See § +F.5 +for more. +More than 25% of +SWE-agent-LM-32B +trajectories have a repetitive sequence of at least length 10, compared to less than 4% for Claude 3.7 Sonnet (see Figure +6 +). +The occurrence of long repetitive sequences correlates strongly with the agent’s ability to solve the corresponding task instance, largely because the LM continues issuing similar commands until either the agent cost or turn limit is reached, at which point the run is terminated. +For example, repetitive sequences of length 10 correspond to an 89% failure probability. +Simple interventions from the agent scaffold can mitigate repetitive actions, but do not seem to improve resolve rates (see § +F.5 +). +Figure 7: +More than half of the unresolved instances of +SWE-agent-LM-32B +correspond to runs terminated by cost/step limits, and these limits are frequently reached before source code has been modified. See § +F.5 +for more. +Localization is the dominant failure mode. +Guided by a short plan in the system prompt, SWE-agent typically starts by +localizing +(search and read actions), +reproducing +(test file creation and execution), before modifying source files and validating the fixes. +If the agent gets stuck at any of these stages or keeps on iterating, the agent loop is eventually interrupted by runtime limits (cost, number of LM calls, runtime). +While this rarely happens with Claude 3.7 Sonnet, 53% of +SWE-agent-LM-32b +’s failures are associated with such limits (Figure +7 +). +The agent often already gets stuck during localization or initial efforts to reproduce a bug, with endlessly repeated actions being a persistent issue. +More on failure modes in § +F.5 +. +5 +Related Work +LMs for Software Engineering. +As contemporary LMs have saturated traditional code generation tasks +(Austin et al., +2021 +; Chen et al., +2021 +) +, software engineering benchmarks +(Jain et al., +2024 +; Jimenez et al., +2024b +; Yang et al., +2024b +; Zhao et al., +2024 +; Zan et al., +2025 +) +, notably SWE-bench, have become a new de facto evaluation setting due to their diverse, complex, real-world programming challenges. +The most significant source of open source progress on SWE-bench has been the development of LM-based workflows +(Orwall, +2024 +; Xia et al., +2024 +; Zhang et al., +2024b +) +and agents +(Antoniades et al., +2024 +; Wang et al., +2024 +; Yang et al., +2024a +; Zhang et al., +2024a +) +. +Workflow-based systems are typically human-engineered decompositions of a task into a sequence of sub-goals. +Yang et al. ( +2024b +) +suggests such pipelines may not generalize effectively to non-Python repositories, requiring additional human intervention to re-adapt. +We therefore elect to focus on generating trajectories with and for LM agent systems +(Sumers et al., +2024 +; Yang et al., +2023 +; Yao et al., +2023b +) +. +Because no workflow is imposed, agent systems inherently rely more on the LM to plan and refine its actions, putting more focus on an LM’s capabilities, not inference scaffolds. +Training Datasets for Coding. +Prior work around training data has focused on instruction following +(Luo et al., +2023 +; Muennighoff et al., +2024 +; Shypula et al., +2024 +; Wei et al., +2024a +; +b +; Yu et al., +2024 +) +and preference learning +(Liu et al., +2024a +; +b +) +for code completion tasks. +Several recent works introduce training sets for retrieval augmented generation +(Jimenez et al., +2024b +; Xie et al., +2025a +) +, workflows +(Wei et al., +2025 +) +, and agent +(Badertdinov et al., +2024 +; Ma et al., +2024 +; Pan et al., +2024 +; Jain et al., +2025 +) +approaches to SWE-bench. +Our work applies +Haluptzok et al. ( +2023 +) +at a repository level: by having an LM break a codebase, we drastically reduce the human effort needed to define a task and build its environment. +Concurrent to our work, +Xie et al. ( +2025b +) +(RePOST) also constructs execution environments for repository functions, but differs significantly in methodology and evaluation. +RePOST sandboxes a function and its dependencies to a separate script, then generates tests with an LM, removing the original codebase as context. +The tasks’ source is repository-level; the environments and tasks are not. +RePOST evaluates solely on code generation (e.g., HumanEval +(Chen et al., +2021 +) +). +Jain et al. ( +2025 +) +(R2E-Gym) improves open source LMs’ performance on SWE-bench with inference time scaling and verifiers. +R2E-gym’s +51 +51 +51 +51 +% resolve rate is not comparable to Table +3 +results, as each instance is attempted +26 +26 +26 +26 +times. +R2E-gym’s +4.6 +4.6 +4.6 +4.6 +k training instances are collected using SWE-bench’s pipeline, with some augmentations around using LMs to synthesize issue text and tests. +To our knowledge, we are the first to address the limited scalability +of previous approaches. +6 +Discussion +Limitations and future directions. +First, +\bugs +’s collection pipeline is Python-centric. +The mechanisms to identify programmatic objects (e.g. functions, classes) and perform transformations rely heavily on the Python specific +ast +library. +That said, +\bugs +’s collection strategy is transferable to other languages. +Second, due to both compute/budget constraints and our work’s primary stance as a dataset contribution, we only include fine-tuning as a demonstration of +\bugs +’s effectiveness. +We do not explore other training techniques such as reasoning capabilities elicited via reinforcement learning. +Conclusion. +We introduce +\bugs +, a dataset of +50 +50 +50 +50 +k software engineering task instances from across +128 +128 +128 +128 +real world GitHub repositories. +\bugs +collection pipeline allows us to scale up task instances, environments, and trajectories at a fraction of prior costs without sacrificing faithfulness to open source software development practices. +Using +\bugs +, we train +SWE-agent-LM-32B +, achieving a state-of-the-art +40.2 +40.2 +40.2 +40.2 +% on SWE-bench Verified. +Our experiments show how +\bugs +can be used to identify fundamental trends about developing SWE-agents. +We believe +\bugs +provides the foundational data and infrastructure needed to train software engineering agents in a truly scalable manner. +Acknowledgments +We thank Princeton Language & Intelligence (PLI) for providing credits for running closed-source API models. +Thanks to Samuel Ainsworth for his constant support of +bitbop.io +( +https://bitbop.io/ +), the compute service for which the majority of the project was carried out with. +We’d also like to thank Akshat Bubna, Howard Halim, Andrew Liu, Peyton Walters, and the great team at Modal ( +https://modal.com/ +) for providing credits that made fine-tuning and model serving efforts extremely easy for this project. +This work is partially supported by ONR grant N000142412532 and NSF grant IIS-2247357. +We also thank Open Philanthropy and Andreessen Horowitz for providing funding for this work. +Finally, thanks to Tianyu Gao, William Held, Niklas Muennighoff, Rafael Rafailov, Yijia Shao, Chenglei Si, Anikait Singh, Tianyi Zhang, Kexin Pei, and Karthik Narasimhan for constructive discussions and support throughout this project. +References +Anthropic (2024) +Anthropic. +Introducing claude 3.5 sonnet, 2024. +URL +https://www.anthropic.com/news/claude-3-5-sonnet +. +Anthropic (2025) +Anthropic. +Introducing claude 3.7 sonnet, 2025. +URL +https://www.anthropic.com/news/claude-3-7-sonnet +. +Antoniades et al. (2024) +Antonis Antoniades, Albert Örwall, Kexun Zhang, Yuxi Xie, Anirudh Goyal, and William Wang. +SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement, December 2024. +URL +http://arxiv.org/abs/2410.20285 +. +arXiv:2410.20285 [cs]. +Austin et al. (2021) +Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, and Charles Sutton. +Program Synthesis with Large Language Models, August 2021. +URL +http://arxiv.org/abs/2108.07732 +. +arXiv:2108.07732 [cs]. +Badertdinov et al. (2024) +Ibragim Badertdinov, Maria Trofimova, Yuri Anapolskiy, Sergey Abramov, Karina Zainullina, Alexander Golubev, Sergey Polezhaev, Daria Litvintseva, Simon Karasik, Filipp Fisin, Sergey Skvortsov, Maxim Nekrashevich, Anton Shevtsov, and Boris Yangel. +Scaling data collection for training software engineering agents. +Nebius blog +, 2024. +Bogin et al. (2024) +Ben Bogin, Kejuan Yang, Shashank Gupta, Kyle Richardson, Erin Bransom, Peter Clark, Ashish Sabharwal, and Tushar Khot. +Super: Evaluating agents on setting up and executing tasks from research repositories, 2024. +URL +https://arxiv.org/abs/2409.07440 +. +Chen et al. (2023) +Baian Chen, Chang Shu, Ehsan Shareghi, Nigel Collier, Karthik Narasimhan, and Shunyu Yao. +FireAct: Toward Language Agent Fine-tuning, October 2023. +URL +http://arxiv.org/abs/2310.05915 +. +arXiv:2310.05915 [cs]. +Chen et al. (2021) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba. +Evaluating Large Language Models Trained on Code, July 2021. +URL +http://arxiv.org/abs/2107.03374 +. +arXiv:2107.03374 [cs]. +Chowdhury et al. (2024) +Neil Chowdhury, James Aung, Chan Jun Shern, Oliver Jaffe, Dane Sherburn, Giulio Starace, Evan Mays, Rachel Dias, Marwan Aljubeh, Mia Glaese, et al. +Introducing swe-bench verified, 2024. +URL https://openai.com/index/introducing-swe-bench-verified +, 2024. +Daniel Han & team (2023) +Michael Han Daniel Han and Unsloth team. +Unsloth, 2023. +URL +http://github.com/unslothai/unsloth +. +Eliseeva et al. (2025) +Aleksandra Eliseeva, Alexander Kovrigin, Ilia Kholkin, Egor Bogomolov, and Yaroslav Zharov. +Envbench: A benchmark for automated environment setup, 2025. +URL +https://arxiv.org/abs/2503.14443 +. +Haluptzok et al. (2023) +Patrick Haluptzok, Matthew Bowers, and Adam Tauman Kalai. +Language models can teach themselves to program better, 2023. +URL +https://arxiv.org/abs/2207.14502 +. +Hu et al. (2021) +Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. +Lora: Low-rank adaptation of large language models, 2021. +URL +https://arxiv.org/abs/2106.09685 +. +Hui et al. (2024) +Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Lei Zhang, Tianyu Liu, Jiajun Zhang, Bowen Yu, Keming Lu, Kai Dang, Yang Fan, Yichang Zhang, An Yang, Rui Men, Fei Huang, Bo Zheng, Yibo Miao, Shanghaoran Quan, Yunlong Feng, Xingzhang Ren, Xuancheng Ren, Jingren Zhou, and Junyang Lin. +Qwen2.5-coder technical report, 2024. +URL +https://arxiv.org/abs/2409.12186 +. +Jain et al. (2024) +Naman Jain, Manish Shetty, Tianjun Zhang, King Han, Koushik Sen, and Ion Stoica. +R2e: Turning any github repository into a programming agent environment. +In +ICML 2024 +, 2024. +Jain et al. (2025) +Naman Jain, Jaskirat Singh, Manish Shetty, Liang Zheng, Koushik Sen, and Ion Stoica. +R2e-gym: Procedural environments and hybrid verifiers for scaling open-weights swe agents, 2025. +URL +https://arxiv.org/abs/2504.07164 +. +Jimenez et al. (2024a) +Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. +Swe-bench leaderboard, 2024a. +URL +https://swe-bench.github.io/ +. +Jimenez et al. (2024b) +Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan. +SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, November 2024b. +URL +http://arxiv.org/abs/2310.06770 +. +arXiv:2310.06770 [cs]. +Liu et al. (2024a) +Jiawei Liu, Thanh Nguyen, Mingyue Shang, Hantian Ding, Xiaopeng Li, Yu Yu, Varun Kumar, and Zijian Wang. +Learning code preference via synthetic evolution, 2024a. +URL +https://arxiv.org/abs/2410.03837 +. +Liu et al. (2024b) +Zhihan Liu, Shenao Zhang, Yongfei Liu, Boyi Liu, Yingxiang Yang, and Zhaoran Wang. +DSTC: Direct Preference Learning with Only Self-Generated Tests and Code to Improve Code LMs, December 2024b. +URL +http://arxiv.org/abs/2411.13611 +. +arXiv:2411.13611 [cs]. +Luo et al. (2023) +Ziyang Luo, Can Xu, Pu Zhao, Qingfeng Sun, Xiubo Geng, Wenxiang Hu, Chongyang Tao, Jing Ma, Qingwei Lin, and Daxin Jiang. +Wizardcoder: Empowering code large language models with evol-instruct, 2023. +URL +https://arxiv.org/abs/2306.08568 +. +Ma et al. (2024) +Yingwei Ma, Rongyu Cao, Yongchang Cao, Yue Zhang, Jue Chen, Yibo Liu, Yuchen Liu, Binhua Li, Fei Huang, and Yongbin Li. +Lingma swe-gpt: An open development-process-centric language model for automated software improvement, 2024. +URL +https://arxiv.org/abs/2411.00622 +. +Modal (2025) +Modal. +Modal: High-performance ai infrastructure, 2025. +URL +https://modal.com/ +. +Muennighoff et al. (2024) +Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro von Werra, and Shayne Longpre. +OctoPack: Instruction Tuning Code Large Language Models, February 2024. +URL +http://arxiv.org/abs/2308.07124 +. +arXiv:2308.07124 [cs]. +Murty et al. (2024) +Shikhar Murty, Dzmitry Bahdanau, and Christopher D. Manning. +NNetscape Navigator: Complex Demonstrations for Web Agents Without a Demonstrator, October 2024. +URL +http://arxiv.org/abs/2410.02907 +. +arXiv:2410.02907 [cs]. +Mündler et al. (2025) +Niels Mündler, Mark Niklas Müller, Jingxuan He, and Martin Vechev. +Swt-bench: Testing and validating real-world bug-fixes with code agents, 2025. +URL +https://arxiv.org/abs/2406.12952 +. +OpenAI (2024a) +OpenAI. +Gpt-4o system card, 2024a. +URL +https://arxiv.org/abs/2410.21276 +. +OpenAI (2024b) +OpenAI. +Openai o3-mini system card, 2024b. +URL +https://cdn.openai.com/o3-mini-system-card-feb10.pdf +. +Orwall (2024) +Albert Orwall. +Moatless tools, 2024. +URL +https://github.com/aorwall/moatless-tools +. +Ou et al. (2024) +Tianyue Ou, Frank F. Xu, Aman Madaan, Jiarui Liu, Robert Lo, Abishek Sridhar, Sudipta Sengupta, Dan Roth, Graham Neubig, and Shuyan Zhou. +Synatra: Turning Indirect Knowledge into Direct Demonstrations for Digital Agents at Scale, September 2024. +URL +http://arxiv.org/abs/2409.15637 +. +arXiv:2409.15637 [cs]. +Pan et al. (2024) +Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, and Yizhe Zhang. +Training Software Engineering Agents and Verifiers with SWE-Gym, December 2024. +URL +http://arxiv.org/abs/2412.21139 +. +arXiv:2412.21139 [cs]. +PyTorch (2024) +PyTorch. +torchtune: Pytorch’s finetuning library, April 2024. +URL +https//github.com/pytorch/torchtune +. +Qwen et al. (2025) +Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tianyi Tang, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. +Qwen2.5 Technical Report, January 2025. +URL +http://arxiv.org/abs/2412.15115 +. +arXiv:2412.15115 [cs]. +Shen et al. (2024) +Junhong Shen, Atishay Jain, Zedian Xiao, Ishan Amlekar, Mouad Hadji, Aaron Podolny, and Ameet Talwalkar. +Scribeagent: Towards specialized web agents using production-scale workflow data, 2024. +URL +https://arxiv.org/abs/2411.15004 +. +Shypula et al. (2024) +Alexander Shypula, Aman Madaan, Yimeng Zeng, Uri Alon, Jacob Gardner, Milad Hashemi, Graham Neubig, Parthasarathy Ranganathan, Osbert Bastani, and Amir Yazdanbakhsh. +Learning performance-improving code edits, 2024. +URL +https://arxiv.org/abs/2302.07867 +. +Sumers et al. (2024) +Theodore R. Sumers, Shunyu Yao, Karthik Narasimhan, and Thomas L. Griffiths. +Cognitive architectures for language agents, 2024. +URL +https://arxiv.org/abs/2309.02427 +. +Vergopoulos et al. (2025) +Konstantinos Vergopoulos, Mark Niklas Müller, and Martin Vechev. +Automated benchmark generation for repository-level coding tasks, 2025. +URL +https://arxiv.org/abs/2503.07701 +. +Wang et al. (2024) +Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and Graham Neubig. +OpenHands: An Open Platform for AI Software Developers as Generalist Agents, October 2024. +URL +http://arxiv.org/abs/2407.16741 +. +arXiv:2407.16741 [cs]. +Wei et al. (2024a) +Yuxiang Wei, Federico Cassano, Jiawei Liu, Yifeng Ding, Naman Jain, Zachary Mueller, Harm de Vries, Leandro von Werra, Arjun Guha, and Lingming Zhang. +SelfCodeAlign: Self-Alignment for Code Generation, November 2024a. +URL +http://arxiv.org/abs/2410.24198 +. +arXiv:2410.24198 [cs]. +Wei et al. (2024b) +Yuxiang Wei, Zhe Wang, Jiawei Liu, Yifeng Ding, and Lingming Zhang. +Magicoder: Empowering code generation with oss-instruct, 2024b. +URL +https://arxiv.org/abs/2312.02120 +. +Wei et al. (2025) +Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang. +SWE-RL: Advancing llm reasoning via reinforcement learning on open software evolution, 2025. +URL +https://arxiv.org/abs/2502.18449 +. +Xia et al. (2024) +Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang. +Agentless: Demystifying LLM-based Software Engineering Agents, October 2024. +URL +http://arxiv.org/abs/2407.01489 +. +arXiv:2407.01489 [cs]. +Xiang et al. (2023) +Jiannan Xiang, Tianhua Tao, Yi Gu, Tianmin Shu, Zirui Wang, Zichao Yang, and Zhiting Hu. +Language Models Meet World Models: Embodied Experiences Enhance Language Models, October 2023. +URL +http://arxiv.org/abs/2305.10626 +. +arXiv:2305.10626 [cs]. +Xie et al. (2025a) +Chengxing Xie, Bowen Li, Chang Gao, He Du, Wai Lam, Difan Zou, and Kai Chen. +Swe-fixer: Training open-source llms for effective and efficient github issue resolution, 2025a. +URL +https://arxiv.org/abs/2501.05040 +. +Xie et al. (2024) +Tianbao Xie, Danyang Zhang, Jixuan Chen, Xiaochuan Li, Siheng Zhao, Ruisheng Cao, Toh Jing Hua, Zhoujun Cheng, Dongchan Shin, Fangyu Lei, Yitao Liu, Yiheng Xu, Shuyan Zhou, Silvio Savarese, Caiming Xiong, Victor Zhong, and Tao Yu. +OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments, May 2024. +URL +http://arxiv.org/abs/2404.07972 +. +arXiv:2404.07972 [cs]. +Xie et al. (2025b) +Yiqing Xie, Alex Xie, Divyanshu Sheth, Pengfei Liu, Daniel Fried, and Carolyn Rose. +Repost: Scalable repository-level coding environment construction with sandbox testing, 2025b. +URL +https://arxiv.org/abs/2503.07358 +. +Xu et al. (2024) +Yiheng Xu, Dunjie Lu, Zhennan Shen, Junli Wang, Zekun Wang, Yuchen Mao, Caiming Xiong, and Tao Yu. +AgentTrek: Agent Trajectory Synthesis via Guiding Replay with Web Tutorials, December 2024. +URL +http://arxiv.org/abs/2412.09605 +. +arXiv:2412.09605 [cs]. +Yang et al. (2023) +John Yang, Akshara Prabhakar, Karthik Narasimhan, and Shunyu Yao. +InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback, October 2023. +URL +http://arxiv.org/abs/2306.14898 +. +arXiv:2306.14898 [cs]. +Yang et al. (2024a) +John Yang, Carlos E. Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press. +SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering, November 2024a. +URL +http://arxiv.org/abs/2405.15793 +. +arXiv:2405.15793 [cs]. +Yang et al. (2024b) +John Yang, Carlos E. Jimenez, Alex L. Zhang, Kilian Lieret, Joyce Yang, Xindi Wu, Ori Press, Niklas Muennighoff, Gabriel Synnaeve, Karthik R. Narasimhan, Diyi Yang, Sida I. Wang, and Ofir Press. +SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?, October 2024b. +URL +http://arxiv.org/abs/2410.03859 +. +arXiv:2410.03859 [cs]. +Yao et al. (2023a) +Shunyu Yao, Howard Chen, John Yang, and Karthik Narasimhan. +WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents, February 2023a. +URL +http://arxiv.org/abs/2207.01206 +. +arXiv:2207.01206 [cs]. +Yao et al. (2023b) +Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. +ReAct: Synergizing Reasoning and Acting in Language Models, March 2023b. +URL +http://arxiv.org/abs/2210.03629 +. +arXiv:2210.03629 [cs]. +Yu et al. (2024) +Zhaojian Yu, Xin Zhang, Ning Shang, Yangyu Huang, Can Xu, Yishujie Zhao, Wenxiang Hu, and Qiufeng Yin. +Wavecoder: Widespread and versatile enhancement for code large language models by instruction tuning, 2024. +URL +https://arxiv.org/abs/2312.14187 +. +Yuan et al. (2023) +Zheng Yuan, Hongyi Yuan, Chengpeng Li, Guanting Dong, Keming Lu, Chuanqi Tan, Chang Zhou, and Jingren Zhou. +Scaling relationship on learning mathematical reasoning with large language models, 2023. +URL +https://arxiv.org/abs/2308.01825 +. +Zan et al. (2025) +Daoguang Zan, Zhirong Huang, Wei Liu, Hanwu Chen, Linhao Zhang, Shulin Xin, Lu Chen, Qi Liu, Xiaojian Zhong, Aoyan Li, Siyao Liu, Yongsheng Xiao, Liangqiang Chen, Yuyu Zhang, Jing Su, Tianyu Liu, Rui Long, Kai Shen, and Liang Xiang. +Multi-swe-bench: A multilingual benchmark for issue resolving, 2025. +URL +https://arxiv.org/abs/2504.02605 +. +Zhang et al. (2024a) +Kexun Zhang, Weiran Yao, Zuxin Liu, Yihao Feng, Zhiwei Liu, Rithesh Murthy, Tian Lan, Lei Li, Renze Lou, Jiacheng Xu, Bo Pang, Yingbo Zhou, Shelby Heinecke, Silvio Savarese, Huan Wang, and Caiming Xiong. +Diversity Empowers Intelligence: Integrating Expertise of Software Engineering Agents, August 2024a. +URL +http://arxiv.org/abs/2408.07060 +. +arXiv:2408.07060 [cs]. +Zhang et al. (2024b) +Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, and Abhik Roychoudhury. +AutoCodeRover: Autonomous Program Improvement, July 2024b. +URL +http://arxiv.org/abs/2404.05427 +. +arXiv:2404.05427 [cs]. +Zhao et al. (2024) +Wenting Zhao, Nan Jiang, Celine Lee, Justin T. Chiu, Claire Cardie, Matthias Gallé, and Alexander M. Rush. +Commit0: Library Generation from Scratch, December 2024. +URL +http://arxiv.org/abs/2412.01769 +. +arXiv:2412.01769 [cs]. +Zhou et al. (2024) +Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Tianyue Ou, Yonatan Bisk, Daniel Fried, Uri Alon, and Graham Neubig. +WebArena: A Realistic Web Environment for Building Autonomous Agents, April 2024. +URL +http://arxiv.org/abs/2307.13854 +. +arXiv:2307.13854 [cs]. +Appendix +The appendix is generally structured as follows. +In Sections +A +to +D +, we review details about +\bugs +’s infrastructure and collection strategies for curating the +\bugs +task instances and execution environments, providing comparisons to existing datasets such as SWE-bench and SWE-gym along the way. +In Sections +E +and onward, we discuss more about how we created the trajectories dataset, then provide additional ablations and results showcasing the effectiveness of +\bugs +as a dataset. +Figure 8: +An overview of pipelines in +\bugs +. +Scripts/functions and manual steps are highlighted in +blue +. +Artifacts that are also the inputs and outputs of these scripts are in +orange +. +\bugs +fits in seamlessly with the SWE-bench and SWE-agent ecosystem. +Use +\bugs +to construct execution environments and generate task instances. +Use SWE-agent to generate expert trajectories on +\bugs +task instances and run inference with models trained on these trajectories. +Use SWE-bench to evaluate how good your models are at resolving GitHub issues and performing software engineering tasks. +Appendix A +Infrastructure +We cover additional details about how +\bugs +works, specifically +• +The form factor of a +\bugs +task instance. +• +How we identify repositories and the SWE-agent configuration we use to automatically install them. +• +How the task validation and evaluation harnesses work. +A.1 +\bugs +Task Instance +We briefly review the format of a +\bugs +task instance, highlight how it is different from a SWE-bench task instance, and discuss why +\bugs +’s relatively simple infrastructure compared to SWE-bench allows us scale task collection much more efficiently. +A +\bugs +task instance is very similar to the form factor of a SWE-bench task instance, with several minor differences. +A +\bugs +task instance includes the following fields: +• +repo +: The repository the task instance is from. +• +instance_id +: A unique identifier (usually +(repo).(bug_type).(hash) +) +• +base_commit +: Hash of the GitHub branch that points to the repository with the bug +patch +applied. +• +patch +: The +diff +that causes the bug. It is applied to the original codebase to create the bug. Reverting this patch is effectively the solution. +• +problem_statement +: The generated issue text that conveys the bug. +It is provided to a model or system before it begins attempting a fix. +• +created_at +: A timestamp matching when the bug was successfully validated and pushed to the mirror repository as a branch. +• +FAIL_TO_PASS +: The unit tests that break when the test suite is run with the bug +patch +applied. +• +PASS_TO_PASS +: The unit tests that do not break. +These correspond to the set of all tests minus the +FAIL_TO_PASS +tests. +We summarize the key distinctions between a +\bugs +and SWE-bench task instance: +• +\bugs +task instances do not include the +version +or +environment_setup_commit +fields, which SWE-bench requires as additional identifiers for specifying repository-specific installation instructions across time. +In +\bugs +, unique installation instructions are specified for each (repository, commit). +• +The +hints_text +field is not included. In SWE-bench, this refers to the issue and PR thread comments written after the first commit of the corresponding PR. +• +The +created_at +field is assigned the timestamp reflecting when the bug was successfully validated. +Originally, +created_at +refers to when a PR was created. +• +There is no +test_patch +field, as the +\bugs +collection pipeline does not create or synthesize any hidden tests. +All +FAIL_TO_PASS +bugs are visible and runnable in the repository at inference time. +A.2 +Repository Selection +In addition to the criteria discussed in Section +2.1 +, we also ensure that a repository has a license that allows non-proprietary use. +The majority of software licenses are permissive (BSD, MIT, Apache), while the remainder are largely protective licenses (GPL) that still allow for non-commercial use. +We inspected the repositories with custom licenses and confirmed they allowed for the use cases exercised in our work. +The licenses for each repository are fully listed in Table +6 +. +Apache License 2.0 +Project-MONAI/MONAI; alanjds/drf-nested-routers; arrow-py/arrow; buriy/python-readability; facebookresearch/fvcore; getmoto/moto; google/textfsm; iterative/dvc; jax-ml/jax; jd/tenacity; kayak/pypika; modin-project/modin; pyca/pyopenssl; spulec/freezegun; tkrajina/gpxpy; tornadoweb/tornado; weaveworks/grafanalib +BSD 2-Clause ”Simplified” License +madzak/python-json-logger; pyasn1/pyasn1; pygments/pygments; sunpy/sunpy +BSD 3-Clause ”New” or ”Revised” License +Suor/funcy; alecthomas/voluptuous; andialbrecht/sqlparse; cookiecutter/cookiecutter; dask/dask; django/channels; django/daphne; encode/starlette; gawel/pyquery; gweis/isodate; john-kurkowski/tldextract; lepture/mistune; oauthlib/oauthlib; pallets/click; pallets/flask; pallets/jinja; pallets/markupsafe; pandas-dev/pandas; scrapy/scrapy; theskumar/python-dotenv +GNU General Public License v3.0 +Cog-Creators/Red-DiscordBot; adrienverge/yamllint +GNU Lesser General Public License v2.1 +chardet/chardet; paramiko/paramiko; pylint-dev/astroid +GNU Lesser General Public License v3.0 +Knio/dominate +ISC License +kennethreitz/records +MIT License +amueller/word_cloud; borntyping/python-colorlog; bottlepy/bottle; cantools/cantools; cdgriffith/Box; cknd/stackprinter; conan-io/conan; cool-RR/PySnooper; datamade/usaddress; dbader/schedule; erikrose/parsimonious; facebookresearch/hydra; facelessuser/soupsieve; getnikola/nikola; graphql-python/graphene; hukkin/tomli; jaraco/inflect; jawah/charset_normalizer; joke2k/faker; keleshev/schema; life4/textdistance; luozhouyang/python-string-similarity; marshmallow-code/apispec; marshmallow-code/marshmallow; marshmallow-code/webargs; martinblech/xmltodict; matthewwithanm/python-markdownify; mewwts/addict; mido/mido; mozillazg/python-pinyin; msiemens/tinydb; pdfminer/pdfminer; pndurette/gTTS; pudo/dataset; pydantic/pydantic; pyparsing/pyparsing; pytest-dev/iniconfig; python-hyper/h11; python-jsonschema/jsonschema; python-openxml/python-docx; pyupio/safety; pyvista/pyvista; r1chardj0n3s/parse; rsalmei/alive-progress; rubik/radon; rustedpy/result; scanny/python-pptx; seatgeek/thefuzz; sloria/environs; sqlfluff/sqlfluff; termcolor/termcolor; tobymao/sqlglot; tox-dev/pipdeptree; tweepy/tweepy; un33k/python-slugify; vi3k6i5/flashtext +Other +Mimino666/langdetect; PyCQA/flake8; agronholm/exceptiongroup; agronholm/typeguard; aio-libs/async-timeout; benoitc/gunicorn; cloudpipe/cloudpickle; davidhalter/parso; django-money/django-money; gruns/furl; kurtmckee/feedparser; lincolnloop/python-qrcode; mahmoud/boltons; mahmoud/glom; mozilla/bleach; pexpect/ptyprocess; prettytable/prettytable; pwaller/pyfiglet; pydata/patsy; pydicom/pydicom; python-trio/trio; python/mypy; pyutils/line_profiler; seperman/deepdiff +Table 6: +License associated with each repository as of April 8, 2025. All licenses are permissive and allow for public, nonprofit use. +We deliberately limit the search scope for repositories to those predominantly written in Python. +Following precedents, focusing on Python repositories allowed us to form assumptions about installation and testing procedures (e.g. repository is organized as a PyPI package, +pytest +is the testing framework) that made scaling up automatic repository setup with SWE-agent more tractable. +A worthwhile direction to consider for future work is expanding the coverage of repositories to be more comprehensive of codebases written in different programming languages, as +Yang et al. ( +2024b +) +does, extending SWE-bench style evaluation to JavaScript repositories with multimodal inputs. +Automated repository installation. +The goal of this step is to first, get the installation and testing instructions for a repository, and second, create a Docker image containing the repository with the development environment set up. +We provide the system prompt given to SWE-agent that asks it to install a repository in Figure +6 +. +Each repository installation task is initialized with a clone of the original repository. +No additional steps (e.g. +pypi +package downloads, +conda +environment setup) are performed. +We run SWE-agent with +claude-3-5-sonnet-20241022 +with a maximum cost limit of $ +2 +2 +2 +2 +and a maximum call limit of +150 +150 +150 +150 +. +The installation run terminates whenever one of these conditions is met. +For every run, we record the interactions. +We then manually review the trajectory, identifying the appropriate installation and testing specifications. +Each run incurs an average cost of $ +0.72 +0.72 +0.72 +0.72 +and an average of +17 +17 +17 +17 +steps before SWE-agent issues the +submit +command. +The runs typically finish within two minutes. +The majority of Python repositories require fewer steps — typically, SWE-agent will view the +CONTRIBUTING.md +, run the installation command provided verbatim in the text, and then runs +pytest +, showing all tests passing. +A minority of repositories will require several steps because additional dependencies must be installed with +apt-get +. +The manual review process following this requires +3 +3 +3 +3 +to +20 +20 +20 +20 +minutes. +One author carried out this effort for +128 +128 +128 +128 +repositories, taking an estimated +18 +18 +18 +18 +human hours to accomplish. +In the process of reaching +128 +128 +128 +128 +repositories, the author gave up on +17 +17 +17 +17 +repositories at the manual review stage. +System prompt for generating bugs with an LM +< +< +< +uploaded_files +> +> +> +{{working_dir}} +< +< +< +/uploaded_files +> +> +> +I’ve uploaded a python code repository in the directory +{{working_dir}} +. +Can you please install this repository? +Your goal should be to configure the repository’s development environment such that existing tests pass. +You are currently in the root directory of the repository, and nothing has been installed yet. +You in an Ubuntu 22.04 environment. +The repository is predominantly written in Python. Here are several tips for installing it: +1. A good place to start is to look for a +CONTRIBUTING.[md +| +| +| +| +rst] +file, which will often contain instructions on how to install the repository and any dependencies it may have. Occasionally, the +README.md +file may also contain installation instructions. +2. Usually, a repository may have +setup.py +or +pyproject.toml +files which can be used to install the package. +pip install -e . +is commonly used, although many packages will also require an additional specifier that installs development packages as well (e.g. +pip install -e .[dev] +). +3. To check whether the repository was installed successfully, run tests and see if they pass. You can usually find tests in a +tests/ +or +test/ +directory. You can run tests using +pytest +or +unittest +, depending on the framework used by the repository. +4. Sometimes, you will need to install additional packages, often listed in a +requirements.txt +or +environment.yml +file. Also, be mindful of Ubuntu system dependencies that may need to be installed via +apt-get +(e.g. +sudo apt-get install +< +< +< +package +> +> +> +). +Once you are finished with installing the repository, run the +submit +command to submit your changes for review. +\captionof +figure +Prompt provided to SWE-agent + an LM asking it to install a repository. +A.3 +Validation, Evaluation Harnesses +We adapt SWE-bench’s validation script to convert each bug patch into a SWE-bench style task instance. +This step ensures +\bugs +can be run by existing SWE-bench solutions. +The conversion involves two steps. +First, the bug patch is applied and pushed as a branch to a mirror clone of the repository. +Second, we create a SWE-bench style task instance from the bug patch, populating important fields such as Fail-to-Pass and Pass-to-Pass tests with information from the validation logs. +Appendix B +Bug Generation Strategies +In this section, we review each of the bug generation strategies we employ in depth. +While we experimented with several bug generation strategies, the ones we elect to include are those we found to satisfy several desirable properties. +1. +The approach works in a codebase-agnostic manner. +2. +The approach reliably yields usable task instances (meaning +1 ++ +limit-from +1 +1+ +1 + +passing tests break). +3. +The approach is controllable; via each strategy’s parameters, we can affect the quantity and quality of the generated bugs. +System prompt for generating bugs with an LM +You are a software developer doing chaos monkey testing. +Your job is to rewrite a function such that it introduces a logical bug that will break existing unit test(s) in a codebase. +To this end, some kinds of bugs you might introduce include: +(Per inference call, only 3 of the following tips are randomly selected and shown) +- Alter calculation order for incorrect results: Rearrange the sequence of operations in a calculation to subtly change the output (e.g., change (a + b) * c to a + (b * c)). +- Introduce subtle data transformation errors: Modify data processing logic, such as flipping a sign, truncating a value, or applying the wrong transformation function. +- Change variable assignments to alter computation state: Assign a wrong or outdated value to a variable that affects subsequent logic. +- Mishandle edge cases for specific inputs: Change handling logic to ignore or improperly handle boundary cases, like an empty array or a null input. +- Modify logic in conditionals or loops: Adjust conditions or loop boundaries (e.g., replace +<= +<= +< = +with +< +< +< +) to change the control flow. +- Introduce off-by-one errors in indices or loop boundaries: Shift an index or iteration boundary by one, such as starting a loop at 1 instead of 0. +- Adjust default values or constants to affect behavior: Change a hardcoded value or default parameter that alters how the function behaves under normal use. +- Reorder operations while maintaining syntax: Rearrange steps in a process so the function produces incorrect intermediate results without breaking the code. +- Swallow exceptions or return defaults silently: Introduce logic that catches an error but doesn’t log or handle it properly, leading to silent failures. +Tips about the bug-introducing task: +(At inference time, tips are randomly shuffled) +- It should not cause compilation errors. +- It should not be a syntax error. +- It should be subtle and challenging to detect. +- It should not modify the function signature. +- It should not modify the documentation significantly. +- For longer functions, if there is an opportunity to introduce multiple bugs, please do!” +- Please DO NOT INCLUDE COMMENTS IN THE CODE indicating the bug location or the bug itself. +Your answer should be formatted as follows: +Explanation: +< +< +< +explanation +> +> +> +Bugged Code: +‘‘‘ +< +< +< +bugged_code +> +> +> +‘‘‘ +\captionof +figure +System prompt provided to an LM to generate bugs by modifying an existing, working function. +Text in +red +are not included at the actual prompt. +B.1 +Generating with an LM +We describe our workflows for generating bugs with an LM. +For each function or class in a codebase, we prompt an LM to generate either a rewrite that introduces bugs or a complete re-implementation from scratch. +This strategy is illustrated in Figure +9 +. +Figure 9: +Workflow to generate bugs for a function or class with an LM. +We first extract all functions or classes from a codebase, then enumerate across all candidates and prompt the LM to generate either a bug-laced rewrite or a re-implementation. +Modify existing functions. +Given a Python codebase, we use the +ast +library to identify all unique functions, excluding any functions found under a testing related directory (e.g. +tests +, +testing +). +Next, given a function, the LM is asked to write a new version that introduces logical, runtime bugs. +Within the prompt, shown in Figure +B +, several suggestions of types of bugs along with a demonstration of a rewrite are provided. +Prompts for reimplementing bugs with an LM +System Prompt +You are a software developer and you have been asked to implement a function. +You will be given the contents of an entire file, with one or more functions defined in it. +Please implement the function(s) that are missing. +Do NOT modify the function signature, including the function name, parameters, return types, or docstring if provided. +Do NOT change any other code in the file. +You should not use any external libraries. +Task Instance Prompt +Please implement the function func_signature in the following code: +{file_src_code} +Remember, you should not modify the function signature, including the function name, parameters, return types, or docstring if provided. +Do NOT change any other code in the file. +Format your output as: +[explanation] +{func_to_write} +\captionof +figure +System prompt provided to an LM to generate bugs by re-implementing an existing target function. +file_src_code +refers to the original source file minus the target function’s original implementation. +func_to_write +refers to the signature and docstring of the target function. +In our experiments, we use OpenAI’s o3 mini model +(OpenAI, +2024b +) +( +o3-mini-2025-01-31 +) as the main base model for bug generation. +Based on our empirical observations of an LM’s tendencies, we include several explicit guidelines in the prompt about what the rewrite should not do. +Notably, it is important to ask the LM to not generate any inline comments denoting the location of a bug; we observe that without explicitly specifying this, model generation outputs tend to have inline comments pointing out the bug. +We also want to avoid the complexities of identifying and removing such comments from a file diff representation. +Second, we state that rewrites causing compilation or syntax errors (e.g. undeclared variables, function definition modifications) should be avoided because such bugs are relatively trivial to solve. +We do not experiment extensively with different prompts or generating multiple buggy rewrites per function. +Modify existing classes. +This method involves a simple amendment to the function rewriting approach. +Instead of identifying unique functions ( +ast.FunctionDef +), the codebase traversal logic instead looks for classes ( +ast.ClassDef +). +Otherwise, all other aspects of the implementation are near identical to function rewriting, with minor changes to the prompt to make bug suggestions and the demonstration more class oriented. +Rewrite existing functions. +Instead of providing an LM with the original function, we explore an alternative strategy of asking an LM to re-implement a function from scratch. +Similar to above, we again use the +ast +library to identify all unique functions. +However, instead of directly asking for a bug, we remove the function’s implementation, then prompt the LM with the entire file containing the function (minus the original implementation). +In the task description, we then explicitly ask for the LM to implement the function without changing the function signature. +B.2 +Procedural Modification +We explore a zero-cost approach to create bugs by performing random modifications to the +ast +representation of a function or class. +A “procedural modification” refers to a function that takes in an +ast +and applies a fixed transformation to it, such as removing a loop or swapping the blocks of an if/else clause. +This strategy is illustrated in Figure +10 +. +Figure 10: +Workflow to generate bugs via procedural modifications. +Per function/class, the source code is first convert into an +ast +. +The modification then mutates the +ast +(e.g. removes an assignment statement). +The +ast +is then converted back into source code with the specific modification introduced. +Similar to the workflow for generating bugs with an LM, we first identify all functions or classes in a repository. +Per procedural modification, we first impose a set of criteria that filters out any candidates for which the modification would be impossible. +For instance, if the procedural modification removes a random conditional from a function, the modification’s criteria will filter out any candidates that are not functions or do not have a conditional. +For the remaining candidates, the procedural modification is applied with controlled +likelihood +, where +likelihood +is a fraction indicating how often the procedural modification is applied within a candidate. +For example, if the procedural modification removes a random function with a +likelihood +of +0.5 +0.5 +0.5 +0.5 +, then for every conditional declared within the function, there is a +50 +50 +50 +50 +% chance it gets removed. +We introduce +likelihood +so procedural modifications do not lead to changes that are too difficult. +Finally, the modified +ast +is converted back into source code. +Table +7 +is a complete list of filtering criteria that is used for any procedural modification. +For the +filter_min_complexity +and +filter_max_complexity +criteria, we define a simple definition of “complexity” as a sum of the number of conditional blocks, loops, boolean operators, exception handling blocks, and comparison operators in a function. +The purpose of +filter_min_complexity +is to remove both simple, uninteresting functions (e.g. getter, setter methods) from consideration. +filter_max_complexity +is occasionally used to avoid changing long, monolithic functions. +Index +Criteria +Description +1 +filter_functions +Is the +ast +a function definition +2 +filter_classes +Is the +ast +a class definition +3 +filter_classes_has_base +Is the +ast +a class definition with parents +4 +filter_loops +Does the +ast +contain a +For +or +While +loop? +5 +filter_conditionals +Does the +ast +contain a conditional block? +6 +filter_assignments +Is the +ast +a function def. with assignments? +7 +filter_wrappers +Does the +ast +contain +try +or +with +blocks? +8 +filter_if_else +Does the +ast +contain an +if-else +block? +9 +filter_operators +Does the +ast +contain binary, boolean operators? +10 +filter_min_complexity +Is the +ast +≥ +\geq +≥ +a complexity score? +11 +filter_max_complexity +Is the +ast +≤ +\leq +≤ +a complexity score? +Table 7: +Pool of criteria used to filter for functions or classes with specific properties. +Per procedural modification, a subset of these criteria is first used to filter functions and/or classes from a codebase. +The modification is then run on the remainder. +Table +8 +is an exhaustive list of all procedural modifications used to create bugs in a codebase. +Procedural Modification +Criteria +Description +Class +Remove Functions +2, 10 +Removes method(s) + reference(s). +Remove Parent +3, 10 +Removes base class from class header. +Shuffle Methods +2, 10 +Shuffles method definitions in a class. +Control +Invert If/Else +8 +Inverts the if-else bodies of a condition. +Flow +Shuffle Lines +11, 12 +Shuffles the lines of a function. +Expressions +Change Constants +1, 9, 10 +± +1 +plus-or-minus +1 +\pm 1 +± 1 +to a constant numeric value. +Break Chains +1, 9, 10 +Removes operator(s), operator(s). +Swap Operands +1, 9, 10 +Mixes order of operands. +Change Operator +1, 9, 10 +Changes operator(s) (e.g. ++ ++ ++ +to +− +- +- +). +Removal +Loops +1, 4, 10 +Remove loops (e.g. +for +, +while +). +Conditionals +1, 5, 10 +Remove conditionals ( +if +). +Assignments +1, 6, 10 +Remove assignment statements. +Wrappers +1, 7, 10 +Remove exception ( +try +), context ( +with +). +Table 8: +The +13 +13 +13 +13 +procedural modification techniques we use to create bugs in a codebase. +The “Criteria” column contains indices referencing the corresponding filter defined in Table +7 +. +There are four informal categories — Class, Control Flow, Expressions, Removal — which indicates the general type of modification being made. +B.3 +Combine Bug Patches +We discuss the two strategies we use to combine bug patches from the same file or the same module. +In practice, we combine LM and procedurally generated bugs that have been validated successfully as usable task instances. +Figure 11: +Workflow to generate bugs by combining bug patches. +We take +n +𝑛 +n +italic_n +patches (generated using an LM or procedural modification), then sequentially apply each bug patch to the codebase. +If all individual patches apply successfully, we save the resulting single patch which now represents all +n +𝑛 +n +italic_n +bugs combined. +From the same file. +If two or more functions are defined within a single file, this strategy merges the function-level bug patches together. +Given +n +𝑛 +n +italic_n +function-level bugs and +k +𝑘 +k +italic_k +as the number of bugs to combine, there are +( +n +k +) +binomial +𝑛 +𝑘 +{n\choose k} +( binomial start_ARG italic_n end_ARG start_ARG italic_k end_ARG ) +unique file-level candidate bug patches, which can be a large search space to cover. +To make the search space tractable, ensure no single function-level bug is repeatedly used, and generate instances that reliably have +1 +1 +1 +1 ++ Fail to Pass tests, we implement the following approach described in Algorithm +1 +. +c +⁢ +o +⁢ +d +⁢ +e +⁢ +b +⁢ +a +⁢ +s +⁢ +e +𝑐 +𝑜 +𝑑 +𝑒 +𝑏 +𝑎 +𝑠 +𝑒 +codebase +italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e +, +b +⁢ +u +⁢ +g +⁢ +s +𝑏 +𝑢 +𝑔 +𝑠 +bugs +italic_b italic_u italic_g italic_s +; +n +⁢ +u +⁢ +m +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑛 +𝑢 +𝑚 +_ +𝑏 +𝑢 +𝑔 +𝑠 +num\_bugs +italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s +, +l +⁢ +i +⁢ +m +⁢ +i +⁢ +t +⁢ +_ +⁢ +p +⁢ +e +⁢ +r +⁢ +_ +⁢ +f +⁢ +i +⁢ +l +⁢ +e +𝑙 +𝑖 +𝑚 +𝑖 +𝑡 +_ +𝑝 +𝑒 +𝑟 +_ +𝑓 +𝑖 +𝑙 +𝑒 +limit\_per\_file +italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_f italic_i italic_l italic_e +; +m +⁢ +a +⁢ +x +⁢ +_ +⁢ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +⁢ +s +𝑚 +𝑎 +𝑥 +_ +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +𝑠 +max\_combos +italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s +m +⁢ +i +⁢ +n +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑚 +𝑖 +𝑛 +_ +𝑏 +𝑢 +𝑔 +𝑠 +min\_bugs +italic_m italic_i italic_n _ italic_b italic_u italic_g italic_s +≥ +\geq +≥ +2; +m +⁢ +a +⁢ +x +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑚 +𝑎 +𝑥 +_ +𝑏 +𝑢 +𝑔 +𝑠 +max\_bugs +italic_m italic_a italic_x _ italic_b italic_u italic_g italic_s +≥ +\geq +≥ +m +⁢ +i +⁢ +n +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑚 +𝑖 +𝑛 +_ +𝑏 +𝑢 +𝑔 +𝑠 +min\_bugs +italic_m italic_i italic_n _ italic_b italic_u italic_g italic_s +; +procedure +CombineFileBugs +for +each +f +⁢ +i +⁢ +l +⁢ +e +𝑓 +𝑖 +𝑙 +𝑒 +file +italic_f italic_i italic_l italic_e +in +c +⁢ +o +⁢ +d +⁢ +e +⁢ +b +⁢ +a +⁢ +s +⁢ +e +𝑐 +𝑜 +𝑑 +𝑒 +𝑏 +𝑎 +𝑠 +𝑒 +codebase +italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e +do +f +⁢ +i +⁢ +l +⁢ +e +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑓 +𝑖 +𝑙 +𝑒 +_ +𝑏 +𝑢 +𝑔 +𝑠 +file\_bugs +italic_f italic_i italic_l italic_e _ italic_b italic_u italic_g italic_s +← +← +\leftarrow +← +bugs that apply to +f +⁢ +i +⁢ +l +⁢ +e +𝑓 +𝑖 +𝑙 +𝑒 +file +italic_f italic_i italic_l italic_e +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +← +← +\leftarrow +← +get_combos( +f +⁢ +i +⁢ +l +⁢ +e +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑓 +𝑖 +𝑙 +𝑒 +_ +𝑏 +𝑢 +𝑔 +𝑠 +file\_bugs +italic_f italic_i italic_l italic_e _ italic_b italic_u italic_g italic_s +, +n +⁢ +u +⁢ +m +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑛 +𝑢 +𝑚 +_ +𝑏 +𝑢 +𝑔 +𝑠 +num\_bugs +italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s +, +m +⁢ +a +⁢ +x +⁢ +_ +⁢ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +⁢ +s +𝑚 +𝑎 +𝑥 +_ +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +𝑠 +max\_combos +italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s +) +for +each +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +in combinations +do +Apply +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +to +c +⁢ +o +⁢ +d +⁢ +e +⁢ +b +⁢ +a +⁢ +s +⁢ +e +𝑐 +𝑜 +𝑑 +𝑒 +𝑏 +𝑎 +𝑠 +𝑒 +codebase +italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e +if +success +then +Save +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +to disk +if +l +⁢ +i +⁢ +m +⁢ +i +⁢ +t +⁢ +_ +⁢ +p +⁢ +e +⁢ +r +⁢ +_ +⁢ +f +⁢ +i +⁢ +l +⁢ +e +𝑙 +𝑖 +𝑚 +𝑖 +𝑡 +_ +𝑝 +𝑒 +𝑟 +_ +𝑓 +𝑖 +𝑙 +𝑒 +limit\_per\_file +italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_f italic_i italic_l italic_e +reached +then +break +end +if +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +← +← +\leftarrow +← +[c for c in +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +if c +∩ +\cap +∩ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o += +∅ +absent +=\emptyset += ∅ +] +end +if +end +for +end +for +end +procedure +Algorithm 1 +Combine multiple patches from the same file. +For each file in a codebase, we first identify the function-level bugs (or bug patches) that edit that file. +The pool of bugs we draw from have been +validated +, meaning we have already ensured there is +1 +1 +1 +1 ++ Fail to Pass test(s) associated with the bug. +From these pool of +file_bugs +, the +get_combos +function then generates up to +max_combos +sets of bugs, where the size of each set is +num_bugs +. +For each +combo +, or set of bugs, the bugs are applied to the codebase one by one. +If all patches are successfully combined, this means they were successfully merged, and the merged patch, which consists of multiple function-level bugs, is saved and re-validated as a single bug. +Merging patches occasionally fails if there is an overlapping conflict between two files, akin to a merge conflict with +git +; this usually happens when a function is declared within another. +To ensure a function-level bug is only used once, any remaining bug sets in +combinations +using any patch in +combo +are removed. +The +limit_per_file +and +max_combos +parameters prevent any one file from being over-represented and constrains an otherwise combinatorial large search space. +We run this algorithm across all codebase files, typically setting +num_bugs += +[ +2 +, +4 +] +absent +2 +4 +=[2,4] += [ 2 , 4 ] +, +limit_per_file += +3 +absent +3 +=3 += 3 +, +max_combos += +40 +absent +40 +=40 += 40 +. +Decreasing +num_bugs +or increasing the other three parameters improves the yield. +From the same module. +There are several ways one could imagine composing function-level bugs from multiple bugs, such as combining those that break the same test or have a programmatic relationship (e.g. function +a +calls function +b +). +We found a relatively straightforward and effective approach to be combining files that edit the same “module”. +By “module” we are referring to a subdirectory within the source code (e.g. +sklearn/feature_extraction +, +astropy/convolution +). +Out of all SWE-bench instances that edit +2 +2 +2 +2 ++ files, +75 +75 +75 +75 +% modify files within the same submodule, suggesting a high degree of intra-module code changes. +The implementation for our approach is described in Algorithm +2 +b +⁢ +u +⁢ +g +⁢ +s +𝑏 +𝑢 +𝑔 +𝑠 +bugs +italic_b italic_u italic_g italic_s +; +n +⁢ +u +⁢ +m +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑛 +𝑢 +𝑚 +_ +𝑏 +𝑢 +𝑔 +𝑠 +num\_bugs +italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s +; +l +⁢ +i +⁢ +m +⁢ +i +⁢ +t +⁢ +_ +⁢ +p +⁢ +e +⁢ +r +⁢ +_ +⁢ +m +⁢ +o +⁢ +d +⁢ +u +⁢ +l +⁢ +e +𝑙 +𝑖 +𝑚 +𝑖 +𝑡 +_ +𝑝 +𝑒 +𝑟 +_ +𝑚 +𝑜 +𝑑 +𝑢 +𝑙 +𝑒 +limit\_per\_module +italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_m italic_o italic_d italic_u italic_l italic_e +; +m +⁢ +a +⁢ +x +⁢ +_ +⁢ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +⁢ +s +𝑚 +𝑎 +𝑥 +_ +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +𝑠 +max\_combos +italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s +; +d +⁢ +e +⁢ +p +⁢ +t +⁢ +h +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +depth +italic_d italic_e italic_p italic_t italic_h +n +⁢ +u +⁢ +m +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑛 +𝑢 +𝑚 +_ +𝑏 +𝑢 +𝑔 +𝑠 +num\_bugs +italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s +≥ +\geq +≥ +2; +procedure +CombineModuleBugs +m +⁢ +a +⁢ +p +⁢ +_ +⁢ +p +⁢ +a +⁢ +t +⁢ +h +⁢ +_ +⁢ +t +⁢ +o +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +← +{ +} +← +𝑚 +𝑎 +𝑝 +_ +𝑝 +𝑎 +𝑡 +ℎ +_ +𝑡 +𝑜 +_ +𝑏 +𝑢 +𝑔 +𝑠 +map\_path\_to\_bugs\leftarrow\{\} +italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_b italic_u italic_g italic_s ← { } +for +each +b +⁢ +u +⁢ +g +𝑏 +𝑢 +𝑔 +bug +italic_b italic_u italic_g +in +b +⁢ +u +⁢ +g +⁢ +s +𝑏 +𝑢 +𝑔 +𝑠 +bugs +italic_b italic_u italic_g italic_s +do +p +⁢ +a +⁢ +t +⁢ +h +← +← +𝑝 +𝑎 +𝑡 +ℎ +absent +path\leftarrow +italic_p italic_a italic_t italic_h ← +get_path_from(bug) +m +⁢ +a +⁢ +p +⁢ +_ +⁢ +p +⁢ +a +⁢ +t +⁢ +h +⁢ +_ +⁢ +t +⁢ +o +⁢ +_ +⁢ +p +⁢ +a +⁢ +t +⁢ +c +⁢ +h +⁢ +e +⁢ +s +⁢ +[ +p +⁢ +a +⁢ +t +⁢ +h +] +← +[ +b +⁢ +u +⁢ +g +] +← +𝑚 +𝑎 +𝑝 +_ +𝑝 +𝑎 +𝑡 +ℎ +_ +𝑡 +𝑜 +_ +𝑝 +𝑎 +𝑡 +𝑐 +ℎ +𝑒 +𝑠 +delimited-[] +𝑝 +𝑎 +𝑡 +ℎ +delimited-[] +𝑏 +𝑢 +𝑔 +map\_path\_to\_patches[path]\leftarrow[bug] +italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_p italic_a italic_t italic_c italic_h italic_e italic_s [ italic_p italic_a italic_t italic_h ] ← [ italic_b italic_u italic_g ] +end +for +Collapse nested paths based on +d +⁢ +e +⁢ +p +⁢ +t +⁢ +h +𝑑 +𝑒 +𝑝 +𝑡 +ℎ +depth +italic_d italic_e italic_p italic_t italic_h +for all +( +p +⁢ +a +⁢ +t +⁢ +h +, +p +⁢ +a +⁢ +t +⁢ +c +⁢ +h +⁢ +e +⁢ +s +) +𝑝 +𝑎 +𝑡 +ℎ +𝑝 +𝑎 +𝑡 +𝑐 +ℎ +𝑒 +𝑠 +(path,patches) +( italic_p italic_a italic_t italic_h , italic_p italic_a italic_t italic_c italic_h italic_e italic_s ) +in +m +⁢ +a +⁢ +p +⁢ +_ +⁢ +p +⁢ +a +⁢ +t +⁢ +h +⁢ +_ +⁢ +t +⁢ +o +⁢ +_ +⁢ +p +⁢ +a +⁢ +t +⁢ +c +⁢ +h +⁢ +e +⁢ +s +𝑚 +𝑎 +𝑝 +_ +𝑝 +𝑎 +𝑡 +ℎ +_ +𝑡 +𝑜 +_ +𝑝 +𝑎 +𝑡 +𝑐 +ℎ +𝑒 +𝑠 +map\_path\_to\_patches +italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_p italic_a italic_t italic_c italic_h italic_e italic_s +do +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +← +← +\leftarrow +← +get_combos(patches, +n +⁢ +u +⁢ +m +⁢ +_ +⁢ +b +⁢ +u +⁢ +g +⁢ +s +𝑛 +𝑢 +𝑚 +_ +𝑏 +𝑢 +𝑔 +𝑠 +num\_bugs +italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s +, +m +⁢ +a +⁢ +x +⁢ +_ +⁢ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +⁢ +s +𝑚 +𝑎 +𝑥 +_ +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +𝑠 +max\_combos +italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s +) +for +each +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +in +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +do +Apply +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +to +c +⁢ +o +⁢ +d +⁢ +e +⁢ +b +⁢ +a +⁢ +s +⁢ +e +𝑐 +𝑜 +𝑑 +𝑒 +𝑏 +𝑎 +𝑠 +𝑒 +codebase +italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e +if +success and num_files_changed(combo) +≥ +2 +absent +2 +\geq 2 +≥ 2 +then +Save +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o +to disk +if +l +⁢ +i +⁢ +m +⁢ +i +⁢ +t +⁢ +_ +⁢ +p +⁢ +e +⁢ +r +⁢ +_ +⁢ +m +⁢ +o +⁢ +d +⁢ +u +⁢ +l +⁢ +e +𝑙 +𝑖 +𝑚 +𝑖 +𝑡 +_ +𝑝 +𝑒 +𝑟 +_ +𝑚 +𝑜 +𝑑 +𝑢 +𝑙 +𝑒 +limit\_per\_module +italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_m italic_o italic_d italic_u italic_l italic_e +reached +then +break +end +if +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +← +← +\leftarrow +← +[c for c in +c +⁢ +o +⁢ +m +⁢ +b +⁢ +i +⁢ +n +⁢ +a +⁢ +t +⁢ +i +⁢ +o +⁢ +n +⁢ +s +𝑐 +𝑜 +𝑚 +𝑏 +𝑖 +𝑛 +𝑎 +𝑡 +𝑖 +𝑜 +𝑛 +𝑠 +combinations +italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s +if c +∩ +\cap +∩ +c +⁢ +o +⁢ +m +⁢ +b +⁢ +o +𝑐 +𝑜 +𝑚 +𝑏 +𝑜 +combo +italic_c italic_o italic_m italic_b italic_o += +∅ +absent +=\emptyset += ∅ +] +end +if +end +for +end +for +end +procedure +Algorithm 2 +Combine multiple patches from the same module. +The implementation for this approach is similar to Algorithm +1 +with two key changes. +First, we do not do file-by-file or folder-by-folder traversal. +Instead, using the diff patches, we create a dictionary +map_path_to_bugs +that mimics the file structure of a codebase. +For example, if +bug +modifies path +a/b/c/d.py +, it is represented as +map_path_to_bugs[a][b][c][d.py] += += += +[bug] +. +Additional bugs that modify the same path are appended to the list. +Since every bug is a function-level bug, there will never be a bug registered in multiple lists. +We then “collapse” up to +depth +indices. +So for instance, at +depth += +3 +absent +3 +=3 += 3 +, the above data structure is collapsed into +map_path_to_bugs[a/b/c][d.py] += += += +[bug] +. +Finally, any nested dictionaries are collapsed into a single list of patches (e.g. +map_path_to_bugs[a/b/c] += += += +[bug] +). +Mirroring the procedure in Algorithm +1 +, we then iterate across this dictionary’s values (lists of bugs). +Second, we only save patches that modify +2 ++ +limit-from +2 +2+ +2 + +files; aggregate bugs (represented by +combo +) modifying a single file are not considered. +Again, we run this strategy across all +100 +100 +100 +100 +repositories, with parameters +num_bugs += +[ +2 +, +5 +] +absent +2 +5 +=[2,5] += [ 2 , 5 ] +, +limit_per_module += +10 +absent +10 +=10 += 10 +, +max_combos += +100 +absent +100 +=100 += 100 +, and +depth += +2 +absent +2 +=2 += 2 +. +Reducing +num_bugs +, +depth +and increasing the other parameters yields more bugs. +We choose a +depth +of +2 +2 +2 +2 +because empirically, we find that meaningful modules are usually declared as immediate sub-folders of the main source code folder (e.g. in +sklearn/feature_extraction +, +sklearn +is the source code folder while +feature_extraction +is the module). +A shallower depth leads to less meaningful groupings, while yield decreases significantly for every increased level of depth, particularly for smaller repositories. +B.4 +Pull Request Mirroring +We finally discuss the fourth and last strategy for generating bugs - mirroring real world pull requests (PR). +We visualize this process in Figure +12 +. +Figure 12: +Workflow to generate bugs by reverting changes made in the diff patch corresponding to a real GitHub pull request (PR). +Given the patch and the files modified by the patch, we prompt the LM to generate a complete rewrite of each file that +reverses +the changes made in the PR. +The changes are applied to the codebase, and we extract the patch, which now captures the reversal of the PR changes. +Why use an LM? +When we initially implemented this approach, we attempted to directly perform a +git apply --reverse [patch] +on the codebase. +However, for the large majority of patches, this fails. +We performed troubleshooting by inspecting +100 +100 +100 +100 +PR patches on the +sqlfluff/sqlfluff +repository, leading us to two observations. +1. +The majority of these PRs reflect changes that remain present in the codebase today (making the bug creation promising). +2. +However, many patches can not be reversed because the exact location (e.g. lines, file) of the relevant code changed because of other changes. +Therefore, we employ LMs to perform patch reversal, and find that reasoning models (e.g. +o3-mini +(OpenAI, +2024b +) +) are particularly effective. +Description of method. +We follow SWE-bench’s methodology for crawling PRs created January 1st, 2023 and onwards, with minor and arbitrary exceptions for some repositories where we crawl older PRs as well. +Per PR, we iterate across the file(s) changed by the patch. +Per file, we prompt an LM with the file-specific changes from the patch along with the file’s source code in the current state of the repository ( +not +the repository’s state corresponding to when the PR was applied, referred to as the +base_commit +in SWE-bench). +The LM is asked to generate a rewrite of the file that reverts the changes reflected in the PR. +We aggregate the changes across all file(s) into a single patch. +Because we are interested in problems that our expert trajectory generation method (SWE-agent + Claude 3.7 Sonnet) has a chance of solving, we do not attempt to reproduce PRs that change more than +8 +8 +8 +8 +files. +This constraint is imposed because no SWE-bench instance that edits more than +6 +6 +6 +6 +files has ever been solved +(Jimenez et al., +2024a +) +. +How well does PR mirroring work? +We scrape the PRs corresponding to +100 +100 +100 +100 +randomly selected SWE-bench task instances from the +django/django +GitHub repository and attempt to recreate these task instances with +\bugs +’s collection process. +We successfully recovered +92 +92 +92 +92 +of +100 +100 +100 +100 +task instances. +Of these, +84 +84 +84 +84 +break identical F2P test(s), with the remaining +8 +8 +8 +8 +breaking a subset because some tests were removed over time. +This sanity check gives us confidence that the PR mirroring strategy lives up to its name. +Comparison to SWE-bench. +This approach has several benefits and drawbacks compared to SWE-bench’s collection pipeline. +First, it removes the need to create instance-specific Docker images — all PRs are mirrored against the same version of a repository. +This also implies that there is no need to write installation specifications for past versions of a repository, which is typically the most laborious step in task construction with SWE-bench. +Finally, this strategy also allows us to loosen the requirements on what PRs we attempt to convert into a task instance. +In SWE-bench, the core requirements for what PRs to attempt to convert into a task instance include: +1. +It must edit +1 ++ +limit-from +1 +1+ +1 + +code files (e.g. not just +.md +, +.rst +files). +2. +It must reference +1 ++ +limit-from +1 +1+ +1 + +GitHub issues, which serves as the problem statement. +3. +It must edit +1 ++ +limit-from +1 +1+ +1 + +testing related files ( +1 ++ +limit-from +1 +1+ +1 + +files with a +test +-adjacent keyword in it). +With this collection strategy and +\bugs +’s focus on training data, the second and third requirements are no longer necessary. +If there is no associated issue, issue text can simply be generated. +If the patch does not contain any testing related changes, this is tolerable, as the validation stage will determine whether the PR breaks any tests. +With these considerations, we purport that +\bugs +’s PR mirroring strategy can re-purpose a higher percentage of real world code changes for training purposes. +The main downside is that the rest of the repository is out of sync with the state of the codebase when the PR was applied. +As a result, it’s possible that changes in the behavior of the rest of the codebase may affect the issue’s reproducibility or the accuracy of the issue description (e.g. line numbers referenced in the issue text are likely somewhat off with respect to the codebase). +However, a simple mitigation for this is to create a Docker image for a repository at an earlier commit that’s closer to the original creation date of the issue. +While we do not carry out a targeted experiment, we hypothesize that using +\bugs +, we would be able to reproduce SWE-bench entirely with +10 +10 +10 +10 +x less human hours with an estimated +2294 +2294 +2294 +2294 +x $ +0.055 +0.055 +0.055 +0.055 += $ +126.17 +126.17 +126.17 +126.17 +in costs. +Appendix C +Dataset Statistics +We present additional breakdowns and analyses of the +\bugs +dataset, focusing on the kinds of repositories and bugs that are represented. +Repository categorization. +We present an exhaustive list of repositories used in +\bugs +in Table +9 +. +We categorize the repositories into seven general buckets: Data Parsing and Transformation ( +39 +39 +39 +39 +), Web & API Development ( +11 +11 +11 +11 +), Code Quality & Testing ( +12 +12 +12 +12 +), Visualization & Presentation ( +8 +8 +8 +8 +), System Tools & Protocols ( +17 +17 +17 +17 +), Natural Language Processing ( +7 +7 +7 +7 +), and Miscellaneous ( +6 +6 +6 +6 +). +The categorizations were performed by first, determining an appropriate set of categories based on manual inspection supported by the descriptions and GitHub topics associated with each repository. +After settling upon the buckets, we asked GPT-4o to provide a label based on the repository’s metadata and +README +dump. +\bugs +represents a wider and more variegated coverage of software tools and applications compared to any prior works. +{CJK*} +UTF8gbsn +Repository +Description +Code Quality and Testing +PyCQA/flake8 +flake8 is a python tool that glues together pycodestyle, pyflakes, mccabe, and third-party plugins to check the style and quality of some python code. +Suor/funcy +A fancy and practical functional tools +adrienverge/yamllint +A linter for YAML files. +agronholm/typeguard +Run-time type checker for Python +cknd/stackprinter +Debugging-friendly exceptions for Python +cool-RR/PySnooper +Never use print for debugging again +getmoto/moto +A library that allows you to easily mock out tests based on AWS infrastructure. +pylint-dev/astroid +A common base representation of python source code for pylint and other projects +pytest-dev/iniconfig None +pytest-dev/iniconfig +None +python/mypy +Optional static typing for Python +pyupio/safety +Safety checks Python dependencies for known security vulnerabilities and suggests the proper remediations for vulnerabilities detected. +pyutils/line_profiler +Line-by-line profiling for Python +rubik/radon +Various code metrics for Python code +spulec/freezegun +Let your Python tests travel through time +sqlfluff/sqlfluff +A modular SQL linter and auto-formatter with support for multiple dialects and templated code. +Data Parsing and Transformation +alecthomas/voluptuous +CONTRIBUTIONS ONLY: Voluptuous, despite the name, is a Python data validation library. +andialbrecht/sqlparse +A non-validating SQL parser module for Python +buriy/python-readability +fast python port of arc90’s readability tool, updated to match latest readability.js! +burnash/gspread +Google Sheets Python API +chardet/chardet +Python character encoding detector +cloudpipe/cloudpickle +Extended pickling support for Python objects +dask/dask +Parallel computing with task scheduling +datamade/usaddress +:us: a python library for parsing unstructured United States address strings into address components +davidhalter/parso +A Python Parser +erikrose/parsimonious +The fastest pure-Python PEG parser I can muster +facelessuser/soupsieve +A modern CSS selector implementation for BeautifulSoup +gawel/pyquery +A jquery-like library for python +google/textfsm +Python module for parsing semi-structured text into python tables. +gruns/furl +URL parsing and manipulation made easy. +gweis/isodate +ISO 8601 date/time parser +hukkin/tomli +A lil’ TOML parser +jawah/charset_normalizer +Truly universal encoding detector in pure Python +john-kurkowski/tldextract +Accurately separates a URL’s subdomain, domain, and public suffix, using the Public Suffix List (PSL). +joke2k/faker +Faker is a Python package that generates fake data for you. +jsvine/pdfplumber +Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables. +kayak/pypika +PyPika is a python SQL query builder that exposes the full richness of the SQL language using a syntax that reflects the resulting query. PyPika excels at all sorts of SQL queries but is especially useful for data analysis. +keleshev/schema +Schema validation just got Pythonic +kennethreitz/records +SQL for Humans™ +kurtmckee/feedparser +Parse feeds in Python +lepture/mistune +A fast yet powerful Python Markdown parser with renderers and plugins. +madzak/python-json-logger +Json Formatter for the standard python logger +mahmoud/glom +☄️ Python’s nested data operator (and CLI), for all your declarative restructuring needs. Got data? Glom it! ☄️ +marshmallow-code/marshmallow +A lightweight library for converting complex objects to and from simple Python datatypes. +martinblech/xmltodict +Python module that makes working with XML feel like you are working with JSON +matthewwithanm/python-markdownify +Convert HTML to Markdown +mewwts/addict +The Python Dict that’s better than heroin. +mido/mido +MIDI Objects for Python +modin-project/modin +Modin: Scale your Pandas workflows by changing a single line of code +mozilla/bleach +Bleach is an allowed-list-based HTML sanitizing library that escapes or strips markup and attributes +msiemens/tinydb +TinyDB is a lightweight document oriented database optimized for your happiness :) +pandas-dev/pandas +Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more +pdfminer/pdfminer.six +Community maintained fork of pdfminer - we fathom PDF +pudo/dataset +Easy-to-use data handling for SQL data stores with support for implicit table creation, bulk loading, and transactions. +pydantic/pydantic +Data validation using Python type hints +pydata/patsy +Describing statistical models in Python using symbolic formulas +pydicom/pydicom +Read, modify and write DICOM files with python code +pygments/pygments +Pygments is a generic syntax highlighter written in Python +pyparsing/pyparsing +Python library for creating PEG parsers +python-jsonschema/jsonschema +An implementation of the JSON Schema specification for Python +python-openxml/python-docx +Create and modify Word documents with Python +r1chardj0n3s/parse +Parse strings using a specification based on the Python format() syntax. +scanny/python-pptx +Create Open XML PowerPoint documents in Python +scrapy/scrapy +Scrapy, a fast high-level web crawling & scraping framework for Python. +seperman/deepdiff +DeepDiff: Deep Difference and search of any Python object/data. DeepHash: Hash of any object based on its contents. Delta: Use deltas to reconstruct objects by adding deltas together. +sloria/environs +simplified environment variable parsing +sunpy/sunpy +SunPy - Python for Solar Physics +tkrajina/gpxpy +gpx-py is a python GPX parser. GPX (GPS eXchange Format) is an XML based file format for GPS tracks. +tobymao/sqlglot +Python SQL Parser and Transpiler +un33k/python-slugify +Returns unicode slugs +Machine Learning and AI +facebookresearch/fvcore +Collection of common code that’s shared among different research projects in FAIR computer vision team. +facebookresearch/hydra +Hydra is a framework for elegantly configuring complex applications +HIPS/autograd +Efficiently computes derivatives of NumPy code. +iterative/dvc +Data Versioning and ML Experiments +jaraco/inflect +Correctly generate plurals, ordinals, indefinite articles; convert numbers to words +life4/textdistance +Compute distance between sequences. 30+ algorithms, pure python implementation, common interface, optional external libs usage. +luozhouyang/python-string-similarity +A library implementing different string similarity and distance measures using Python. +Mimino666/langdetect +Port of Google’s language-detection library to Python. +mozillazg/python-pinyin +汉字转拼音(pypinyin) +pndurette/gTTS +Python library and CLI tool to interface with Google Translate’s text-to-speech API +Project-MONAI/MONAI +AI Toolkit for Healthcare Imaging +seatgeek/thefuzz +Fuzzy String Matching in Python +vi3k6i5/flashtext +Extract Keywords from sentence or Replace keywords in sentences. +System Tools and Protocols +agronholm/exceptiongroup +Backport of PEP 654 (exception groups) +aio-libs/async-timeout +asyncio-compatible timeout class +arrow-py/arrow +Better dates & times for Python +borntyping/python-colorlog +A colored formatter for the python logging module +cantools/cantools +CAN bus tools. +conan-io/conan +Conan - The open-source C and C++ package manager +cookiecutter/cookiecutter +A cross-platform command-line utility that creates projects from cookiecutters (project templates), e.g. Python package projects, C projects. +dbader/schedule +Python job scheduling for humans. +gruns/icecream +Never use print() to debug again. +jd/tenacity +Retrying library for Python +mahmoud/boltons +Like builtins, but boltons. 250+ constructs, recipes, and snippets which extend (and rely on nothing but) the Python standard library. Nothing like Michael Bolton. +oauthlib/oauthlib +A generic, spec-compliant, thorough implementation of the OAuth request-signing logic +pallets/click +Python composable command line interface toolkit +paramiko/paramiko +The leading native Python SSHv2 protocol library. +pexpect/ptyprocess +Run a subprocess in a pseudo terminal +pyasn1/pyasn1 +Generic ASN.1 library for Python +pyca/pyopenssl +A Python wrapper around the OpenSSL library +python-hyper/h11 +A pure-Python, bring-your-own-I/O implementation of HTTP/1.1 +python-trio/trio +Trio – a friendly Python library for async concurrency and I/O +rustedpy/result +NOT MAINTAINED - A simple Rust like Result type for Python 3. Fully type annotated. +termcolor/termcolor +ANSI color formatting for output in terminal +theskumar/python-dotenv +Reads key-value pairs from a .env file and can set them as environment variables. It helps in developing applications following the 12-factor principles. +tox-dev/pipdeptree +A command line utility to display dependency tree of the installed Python packages +Visualization and Presentation +amueller/word_cloud +A little word cloud generator in Python +lincolnloop/python-qrcode +Python QR Code image generator +prettytable/prettytable +Display tabular data in a visually appealing ASCII table format +pwaller/pyfiglet +An implementation of figlet written in Python +rsalmei/alive-progress +A new kind of Progress Bar, with real-time throughput, ETA, and very cool animations! +weaveworks/grafanalib +Python library for building Grafana dashboards +Web and API Development +Cog-Creators/Red-DiscordBot +A multi-function Discord bot +Knio/dominate +Dominate is a Python library for creating and manipulating HTML documents using an elegant DOM API. It allows you to write HTML pages in pure Python very concisely, which eliminate the need to learn another template language, and to take advantage of the more powerful features of Python. +alanjds/drf-nested-routers +Nested Routers for Django Rest Framework +benoitc/gunicorn +gunicorn ’Green Unicorn’ is a WSGI HTTP Server for UNIX, fast clients and sleepy applications. +bottlepy/bottle +bottle.py is a fast and simple micro-framework for python web-applications. +django-money/django-money +Money fields for Django forms and models. +django/channels +Developer-friendly asynchrony for Django +django/daphne +Django Channels HTTP/WebSocket server +encode/starlette +The little ASGI framework that shines. +getnikola/nikola +A static website and blog generator +graphql-python/graphene +GraphQL framework for Python +marshmallow-code/apispec +A pluggable API specification generator. Currently supports the OpenAPI Specification (f.k.a. the Swagger specification).. +marshmallow-code/webargs +A friendly library for parsing HTTP request arguments, with built-in support for popular web frameworks, including Flask, Django, Bottle, Tornado, Pyramid, webapp2, Falcon, and aiohttp. +pallets/jinja +A very fast and expressive template engine. +pallets/markupsafe +Safely add untrusted strings to HTML/XML markup. +tornadoweb/tornado +Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed. +tweepy/tweepy +Twitter for Python! +C.1 +Bug Generation Statistics +We provide extensive details about different aspects of each of the bug generation strategies, including the yield rates, labor/monetary costs, and dataset characterizations. +Yield rates. +In Table +10 +, we provide the yield rates for each bug generation method across all repositories in +\bugs +. +In general, we find that the PR Mirroring has the lowest yield rate at +13.18 +13.18 +13.18 +13.18 +% (although this rate is somewhat higher than SWE-bench’s yield rate of +2294 +/ +93139 += +2.46 +2294 +93139 +2.46 +2294/93139=2.46 +2294 / 93139 = 2.46 +%). +For using LMs to generate bugs, modifying functions to introduce bugs intentionally has a higher yield than asking LMs to perform a best-effort rewrite. +The efficacy of Procedural Modifications varies by strategy. +For instance, shuffling the functions declared in a class only breaks existing test(s) +1.93 +1.93 +1.93 +1.93 +% of the time, but inverting a conditional will lead to a task instance for +47.04 +47.04 +47.04 +47.04 +% of modifications. +Finally, combining bug patches has an extremely high yield rate - this is to be expected because we only attempt to combine bug patches that have been validated as usable task instances breaking +1 ++ +limit-from +1 +1+ +1 + +tests. +Strategy +# Repos +# Candidates +# Instances +Yield Rate +Combine (file) +124 +6020 +5865 +97.43% +Combine (module) +65 +4396 +4227 +96.16% +LM (Modify) +108 +31950 +17887 +55.98% +LM (Rewrite) +128 +11908 +4173 +35.04% +PR Mirroring +108 +6934 +2344 +33.8% +Procedural (Class Rm Base) +103 +1401 +463 +33.05% +Procedural (Class Rm Funcs) +103 +2506 +1180 +47.09% +Procedural (Class Shuffle Funcs) +103 +2504 +47 +1.88% +Procedural (Ctrl Invert If) +105 +4695 +2321 +49.44% +Procedural (Ctrl Shuffle) +104 +9055 +4015 +44.34% +Procedural (Op Break Chains) +71 +747 +225 +30.12% +Procedural (Op Change Const) +77 +723 +257 +35.55% +Procedural (Op Change) +81 +1507 +450 +29.86% +Procedural (Op Swap) +87 +2141 +483 +22.56% +Procedural (Remove Assign) +121 +5470 +2661 +48.65% +Procedural (Remove Cond) +120 +5288 +2311 +43.7% +Procedural (Remove Loop) +110 +1945 +860 +44.22% +Procedural (Remove Wrapper) +80 +884 +368 +41.63% +All +129 +100074 +50137 +50.1% +Table 10: +Yield rates for different bug generation strategies covered in Section +B +. +We show the number of repositories that each strategy was run on, the number of bug candidates generated by each strategy, and the number of instances, or the number of candidates that were validated to have +1 ++ +limit-from +1 +1+ +1 + +Fail to Pass test. +The yield rate for +The number of repositories captured by each bug generation technique varies due to each strategy’s specific preconditions, which at times may not be effective for some repositories. +For instance, the +Procedural (Class *) +set of methods only mutates Python classes. +This strategy is fruitless for the minority of +\bugs +repositories that do not define any classes. +The +Procedural (Op Break Chains) +method randomly removes operations and operands from expressions with two or more operations (e.g. +a ++ +b ++ +c +→ +a ++ +b +→ +𝑎 +𝑏 +𝑐 +𝑎 +𝑏 +a+b+c\rightarrow a+b +italic_a + italic_b + italic_c → italic_a + italic_b +) — such expressions are not always present in +\bugs +repositories. +The collective yield rate across +\bugs +’s bug generation strategies is significantly higher than SWE-bench’s collection strategy. +Yield Rate +# of Repositories +0 +0 +- +25 +25 +25 +25 +% +10 +10 +10 +10 +25 +25 +25 +25 +- +50 +50 +50 +50 +% +31 +31 +31 +31 +50 +50 +50 +50 +- +75 +75 +75 +75 +% +60 +60 +60 +60 +75 +75 +75 +75 +- +100 +100 +100 +100 +% +27 +27 +27 +27 +Table 11: +Yield rates for different repositories represented in +\bugs +. +The yield rate also varies with respect to the repository it is being applied to. +We provide a summary of yield rates by repository in Table +11 +. +We generally observe that lower test coverage correlates with a lower yield rate. +Dataset characterizations. +In Table +12 +, we provide statistics about the validated task instances produced by different bug generation strategies. +Our work’s LM-based strategies rewrite one function in one file. +Procedural modifications will also only change one file, but depending on the strategy, +1 ++ +limit-from +1 +1+ +1 + +functions or classes may be changed. +Combining multiple patches from the same file always produces a patch with +2 ++ +limit-from +2 +2+ +2 + +functions edited. +Combining across modules produces a patch with +2 ++ +limit-from +2 +2+ +2 + +files edited. +The targeted nature of each of the bug creation strategies is reflected in the typical number of functions and files that the bugs produced by each strategy edits. +Strategy +# Instances +# F2P +Δ +Δ +\Delta +roman_Δ +Lines +Δ +Δ +\Delta +roman_Δ +Functions +Δ +Δ +\Delta +roman_Δ +Files +Combine +10092 +15 (5-48) +19 (12-36) +2 (2-3) +1 (1-2) +LM +22060 +4 (1-17) +6 (3-15) +1 (1-1) +1 (1-1) +PR Mirroring +2344 +3 (1-14) +20 (8-55) +2 (2-4) +1 (1-2) +Procedural +15641 +7 (2-32) +7 (5-15) +1 (1-1) +1 (1-1) +Table 12: +Statistics for attributes of a +\bugs +task instance across different bug generation strategies, reported as +median (IQR) +, where IQR is the inter-quartile range (25th–75th percentile). +In Figure +13 +, we show the distributions for different attributes of +\bugs +compared to other SWE-bench style datasets. +Compared to prior works, there is a much higher proportion of task instances with more than one Fail-to-Pass test. +For any one repository, we find that +\bugs +task instances collectively cause failures for a much higher percentage of the testing suit than other datasets; a potential benefit of this is that training on +\bugs +based trajectories may expose models to a much broader set of functionalities in a codebase. +The number of lines and files edited by +\bugs +task instances is highly similar to the trend lines for SWE-bench Verified. +Figure 13: +Comparison of cumulative distributions for Fail-to-Pass tests along with the lines and files edited by the gold patch across +\bugs +and four SWE-bench style datasets. +We note that unlike other datasets, the trend line of +\bugs +task instances is “adjustable”. +In other words, the Figure +13 +distributions are a capture of the task instances provided in this release of +\bugs +. +However, because of +\bugs +’s flexible bug creation techniques, the distribution can be “shaped” if needed. +For instance, generating more task instances using the bug patch combination method would shift all three curves in Figure +13 +. +We make this point to highlight the fact that the attributes of SWE-bench task instances are, in a sense, constrained by real world software development behavior. +On the other hand, +\bugs +can be used to break tests and code that may not be reflected at all in any existing pull request. +In this sense, we argue that LMs trained on +\bugs +have better “exposure” to a codebase compared to exclusively training on pull requests. +Continuation of scaling execution environments. +The validation and evaluation procedures for +\bugs +deviate slightly from SWE-bench’s harnesses. +The main reasons for these differences can largely be attributed to the granularity of installation specifications. +In SWE-bench, each task instance corresponds to a unique base commit, with additional +version +and +environment_setup_commit +keys needed as indirection for mapping an instance to the correct set of installation and testing instructions. +Across time, the continuous evolution of a repository and its dependencies make for an incredibly high degree of variability in how a repository should be installed correctly. +To solve this variability, the community has resorted to creating an image per task instance, as done in +Chowdhury et al. ( +2024 +) +. +Therefore, for +2294 +2294 +2294 +2294 +SWE-bench task instances, there are +2294 +2294 +2294 +2294 +unique Docker images, each at a size of at least several gigabytes ( +∼ +5 +similar-to +absent +5 +\sim 5 +∼ 5 +- +6 +6 +6 +6 +GBs). +On the other hand, the simplicity and scalability of +\bugs +’s design allows one to support many task instances with comparatively much fewer Docker images. +As mentioned above, installation and testing procedures are (repository, commit) specific. +Therefore, when bugs are generated from each (repository, commit), all bugs can be reproduced and tested successfully from the same Docker image. +In other words, if I generate +100 +100 +100 +100 +bugs for a repository at some commit, instead of +100 +100 +100 +100 +Docker images, only a single Docker image is required to run inference on any of the +100 +100 +100 +100 +task instances. +This design is what enables +\bugs +to be significantly more space-efficient than SWE-bench. +Based on the publicly released images, for SWE-bench’s +2294 +2294 +2294 +2294 +task instances, +1.2 +1.2 +1.2 +1.2 +TBs of storage are required to download all Docker images locally. +for SWE-bench Multimodal’s +517 +517 +517 +517 +task instances, +1.2 +1.2 +1.2 +1.2 +TBs are required. +The higher per-instance Docker image size for SWE-bench Multimodal is due to how JavaScript dependency management tools (e.g. +npm +) require more storage compared to equivalent Python infrastructure (e.g. +pypi +). +Pan et al. ( +2024 +) +states that each image for the +2438 +2438 +2438 +2438 +instances an average of +2.6 +2.6 +2.6 +2.6 +GB, totaling 6 TB of storage total. +Such a storage requirement can be a significant barrier for academic practitioners. +On the other hand, with more than +20 +20 +20 +20 +x the number of bugs, +\bugs +requires only +125 +125 +125 +125 +Docker images total, corresponding to the number of unique (repository, commit) pairs (in this work, for each repository, we only determine installation and test specifications for one commit). +The +125 +125 +125 +125 +images require a total of +290.54 +290.54 +290.54 +290.54 +GBs. +In summary, compared to SWE-bench’s task collection strategy, +\bugs +’s design makes it easier to not only create task instances, but also train on them as well. +C.2 +Case Study: SWE-bench & +\bugs +To better understand the differences between the SWE-bench and +\bugs +collection strategies, we perform +\bugs +collection on the +pallets/flask +GitHub repository, one of the +12 +12 +12 +12 +test split repositories from the original SWE-bench benchmark. +We review the steps covered in Section +2.1 +applied to +pallets/flask +in detail. +First, we defined the installation and testing specifications for the +pallets/flask +repository at commit +bc09840 +. +Next, we apply the LM modification bug generation strategy to this version of the repository, generating +267 +267 +267 +267 +unique bugs. +We observe several differences. +First, +the +\bugs +collection strategy yields a much higher number of bugs outright. +From SWE-bench, +11 +11 +11 +11 +task instances are from the +pallets/flask +repository. +The task instances were originally filtered from +2434 +2434 +2434 +2434 +pull requests (PRs), with +107 +107 +107 +107 +satisfying SWE-bench’s filtering criteria of (1) being linked to one or more issues and (2) featuring 1+ new tests. +Out of these +107 +107 +107 +107 +, the +11 +11 +11 +11 +( +0.45 +0.45 +0.45 +0.45 +% of +2434 +2434 +2434 +2434 +) task instances represent the proportion of PRs that execution environments could be successfully constructed for. +On the other hand, running the function-level rewriting strategy for bug generation originally yielded +402 +402 +402 +402 +candidates, of which +267 +267 +267 +267 +were determined to be valid task instances. +Second, +\bugs +requires significantly less human effort while only incurring minor costs +. +Collecting the +11 +11 +11 +11 +pallets/flask +task instances (steps include scraping PRs, determining repository versions across time, defining version-specific installation/test specifications, running execution-based validation multiple times) took an estimated +38 +38 +38 +38 +hours worth of human labor. +On the contrary, defining installation and testing specifications for the latest commit of +pallets/flasks +took +10 +10 +10 +10 +minutes. +The subsequent function-level rewriting strategy for bugs took +23 +23 +23 +23 +minutes to run, incurring a total cost of just $ +2.47 +2.47 +2.47 +2.47 +( +∼ +similar-to +\sim +∼ +$ +0.00613 +0.00613 +0.00613 +0.00613 +per instance). +The final execution-based validation step that filters out +402 +− +267 += +135 +402 +267 +135 +402-267=135 +402 - 267 = 135 +unqualified bug candidates ran in +14 +14 +14 +14 +minutes. +Since both the bug and problem statement generation strategies are repository agnostic, no additional human intervention is necessary for these steps. +Head to head, per instance for the +pallets/flask +repository, SWE-bench style collection requires +38 +× +60 +/ +11 += +207.27 +38 +60 +11 +207.27 +38\times 60/11=207.27 +38 × 60 / 11 = 207.27 +minutes compared to +0.176 +0.176 +0.176 +0.176 +minutes ( +∼ +10.6 +similar-to +absent +10.6 +\sim 10.6 +∼ 10.6 +seconds) and $ +0.00613 +0.00613 +0.00613 +0.00613 +in API costs using +\bugs +. +Third, +collectively, +\bugs +task instances break a significantly larger proportion of existing tests in a codebase +. +We define “bug coverage” as the proportion of tests broken by +1 +1 +1 +1 ++ instance across all task instances. +For the SWE-bench split of +pallets/flask +, there are +207 +207 +207 +207 +unique tests across all +11 +11 +11 +11 +instances. +Of these +207 +207 +207 +207 +tests, +15 +15 +15 +15 +are broken by +1 +1 +1 +1 ++ instance, corresponding to a bug coverage rate of +7.25 +7.25 +7.25 +7.25 +%. +For the +\bugs +split of +pallets/flask +, there are +474 +474 +474 +474 +unique tests across +267 +267 +267 +267 +instances. +The larger amount of tests is due to increased test coverage in the +pallets/flask +repository as of Nov. 28, 2024 (when +\bugs +was collected) compared to June 2023 (when SWE-bench was collected). +Of these +474 +474 +474 +474 +tests, +422 +422 +422 +422 +are broken by +1 +1 +1 +1 ++ instance, a bug coverage rate of +89.03 +89.03 +89.03 +89.03 +%. +We attribute the significant difference to a consistent tendency in real world open source software development workflows, that is, the +minority +of tests are introduced to capture existing, errant behavior in the repository. +The significant majority of tests are committed alongside working code, ensuring that already correct behavior is upheld. +Well-maintained repositories will typically not merge commits that cause such tests to fail. +This results in a large number of tests where few to no commits correspond to those tests’ failures. +Finally, +\bugs +does not yield instances appropriate for evaluation +. +The +\bugs +pipeline as presented does not produce hidden tests, a crucial difference that makes SWE-bench more suitable for evaluation. +Consequently, when expert trajectories are generated, the Fail-to-Pass tests are present in the repository at inference time. +Furthermore, our issue generation strategy does not include checks for known problems such as underspecified text descriptions or solution leakage +(Chowdhury et al., +2024 +) +. +Simple amendments could make +\bugs +task instances suitable for evaluation, such as deleting Fail-to-Pass test functions or files along with a validation procedure around the ambiguity and leakage of the issue text. +Finally, thorough analyses of how faithful +\bugs +task instances are to real world issues and PRs would be necessary to justify synthetic bugs for evaluation. +Appendix D +Issue Generation +We cover the four issue generation strategies we experiment with to determine issue text’s effect on how solvable a +\bugs +instance is along with the trajectory’s value as a training data point. +Generated with LM. +We prompt an LM with a randomly selected SWE-bench Verified problem statement, the bug patch, list of Fail-to-Pass tests, source code for one Fail-to-Pass test, and the execution logs of running all the Fail-to-Pass tests. +We ask the LM to generate an issue that describes the bug conveyed in the patch in the style of the SWE-bench Verified demonstration. +Figure +13 +shows the system prompt for this strategy. +Fixed issue templates. +We create a set of +7 +7 +7 +7 +pre-defined issue templates, listed in Table +13 +. +Each template uses information from the bug patch or Fail-to-Pass tests associated with every task instance. +Given a dataset of task instances, we randomly select one of the templates to use as the problem statement according to the probabilities listed in Table +13 +. +The reason we assign the highest likelihood for the prompt that provides all four categories of information (bug type, files changed, functions changed, Fail-to-Pass tests) is to ensure that a higher proportion of task instances are well-specified. +Template +Prob. +Information Provided +Basic +0.05 +0.05 +0.05 +0.05 +None +Files +0.1 +0.1 +0.1 +0.1 +States which file(s) have bug(s). +Funcs +0.15 +0.15 +0.15 +0.15 +States which file(s) and func(s) have bug(s). +Tests +0.1 +0.1 +0.1 +0.1 +States that some tests are failing. +F2P Tests +0.1 +0.1 +0.1 +0.1 +States which tests are failing. +Bug Type +0.05 +0.05 +0.05 +0.05 +States failure type. +Bug Type + Files +0.15 +0.15 +0.15 +0.15 +States failure type and which file(s) have bug(s) +Bug Type + Files +0.15 +0.15 +0.15 +0.15 +States failure type, which file(s) have bug(s), ++ Test +and a random F2P test. +Bug Type + Files +0.15 +0.15 +0.15 +0.15 +States failure type, which file(s) and func(s) ++ Funcs + Test +have bug(s), and a random F2P test. +Table 13: +List of issue text templates we use to generate problem statements. +Across all templates, four types of information are included — the files with bugs, functions with bugs, Fail-to-Pass test(s), and the type of bug. +Templates that offer less information are generally assigned a lower probability. +Fail-to-Pass test code and execution logs. +Another approach is showing the source code and test execution logs for a randomly selected Fail-to-Pass test. +This approach is motivated by the lack of reproduction code or expected/actual behavior of code communicated with fixed issue templates. +We show code and execution logs only for a single Fail-to-Pass test; if a task instance has more than one Fail-to-Pass test, we do not disclose remaining tests. +Original issue text. +This strategy works exclusively for some task instances generated using PR Mirroring. +If a PR is successfully mirrored, we use the text from the associated issues as the problem statement, exactly as done in SWE-bench. +Of the +2345 +2345 +2345 +2345 +task instances represented in +\bugs +mirrored from real-world PRs, +708 +708 +708 +708 +or +30.19 +30.19 +30.19 +30.19 +% of these have one or more associated GitHub issue(s) to create a SWE-bench style problem statement. +System prompt for generating issues with an LM +You are a software engineer helping to create a realistic dataset of synthetic GitHub issues. +You will be given the following input: +1. Demonstration: A realistic GitHub issue to mimic (included in the +< +< +< +demonstration +> +> +> +tag). +2. Patch: A git diff output/PR changes that introduces a bug (included in the +< +< +< +patch +> +> +> +tag). +3. Test output: The output of running the tests after the patch is applied (included in the +< +< +< +test_output +> +> +> +tag). +4. Test source code: Source code for one or more tests that failed (included in the +< +< +< +test_source_code +> +> +> +tag). +Output: A realistic GitHub issue for the patch. +Guidelines: +- Mimic the style and structure of the demonstration issues. +If the demonstration issues are not well structured, your output should also be not well structured. +If the demonstrations use improper or no markdown, your output should also use improper or no markdown. +If the demonstrations are short/long, your output should also be short/long (if possible). +If the demonstrations include human ”flavor text” or ”fluff”, your output should also include human ”flavor text” or ”fluff”. +Do this even if it conflicts with your default behavior of trying to be extremely concise and helpful. +- DO NOT explain the fix/what caused the bug itself, focus on how to reproduce the issue it introduces +- Do not mention pytest or what exact test failed. Instead, generate a realistic issue. +- If possible, include information about how to reproduce the issue. An ideal reproduction script should raise an error +or print an unexpected output together with the expected output. +However, still include this information in a style very similar to the demonstration issues. +\captionof +figure +System prompt provided to an LM to generate an issue based off the bug patch and testing information of a task instance along with a demonstration problem statement randomly selected from SWE-bench Verified. +Appendix E +Difficulty Rating +We train a model that labels a task with one of three difficulty labels: +< +15 +absent +15 +<15 +< 15 +minutes (easy), +15 +15 +15 +15 +minutes - +1 +1 +1 +1 +hour (medium), and +1 +1 +1 +1 ++ hour (hard). +This model allows us to quantify the difficulty of individual task instances and, in aggregate, the difficulty of entire datasets. +To train this model, we use +1699 +1699 +1699 +1699 +annotations from +Chowdhury et al. ( +2024 +) +. +In their work towards curating SWE-bench Verified, a subset of +1699 +1699 +1699 +1699 +SWE-bench task instances were labeled with four difficulty levels: +< +15 +absent +15 +<15 +< 15 +min, +15 +15 +15 +15 +min - +1 +1 +1 +1 +hr, +1 +1 +1 +1 +- +4 +4 +4 +4 +hrs, and +4 +4 +4 +4 ++ hrs. +Generally, three annotators were assigned to each instance, and the difficulty annotations were ensembled by taking the majority choice for a sample, or the median if there is no majority. +The distribution of annotated difficulties, from easiest to hardest, is +24.5 +24.5 +24.5 +24.5 +%, +53.5 +53.5 +53.5 +53.5 +%, +19.4 +19.4 +19.4 +19.4 +%, and +2.8 +2.8 +2.8 +2.8 +%. +Because there are very few samples in the +4 +4 +4 +4 ++ hr category, we reclassify the +1 +1 +1 +1 +- +4 +4 +4 +4 +hr and +4 +4 +4 +4 ++ hr instances into a single +1 +1 +1 +1 ++ hr category. +Next, we create corresponding train and test datasets at a +80 +80 +80 +80 +/ +20 +20 +20 +20 +% split, randomly shuffling the instances while ensuring the train and test distributions do not deviate significantly from the original. +An instance’s problem statement and solution patch are provided as input, and one of the three difficulty labels serves as the target output. +We perform LoRA fine-tuning +(Hu et al., +2021 +) +on a Qwen 2.5 32B Instruct model using the Unsloth +(Daniel Han & team, +2023 +) +library. +The model achieves an accuracy of +75.3 +75.3 +75.3 +75.3 +% on the test set. +All errant predictions are off by one; in other words, the model never predicted +< +15 +absent +15 +<15 +< 15 +min when the label was +1 +1 +1 +1 ++ hr, and vise versa. +Using this model, we can grade the difficulty of a +\bugs +instance once the bug patch and corresponding issue text have been created. +To provide a succinct summary of difficulty for a dataset of SWE-bench style task instances, we propose a “difficulty score” metric. +Each label corresponds to a numeric difficulty score of +1 +1 +1 +1 +, +5 +5 +5 +5 +, and +9 +9 +9 +9 +, from easiest to hardest. +The difficulty score is therefore the average difficulty score across all task instances. +Figure 14: +Distribution of task instance difficulty ( +easy +/ +medium +/ +hard +) for existing SWE-bench style datasets (left +5 +5 +5 +5 +bars) and +\bugs +(right +5 +5 +5 +5 +bars), assessed by our difficulty rating model. +The average difficulty score for each dataset is listed above each bar. +For +\bugs +, per bug strategy, we sample +1000 +1000 +1000 +1000 +task instances with LM generated issue text. +Dataset +# Instances +Score +easy +med +hard +SWE-bench +2294 +2294 +2294 +2294 +5.014 +5.014 +5.014 +5.014 +438 +438 +438 +438 +1408 +1408 +1408 +1408 +446 +446 +446 +446 +Lite +300 +300 +300 +300 +3.893 +3.893 +3.893 +3.893 +93 +93 +93 +93 +197 +197 +197 +197 +10 +10 +10 +10 +Verified +500 +500 +500 +500 +3.960 +3.960 +3.960 +3.960 +173 +173 +173 +173 +284 +284 +284 +284 +43 +43 +43 +43 +SWE-bench Multimodal +510 +510 +510 +510 +6.036 +6.036 +6.036 +6.036 +55 +55 +55 +55 +265 +265 +265 +265 +186 +186 +186 +186 +SWE-gym +2438 +2438 +2438 +2438 +5.625 +5.625 +5.625 +5.625 +288 +288 +288 +288 +1456 +1456 +1456 +1456 +664 +664 +664 +664 +Lite +230 +230 +230 +230 +3.890 +3.890 +3.890 +3.890 +67 +67 +67 +67 +156 +156 +156 +156 +4 +4 +4 +4 +\bugs +(LM Modify) +1000 +1000 +1000 +1000 +3.304 +3.304 +3.304 +3.304 +441 +441 +441 +441 +542 +542 +542 +542 +17 +17 +17 +17 +\bugs +(LM Rewrite) +1000 +1000 +1000 +1000 +5.272 +5.272 +5.272 +5.272 +68 +68 +68 +68 +796 +796 +796 +796 +136 +136 +136 +136 +\bugs +(Procedural) +1000 +1000 +1000 +1000 +3.596 +3.596 +3.596 +3.596 +374 +374 +374 +374 +603 +603 +603 +603 +23 +23 +23 +23 +\bugs +(PR Mirror) +1000 +1000 +1000 +1000 +4.876 +4.876 +4.876 +4.876 +206 +206 +206 +206 +619 +619 +619 +619 +175 +175 +175 +175 +\bugs +(Combine) +1000 +1000 +1000 +1000 +5.720 +5.720 +5.720 +5.720 +52 +52 +52 +52 +716 +716 +716 +716 +232 +232 +232 +232 +Table 14: +The score is averaged over all task instances, where +easy +/ +med +/ +hard +corresponds to +1 +1 +1 +1 +/ +5 +5 +5 +5 +/ +9 +9 +9 +9 +. +For +\bugs +, we sample +1000 +1000 +1000 +1000 +task instances per bug strategy. +Figure +14 +summarizes our findings for difficulties across different SWE-bench style datasets. +We provide a more thorough rundown of task instances per difficulty level in Table +14 +. +We find that different +\bugs +bug generation methods yield different levels of difficulty. +LM Modify are consistently rated to be easy - from several manual spot checks, we notice that while the prompt for LM Modify provides several examples of types of bugs and does not name specific issues to create, the large majority of bugs created by this strategy are simple variable assignment mistakes (e.g. +a=a; b=b +is changed to +a=b; b=a +). +An open-ended prompt like ours does not actually yield high diversity in terms of mistakes created. +Procedural modifications are, as expected, the next easiest, as the types of bugs created by this strategy are finite. +PR Mirrors and LM Rewrites yield much harder tasks, confirmed not only by our bug rating model, but also the lower average resolve rate on these tasks by our expert model (SWE-agent + Claude 3.7 Sonnet). +Finally, aggregating smaller functions together is a simple but effective strategy for creating bugs that are rated as more complex. +This effect aligns with our original expectations; generally, bugs that require editing more functions and files tend to be rated as more difficult. +\bugs +can be used to create task instances with a range of difficulties. +Appendix F +Experiments +In this section, we provide additional details about the configurations and parameters used to generate trajectories with an expert model and run inference on a fine-tuned model. +We then provide additional ablations and analyses about the +\bugs +dataset and the agents trained on +\bugs +. +F.1 +Training Details +Rejection sampling fine-tuning. +Our fine-tuning setup heavily inherits from +Pan et al. ( +2024 +) +’s work. +We perform full parameter fine tuning using the +torchtune +(PyTorch, +2024 +) +library, with learning rate +5e-5 +, maximum +3 +3 +3 +3 +epochs, and max context length of +32768 +32768 +32768 +32768 +. +Training was carried on Modal +(Modal, +2025 +) +on +2 +2 +2 +2 +- +8 +8 +8 +8 +NVIDIA H100 80G GPUs. +As discussed in Section +3 +, the procedure for rejection sampling fine-tuning (RFT) is as follows. +We first generate expert demonstrations/trajectories using SWE-agent and a “strong” model (e.g. Claude 3.7 Sonnet, GPT 4o) on +\bugs +task instances. +Of these, we then only train a student model on the trajectories corresponding to resolved instances. +SWE-agent configuration. +We use two different configurations, one for generating trajectories with an expert model, and a separate one for running inference on the fine-tuned Qwen, student models. +The configurations are generally quite similar, with minor differences around how LMs’ responses are elicited, the parsing mechanism for an LM response, constraints around message sizes, and the system prompt. +We will first review the information common to both configurations. +The prompt template informing an agent of the task’s nature and problem statement is included in Figure +F.1 +. +This prompt is very similar to the original SWE-agent prompt used in +Yang et al. ( +2024a +) +. +The prompt templates for showing environment feedback are identical as well. +If there is execution output, the text is simply preceded by +OBSERVATION: [output] +. +If there is no output (e.g +rm -r +succeeds silently), then the agent is informed “Your command ran successfully and did not produce any output”. +The agent computer interface (ACI) provided is also identical; SWE-agent provides LM with access to three general tools: +• +bash +: Execute a bash command in terminal. +• +str_replace_editor +: A tool for viewing, creating, and editing files. +• +submit +: A special keyword for the LM to indicate the task is completed or if it is unable to proceed further with the task. +Task Instance Prompt provided to SWE-agent +< +< +< +uploaded_files +> +> +> +{{working_dir}} +< +< +< +/uploaded_files +> +> +> +I’ve uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description: +< +< +< +pr_description +> +> +> +{{problem_statement}} +< +< +< +/pr_description +> +> +> +Can you help me implement the necessary changes to the repository so that the requirements specified in the +< +< +< +pr_description +> +> +> +are met? +I’ve already taken care of all changes to any of the test files described in the +< +< +< +pr_description +> +> +> +. This means you DON’T have to modify the testing logic or any of the tests in any way! +Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the +< +< +< +pr_description +> +> +> +is satisfied. +Follow these steps to resolve the issue: +1. As a first step, it might be a good idea to find and read code relevant to the +< +< +< +pr_description +> +> +> +2. Create a script to reproduce the error and execute it with ‘python +< +< +< +filename.py +> +> +> +‘ using the bash tool, to confirm the error +3. Edit the source code of the repo to resolve the issue +4. Rerun your reproduce script and confirm that the error is fixed! +5. Think about edgecases and make sure your fix handles them as well +Your thinking should be thorough and so it’s fine if it’s very long. +\captionof +figure +A copy of the prompt provided to an LM via SWE-agent informing the LM of the nature of the task, the task description itself, and several tips on how to proceed. +We briefly review the distinctions. +First, tool invocation works differently for expert versus student models. +For the Claude and GPT series models that are used as experts, we use function calling for models to invoke the aforementioned tools. +On the other hand, the student model is asked to generate a response with XML tags to delineate the thought and action. +Therefore, when fine-tuning on expert trajectories, a key processing step is to convert the expert trajectories’ function calling format into the XML style response — fine-tuning +directly +on the expert trajectories does not work. +We note that we use these particular settings because as of the publication of this paper, this tool setting reflects the absolute state-of-the-art performance achieved with an open source agent system (SWE-agent) and any existing LM (Claude 3.7 Sonnet). +It is certainly possible to explore more tool designs and experiment with different formatting calls, as many existing prior works, notably +Yang et al. ( +2024a +) +, have performed. +However, given the focus of our work, we do not bother with repeating such a ”hyperparameter sweep” across configurations for the agent system, as this effort is expensive and has already been performed to suggest that the configuration we are using is ideal for expert level performance. +For generating trajectories with expert models, we run with a maximum of +75 +75 +75 +75 +steps and a cost limit of $ +2.00 +2.00 +2.00 +2.00 +. +A run terminates automatically when either of these limits are reached or the context window of the expert model is exceeded. +The overwhelming majority of automatic terminations are due to the +75 +75 +75 +75 +maximum steps limit. +For running inference with student models, we run with a maximum of +75 +75 +75 +75 +steps or a cost limit +3 +3 +3 +We include the cost limit in addition the step limit to provide realistic behavior with respect to handling long context. To calculate a cost value for our model, we use the gpt-4o cost function as of April, 2025. +of $ +2.00 +2.00 +2.00 +2.00 +, where the run similarly terminates when either the steps, cost or context window limit is reached. +For the student model, per LM inference call, we truncate the message history to only keep the +5 +5 +5 +5 +most recent tool outputs. +While we occasionally sample trajectories with the expert model set at various temperatures, for the student model, the temperature is fixed at +0.0 +0.0 +0.0 +0.0 +. +F.2 +Evaluation Datasets +SWE-bench. +SWE-bench is a widely used benchmark that evaluates AI systems on their ability to resolve GitHub issues +(Jimenez et al., +2024b +) +. +Given a codebase along with a description of a bug or feature, the AI system is asked to modify the codebase in such a way that the issue presented in the description is resolved. +SWE-bench consists of +2294 +2294 +2294 +2294 +such task instances, collected from real world pull requests (PRs) and issues in +12 +12 +12 +12 +GitHub repositories that are predominantly Python. +As discussed in Section +3 +, the Lite and Verified subsets are curated from the main SWE-bench repository with the goal of making evaluation either more efficent or more reliable. +Since evaluation on the entirety of SWE-bench is fairly costly and does not have as many comparable references, we do not evaluate +SWE-agent-LM-32B +on the entire SWE-bench test set. +SWE-bench Multimodal. +SWE-bench Multimodal applies SWE-bench collection strategy to +12 +12 +12 +12 +additional predominantly JavaScript and TypeScript GitHub repositories, where task instances are associated with issues that have visual asset(s) in them +(Yang et al., +2024b +) +. +The evaluation dataset consists of +510 +510 +510 +510 +task instances. +While the original work evaluates vision language models (VLMs) specifically, we do not evaluate +SWE-agent-LM-32B +which, as it is based on Qwen +2.5 +2.5 +2.5 +2.5 +Coder Instruct, does not have the ability to process images as inputs. +jqlang/jq +9 +redis/redis +12 +micropython/micropython +5 +valkey-io/valkey +4 +nlohmann/json +1 +fmtlib/fmt +11 +C/C++ +42 +prometheus/prometheus +8 +caddyserver/caddy +14 +gin-gonic/gin +8 +hashicorp/terraform +5 +gohugoio/hugo +7 +Go +42 +briannesbitt/carbon +10 +laravel/framework +13 +phpoffice/phpspreadsheet +10 +php-cs-fixer/php-cs-fixer +10 +PHP +43 +apache/druid +5 +reactivex/rxjava +1 +apache/lucene +9 +projectlombok/lombok +17 +google/gson +9 +javaparser/javaparser +2 +Java +43 +babel/babel +5 +mrdoob/three.js +3 +vuejs/core +5 +preactjs/preact +17 +axios/axios +6 +immutable-js/immutable-js +2 +facebook/docusaurus +5 +JS/TS +43 +rubocop/rubocop +16 +jekyll/jekyll +5 +faker-ruby/faker +2 +fastlane/fastlane +7 +fluent/fluentd +12 +jordansissel/fpm +2 +Ruby +44 +tokio-rs/axum +7 +nushell/nushell +5 +sharkdp/bat +8 +burntsushi/ripgrep +2 +uutils/coreutils +5 +tokio-rs/tokio +9 +astral-sh/ruff +7 +Rust +43 +Table 15: +Number of task instances per repository and language in the SWE-bench Multilingual evaluation set. +The entire dataset includes +300 +300 +300 +300 +task instances covering +9 +9 +9 +9 +languages. +SWE-bench Multilingual. +SWE-bench Multilingual is an evaluation dataset consisting of 300 task instances that we introduce with this work. +A single author carried out SWE-bench’s collection strategy for +42 +42 +42 +42 +additional GitHub repositories, covering the following +9 +9 +9 +9 +programming languages: JavaScript, TypeScript, C, C++, Go, Java, PHP, Ruby, and Rust. These repositories span a wide range of application domains, including web frameworks, data storage and processing tools, core utilities, and widely used libraries. A brief summary of the dataset is presented in Table +15 +. +Like SWE-bench Verified, we curate the dataset by excluding task instances deemed by a team of three authors to have ambiguous or underspecified issue text. +Each task instance edits (meaning additions and removals) on average +48 +48 +48 +48 +lines of code. +Similar to SWE-bench and +\bugs +, the median number of Fail-to-Pass tests is one. +We introduce SWE-bench Multilingual to: +1. +Provide a benchmark to evaluate model and agent performance across a variety of programming languages and application domains. Existing agent systems often rely on Python-specific tooling, effectively overfitting to the original SWE-bench +(Yang et al., +2024b +) +. Although SWE-bench Multimodal addresses this to some degree, its focus on visual inputs is a confounding factor for text-only evaluation of software engineering capabilities. +2. +Remain fully compatible with SWE-bench, so current users can adopt it without changing infrastructure. +3. +Keep the dataset small enough to run quickly. While concurrent work like +Zan et al. ( +2025 +) +provides more task instances in multiple languages, we purposely constrain the number of task instances so that the dataset is easy to run quickly. +In § +F.4 +, we briefly discuss how performance by existing state of the art methods for SWE-bench is markedly worse on SWE-bench Multilingual, then offer some clear directions for potential next steps to build better agentic coding models that would involve extending +\bugs +. +F.3 +Trajectory Dataset Breakdown +Purpose +Bug Gen. +Issue Gen. +# Instances +Temp. +# Traj. +claude-3-7-sonnet-20250219 +Ablation +LM (Modify) +LM +1000 +0 +605 +(Bug Type) +LM (Rewrite) +LM +1000 +0 +507 +Procedural +LM +1000 +0 +745 +PR Mirrors +LM +1000 +0 +557 +Ablation +PR Mirrors +Fixed +600 +0 +259 +(Issue Type) +PR Mirrors +F2P Test +600 +0 +390 +PR Mirrors +Original +600 +0 +328 +PR Mirrors +LM +600 +0 +319 +Ablation +Procedural +LM +1000 +0 +721 +(Repositories) +Procedural +LM +1000 +0 +709 +Procedural +LM +1000 +0 +723 +Procedural +LM +1000 +0 +707 +Final Dataset +LM (Rewrite) +LM +3574 +0 +1003 +Curation +PR Mirrors +LM +1049 +0 +349 +claude-3-5-sonnet-20250219 +Compare with prior work +All +LM +800 +0 +535 +gpt-4o-2024-08-06 +Compare with prior work +All +LM +200 +0 +89 +Table 16: +Breakdown of trajectories sampled from +\bugs +. +Trajectories were generated from subsets of +\bugs +that were either for the purpose of ablations or performance. +All trajectories were generated with a maximum of +75 +75 +75 +75 +steps and a $ +2 +2 +2 +2 +cost limit. +Bug Type +Count +Combine (File) +123 +Combine (Module) +7 +LM (Modify) +11 +LM (Rewrite) +1532 +Procedural +1495 +PR Mirror +1848 +Table 17: +Bug types represented in final training dataset. +Repository +Count +Repository +Count +getmoto/moto +378 +378 +378 +378 +sqlfluff/sqlfluff +122 +122 +122 +122 +pandas-dev/pandas +320 +320 +320 +320 +pylint-dev/astroid +110 +110 +110 +110 +conan-io/conan +243 +243 +243 +243 +pydicom/pydicom +103 +103 +103 +103 +pydantic/pydantic +209 +209 +209 +209 +tobymao/sqlglot +101 +101 +101 +101 +iterative/dvc +181 +181 +181 +181 +pygments/pygments +99 +99 +99 +99 +dask/dask +139 +139 +139 +139 +scanny/python-pptx +98 +98 +98 +98 +Table 18: +Top ten repositories by number of trajectories represented in final dataset for main result. +We provide a thorough review of the dataset of SWE-agent trajectories released with this work in Table +16 +. +The majority are generated with +claude-3-7-sonnet-20250219 +. +To compare with prior work, a minority were generated with +claude-3-5-sonnet-20240620 +and +gpt-4o-2024-08-06 +. +As mentioned in Section +4 +, to guard against the easy data bias phenomenon, we impose a per-instance cap of +3 +3 +3 +3 +, meaning for any task instance, we include at most +3 +3 +3 +3 +trajectories successfully resolving that task instance in our fine-tuning dataset. +From the pool of trajectories reflected in Table +16 +, we curate a set of +5000 +5000 +5000 +5000 +trajectories that we then use to train +SWE-agent-LM-32B +. +Tables +18 +and +18 +show what repositories and bug types are represented in the final training dataset. +In total, +123 +123 +123 +123 +repositories are represented, with at least +10 +10 +10 +10 +trajectories from +91 +91 +91 +91 +repositories. +Trajectories are on average +58 +58 +58 +58 +turns long, meaning an LM typically takes +29 +29 +29 +29 +actions for a given demonstration trajectory. +We visualize this distribution in Figure +15 +. +Figure 15: +Distribution of number of turns for trajectories represented in the final dataset. +F.4 +Training Analyses +We provide additional experiments and discussions around training +SWE-agent-LM-32B +. +Pass@k trend line. +To calculate the Pass@1 score discussed in our main result, we ran SWE-agent with +SWE-agent-LM-32B +six times. +In Figure +17 +, we observe increasing performance at higher values of +k +, a phenomenon that reflects observations in prior works across LMs for software engineering, code generation, web navigation, and theorem proving. +While we do not explore work around inference time scaling and training a separate verifier model to select the best solution candidate generated by multiple roll-outs, as done in +Pan et al. ( +2024 +) +and +Jain et al. ( +2025 +) +, +SWE-agent-LM-32B +is fully compatible with the generate-then-select pipelines explored by such works. +Given its strong Pass@1 performance, +SWE-agent-LM-32B +would likely be quite competitive for Best@k results as well. +As mentioned before, all trajectories generated in the course of +\bugs +have been released publicly, which the community might find useful for training better verifiers. +Rejection sampling fine-tuning ablation. +To confirm that rejection sampling fine-tuning leads to better performance on the downstream task, we compare against a setting where we randomly sample +n +training points with no filtering criteria, at +n = [100, 200, 400, 800, 1600] +and fine-tune the same student model (Qwen +2.5 +2.5 +2.5 +2.5 +Coder Instruct +32 +32 +32 +32 +B. +We then run SWE-agent with each student model on the SWE-bench Verified dataset three times, with the “% Resolved” corresponding to the Pass@1 score. +We show results in Figure +17 +, which confirms that fine-tuning only on trajectories corresponding to successfully resolved tasks is better than randomly sampling trajectories. +SWE-bench Multilingual performance. +To assess how well +SWE-agent-LM-32B +and existing models generalize to non-Python coding domains, we evaluate the performance of our model, Qwen +2.5 +2.5 +2.5 +2.5 +Coder Instruct 32B, and Claude +3.7 +3.7 +3.7 +3.7 +Sonnet with SWE-agent on our new dataset, which we introduced in Section +F.2 +. +Out of +300 +300 +300 +300 +task instances, we found that Claude +3.7 +3.7 +3.7 +3.7 +Sonnet achieved a +43 +43 +43 +43 +% Pass@1 resolve rate, which is significantly better than +SWE-agent-LM-32B +( +8.4 +8.4 +8.4 +8.4 +%) and Qwen +2.5 +2.5 +2.5 +2.5 +Coder Instruct ( +6.5 +6.5 +6.5 +6.5 +%). +SWE-agent-LM-32B +does not demonstrate a significant improvement over the baseline model. +Through several spot checks of different trajectories, we came to a working hypothesis that while the rejection sampling fine-tuning process had improved its ability to carry out multi-turn interactions in this task setting, there were instances where code edits reflected syntax closer to Python despite code and files viewed in previous steps clearly not being written in Python. +While the result for +SWE-agent-LM-32B +SWE-bench Multilingual is clearly subpar, we are excited by such a finding, as it motivates future work on top of +\bugs +. +To elaborate, we expect that the path to open agent coding models capable of generalizing to many repositories and languages will be paved by more data and better training techniques, both of which +\bugs +is very capable of facilitating. +First, regarding data, although we wrote +\bugs +to be Python centric, the collection methodology and bug generation techniques (especially LM based methods) should be readily transferable to other repositories. +Second, the negative result on SWE-bench Multilingual provides a clear impetus for exploring whether better training techniques could lead to models that are trained on one code domain (e.g., Python), but can generalize to many languages and repositories. +Figure 16: +SWE-agent-LM-32B +Pass@k curve on SWE-bench Verified. +We observe higher % resolved when considering +more runs. +Figure 17: +Rejection sampling fine-tuning +leads to better performance than random sampling +of trajectories for training. +F.5 +Agent Behavioral Studies +F.5.1 +Turn counts and cost +While agents are frequently quoted with a singular cost-per-instance number, this can be very misleading in the case of SWE-agent-LM-32B. +Because most of the failed instances fail due to termination by the cost or turn count limit, the average cost and turn counts depend strongly on these limits (see Fig. +19 +). +We can also chart the number of resolved instances vs step limits. +To avoid reevaluating the agent with multiple step limits, we use one run with step limit 75 and then assume that a successful agent run that terminates after step +n +𝑛 +n +italic_n +would have failed when restricted by a limit smaller than +n +𝑛 +n +italic_n +. +This chart corroborates the point made in section +3 +: SWE-agent-LM-32B has a higher resolution rate for very low step limits. +Figure 18: +The average step count depends strongly on the prescribed step limit. +Figure 19: +Number of successful instances submitted before a given step limit. +F.5.2 +Analysis of agent action space +Reduction to +base commands. +In addition to the dedicated tools provided to the agent as part of the agent computer interface (Section +F.1 +), the agent can execute arbitrary bash commands. +This makes quantitative analyses of the agent action space challenging. +For example, the agent might issue commands like +PYTHONPATH=/testbed/repo cd /testbed/repo && python3 reproduce.py +. +We have found the following procedure to determine a +base command +effective to meaningfully describe the action: +1. +Strip any environment variable manipulation from the beginning of the command. +2. +When multiple commands are chained with +&& +or semicolons, only consider the last command. +3. +Remove all arguments. Because some commands have subcommands (e.g., +git checkout +), we apply several basic heuristics to determine whether to keep the first or the first two words. +Repetitive actions. +We determine the longest repetitive sequence of actions by determining the longest sequence of identical base commands within the agent actions. +Note that this means that e.g., +str_replace_editor view +actions that target different files are considered to be repetitive actions as far as this analysis is concerned. +F.5.3 +Failure mode analysis +Categorizing the failure mode proceeds as shown in Figure +20 +: +1. +Error conditions: +If the agent terminates due to an error (environment errors, inability of the LM to correctly format its messages, etc.) or because it exceeded its maximum context window, we return the +error +or +context +category. +2. +Early termination: +If the agent was terminated because of a step or cost limit, we return one of the +stuck … +subcategories. Note that the SWE-agent still attempts to extract a submission (list of changes/patch). +We determine the subcategory based on which part of the workflow agentic loop was terminated: +(a) +If no source (i.e., non-test) file was modified +4 +4 +4 +We exclude added files because solving SWE-bench instances always requires +changes +to existing files. +and no attempt at testing was made, we return +stuck at localization +. If test commands were run (i.e., +python +, +pytest +, …, or similar commands), we return +stuck at reproduction +. +(b) +If source files +were +modified, we check whether the changes include changes to all source files that are modified in the gold patch. If not, we return +incorrect localization (stuck) +, else +incorrect edit (stuck) +. +3. +Successful submission: +If the agent terminated and submitted a solution naturally, we return +incorrect localization +or +incorrect edit +, depending on whether the changes from the submitted patch included changes to all files from the SWE-bench gold patch. +Figure 20: +Categorizing failure modes +F.5.4 +Mitigating repetitive actions +As described in section +4 +, +SWE-agent-LM-32B +frequently shows highly repetitive actions for unresolved instances. +In light of this, it seems promising to investigate whether agent scaffolding interventions can be used to mitigate the problem and increase the success rates. +We make the following modification to the agent scaffold: +• +We add warning messages to the observation (command output) if a base command is repeated four ( +str_replace_editor view +) or six (any other base command) times. The warning message advises to try different commands, and in particular suggest to locate relevant context using +find +or +grep +. +• +If the warning messages do not break the string of repetitive base commands and the repetition length reaches 6 ( +str_replace_editor view +) or 8 (any other base command), every following action is resampled up to 10 times, stopping at the first base command that is distinct from the previous ones. +To further increase the likelihood of breaking the cycle, we inject assistant messages or raise the temperature if the repetition length reaches 7 or 9. +This effectively reduces the number of repetitive actions (see Fig. +21 +). +However, the overall number of resolved instances drops slightly to 192 ( +38.4 +% +percent +38.4 +38.4\% +38.4 % +). +Variations of the above strategies yield similar outcomes: while repetition is suppressed, success rates do not improve substantially. +This may suggest that repetitive actions are better understood as +symptoms +of the model’s difficulty in solving an instance (such as when the instance is out-of-distribution or particularly challenging) rather than constituting intrinsic failure modes. +Figure 21: +Scaffold interventions can drastically reduce the number of repetitive actions. +Appendix G +Miscellaneous +Teaser figure description. +We briefly describe how the left hand graph of Figure +1 +, which depicts scaling of task instance collection for the +\bugs +vs. SWE-bench, was created. +For +\bugs +, we simply collected the number of task instances for each repository. +For SWE-bench, we ran the SWE-bench task instance candidate collection script on all +128 +128 +128 +128 +repositories, which first crawls all PRs from a given repository. +Then, each PR that edits at least one or more Python files and changes at least one or more testing related files is converted into a candidate task instance. +Finally, based on the average task instance yield rate reported in +Jimenez et al. ( +2024b +) +, we estimate the number of viable task instances to be +20 +20 +20 +20 +% of the candidates. +We then determine the number of task instances for +n +repositories at intervals of +5 +5 +5 +5 +repositories ranging from +5 +5 +5 +5 +to +250 +250 +250 +250 +, where the repositories are sorted by number of stars. +In other words, the first five repositories we account for in the figure are the five with the fewest number of stars out of the +128 +128 +128 +128 +repositories used. +Extended related works. +We discuss additional related works briefly, primarily about similar work towards synthesizing trajectories for training LM agents, but for the domain of web tasks. +To improve the interactive capabilities of open source LMs +(Chen et al., +2023 +) +, prior works have also explored trajectory generation techniques for web benchmarks and settings +(Xie et al., +2024 +; Yao et al., +2023a +; Zhou et al., +2024 +) +. +For web navigation, existing strategies rely on (1) performing random walks which are then labeled retroactively with instructions +(Xiang et al., +2023 +; Murty et al., +2024 +) +, (2) using online web tutorials as a source of indirect supervision for generating synthetic trajectories +(Ou et al., +2024 +) +, or (3) collecting human demonstrations +(Shen et al., +2024 +; Xu et al., +2024 +) +. +These procedures do not translate well to the software engineering setting; random sequences of command line interactions usually do not achieve meaningful effects on a codebase. +Our cursory efforts around replaying trajectories synthesized from online code edit sequences (e.g. GitHub commit histories) were unsuccessful due to the limited information available, which primarily capture file-level changes without reflecting the underlying skills, decision-making, or the broader context of a software development process. +Our exploration of using SWE-agent to automatically determine installation and testing specifications for a repository is heavily influenced by two research directions - automatic execution environment construction using LMs +(Bogin et al., +2024 +; Eliseeva et al., +2025 +; Vergopoulos et al., +2025 +) +, and generating unit tests using LMs +(Mündler et al., +2025 +) +. +Although relatively much less than SWE-bench style collection, +\bugs +still requires minimal amounts of human labor (around +8 +8 +8 +8 +minutes total per repository). +As we expand +\bugs +to more repositories and languages, we are continuing to consider how to completely automate the environment construction process end to end. \ No newline at end of file diff --git a/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md b/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md new file mode 100644 index 0000000000000000000000000000000000000000..2434e10ce9d7b9f354ea6be6addeb3df1eebec9b --- /dev/null +++ b/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md @@ -0,0 +1,3280 @@ +--- +title: 'Chain of World: World Model Thinking in Latent Motion' +id: chain-of-world-world-model-thinking-in-latent-motion +tags: +- deepread +created: '2026-06-10T00:31:05.101951Z' +source: https://arxiv.org/html/2603.03195 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:31:05.101809Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Chain of World: World Model Thinking in Latent Motion +Title: +Content selection saved. Describe the issue below: +Description: +License: arXiv.org perpetual non-exclusive license +arXiv:2603.03195v1 [cs.CV] 03 Mar 2026 +Chain of World: World Model Thinking in Latent Motion +Fuxiang Yang +1,2 +Donglin Di +2 +Lulu Tang +3,6 +Xuancheng Zhang +2 +Lei Fan +4 +Hao Li +2 +Wei Chen +2 +Tonghua Su +1,5 +Baorui Ma +2 +1 +Harbin Institute of Technology +2 +Li Auto +3 +Beijing Academy of Artificial Intelligence (BAAI) +4 +University of New South Wales +5 +Chongqing Research Institute of HIT +6 +Peking University +hityangfx@foxmail.com, donglin.ddl@gmail.com, lulutang_@outlook.com +xczhang.thu@gmail.com, lei.fan1@unsw.edu.au, {lihao43, chenwei10}@lixiang.com +thsu@hit.edu.cn, mabaorui2014@gmail.com +Work done during an internship at Li Auto.Corresponding author.Project leader and corresponding author. +Abstract +Vision-Language-Action (VLA) models are a promising path toward embodied intelligence, yet they often overlook the predictive and temporal-causal structure underlying visual dynamics. +World-model VLAs address this by predicting future frames, but waste capacity reconstructing redundant backgrounds. +Latent-action VLAs encode frame-to-frame transitions compactly, but lack temporally continuous dynamic modeling and world knowledge. +To overcome these limitations, we introduce CoWVLA (Chain-of-World VLA), a new “Chain of World” paradigm that unifies world-model temporal reasoning with a disentangled latent motion representation. +First, a pretrained video VAE serves as a latent motion extractor, explicitly factorizing video segments into structure and motion latents. +Then, during pre-training, the VLA learns from an instruction and an initial frame to infer a continuous latent motion chain and predict the segment’s terminal frame. +Finally, during co-fine-tuning, this latent dynamic is aligned with discrete action prediction by jointly modeling sparse keyframes and action sequences in a unified autoregressive decoder. +This design preserves the world-model benefits of temporal reasoning and world knowledge while retaining the compactness and interpretability of latent actions, enabling efficient visuomotor learning. +Extensive experiments on robotic simulation benchmarks show that CoWVLA outperforms existing world-model and latent-action approaches and achieves moderate computational efficiency, highlighting its potential as a more effective VLA pretraining paradigm. +The project website can be found at https://fx-hit.github.io/cowvla-io. +1 +Introduction +Figure 1 +: +Comparison of VLA pretraining strategies. +(a) +World Model +: It predicts future visual frames, leading to redundant background reconstruction. +(b) +Latent Action +: It learns the frame-to-frame transition using a visual encoder +E +E +, but lacks temporally continuous reasoning. +(c) +CoWVLA +: Our method first uses a video encoder +E +E +to decompose each video segment into motion and structure latents, and then trains the VLM to infer latent motion and predict the terminal frame of the segment given the instruction and the initial frame. +Embodied intelligence aims to build agents that can perceive, understand, and act in the physical world. +Vision-Language-Action (VLA) models represent a significant step toward this goal, unifying multimodal perception and motor control into end-to-end transformers +[ +61 +, +24 +, +3 +, +34 +] +. +While effective at mapping visual observations and language instructions directly to actions for many tasks, standard VLAs lack the future prediction capabilities that humans possess, which has spurred interest in enriching them with predictive world models +[ +1 +, +5 +] +. +A prominent approach integrates world models into VLAs by predicting future visual frames to explicitly model environmental dynamics, as illustrated in Figure +1 +(a). +Methods such as WorldVLA +[ +7 +] +, UniVLA +[ +50 +] +, and FlowVLA +[ +58 +] +typically built on large-scale autoregressive transformers, learn to anticipate future states and thus benefit action policy learning. +While effective, this paradigm has fundamental limitations. +It requires modeling entire visual frames containing substantial static and redundant background pixels, leading to near-trivial pixel replication rather than focusing on meaningful motion and dynamic change. +Furthermore, quantizing images +[ +15 +] +into discrete tokens results in excessively long sequences and severe training inefficiency when multiple frames are used. +From a cognitive standpoint, such frame prediction is misaligned with how humans model the world: we reason about motion and interactions rather than rebuilding every pixel in memory. +This observation raises an important question: +can we build a more compact, abstract, and dynamic form of world modeling? +The latent action paradigm +[ +54 +, +12 +, +6 +, +11 +] +offers compelling inspiration. +As shown in Figure +1 +(b), it encodes frame-to-frame transitions as latent actions, which serve as abstract motion carriers for world modeling, enabling large-scale pretraining using the pseudo-action labels built from videos. +However, we identify two critical limitations in the current latent action paradigm compared to world models. +First, world models perform temporally continuous dynamic modeling, whereas existing latent actions often focus only on the change between two frames +[ +54 +, +12 +, +6 +] +. +Second, world models, through future frame prediction, learn generalizable knowledge for task execution and common sense about the world. +In contrast, latent actions only encode “how to move”, but lack an understanding of what is moving, where the motion happens, or how the scene should evolve after the motion. +To address these limitations, we propose Chain-of-World VLA (CoWVLA), which establishes a new paradigm that unifies the advantages of both approaches, as shown in Figure +1 +(c). +Our key insight is that effective world modeling requires both the compactness of motion representations and the temporal continuity and world knowledge of frame prediction. +We argue that it is possible to extract continuous and compact motion representations from video clips, suggesting the need for a model capable of decoupling the content structure and motion in videos. +Such motion representations serve as carriers for perceiving essential dynamic changes and further enable the model to reason about keyframes after temporal evolution, thereby preserving crucial visual landmarks. +Specifically, our approach employs a pretrained video VAE as a latent motion extractor, which explicitly disentangles each video segment into structure and motion representations, providing compact and interpretable supervision for downstream visuomotor learning. +We then train a unified VLA decoder through two stages. +During the pre-training stage, the model learns to infer latent dynamics and predict the terminal frame of a video segment given the instruction and initial frame, thereby establishing a dynamics-aware world prior in the latent motion space. +During the subsequent co-fine-tuning stage, this prior is further aligned with discrete action prediction by jointly modeling sparse keyframes and action sequences in a unified autoregressive manner. +This design combines the interpretability and compactness of latent motion with the temporal reasoning and world knowledge of world models, achieving efficient and robust visuomotor learning without reconstructing redundant intermediate frames. +In summary, our contributions are as follows: +• +We present CoWVLA, establishing the “Chain-of-World” paradigm that unifies world modeling and latent action learning through continuous latent-motion sequences and terminal keyframe prediction. +• +We introduce a structure-motion disentangled latent prior that yields interpretable, continuous, and effective dynamic representations. +• +We conduct extensive experiments demonstrating that CoWVLA achieves state-of-the-art performance across multiple benchmarks, surpassing existing world-model and latent-action approaches. +2 +Related Work +Vision-Language-Action Models. +Deep learning has been widely applied in various industrial scenarios, such as visual anomaly detection +[ +17 +, +16 +] +. +Recent vision-language-action (VLA) models have rapidly advanced toward directly generating actions from visual and language inputs within a unified framework +[ +61 +, +24 +, +34 +, +25 +, +36 +, +35 +, +22 +, +3 +, +18 +, +41 +, +46 +] +. +RT-2 +[ +61 +] +pioneered this direction by treating robotic control as a sequence modeling problem, fine-tuning a pretrained vision-language model on robotic data to output discretized action tokens. +This approach was scaled up by RT-X +[ +34 +] +, which demonstrated the benefits of joint training across diverse robot platforms and tasks. OpenVLA +[ +24 +, +25 +] +further democratized this effort with an open-source implementation. +FAST +[ +35 +] +introduced a unified frequency-domain formulation for discretizing actions, enhancing temporal correlation in discrete control. +Meanwhile, another line of research explores continuous trajectory generation +[ +13 +, +3 +, +28 +, +21 +] +. +They leverage diffusion or flow-matching models to generate continuous, high-frequency action sequences. +However, most existing methods primarily focus on action space modeling, with limited capability to capture how the environment evolves. +Figure 2 +: +Overview of the CoWVLA framework. +CoWVLA consists of two core components: a latent motion extractor and a VLA decoder. +The latent motion extractor, implemented as a video VAE, disentangles each video segment into a structure latent +z +s +z_{s} +and two directional motion latents +z +m +h +z_{m}^{h} +and +z +m +w +z_{m}^{w} +, which are concatenated into a unified latent motion vector +z +m +z_{m} +. +The VLA decoder performs unified autoregressive modeling over multimodal sequences. +During pre-training, the model takes the instruction and initial frame as input, and uses a learnable motion query +Q +Q +to predict the latent motion +z +^ +m +\hat{z}_{m} +while reconstructing the terminal frame of the video segment. +During co-fine-tuning, the input expands into alternating keyframe–action pairs; +Q +Q +continues to aggregate temporally continuous latent dynamics, guiding multi-step action generation under sparse visual observations. +World Models for Robotics. +World models are commonly employed to capture environment states and their future evolution, and have been widely applied in areas such as autonomous driving +[ +48 +, +51 +] +, image and video generation +[ +5 +, +31 +, +53 +, +45 +, +42 +, +14 +] +, and robotics +[ +50 +, +7 +, +1 +, +38 +, +19 +, +56 +] +. +When combined with VLA models, most approaches +[ +52 +, +8 +, +57 +, +7 +, +50 +, +58 +] +rely on predicting future visual states to provide implicit world knowledge and demonstrate improved performance in robotic manipulation. +UVA +[ +29 +] +further jointly optimizes video prediction and action prediction using diffusion models, enhancing both visual reasoning and control inference efficiency. +However, these methods require reconstructing full visual frame sequences, leading to high computational cost and heavy resource consumption. +Latent Actions for Robotics. +Latent-action methods learn a compact latent transition between two frames to model environment dynamics. +LAPA +[ +54 +] +introduces a three-stage framework (including latent action quantization, latent pretraining, and action fine-tuning), leveraging large-scale pseudo-action supervision to improve learning of real-world robotic control. MoTo +[ +12 +] +follows this paradigm with enhancements in motion quantization and real action quality. +TLA +[ +6 +] +further disentangles task-relevant and task-irrelevant motion factors. +However, these approaches generally restrict latent action modeling to frame pairs, limiting their ability to capture long-range temporal dynamics. Although Villa-X +[ +11 +] +extends latent actions to multi-frame settings, it still generates one latent action per local frame pair, resulting in limited temporal consistency. +Moreover, the latent action representations inevitably encode static appearance and contextual details. +While TLA +[ +6 +] +mitigates this issue by decoupling task relevance, an ideal latent space should explicitly separate structure from motion, producing cleaner and more interpretable action representations. +Video Compression and Decoupling. +Recent methods in video representation learning have increasingly focused on compressing visual information into disentangled latent spaces that separately encode spatial structure and temporal motion +[ +53 +, +40 +, +26 +, +55 +, +49 +] +. +The design of our latent motion space is inspired by these advances. +Models like CMD +[ +55 +] +and VidTwin +[ +49 +] +have successfully disentangled overall content and dynamic information in a highly compressed latent space. +This factorization provides a compact, continuous, and meaningful representation of how scenes evolve. +While these models were developed for video generation, we are the first to hypothesize and demonstrate that their pretrained latent motion space can serve as a powerful dynamic prior for a robotic world model. +3 +Method +3.1 +Overall Framework +We consider a robotic manipulation task that involves executing a sequence of actions conditioned on a language instruction and visual observations. +The instruction is denoted as +T +T +. +The raw action sequence is +𝐀 +1 +: +t += +{ +a +1 +, +… +, +a +t +} +\mathbf{A}_{1:t}=\{a_{1},\ldots,a_{t}\} +. +To enable discrete sequence modeling, the action sequence +𝐀 +1 +: +t +\mathbf{A}_{1:t} +is partitioned into consecutive chunks of fixed length +l +a +l_{a} +, i.e., +𝐀 +1 +: +t += +⋃ +j += +1 +N +𝐀 +j +, +𝐀 +j += +{ +a +( +j +− +1 +) +​ +l +a ++ +1 +, +… +, +a +j +​ +l +a +} +\mathbf{A}_{1:t}=\bigcup_{j=1}^{N}\mathbf{A}^{j},\quad\mathbf{A}^{j}=\{a_{(j-1)l_{a}+1},\ldots,a_{jl_{a}}\} +, +and each chunk +𝐀 +j +\mathbf{A}^{j} +is then quantized into a discrete token sequence +𝐀 +q +j +\mathbf{A}_{q}^{j} +, +using the FAST +[ +35 +] +algorithm. +The raw corresponding visual observation sequence is represented as +𝐕 +1 +: +t += +{ +v +1 +, +… +, +v +t +} +\mathbf{V}_{1:t}=\{v_{1},\ldots,v_{t}\} +, where each frame +v +i +∈ +ℝ +H +× +W +× +3 +v_{i}\in\mathbb{R}^{H\times W\times 3} +. +We extract the first frame of each action chunk as a keyframe: +𝐕 +~ += +{ +v +~ +j +} +j += +1 +N += +{ +v +( +j +− +1 +) +​ +l +a ++ +1 +} +j += +1 +N +, +\tilde{\mathbf{V}}=\{\tilde{v}_{j}\}_{j=1}^{N}=\{v_{(j-1)l_{a}+1}\}_{j=1}^{N}, +where each +v +~ +j +\tilde{v}_{j} +is subsequently quantized into a visual token +v +~ +q +j +\tilde{v}_{q}^{j} +using VQGAN +[ +15 +] +. +Additionally, a learnable motion query token +Q +∈ +ℝ +D +Q +Q\in\mathbb{R}^{D_{Q}} +is introduced as a world dynamics query, whose hidden representation summarizes past context and provides a future dynamics-aware conditioning signal for generating subsequent vision or action tokens. +The overall framework consists of two models and three training stages. +The first model is the latent motion extractor (video VAE paradigm), which encodes a video sub-sequence +𝐕 +1 +: +f +\mathbf{V}_{1:f} +into an intermediate latent +z +∈ +ℝ +d +z +× +f +× +h +× +w +z\in\mathbb{R}^{d_{z}\times f\times h\times w} +, and decomposes it into a structural feature +z +s +z_{s} +and two directional motion features +z +m +h +z_{m}^{h} +and +z +m +w +z_{m}^{w} +. +The two motion components are concatenated to form a unified latent motion vector +z +m +∈ +ℝ +D +m +z_{m}\in\mathbb{R}^{D_{m}} +, providing the ground-truth supervision. +The second model is the VLA decoder (Transformer-decoder paradigm), which performs unified autoregressive next-token prediction across modalities. +During pre-training, the input sequence is organized as +[ +T +, +v +q +1 +, +Q +, +v +q +f +] +[T,v_{q}^{1},Q,v_{q}^{f}] +. +The final hidden representation corresponding to the query token +Q +Q +, obtained from the VLA decoder, is fed into an MLP to predict the latent motion +z +^ +m +\hat{z}_{m} +. +This stage enables the model to infer latent dynamics and future observations from language and the initial visual input. +During co-fine-tuning, we use alternating keyframes and action tokens, e.g., +[ +T +, +v +~ +q +1 +, +Q +, +𝐀 +q +1 +, +v +~ +q +2 +, +𝐀 +q +2 +, +… +] +[T,\tilde{v}_{q}^{1},Q,\mathbf{A}_{q}^{1},\tilde{v}_{q}^{2},\mathbf{A}_{q}^{2},\ldots] +. +The model continues to predict a latent motion vector +z +^ +m +\hat{z}_{m} +at +Q +Q +position. As a result, the model maintains explicit dynamics reasoning under sparse keyframe observations and generates stable multi-step actions from compact latent representations. +3.2 +Latent Motion Extractor +To encode temporal dynamics in a compact latent space, we adopt a pretrained video variational autoencoder +[ +49 +] +as the latent motion extractor. +As illustrated in Figure +2 +, the extractor achieves structure–motion disentanglement through two dedicated branches. +Given a video segment +𝐕 +1 +: +f +\mathbf{V}_{1:f} +, the encoder produces a latent tensor +z +∈ +ℝ +d +z +× +f +× +h +× +w +. +z\in\mathbb{R}^{d_{z}\times f\times h\times w}. +The structure branch employs a Q-Former +[ +27 +] +module with a set of learnable queries +{ +q +i +} +i += +1 +n +q +\{q_{i}\}_{i=1}^{n_{q}} +to aggregate global semantics and low-frequency dynamics along the temporal dimension, yielding +z +s +∈ +ℝ +d +s +× +n +q +× +h +s +× +w +s +, +n +q +≤ +f +. +z_{s}\in\mathbb{R}^{d_{s}\times n_{q}\times h_{s}\times w_{s}},n_{q}\leq f. +The motion branch operates along spatial dimensions: several convolutional layers reduce the dimension of +z +z +and produce +z +′ +∈ +ℝ +d +m +× +f +× +h +m +× +w +m +z^{\prime}\in\mathbb{R}^{d_{m}\times f\times h_{m}\times w_{m}} +. +Then, spatial averaging +μ +​ +( +⋅ +) +\mu(\cdot) +is applied independently along the height and width axes to extract directional motion embeddings: +z +m +h += +μ +h +​ +( +z +′ +) +∈ +ℝ +d +m +× +f +× +w +m +, +z +m +w += +μ +w +​ +( +z +′ +) +∈ +ℝ +d +m +× +f +× +h +m +. +z_{m}^{h}=\mu_{h}(z^{\prime})\in\mathbb{R}^{d_{m}\times f\times w_{m}},z_{m}^{w}=\mu_{w}(z^{\prime})\in\mathbb{R}^{d_{m}\times f\times h_{m}}. +These two motion components are concatenated and flattened to form a unified latent motion representation: +z +m +∈ +ℝ +D +m +, +D +m += +f +× +d +m +× +( +h +m ++ +w +m +) +. +z_{m}\in\mathbb{R}^{D_{m}},D_{m}=f\times d_{m}\times(h_{m}+w_{m}). +In the decoder stage, the three latent components +( +z +s +, +z +m +h +, +z +m +w +) +(z_{s},z_{m}^{h},z_{m}^{w}) +are upsampled through convolutional and MLP layers to the same spatial and temporal size, summed together, and then fed into the decoder to reconstruct +𝐕 +^ +1 +: +f +\hat{\mathbf{V}}_{1:f} +. +The training objective follows the original VAE design +[ +49 +] +, combining reconstruction loss +ℒ +rec +\mathcal{L}_{\text{rec}} +, perceptual loss +ℒ +p +\mathcal{L}_{p} +, adversarial loss +ℒ +GAN +\mathcal{L}_{\text{GAN}} +, and KL-divergence regularization loss +ℒ +KL +\mathcal{L}_{\text{KL}} +to preserve temporal consistency and visual realism: +ℒ +v +​ +a +​ +e += +ℒ +rec ++ +λ +p +​ +ℒ +p ++ +λ +GAN +​ +ℒ +GAN ++ +λ +KL +​ +ℒ +KL +. +\mathcal{L}_{vae}=\mathcal{L}_{\text{rec}}+\lambda_{p}\mathcal{L}_{p}+\lambda_{\text{GAN}}\mathcal{L}_{\text{GAN}}+\lambda_{\text{KL}}\mathcal{L}_{\text{KL}}. +(1) +Through explicit structure–motion disentanglement and mild adaptation, the extractor yields a compact, interpretable, and transferable latent representation well-suited for robotic scenarios, providing effective supervision for downstream VLA pre-training and co-fine-tuning. +3.3 +Pre-training to Think in Latent Motion +The pre-training stage aims to align language and initial visual observations with latent motion representations, enabling the model to reason about continuous temporal dynamics in the latent space and predict the terminal frame of the video segment. +Given a continuous video segment +𝐕 +1 +: +f += +{ +v +1 +, +… +, +v +f +} +\mathbf{V}_{1:f}=\{v_{1},\ldots,v_{f}\} +, the latent motion extractor produces a latent motion supervision signal +z +m +z_{m} +. +Its first and last frames are quantized into discrete visual tokens, denoted as +v +q +1 +v_{q}^{1} +and +v +q +f +v_{q}^{f} +, respectively. +Based on this, we organize the input sequence to the VLA decoder as: +[ +T +, +v +q +1 +, +Q +, +v +q +f +] +, +[T,v_{q}^{1},Q,v_{q}^{f}], +where +T +T +denotes the instruction, +v +q +1 +v_{q}^{1} +represents the initial observation, +Q +Q +is a learnable motion query token, and +v +q +f +v_{q}^{f} +corresponds to the visual state that would be reached after applying the underlying motion from +v +1 +v_{1} +through +z +m +z_{m} +. +During the forward pass, the hidden state at the query position is fed to an MLP to predict the latent motion +z +^ +m +\hat{z}_{m} +. +To prevent information leakage, causal masking is applied so that +Q +Q +only attends to +{ +T +, +v +q +1 +} +\{T,v_{q}^{1}\} +while being masked from +v +q +f +v_{q}^{f} +. +The training objective contains latent motion supervision and terminal-frame visual consistency: +ℒ +pretrain += +‖ +z +^ +m +− +z +m +‖ +2 +2 ++ +∑ +x +∈ +{ +1 +, +f +} +CE +​ +( +v +^ +q +x +, +v +q +x +) +, +\mathcal{L}_{\text{pretrain}}=\|\hat{z}_{m}-z_{m}\|_{2}^{2}+\sum_{x\in\{1,f\}}\mathrm{CE}(\hat{v}_{q}^{x},v_{q}^{x}), +(2) +where the first term enforces that the latent representation extracted at +Q +Q +accurately summarizes the continuous motion from +v +1 +v_{1} +to +v +f +v_{f} +, while the second ensures that the model forms a coherent prediction of the resulting future state. +Through this stage, the model learns to infer latent temporal dynamics directly from language and the initial frame, thus establishing a dynamics-aware prior for subsequent action modeling. +3.4 +Co-Fine-Tuning for Aligning Latent Dynamics with Action Policies +After the pre-training stage establishes a dynamics-aware prior in the latent motion space, the co-fine-tuning stage further aligns latent motion reasoning with discrete action modeling in a unified autoregressive framework, enabling stable multi-step control under sparse keyframe observations. +Given a continuous video sequence +𝐕 +1 +: +f +\mathbf{V}_{1:f} +and its corresponding action sequence +𝐀 +1 +: +f +\mathbf{A}_{1:f} +, we extract +N += +f +/ +l +a +N=f/l_{a} +keyframes and quantize them into visual tokens: +𝐕 +~ +q += +{ +v +~ +q +1 +, +… +, +v +~ +q +N +} +, +\tilde{\mathbf{V}}_{q}=\{\tilde{v}_{q}^{1},\ldots,\tilde{v}_{q}^{N}\}, +where +v +~ +q +j += +v +q +( +j +− +1 +) +​ +l +a ++ +1 +\tilde{v}_{q}^{j}=v_{q}^{(j-1)l_{a}+1} +. +We further quantize the action sequence using FAST +[ +35 +] +: +𝐀 +1 +: +f +→ +FAST +{ +𝐀 +q +1 +, +… +, +𝐀 +q +N +} +. +\mathbf{A}_{1:f}\ \xrightarrow{\text{FAST}}\ \{\mathbf{A}_{q}^{1},\ldots,\mathbf{A}_{q}^{N}\}. +The input sequence adopts a “single- +Q +Q +for the full window” design: +[ +T +, +v +~ +q +1 +, +Q +, +𝐀 +q +1 +, +v +~ +q +2 +, +𝐀 +q +2 +, +… +, +𝐀 +q +N +] +, +[T,\ \tilde{v}_{q}^{1},\ Q,\ \mathbf{A}_{q}^{1},\ \tilde{v}_{q}^{2},\ \mathbf{A}_{q}^{2},\ \ldots,\ \mathbf{A}_{q}^{N}], +where the query token +Q +Q +appears only once after the first keyframe and serves as a latent dynamics aggregator for the entire temporal horizon. The decoder autoregressively predicts both action and visual tokens; the hidden state at +Q +Q +is passed through an MLP to produce a single latent motion vector +z +^ +m +\hat{z}_{m} +, enforcing consistency between latent dynamics and subsequent predictions. +As in pre-training, causal masking prevents +Q +Q +from attending to future keyframes and actions, compelling the model to reason over latent dynamics rather than directly peeking at future states. +The co-fine-tuning objective consists of three terms: +ℒ +finetune += +\displaystyle\mathcal{L}_{\text{finetune}}= +∑ +j += +1 +N +CE +​ +( +𝐀 +^ +q +j +, +𝐀 +q +j +) ++ +λ +1 +​ +‖ +z +^ +m +− +z +m +​ +( +𝐕 +1 +: +f +) +‖ +2 +2 +\displaystyle\sum_{j=1}^{N}\mathrm{CE}\!\left(\hat{\mathbf{A}}_{q}^{j},\ \mathbf{A}_{q}^{j}\right)+\lambda_{1}\left\|\hat{z}_{m}-z_{m}(\mathbf{V}_{1:f})\right\|_{2}^{2} +(3) ++ +λ +2 +​ +∑ +j += +1 +N +CE +​ +( +v +~ +^ +q +j +, +v +~ +q +j +) +. +\displaystyle+\lambda_{2}\sum_{j=1}^{N}\mathrm{CE}\!\left(\hat{\tilde{v}}_{q}^{j},\ \tilde{v}_{q}^{j}\right). +Here, +z +m +​ +( +𝐕 +1 +: +f +) +z_{m}(\mathbf{V}_{1:f}) +is a continuous latent motion supervision signal produced by the pretrained extractor. +The first term ensures accurate execution of discrete actions. +The second term encourages the latent representation at the query token to faithfully capture the continuous dynamics from +v +1 +v_{1} +to +v +f +v_{f} +. +The third term anchors motion predictions to sparse visual checkpoints, maintaining consistent state transitions driven by the predicted dynamics. +Table 1 +: +Comparison of different methods on the LIBERO +[ +32 +] +and SimplerEnv-WidowX +[ +30 +] +benchmarks. +The best and the second-best values for each metric are bold and +underlined +respectively. +Model +LIBERO +SimplerEnv-WidowX +SPATIAL +OBJECT +GOAL +LONG +Avg. +Stack Block +Put Carrot +Put Spoon +Put Eggplant +Avg. +OpenVLA +[ +24 +] +0.849 +0.884 +0.792 +0.537 +0.765 +0.000 +0.000 +0.000 +0.041 +0.010 +SpatialVLA +[ +36 +] +0.882 +0.899 +0.786 +0.555 +0.781 +0.292 +0.250 +0.167 +1.000 +0.427 +CogACT +[ +28 +] +0.960 +0.874 +0.868 +0.846 +0.887 +0.150 +0.508 +0.717 +0.675 +0.513 +Dita +[ +21 +] +0.842 +0.963 +0.854 +0.638 +0.824 +– +– +– +– +– +π +0 +\pi_{0} +[ +3 +] +0.968 +0.988 +0.958 +0.852 +0.942 +0.167 +0.000 +0.291 +0.625 +0.401 +π +0 +\pi_{0} +-FAST +[ +35 +] +0.964 +0.968 +0.886 +0.602 +0.855 +0.108 +0.219 +0.291 +0.666 +0.483 +GR00T N1 +[ +2 +] +0.944 +0.976 +0.930 +0.906 +0.939 +0.167 +0.458 +0.625 +0.208 +0.495 +w/ Latent Actions +LAPA +[ +54 +] +– +– +– +– +– +0.542 +0.458 +0.708 +0.583 +0.573 +villa-X +[ +11 +] +0.975 +0.970 +0.915 +0.745 +0.901 +0.613 +0.463 +0.779 +0.646 +0.625 +TLA +[ +6 +] +0.965 +0.968 +0.956 +0.920 +0.952 +0.028 +0.556 +0.528 +0.806 +0.480 +w/ World Model +WorldVLA +[ +7 +] +0.856 +0.890 +0.826 +0.590 +0.791 +– +– +– +– +– +CoT-VLA +[ +57 +] +0.875 +0.916 +0.876 +0.690 +0.811 +– +– +– +– +– +UniVLA +[ +50 +] +0.960 +0.992 +0.932 +0.914 +0.950 +0.292 +0.625 +0.833 +1.000 +0.687 +FlowVLA +[ +58 +] +0.932 +0.950 +0.916 +0.726 +0.881 +0.625 +0.625 +0.708 +1.000 +0.740 +\rowcolor +gray!20 +Ours +0.972 +0.978 +0.946 +0.928 +0.956 +0.625 +0.667 +0.792 +0.958 +0.760 +4 +Experiments +4.1 +Benchmarks +LIBERO. +The LIBERO +[ +32 +] +benchmark is designed for studying knowledge transfer in multitask and lifelong robot learning, requiring both +declarative knowledge +about objects and spatial relations and +procedural knowledge +about motion and behaviors. +It contains four task suites: LIBERO-Spatial emphasizes spatial reasoning by placing a bowl based on its location, LIBERO-Object focuses on object recognition via picking and placing distinct objects, LIBERO-Goal tests procedural learning with varying task goals under fixed objects, and LIBERO-Long contains ten long-horizon tasks with diverse objects, layouts, and goals. +SimplerEnv. +SimplerEnv +[ +30 +] +is a collection of manipulation evaluation environments for common real-world robot setups, showing strong correlation with real-robot performance. It enables assessing the transferability and generalization of models trained on real-world video data. We evaluate on four tasks using a 7-DoF WidowX robotic arm. +4.2 +Implementation Details +Our latent motion extractor is built upon a pretrained video VAE (VidTwin +[ +49 +] +) and is further fine-tuned on a robot-centric dataset consisting of 237k videos (details provided in the appendix). +Each video segment is uniformly sampled to 16 frames and resized to +224 +× +224 +224\times 224 +. +The structure latent +z +s +z_{s} +has a shape of +4 +× +16 +× +7 +× +7 +4\times 16\times 7\times 7 +, +while the directional motion embeddings +z +m +h +z_{m}^{h} +and +z +m +w +z_{m}^{w} +have shapes of +8 +× +16 +× +7 +8\times 16\times 7 +. +The motion latent dimension is +D +m += +1792 +D_{m}=1792 +. +The backbone of our VLA model follows the design of UniVLA +[ +50 +] +and is based on the 8.5B-parameter VLM Emu3 +[ +47 +] +. +Visual observations are quantized into discrete tokens using VQGAN +[ +15 +] +, while actions are partitioned into chunks and discretized into tokens using the FAST algorithm +[ +35 +] +. +During the pre-training stage, we trained the model using the aforementioned 237k videos with pretrained Emu3 initialization. +From each video, we extracted a frame sequence of length +f += +16 +f=16 +, where the first and last frame tokens supervise visual modeling, and the latent motion extracted from VidTwin provides supervision. +We trained using a batch size of 256 for 10k steps. +During the co-fine-tuning stage, we initialized from the pretrained checkpoint and trained on the benchmark-specific datasets. +For the LIBERO benchmark, we used the mixed data from the four task suites curated by OpenVLA +[ +24 +] +, including both third-person and wrist-mounted views. +We trained the model with a batch size of 128 for 8k iterations, resized all images to +200 +× +200 +200\times 200 +, set the action chunk length to +l +a += +10 +l_{a}=10 +, and used +λ +1 += +0.1 +\lambda_{1}=0.1 +and +λ +2 += +0.01 +\lambda_{2}=0.01 +. +For SimplerEnv, we trained the model on the Bridge V2 dataset +[ +43 +] +with a batch size of 128 for 12k iterations. +Single-view images were resized to +256 +× +256 +256\times 256 +, the action chunk length is set to +l +a += +5 +l_{a}=5 +, and we used +λ +1 += +0.1 +\lambda_{1}=0.1 +and +λ +2 += +0 +\lambda_{2}=0 +. +In the co-fine-tuning stage, we set +N += +2 +N=2 +, where two visual observations and two corresponding ground-truth action chunks were used. +Further training details and supplementary results are provided in the appendix. +Table 2 +: +Evaluation of VAE-Reconstructed Videos and downstream fine-tuning performance on SimplerEnv-WidowX +[ +30 +] +. +Model +Reconstruction Metrics +Simulation Evaluation +PSNR +↑ +\uparrow +SSIM +↑ +\uparrow +LPIPS +↓ +\downarrow +Stack Block +Put Carrot +Put Spoon +Put Eggplant +Average +Pretrain +32.7 +0.923 +0.122 +0.458 +0.750 +0.792 +0.917 +0.729 +Finetune +33.4 +0.934 +0.123 +0.625 +0.667 +0.792 +0.958 +0.760 +Figure 3 +: +Visualization of the disentangled motion and structure latents. +We select two frames ( +t +1 +t_{1} +and +t +2 +t_{2} +) and show the original (Orig.) and reconstructed (Recon.) frames. +“M. Recon.” and “S. Recon.” denote the reconstructions obtained by decoding only the motion latent or only the structure latent, respectively. +The structure latent preserves the global scene layout, whereas the motion latent captures motion and fine-grained temporal details. +4.3 +Comparison with SOTA Methods +We compared CoWVLA against three representative categories of methods: VLA baselines (OpenVLA +[ +24 +] +, SpatialVLA +[ +36 +] +, CogACT +[ +28 +] +, DiTA +[ +21 +] +, +π +0 +\pi_{0} +[ +3 +] +, +π +0 +\pi_{0} +-FAST +[ +35 +] +, GR00T-N1 +[ +2 +] +), latent-action approaches (LAPA +[ +54 +] +, villa-X +[ +11 +] +, TLA +[ +6 +] +), and world-model approaches (WorldVLA +[ +7 +] +, CoT-VLA +[ +57 +] +, UniVLA +[ +50 +] +, FlowVLA +[ +58 +] +). +These methods respectively model: (i) actions directly, (ii) frame-to-frame latent transitions, and (iii) pixel/token-level future frames. +They collectively represent the main paradigms in current VLA pretraining and provide strong and fair comparison points. +The results are shown in Table +1 +. +Overall, our CoWVLA achieves SOTA performance with superior cross-domain robustness. +We observe that TLA achieves a strong 0.952 on LIBERO but significantly drops to 0.480 on SimplerEnv, while FlowVLA is strong on SimplerEnv (0.740) but noticeably weaker on LIBERO (0.881). UniVLA shows a more balanced performance (0.950/0.698). +In contrast, CoWVLA achieves 0.956/0.760 on the two benchmarks, outperforming UniVLA on both and demonstrating higher absolute performance and greater cross-domain stability. +4.4 +Latent Motion Analysis +In this subsection, we analyze the effectiveness of the proposed disentangled latent space from three perspectives: the separation of structure and motion factors, the improved adaptiveness of the motion latent after fine-tuning on robot data, and the enhanced capability of modeling future dynamics. These results collectively verify that our latent motion representation provides a clearer physical prior and stronger action reasoning ability. +Effective decoupling of structure and motion latent. +As shown in Figure +3 +, we reconstruct frames using only the motion latent (M. Recon.) or only the structure latent (S. Recon.). +The structure latent preserves global scene layout and object appearance, whereas the motion latent captures robot arm trajectories and fine-grained temporal dynamics. +Figure +4 +provides additional evidence through cross-reconstruction. +Since motion cues are subtle in individual frames, we visualize the pixel-wise differences, which highlight the motion-affected regions and show that injecting the motion latent alters only the dynamic parts while keeping the static structure intact. +These visualizations demonstrate that our latent space effectively separates content structure and dynamic information, providing a more interpretable representation for downstream visuomotor reasoning. +Fine-tuning on robot data improves motion latent quality. +As presented in Table +2 +, fine-tuning the latent motion extractor on robot data not only improves reconstruction quality (higher PSNR and SSIM) but also boosts downstream performance. In the SimplerEnv-WidowX evaluation, the average task success rate increases from 0.729 to 0.760. This confirms that motion latents adapted to the robot domain contain higher-quality dynamic cues that benefit policy learning. +Motion latent enhances dynamic modeling for future frame prediction. +As illustrated in Figure +5 +, we visualize the future frame predictions under different pretraining strategies. +From top to bottom, the tasks in each subfigure are: +i) pick up the black bowl from the table center and place it on the plate, +ii) sweep into a pile. +World-model-based approaches reconstruct redundant background pixels and therefore struggle to focus on interactive motion, while single-goal-frame prediction lacks supervision of temporal evolution and often produces unstable goal frames. +This leads both strategies to easily generate results with no changes, such as Figure +5 +(b) Task i. +In contrast, our model leverages the motion latent as a “chain of world” during reasoning, achieving physically plausible future states that align more closely with the instructions. +Figure 4 +: +Cross-reconstruction visualization. +We extract the structure latent from the static video in the first row and the motion latent from the robot-arm motion video in the second row. +By combining the two latents, we reconstruct the video shown in the third row. +We compute the difference between the cross-reconstructed frames and the static frames to highlight the changed regions, which correspond to the robot arm’s motion. +Figure 5 +: +Comparative visualization of future-frame prediction strategies. +There are two tasks demonstrated: i) pick up the black bowl from the table center and place it on the plate, and ii) sweep into a pile. +(a) The world-model approach predicts five future frames. +(b) The single-goal-frame approach predicts one goal frame. +(c) Our method reasons through a learned motion latent +z +m +z_{m} +, producing more reasonable and instruction-aligned frames. +4.5 +Ablation and Efficiency Analysis +In this section, we conduct an in-depth analysis of key modules, hyperparameter settings, and training efficiency. +Experiments in Table +3 +and Table +4 +adhere to a unified dataset and training configuration, with a batch size of 256 for 10k steps during the pre-training phase and a batch size of 128 for 8k steps during the co-fine-tuning phase. +In Table +3 +, we provide a unified comparison of the effectiveness of latent action, world model, and our proposed method. +In Table +4 +, we analyze the effect of the loss weighting ratio between the latent motion loss ( +λ +1 +\lambda_{1} +) and the visual token loss ( +λ +2 +\lambda_{2} +) on task success rates during the co-fine-tuning strategy. +In addition, we analyze the pre-training cost and task success rate of different methods in Figure +6 +. +The main conclusions are as follows. +Table 3 +: +Ablation study on the LIBERO +[ +32 +] +benchmark. +Config +Variant +Spatial +Object +Goal +Long +Average +Latent Action +w/o LA +0.622 +0.146 +0.694 +0.328 +0.448 +LAPA style +0.718 +0.852 +0.804 +0.488 +0.716 +villa-X style +0.840 +0.904 +0.834 +0.668 +0.812 +structure latent +0.856 +0.898 +0.822 +0.692 +0.817 +motion latent +0.916 +0.932 +0.886 +0.774 +0.877 +World Model +UniVLA Style +0.958 +0.978 +0.932 +0.898 +0.942 +CoT-VLA style +0.942 +0.964 +0.950 +0.838 +0.924 +Ours +motion +0.960 +0.980 +0.922 +0.882 +0.936 +motion & cot +0.948 +0.974 +0.958 +0.906 +0.947 +i) Our latent motion modeling significantly outperforms existing latent action methods. +The “Latent Action” part of Table +3 +compares several baselines. +The “w/o LA” variant, which skips pre-training and fine-tunes directly on LIBERO data, achieves the lowest average success rate (0.448). “LAPA style” (0.716) and “villa-X style” (0.812) both outperform the “w/o LA” variant, with “villa-X style” achieving stronger performance by modeling richer multi-frame information. +Our method separates the latent into a “structure latent” (0.817) capturing content and texture, and a “motion latent” (0.877) encoding dynamic information. +Modeling with the cleaner motion notably improves task success rate. +ii) World model methods show stronger overall performance than latent action methods. +In the “World Model” part of Table +3 +, both “UniVLA style” (pretrained with six frames) and “CoT-VLA style” (pretrained with initial and target frames) achieve higher success rates (0.942 and 0.924, respectively) than those methods in the “Latent Action” category. +Notably, “UniVLA style”, which uses more frames, performs better, indicating that world model methods have a distinct advantage in temporal modeling and learning knowledge of environmental evolution. +iii) Our method achieves superior performance to latent action and world models. +The “Ours” part in Table +3 +presents two configurations of our method. Both use latent motion supervision during pre-training and set +λ +1 += +0.1 +, +λ +2 += +0 +\lambda_{1}=0.1,\lambda_{2}=0 +during fine-tuning (i.e., using only real action and latent motion losses). The “motion” configuration does not use the final frame +v +f +v_{f} +during pre-training and achieves a success rate of 0.936. +In contrast, the “motion & cot” configuration adds supervision from +v +f +v_{f} +during pre-training and improves the success rate to 0.947. +This yields two conclusions: first, introducing latent motion during the fine-tuning phase effectively guides the inference of real actions; second, introducing +v +f +v_{f} +as an evolutionary target during pre-training significantly enhances the model’s perception and understanding of environmental evolution. +Table 4 +: +Ablation study of loss weights on the LIBERO +[ +32 +] +benchmark. +λ +1 +\lambda_{1} +λ +2 +\lambda_{2} +Spatial +Object +Goal +Long +Average +0.0 +0.0 +0.922 +0.962 +0.862 +0.742 +0.872 +0.1 +0.0 +0.960 +0.980 +0.922 +0.882 +0.936 +1.0 +0.0 +0.958 +0.970 +0.950 +0.902 +0.945 +0.1 +0.05 +0.954 +0.972 +0.944 +0.914 +0.946 +0.1 +0.01 +0.970 +0.964 +0.958 +0.926 +0.955 +1.0 +0.01 +0.970 +0.956 +0.934 +0.922 +0.946 +Figure 6 +: +Comparison of pre-training efficiency and task performance on LIBERO +[ +32 +] +across different methods. +Blue and orange circles denote world-model and latent-action baselines, respectively, while green circles denote our configurations. +Circle size indicates training-time GPU memory usage. +Our method balances pre-training efficiency and performance, achieving a higher success rate with moderate computational efficiency. +iv) Balancing latent motion and visual token losses during co-fine-tuning further improves performance. +Table +4 +presents an ablation study on the loss weights +λ +1 +\lambda_{1} +(latent motion) and +λ +2 +\lambda_{2} +(visual token) during the co-fine-tuning stage, based on the same pretrained model. +First, we fix +λ +2 += +0 +\lambda_{2}=0 +to analyze the impact of +λ +1 +\lambda_{1} +. When +λ +1 += +0 +\lambda_{1}=0 +(no latent motion loss), the success rate is only 0.872. As +λ +1 +\lambda_{1} +increases from 0.1 to 1.0, the success rate improves from 0.936 to 0.945, indicating that the guiding effect of latent motion is strengthening. +Next, we introduce the visual token loss +λ +2 +\lambda_{2} +. +By comparing ( +λ +1 += +0.1 +, +λ +2 += +0.05 +\lambda_{1}=0.1,\lambda_{2}=0.05 +) at 0.946 and ( +λ +1 += +0.1 +, +λ +2 += +0.01 +\lambda_{1}=0.1,\lambda_{2}=0.01 +) at 0.955, we find that the weight for visual token prediction should not be too high. +Then we tune +λ +1 += +1.0 +\lambda_{1}=1.0 +and +λ +2 += +0.01 +\lambda_{2}=0.01 +, achieving an average success rate of 0.946. +This proves that simultaneously introducing latent motion ( +λ +1 += +0.1 +\lambda_{1}=0.1 +) and a low-weighted visual token prediction ( +λ +2 += +0.01 +\lambda_{2}=0.01 +) during the fine-tuning phase most effectively guides the inference of real actions. +v) Our method balances pre-training efficiency and performance. +As shown in Figure +6 +, we compare several methods from Table +3 +in terms of training speed, GPU memory usage, and task success rate (batch size = 4 per GPU). +UniVLA is the slowest and most memory-intensive, while LAPA is the fastest but less successful. +Our method has two configurations: “motion” without +v +f +v_{f} +achieves the second-fastest speed and slightly lower performance than UniVLA, and “motion & cot” with +v +f +v_{f} +achieves a better balance of efficiency and performance, surpassing UniVLA in both. +5 +Conclusion +In this work, we presented CoWVLA, which for the first time integrates the temporal reasoning capability of world models with a disentangled latent motion representation, enabling world modeling directly in a structure–motion separated latent space. +By introducing the Chain-of-World paradigm, our method predicts a continuous latent motion chain and a terminal keyframe from the instruction and initial observation, compactly capturing temporal evolution and physical dynamics without reconstructing intermediate pixels. +Extensive experiments on LIBERO and SimplerEnv benchmarks demonstrate that CoWVLA outperforms both world-model and latent-action approaches, while offering improved dynamic consistency and visuomotor grounding, thereby providing a more efficient pretraining route toward general-purpose robotic manipulation. +Limitations +. +Despite its promising results, our approach still has limitations. +The latent motion space remains dependent on the quality and domain coverage of the pretrained video VAE, which may introduce distribution mismatch in new environments. +Moreover, the model relies on a large VLA backbone and substantial computational resources. +We believe exploring more lightweight and scalable architectures, as well as further enhancing the coupling between latent dynamics and action learning, will broaden the applicability of our method to real-world robotics. +6 +Acknowledgments +This work was supported by the National Natural Science Foundation of China (Grant No. 62277011), Project of Chongqing MEITC (Grant No. YJX-2025001001009), and CAAI-CANN Open Fund, developed on OpenI Community. +References +Assran et al. [2025] +Mido Assran, Adrien Bardes, David Fan, et al. +V-jepa 2: Self-supervised video models enable understanding, prediction and planning. +arXiv preprint arXiv:2506.09985 +, 2025. +Bjorck et al. [2025] +Johan Bjorck, Fernando Castañeda, Nikita Cherniadev, Xingye Da, Runyu Ding, et al. +Gr00t n1: An open foundation model for generalist humanoid robots. +arXiv preprint arXiv:2503.14734 +, 2025. +Black et al. [2024] +Kevin Black, Noah Brown, Danny Driess, et al. +π +0 +\pi_{0} +: A vision-language-action flow model for general robot control. +arXiv preprint arXiv:2410.24164 +, 2024. +Brohan et al. [2022] +Anthony Brohan, Noah Brown, et al. +Rt-1: Robotics transformer for real-world control at scale. +arXiv preprint arXiv:2212.06817 +, 2022. +Bruce et al. [2024] +Jake Bruce, Michael D Dennis, Ashley Edwards, Jack Parker-Holder, et al. +Genie: Generative interactive environments. +In +ICML +, 2024. +Bu et al. [2025] +Qingwen Bu, Yanting Yang, Jisong Cai, et al. +Learning to act anywhere with task-centric latent actions. +In +RSS +, 2025. +Cen et al. [2025] +Jun Cen, Chaohui Yu, Hangjie Yuan, Yuming Jiang, Siteng Huang, et al. +Worldvla: Towards autoregressive action world model. +arXiv preprint arXiv:2506.21539 +, 2025. +Cheang et al. [2024] +Chi-Lam Cheang, Guangzeng Chen, Ya Jing, Tao Kong, et al. +Gr-2: A generative video-language-action model with web-scale knowledge for robot manipulation. +arXiv preprint arXiv:2410.06158 +, 2024. +Chen et al. [2023] +Lili Chen, Shikhar Bahl, and Deepak Pathak. +Playfusion: Skill acquisition via diffusion from language-annotated play. +In +CoRL +, pages 2012–2029, 2023. +Chen et al. [2024] +Lawrence Yunliang Chen, Simeon Adebola, and Ken Goldberg. +Berkeley UR5 demonstration dataset, 2024. +Chen et al. [2025a] +Xiaoyu Chen, Hangxing Wei, Pushi Zhang, Chuheng Zhang, Kaixin Wang, et al. +villa-X: enhancing latent action modeling in vision-language-action models. +arXiv preprint arXiv:2507.23682 +, 2025a. +Chen et al. [2025b] +Yi Chen, Yuying Ge, Yizhuo Li, Yixiao Ge, Mingyu Ding, Ying Shan, and Xihui Liu. +Moto: Latent motion token as the bridging language for robot manipulation. +In +ICCV +, 2025b. +Chi et al. [2023] +Cheng Chi, Siyuan Feng, Yilun Du, Zhenjia Xu, et al. +Diffusion policy: Visuomotor policy learning via action diffusion. +In +RSS +, 2023. +Di et al. [2025] +Donglin Di, He Feng, Wenzhang Sun, Yongjia Ma, Hao Li, Wei Chen, Lei Fan, Tonghua Su, and Xun Yang. +Dh-facevid-1k: A large-scale high-quality dataset for face video generation. +In +ICCV +, pages 12124–12134, 2025. +Esser et al. [2021] +Patrick Esser, Robin Rombach, and Bjorn Ommer. +Taming transformers for high-resolution image synthesis. +In +CVPR +, pages 12873–12883, 2021. +Fan et al. [2025a] +Lei Fan, Dongdong Fan, Zhiguang Hu, Yiwen Ding, Donglin Di, Kai Yi, Maurice Pagnucco, and Yang Song. +Manta: A large-scale multi-view and visual-text anomaly detection dataset for tiny objects. +In +CVPR +, pages 25518–25527, 2025a. +Fan et al. [2025b] +Lei Fan, Junjie Huang, Donglin Di, Anyang Su, Tianyou Song, Maurice Pagnucco, and Yang Song. +Salvaging the overlooked: Leveraging class-aware contrastive learning for multi-class anomaly detection. +In +ICCV +, pages 21419–21428, 2025b. +Gao et al. [2025a] +Chongkai Gao, Zixuan Liu, Zhenghao Chi, Junshan Huang, Xin Fei, Yiwen Hou, Yuxuan Zhang, Yudi Lin, Zhirui Fang, and Lin Shao. +VLA-OS: Structuring and dissecting planning representations and paradigms in vision-language-action models. +In +NeurIPS +, 2025a. +Gao et al. [2025b] +Shenyuan Gao, Siyuan Zhou, Yilun Du, Jun Zhang, and Chuang Gan. +Adaworld: Learning adaptable world models with latent actions. +In +ICML +, 2025b. +Gu et al. [2023] +Jiayuan Gu, Fanbo Xiang, Xuanlin Li, Zhan Ling, Xiqiang Liu, Tongzhou Mu, Yihe Tang, Stone Tao, Xinyue Wei, Yunchao Yao, et al. +Maniskill2: A unified benchmark for generalizable manipulation skills. +arXiv preprint arXiv:2302.04659 +, 2023. +Hou et al. [2025] +Zhi Hou, Tianyi Zhang, Yuwen Xiong, Haonan Duan, et al. +Dita: Scaling diffusion transformer for generalist vision-language-action policy. +In +ICCV +, 2025. +Intelligence et al. [2025] +Physical Intelligence, Kevin Black, Noah Brown, et al. +π +0.5 +\pi_{0.5} +: a vision-language-action model with open-world generalization. +arXiv preprint arXiv:2504.16054 +, 2025. +Kalashnikov et al. [2018] +Dmitry Kalashnikov, Alex Irpan, Peter Pastor, Julian Ibarz, et al. +Scalable deep reinforcement learning for vision-based robotic manipulation. +In +CoRL +, pages 651–673, 2018. +Kim et al. [2024] +Moo Jin Kim, Karl Pertsch, Siddharth Karamcheti, Ted Xiao, Ashwin Balakrishna, et al. +OpenVLA: An open-source vision-language-action model. +In +CoRL +, 2024. +Kim et al. [2025] +Moo Jin Kim, Chelsea Finn, and Percy Liang. +Fine-tuning vision-language-action models: Optimizing speed and success. +In +RSS +, 2025. +Lew et al. [2025] +Jaihyun Lew, Jooyoung Choi, Chaehun Shin, Dahuin Jung, and Sungroh Yoon. +Disentangled motion modeling for video frame interpolation. +In +AAAI +, pages 4607–4615, 2025. +Li et al. [2023] +Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. +Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. +In +ICML +, pages 19730–19742. PMLR, 2023. +Li et al. [2024a] +Qixiu Li, Yaobo Liang, Zeyu Wang, Lin Luo, et al. +CogACT: A foundational vision-language-action model for synergizing cognition and action in robotic manipulation. +arXiv preprint arXiv:2411.19650 +, 2024a. +Li et al. [2025] +Shuang Li, Yihuai Gao, Dorsa Sadigh, and Shuran Song. +Unified video action model. +In +RSS +, 2025. +Li et al. [2024b] +Xuanlin Li, Kyle Hsu, Jiayuan Gu, Oier Mees, Karl Pertsch, et al. +Evaluating real-world robot manipulation policies in simulation. +In +CoRL +, 2024b. +Lin et al. [2024] +Bin Lin, Yunyang Ge, Xinhua Cheng, et al. +Open-sora plan: Open-source large video generation model. +arXiv preprint arXiv:2412.00131 +, 2024. +Liu et al. [2023] +Bo Liu, Yifeng Zhu, Chongkai Gao, Yihao Feng, Qiang Liu, Yuke Zhu, and Peter Stone. +LIBERO: Benchmarking knowledge transfer for lifelong robot learning. +In +NeurIPS +, 2023. +Mees et al. [2022] +Oier Mees, Lukas Hermann, Erick Rosete-Beas, and Wolfram Burgard. +Calvin: A benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks. +RA-L +, 7(3):7327–7334, 2022. +O’Neill et al. [2024] +Abby O’Neill, Abdul Rehman, Abhiram Maddukuri, Abhishek Gupta, Abhishek Padalkar, et al. +Open x-embodiment: Robotic learning datasets and rt-x models: Open x-embodiment collaboration. +In +ICRA +, pages 6892–6903, 2024. +Pertsch et al. [2025] +Karl Pertsch, Kyle Stachowicz, Brian Ichter, Danny Driess, et al. +Fast: Efficient action tokenization for vision-language-action models. +arXiv preprint arXiv:2501.09747 +, 2025. +Qu et al. [2025] +Delin Qu, Haoming Song, Qizhi Chen, Yuanqi Yao, Xinyi Ye, et al. +Spatialvla: Exploring spatial representations for visual-language-action model. +In +RSS +, 2025. +Rosete-Beas et al. [2023] +Erick Rosete-Beas, Oier Mees, Gabriel Kalweit, Joschka Boedecker, and Wolfram Burgard. +Latent plans for task-agnostic offline reinforcement learning. +In +CoRL +, pages 1838–1849, 2023. +Routray et al. [2026] +Sandeep Routray, Hengkai Pan, Unnat Jain, Shikhar Bahl, and Deepak Pathak. +Vipra: Video prediction for robot actions. +In +ICLR +, 2026. +Shah et al. [2023] +Rutav Shah, Roberto Martín-Martín, and Yuke Zhu. +Mutex: Learning unified policies from multimodal task specifications. +arXiv preprint arXiv:2309.14320 +, 2023. +Shi et al. [2024] +Xiaoyu Shi, Zhaoyang Huang, Fu-Yun Wang, et al. +Motion-i2v: Consistent and controllable image-to-video generation with explicit motion modeling. +In +ACM SIGGRAPH +, pages 1–11, 2024. +Sun et al. [2025] +Shibo Sun, Xue Li, Donglin Di, Mingjie Wei, Lanshun Nie, Wei-Nan Zhang, Dechen Zhan, Yang Song, and Lei Fan. +Llapa: A vision-language model framework for counterfactual-aware procedural planning. +In +ACM MM +, pages 5020–5029, 2025. +Sun et al. [2024] +Zhenhong Sun, Junyan Wang, Zhiyu Tan, Daoyi Dong, Hailan Ma, Hao Li, and Dong Gong. +Eggen: Image generation with multi-entity prior learning through entity guidance. +In +ACM MM +, pages 6637–6645, 2024. +Walke et al. [2023] +Homer Rich Walke, Kevin Black, Tony Z Zhao, et al. +Bridgedata v2: A dataset for robot learning at scale. +In +CoRL +, pages 1723–1736, 2023. +Wan et al. [2025] +Team Wan, Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, et al. +Wan: Open and advanced large-scale video generative models. +arXiv preprint arXiv:2503.20314 +, 2025. +Wang et al. [2024a] +Junyan Wang, Zhenhong Sun, Zhiyu Tan, Xuanbai Chen, Weihua Chen, Hao Li, Cheng Zhang, and Yang Song. +Towards effective usage of human-centric priors in diffusion models for text-based human image generation. +In +CVPR +, pages 8446–8455, 2024a. +Wang et al. [2026a] +Kun Wang, Xiao Feng, Mingcheng Qu, and Tonghua Su. +Hmvla: Hyperbolic multimodal fusion for vision-language-action models. +arXiv preprint arXiv:2602.02533 +, 2026a. +Wang et al. [2024b] +Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, et al. +Emu3: Next-token prediction is all you need. +arXiv preprint arXiv:2409.18869 +, 2024b. +Wang et al. [2024c] +Yuqi Wang, Jiawei He, Lue Fan, Hongxin Li, Yuntao Chen, and Zhaoxiang Zhang. +Driving into the future: Multiview visual forecasting and planning with world model for autonomous driving. +In +CVPR +, pages 14749–14759, 2024c. +Wang et al. [2025] +Yuchi Wang, Junliang Guo, Xinyi Xie, Tianyu He, Xu Sun, and Jiang Bian. +Vidtwin: Video vae with decoupled structure and dynamics. +In +CVPR +, pages 22922–22932, 2025. +Wang et al. [2026b] +Yuqi Wang, Xinghang Li, Wenxuan Wang, Junbo Zhang, Yingyan Li, Yuntao Chen, Xinlong Wang, and Zhaoxiang Zhang. +Unified vision-language-action model. +In +ICLR +, 2026b. +Wei et al. [2024] +Julong Wei, Shanshuai Yuan, Pengfei Li, Qingda Hu, Zhongxue Gan, and Wenchao Ding. +Occllama: An occupancy-language-action generative world model for autonomous driving. +arXiv preprint arXiv:2409.03272 +, 2024. +Wu et al. [2024a] +Hongtao Wu et al. +Unleashing large-scale video generative pre-training for visual robot manipulation. +In +ICLR +, 2024a. +Wu et al. [2024b] +Jialong Wu, Shaofeng Yin, Ningya Feng, Xu He, Dong Li, Jianye Hao, and Mingsheng Long. +iVideoGPT: Interactive videogpts are scalable world models. +In +NeurIPS +, pages 68082–68119, 2024b. +Ye et al. [2025] +Seonghyeon Ye, Joel Jang, Byeongguk Jeon, Sejune Joo, Jianwei Yang, et al. +Latent action pretraining from videos. +In +ICLR +, 2025. +Yu et al. [2024] +Sihyun Yu, Weili Nie, De-An Huang, Boyi Li, Jinwoo Shin, and Anima Anandkumar. +Efficient video diffusion models via content-frame motion-latent decomposition. +In +ICLR +, 2024. +Zhang et al. [2025] +Wenyao Zhang, Hongsi Liu, Zekun Qi, Yunnan Wang, et al. +Dreamvla: A vision-language-action model dreamed with comprehensive world knowledge. +In +NeurIPS +, 2025. +Zhao et al. [2025] +Qingqing Zhao, Yao Lu, Moo Jin Kim, Zipeng Fu, Zhuoyang Zhang, Yecheng Wu, et al. +Cot-VLA: Visual chain-of-thought reasoning for vision-language-action models. +In +CVPR +, pages 1702–1713, 2025. +Zhong et al. [2025] +Zhide Zhong, Haodong Yan, Junfeng Li, Xiangchen Liu, Xin Gong, et al. +Flowvla: Visual chain of thought-based motion reasoning for vision-language-action models. +arXiv preprint arXiv:2508.18269 +, 2025. +Zhou et al. [2023] +Gaoyue Zhou, Victoria Dean, Mohan Kumar Srirama, Aravind Rajeswaran, Jyothish Pari, Kyle Hatch, Aryan Jain, Tianhe Yu, Pieter Abbeel, Lerrel Pinto, et al. +Train offline, test online: A real robot learning benchmark. +arXiv preprint arXiv:2306.00942 +, 2023. +Zhu et al. [2023] +Yifeng Zhu, Abhishek Joshi, Peter Stone, and Yuke Zhu. +Viola: Imitation learning for vision-based manipulation with object proposal priors. +In +CoRL +, pages 1199–1210, 2023. +Zitkovich et al. [2023] +Brianna Zitkovich, Tianhe Yu, Sichun Xu, Peng Xu, et al. +RT-2: Vision-language-action models transfer web knowledge to robotic control. +In +CoRL +, pages 2165–2183, 2023. +\thetitle +Supplementary Material +1 +Implementation Details +1.1 +Datasets +We collected high-quality robot manipulation data for fine-tuning the Latent Motion Extractor (LME) and training the VLA, with the datasets summarized in Table +1 +. Most of the data comes from the OXE +[ +34 +] +dataset, and we additionally include the Calvin +[ +33 +] +and Libero +[ +32 +] +simulation datasets. For LME fine-tuning, we use only episode frames. In the VLA pre-training stage, we use both episode frames and text instructions. Following UniVLA +[ +50 +] +, we adopt different sampling intervals for each dataset to ensure that the temporal gap between keyframes is approximately one second. We then uniformly sample 16 frames from the continuous frames covered by six keyframes for pre-training. Throughout this stage, only third-person view data is used, excluding wrist-camera views. +During the VLA co-fine-tuning stage, we train on the benchmark-specific training sets using text instructions, frames, and actions. For example, the BridgeV2 dataset +[ +43 +] +is used for the SimplerEnv-Bridge evaluation +[ +30 +] +, while the Libero +[ +32 +] +evaluation uses the mixed data of four Libero task suites processed by OpenVLA +[ +24 +] +. In addition, the appendix includes extended experiments using the Fractal dataset +[ +4 +] +for the Simpler-Google Robot +[ +30 +] +evaluation and the Calvin dataset +[ +33 +] +for the Calvin evaluation, covering both ABCD +→ +\rightarrow +D and ABC +→ +\rightarrow +D task settings. Across the co-fine-tuning experiments, Bridge and Google Robot training use only third-person views, while Libero and Calvin use both third-person and wrist views. +1.2 +Training Details +For LME fine-tuning, we start from the VidTwin +[ +49 +] +pretrained model and fine-tune it on the video data from the datasets listed in Table +1 +. +We use 4 A800 GPUs with a per-GPU batch size of 4, randomly sampling 16 frames per video. +Each frame is resized to 224 +× +\times +224. +The KL loss weight is set to 1e-6, and the reconstruction loss is reduced using the mean over all elements rather than the default reduction over the batch dimension only. +We randomly sample 1000 videos from the training set as a validation set and select the checkpoint with the lowest reconstruction loss. +The final model corresponds to the checkpoint trained for one epoch plus 20k iterations. +For VLA pre-training, we initialize from the 8.5B Emu3 +[ +47 +] +pretrained checkpoint and train on the datasets in Table +1 +. +The training is performed on 32 A800 GPUs with a per-GPU batch size of 8. +Image observations are resized to 256 +× +\times +256. +We use the first and last frames of each video clip together with one learnable motion query, and the maximum sequence length is set to 2500 tokens. +We train for 10k iterations in total, which takes roughly 24 hours. +For VLA co-fine-tuning, we follow the evaluation protocols from UniVLA +[ +50 +] +for each benchmark. +We load the checkpoint from the VLA pre-training stage and train with 16 A800 GPUs, using a batch size of 8 per GPU and full-parameter fine-tuning. +The maximum sequence length is set to 3200 tokens. +For SimplerEnv-Windowx +[ +30 +] +, we use BridgeV2 +[ +43 +] +data with images resized to 256 +× +\times +256 and train for 12k iterations. +For SimplerEnv-Google Robot +[ +30 +] +, Fractal +[ +4 +] +images are resized to 240 +× +\times +192, and training continues for 16k iterations. +For Libero +[ +32 +] +, images are resized to 200 +× +\times +200, and training runs for 8k iterations. +For Calvin +[ +33 +] +, third-person views are resized to 200 +× +\times +200 and wrist views to 80 +× +\times +80, with training conducted for 12k iterations. +The per-iteration training time across these configurations is similar; for example, Libero training takes about 25 hours for 8k iterations. +Overall, each configuration requires roughly one to two days of training. +Table 1 +: +Training datasets. +Dataset Name +Count +Berkeley Autolab Ur5 +[ +10 +] +892 +Bridgev2 +[ +43 +] +24879 +Cmu Play Fusion +[ +9 +] +576 +Fractal +[ +4 +] +65530 +Kuka +[ +23 +] +84202 +Maniskill +[ +20 +] +30029 +Taco Play +[ +37 +] +3242 +Toto +[ +59 +] +899 +Utaustin Mutex +[ +39 +] +1500 +Viola +[ +60 +] +135 +Calvin +[ +33 +] +22966 +Libero +[ +32 +] +1693 +Total +236543 +Table 2 +: +Long-horizon robotic manipulation evaluation on the CALVIN +[ +33 +] +benchmark. +Methods marked with +† +{\dagger} +are from our re-implementation. +Method +Task +Tasks Completed in a Row +Avg. Len +↑ +\uparrow +1 +2 +3 +4 +5 +UniVLA +† +[ +50 +] +ABCD +→ +\rightarrow +D +0.988 +0.934 +0.883 +0.829 +0.764 +4.398 +Ours +0.972 +0.939 +0.894 +0.859 +0.809 +4.473 +TLA +[ +6 +] +ABC +→ +\rightarrow +D +0.955 +0.858 +0.754 +0.669 +0.565 +3.800 +Dita +[ +21 +] +0.945 +0.825 +0.728 +0.613 +0.500 +3.610 +UniVLA +† +[ +50 +] +0.972 +0.902 +0.826 +0.741 +0.661 +4.102 +Ours +0.968 +0.912 +0.844 +0.779 +0.708 +4.211 +Table 3 +: +Evaluation on SimplerEnv-Google Robot +[ +30 +] +across various manipulation tasks. +Model +Pick +Move +Drawer +Place +Average +OpenVLA +[ +24 +] +0.180 +0.563 +0.630 +0.000 +0.343 +SpatialVLA +[ +36 +] +0.860 +0.779 +0.574 +0.090 +0.576 +MoTo +[ +12 +] +0.740 +0.604 +0.431 +0.000 +0.444 +villa-X +[ +11 +] +0.987 +0.750 +0.593 +0.056 +0.597 +UniVLA +[ +50 +] +0.870 +0.565 +0.194 +0.167 +0.449 +\rowcolor +gray!20 +Ours +0.923 +0.676 +0.428 +0.407 +0.609 +Figure 1 +: +Sensitivity analysis of +N +N +and +l +a +l_{a} +on LIBERO. +Table 4 +: +Comparison between our latent motion representation and Wan 2.1 VAE latent +𝐳 +\mathbf{z} +on LIBERO. +Variant +Pre-training +Co-fine-tuning +Spatial +Object +Goal +Long +Average +Ours +latent motion + terminal frame ++ latent motion +0.948 +0.974 +0.958 +0.906 +0.947 +Wan2.1 VAE +[ +44 +] +latent +𝐳 +\mathbf{z} ++ terminal frame ++ latent +𝐳 +\mathbf{z} +0.938 +0.950 +0.922 +0.868 +0.920 +Figure 2 +: +Cross-Recon visualization on LIBERO +[ +32 +] +. +The first six columns show temporally sampled frames from three rows: Structure (top), Motion (middle), and Cross-Recon (bottom). +The Cross-Recon videos are generated by combining the static appearance from the Structure video with the motion representation extracted from the Motion video, revealing the transferred motion patterns. +Each Cross-Recon frame is overlaid with a motion heatmap to highlight dynamic regions. +The last column presents three summary maps: motion heatmaps obtained by averaging and maximizing per-frame absolute differences between Cross-Recon and Structure, and the end-effector trajectory estimated from the motion regions. +Figure 3 +: +Cross-Recon visualization on SimplerEnv +[ +30 +] +and Bridgev2 +[ +43 +] +. +Figure 4 +: +Visualization of latent-motion clusters and corresponding video examples. +(a) Unsupervised clustering results of clip-level motion trajectories. Each subplot shows the average 2D motion trajectory (obtained from the first two PCA components of the accumulated frame-wise motion deltas) for one cluster. +(b) Representative video examples from clusters. +Cluster 1 and 2 correspond to monotonic downward-like or upward-like motions, whereas Cluster 3 and 4 exhibit rightward-like or leftward-like behaviors. +Figure 5 +: +Comparative visualization of future-frame prediction strategies. +1.3 +Interpretation of the World Model and the Latent Motion Chain +Our method combines a world model formulation with latent action modeling. +The world model component consists of two stages: pre-training and co-fine-tuning. +During pre-training, the world model is not action-conditioned. +This follows the representation adopted by UniVLA +[ +50 +] +and FlowVLA +[ +58 +] +, where the world model predicts future environment evolution given a language instruction and an initial state, rather than explicit actions. +During the co-fine-tuning stage, we introduce an action-conditioned formulation: +p +​ +( +v +t ++ +1 +∣ +v +t +, +A +t +) +p(v^{t+1}\mid v^{t},A^{t}) +. +Our latent motion does not explicitly perform multi-step rollouts. Instead, it provides a continuous and decoupled motion encoding over a temporal window, which can be interpreted as an implicit motion chain. +2 +Additional Results +2.1 +Analysis of keyframes and action chunk size +We evaluate the number of sparse keyframes +N +∈ +{ +1 +, +2 +, +3 +, +4 +, +5 +} +N\!\in\!\{1,2,3,4,5\} +and action chunk sizes +l +a +∈ +{ +5 +, +10 +, +20 +, +25 +} +l_{a}\!\in\!\{5,10,20,25\} +on LIBERO to understand the temporal granularity required by latent motion reasoning. +As shown in Figure +1 +, both hyperparameters exhibit a clear inverted-U trend. The best performance is achieved at +( +N += +2 +, +l +a += +10 +) +(N=2,l_{a}=10) +, corresponding to a +∼ +\sim +20-frame ( +≈ +\approx +2 s) temporal horizon. +When using only one keyframe ( +N += +1 +N=1 +), performance drops significantly across all suites, especially on long-horizon tasks, indicating that the latent motion becomes under-constrained. Increasing +N +N +to 2 provides sufficient visual anchoring and yields the largest improvement. However, further increasing +N +N +gradually degrades performance. With dense observations, the model can rely on short-term visual matching instead of inferring motion dynamics, weakening the benefit of latent temporal reasoning. +A similar phenomenon appears for action chunk size. Small chunks ( +l +a += +5 +l_{a}=5 +) reduce temporal abstraction and make the policy closer to step-wise imitation. Large chunks ( +l +a +≥ +20 +l_{a}\geq 20 +) introduce high uncertainty in future evolution, particularly harming the long-horizon tasks. The intermediate chunk size ( +l +a += +10 +l_{a}=10 +) achieves the best trade-off between predictability and abstraction. +Overall, the results suggest that the proposed model performs best when sparse observations provide partial constraints while still requiring the model to infer continuous evolution. This supports our design motivation: the latent motion token serves as a dynamics aggregator over a medium temporal window rather than dense frame tracking or one-step prediction. +2.2 +Comparison with other Video VAE +To further analyze the role of latent motion representations, we replace VidTwin with the VAE from Wan 2.1 +[ +44 +] +and conduct a controlled comparison. Specifically, we use the latent +𝐳 +\mathbf{z} +extracted by the Wan 2.1 VAE as auxiliary supervision during both pre-training and co-fine-tuning. +The Wan 2.1 VAE is trained on large-scale video data and therefore incorporates rich generic video priors. As shown in Table +4 +, this variant achieves an average success rate of 0.920 on LIBERO. While competitive, it remains inferior to our latent motion design (0.947). +2.3 +CALVIN +Calvin +[ +33 +] +is an open-source simulated benchmark built on PyBullet, designed for learning long-horizon, language-conditioned robotic manipulation tasks. +It provides a tabletop simulation environment containing 23 types of manipulation skills, such as lifting, pushing, rotating, and object relocation. +These skills must be executed in sequence to complete multi-step tasks, introducing substantial uncertainty and randomness, which makes Calvin a highly challenging evaluation benchmark. +The dataset includes a large number of expert demonstrations and is organized into multiple subsets. +In our experiments, we use the ABCD +→ +\rightarrow +D and ABC +→ +\rightarrow +D subsets, and during training, we only utilize demonstrations that include natural language descriptions of the actions. +Following the official evaluation protocol, all tests consist of 1000 episodes, each containing a sequence of five sub-tasks specified by natural language instructions. +The main results are presented in Table +2 +. Our method achieves an average success length of 4.473 on the ABCD +→ +\rightarrow +D task and 4.211 on the ABC +→ +\rightarrow +D task. +For a fair comparison, we reproduced UniVLA +[ +50 +] +using the training sets listed in Table +1 +, and followed a fine-tuning setup with 16 A800 GPUs and a per-GPU batch size of 8. +Under the same training configuration, our approach outperforms UniVLA +[ +50 +] +. +2.4 +SimplerEnv-Google Robot +We also evaluate our method on the SimplerEnv-Google Robot benchmark. +The evaluation primarily follows the visual matching protocol, which assesses the alignment between real and simulated visual appearances by overlaying real-world images onto simulated backgrounds and adjusting the textures of foreground objects and the robot within the simulator. +This benchmark includes four tasks: +pick coke can +, +move near +, +open/close drawer +, and +place in closed drawer +. +The main results are shown in Table +3 +. +Our method achieves an average success rate of 0.609, outperforming UniVLA +[ +50 +] +, villa-x +[ +11 +] +, MoTo +[ +12 +] +, and other baselines. +Here, UniVLA refers to our reproduction. +Our method surpasses UniVLA on all four tasks and shows a particularly large improvement on the +place in closed drawer +task. +Figure 6 +: +An Intel RealSense camera and a Realman RM75B robot. +Figure 7 +: +Comparison between data collection and real-world deployment during testing. +2.5 +More Visualization +We provide extended visualizations for the latent motion analysis presented in Section 4.4, with the main results shown in Figures +2 +, +3 +, +4 +, and +5 +. +Effective decoupling of structure and motion latents. +Figures +2 +and +3 +analyze representative samples from the Libero and Bridge datasets. The first six columns display temporally sampled frames from three rows: Structure (top), Motion (middle), and Cross-Reconstruction (bottom). The Cross-Recon videos are synthesized by combining the static appearance from the Structure video with the motion representation extracted from the Motion video, thereby revealing transferred motion patterns. +Each Cross-Recon frame is overlaid with a motion heatmap to highlight dynamic regions. +The final column summarizes three diagnostic maps: motion heatmaps computed by averaging and maximizing the per-frame absolute differences between Cross-Recon and Structure, as well as the end-effector trajectory estimated from the activated motion regions. As shown, the highlighted areas consistently follow the movement of the robot arm in the Motion video. +In the video results, these regions fluctuate over time; for clarity in static visualization, we display aggregated highlights in the figures. +We further analyze the distribution of motion latents, as shown in Figure +4 +. To derive an interpretable trajectory representation from high-dimensional motion latents, we first extract per-frame motion features from each video clip and accumulate framewise differences to obtain a temporal sequence describing the overall motion trend of the clip. These sequences are then resampled to a fixed length across all clips and standardized globally. We subsequently apply PCA to the sequence features and take the first two principal components as a 2D trajectory for each clip. This representation preserves the dynamic structure encoded in the latent space while enabling clear comparison across clips. +Figure +4 +(a) shows unsupervised clustering of all motion trajectories in the 2D PCA space. To obtain cluster-level canonical shapes, we temporally align trajectories within each cluster via resampling and plot their mean curves along with 95% confidence intervals. Distinct trajectory patterns emerge across clusters—such as monotonic rises, two-stage reversals, and multi-phase back-and-forth motions—indicating that the model’s motion latent captures high-level motion semantics. +To further validate the semantic consistency within each cluster, we randomly sample two video clips per cluster and visualize three uniformly sampled frames from each clip, as shown in Figure +4 +(b). The clips within the same cluster exhibit highly similar motion trends in appearance, confirming that the structure of the motion-latent space yields meaningful discrimination among different action patterns. +Motion latent enhances dynamic modeling for future frame prediction. +As shown in Figure +5 +, we further visualize future frame predictions under different pretraining strategies. From top to bottom, the examples correspond to four tasks: +i) pick up the chocolate pudding and place it in the basket, +ii) pour, +iii) open the fridge, and +iv) put the banana inside the drawer. +In Figure +5 +(a), world-model-based approaches suffer from reconstructing redundant background pixels, which can draw attention away from critical interactions and motion cues. As a result, the predicted future frames sometimes remain nearly unchanged, such as in tasks (ii) and (iii). +Figure +5 +(b) shows that predicting only the target frame often leads to unstable generation due to the absence of intermediate evolution steps: in task (i), the target frame nearly collapses back to the initial frame, and in task (iii), only one door of the fridge is generated. +In contrast, our method leverages the motion latent +z +m +z_{m} +as a chain-of-thought for motion, providing stronger guidance for future-frame prediction. The generated final frames align more accurately with the intended task instructions. +3 +Real-Robot Experiments +Experimental Setup. +As shown in Figure +6 +, we use the Realman RM75B robot, which is equipped with 7 degrees of freedom and a single gripper. +An Intel RealSense camera is used to capture RGB images. +We set up a cup-grasping experiment and collected a total of 127 episodes, consisting of 65,382 frames with corresponding actions. +Each episode contains an average of 515 frames, corresponding to approximately 20 seconds in the real world. +The dataset mainly includes grasping cups of four different colors, with the number of episodes per color as follows: red 31, blue 39, yellow 24, and purple 33. +Figure +7 +(a) shows some collected data. +During training, all images are cropped and resized to 256×256. The action chunk size is set to 10. We train the model for 2k steps using 16 GPUs with a per-GPU batch size of 8. +The data were collected in the afternoon and evening and then used for model training. +Testing was conducted the following day. +As shown in Figure +7 +, the lighting conditions have some differences between data collection compared and during real-world deployment. +We found that the model was still able to correctly execute instructions under different lighting conditions. +Figure +7 +(b) shows in the first two rows two test cases: grasping a red/purple cup and placing it on a plate. Their background lighting differs from the training data, but the model is still able to execute the tasks successfully. +BETA \ No newline at end of file diff --git a/research/notes/composer-2-technical-report.md b/research/notes/composer-2-technical-report.md new file mode 100644 index 0000000000000000000000000000000000000000..5453ad04a61dbb89a036afe96567830e34e26ff3 --- /dev/null +++ b/research/notes/composer-2-technical-report.md @@ -0,0 +1,2518 @@ +--- +title: Composer 2 Technical Report +id: composer-2-technical-report +tags: +- deepread +created: '2026-06-10T00:23:34.475868Z' +source: https://arxiv.org/html/2603.24477 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:34.475642Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Composer 2 Technical Report +Title: +Content selection saved. Describe the issue below: +Description: +License: CC BY 4.0 +arXiv:2603.24477v2 [cs.SE] 26 Mar 2026 +Composer 2 Technical Report +Cursor Research Team +1 +Introduction +Composer 2 is a specialized model designed for agentic software engineering. The model demonstrates strong long-term planning and coding intelligence while maintaining the ability to efficiently solve problems for interactive use. +The model scores strongly on CursorBench, our benchmark of real-world software engineering (Figure +1 +), while also scoring at frontier levels on public software engineering benchmarks such as SWE-bench Multilingual +[Jimenez +et al. +, +2024 +] +and Terminal-Bench +[Merrill +et al. +, +2026 +] +. +The model is trained in two phases: first, continued pretraining to improve the model’s knowledge and latent coding ability, followed by large-scale reinforcement learning to improve end-to-end coding performance through stronger reasoning, accurate multi-step execution, and coherence on long-horizon realistic coding problems. +A core tenet of Composer training is to emulate real-world user challenges as closely as possible to minimize train-test mismatch. We develop infrastructure to support training in the same Cursor harness that is used by the deployed model, with equivalent tools and structure, and use environments that match real problems closely. To measure the ability of the model on increasingly difficult tasks, we introduce a benchmark derived from real software engineering problems in large codebases including our own. +Composer 2 is a frontier-level coding model and demonstrates a process for training strong domain-specialized models. On our CursorBench evaluations the model achieves a major improvement in accuracy compared to previous Composer models (61.3). On public benchmarks the model scores 61.7 on Terminal-Bench and 73.7 on SWE-bench Multilingual in our harness, comparable to state-of-the-art systems. +Figure 1: +Composer 2 improves greatly from previous Composer models, achieving performance competitive with state-of-the-art models. +By specializing entirely on coding ability, Composer attains such performance while being lower cost to serve than state-of-the-art model API pricing. See Section +5 +for detailed evaluations. +2 +Background and Related Work +Generating code has been a standout application of large language models +Feng +et al. +[ +2020 +]; Clement +et al. +[ +2020 +]; Chen +et al. +[ +2021 +]; Li +et al. +[ +2022 +] +. Code provides a rich source of challenging training data that has supplemented language data in most large models +Fried +et al. +[ +2023 +]; Li +et al. +[ +2023 +]; Lozhkov +et al. +[ +2024 +]; Rozière +et al. +[ +2023 +]; Guo +et al. +[ +2024 +]; DeepSeek-AI [ +2024a +]; Allal +et al. +[ +2023 +]; Nijkamp +et al. +[ +2023 +]; Hui +et al. +[ +2024 +]; Wang +et al. +[ +2021 +, +2023 +]; Team +et al. +[ +2024 +]; Mishra +et al. +[ +2024 +] +. Early applications of code generation typically focused on autocomplete applications. Subsequently, instruction tuning turned models into coding assistants +Luo +et al. +[ +2024 +]; Wei +et al. +[ +2024 +]; Zhuo +et al. +[ +2025 +]; Muennighoff +et al. +[ +2024 +] +capable of responding to user requests. In the last year, software engineering +agents +have achieved widespread adoption, pushing models beyond chat to autonomously navigate repositories and solve complex engineering tasks +Yang +et al. +[ +2024 +, +2025 +]; Wang +et al. +[ +2025 +]; Qian +et al. +[ +2024 +]; Hong +et al. +[ +2023 +] +. +Software engineering agents aim to autonomously act to solve a given task prompt. Given an environment, i.e., a codebase and an isolated container for code execution, along with a prompt +x +x +giving the agent its task, an agent produces a rollout consisting of a series of actions +a +1 +, +… +, +a +T +a_{1},\ldots,a_{T} +, each of which makes one or more tool calls and yields responses +y +1 +, +… +, +y +T +y_{1},\ldots,y_{T} +. Tool calls may modify the underlying environment, and the result of a rollout is the final state of this environment. Each action +a +i +a_{i} +is selected by sampling from a language model policy +π +θ +​ +( +a +i +∣ +x +, +a +1 +, +y +1 +, +… +, +a +i +− +1 +, +y +i +− +1 +) +\pi_{\theta}(a_{i}\mid x,a_{1},y_{1},\ldots,a_{i-1},y_{i-1}) +, after which a reward is given based on the code’s correctness, succinctness, and conformance to software engineering principles. In contrast to more constrained settings like competitive programming, a strong software engineering agent must perform non-trivial exploration, write its own tests, and construct the minimal changes necessary to solve the task prompt. +Composer 2 has access to a small set of general tools that allow it to read and edit files, run shell commands, search the codebase using grep or semantic search, and search the web. Its prompt includes a system message, the tool call format specification, recent file information, past user messages, and the current task. The most common end result of this process is a set of changes to files in the codebase environment, although there are many other common use cases, such as answering questions, writing plans, resolving version control issues, or monitoring long-running jobs. +Our main research thrust for Composer 2 investigates how scaling model training can reliably improve performance on real-world coding. +We target this through two distinct training phases: continued pretraining (Section +3 +), and asynchronous reinforcement learning (Section +4 +). To measure progress, we construct a suite of challenging benchmarks (Section +5 +). +3 +Continued Pretraining +The continued pretraining stage aims to improve the language model’s base knowledge, specifically in the domain of coding. Such continued pretraining has long been demonstrated to drastically improve downstream performance +Gururangan +et al. +[ +2020 +]; Howard and Ruder [ +2018 +] +. Taking this a step further, recent models use a staged training approach, progressively filtering towards higher quality data +Hoffmann +et al. +[ +2022 +]; Touvron +et al. +[ +2023 +]; Ye and others [ +2024 +] +. While we start with base models naturally trained with large amounts of code data, we find that additional supervised learning reliably improves knowledge benchmarks and leads to improved coding performance of the final coding agent. +We used internal evaluations and inference performance considerations to select a base model. Our evaluations measure internal codebase perplexity, coding knowledge, and state tracking. For more details, see Appendix +B +. These evaluations led us to select Kimi K2.5 +Team [ +2026 +] +, a 1.04T parameter / 32B active parameter Mixture-of-Experts model as our base model for Composer 2. +3.1 +Training +We extend Kimi K2.5 with a continued pretraining stage on a large code-dominated data mix. The purpose of this stage is to provide a base model for the subsequent agentic RL training by specializing the model on coding knowledge and capabilities. We divide this stage into three phases. We spend the bulk of compute at 32k token sequence length, followed by a shorter long-context extension phase to 256k sequence length, and finally a short SFT phase on targeted coding tasks. Training was performed in MXFP8 on NVIDIA B300s using the AdamW optimizer. See Section +6.1 +for more training details. During training, we measure the evaluation loss on our internal codebase. We see that the loss decreases log-linearly over the course of the training run. +Continued pretraining ultimately serves to improve downstream RL performance, and the connection between the two stages is an area of active research. We study the relationship between codebase perplexity and RL performance by applying our continued pretraining recipe to Qwen3-Coder-30B-A3B +Team [ +2025e +] +. Continued pretraining is performed at three logarithmically spaced compute levels: small, medium, and large. Each of these checkpoints then undergoes SFT on a small dataset, followed by an identical RL run. Figure +2 +(left) shows the relationship between the final loss after SFT and the RL reward after a fixed number of steps, demonstrating that cross-entropy loss is indeed predictive of downstream RL performance. +Figure 2: +Continued pretraining translates to downstream RL performance. +Left: We study this relationship on a smaller Qwen model, examining checkpoints trained on a varying number of tokens. Right: The model undergoes a steady decrease in training perplexity. +Multi-Token Prediction +To serve the model faster in production, we train additional Multi-Token Prediction (MTP) layers +Gloeckle +et al. +[ +2024 +]; DeepSeek-AI [ +2024b +] +to use with speculative decoding. We initialize the MTP layers from scratch and train them on the same data mix. To speed up convergence, we train the MTP layers with self-distillation, teaching the model to predict the exact logit distribution of the main LM head at each position. To ensure that this process generalizes, the MTP layers are trained atop a checkpoint cut from the middle of the continued pretraining run. During the final two phases (long-context and SFT), the MTP layers are included and trained jointly with the rest of the model. +4 +Reinforcement Learning +Figure 3: +RL training tasks. +Composer 2 is trained by reinforcement learning on a large set of coding tasks. +These tasks are run in environments that emulate real Cursor sessions as closely as possible (see Section +6.2 +for infrastructure details). +At a high level, RL training consists of sampling a problem, simulating a group of rollouts from the agent with different solutions, and then updating the model weights based on solution quality. +We create a problem distribution that reflects the most common use cases. Figure +3 +shows the breakdown in terms of task category. Notably, our training distribution captures many aspects of software engineering absent from popular AI coding benchmarks. In later stages of training, we use simple heuristics—such as number of turns and thinking tokens of rollouts—to upsample increasingly harder data points. +4.1 +Asynchronous RL Training +Our reinforcement learning pipeline is built around learning from large-scale policy gradients while maintaining stability. +We use a policy gradient algorithm with multiple samples per prompt +Shao +et al. +[ +2024 +]; Ahmadian +et al. +[ +2024 +] +and a fixed group size. +We operate in the single-epoch regime, i.e., the same prompt is never trained on twice. +We utilize Adam as our underlying optimizer and update the full parameter set. RL training operates in a highly asynchronous regime with independent training and rollout generation workers (see Section +6.2 +for details). +A number of policy gradient variants have been proposed in prior literature +Yu +et al. +[ +2025 +]; Zheng +et al. +[ +2025 +]; MiniMax [ +2025 +]; Liu +et al. +[ +2025a +] +. As in Dr. GRPO +Liu +et al. +[ +2025a +] +, we found that it is crucial to minimize the bias in the gradients that can arise from transforming the underlying advantage. Following this work, we remove the length standardization term from GRPO as it introduces a length bias. We do not normalize group advantages by their standard deviation, as it results in the degenerate case where small behavioral differences get massively upweighted within a group where every rollout achieves equal correctness. +Yu +et al. +[ +2025 +] +proposed to mask out rollouts that exceed the maximum sequence length. Some subsequent works employed this masking +Liu +et al. +[ +2025b +]; Golubev +et al. +[ +2025 +] +, while other works found it to yield mixed results. For instance, +Liu +et al. +[ +2025a +] +found that masking overlong rollouts shows limited effectiveness on long-tail reasoning tasks but increases the accuracy and clarity of responses in medium and short-length reasoning tasks, and +Du +et al. +[ +2025 +] +found that overlong masking caused output length to grow too quickly. We did not see benefits with overlong masking at small scale and opted not to mask rollouts that exceed the maximum sequence length. Our self-summary system (discussed below) also limits the occurrence of these cases in practice. +Since agent rollouts can be very long, especially when aiming for long-horizon coherency, it is important that our system maintains stability in the highly asynchronous regime. Our main strategy is to minimize how off-policy the samples become. On the infrastructure side, this divergence is reduced via fast weight synchronization and in-flight weight updates, similar to PipelineRL +Piché +et al. +[ +2025 +] +. Inference workers are capable of updating weights mid-rollout, which means later tokens in a rollout are likely less off-policy. To reduce further divergence between the sampling and training policy, we replay MoE routing +Ma +et al. +[ +2025 +] +. We discuss the implementation of our asynchronous RL pipeline in Section +6.2 +. +Figure 4: +Comparison of estimators of +KL +​ +( +p +∥ +q +) +\mathrm{KL}(p\,\|\,q) +for two synthetic Gaussian distributions with unit variance and different means. +Similar to prior work +Shao +et al. +[ +2024 +]; Team [ +2025d +] +, we use a Kullback–Leibler divergence for regularization, +KL +​ +( +q +∥ +p +) += +𝔼 +x +∼ +q +​ +[ +− +log +⁡ +r +​ +( +x +) +] +\mathrm{KL}(q\,\|\,p)=\mathbb{E}_{x\sim q}\!\left[-\log r(x)\right] +, +r +​ +( +x +) += +p +​ +( +x +) +/ +q +​ +( +x +) +. +r(x)=p(x)/q(x). +Many open-source implementations of RL estimate KL with the estimator +k +3 += +( +r +− +1 +) +− +log +⁡ +r +k_{3}=(r-1)-\log r +, defined in +Schulman [ +2020 +] +. The +k +3 +k_{3} +estimator is an unbiased estimator of KL and reduces variance when +p +p +and +q +q +are close. However, Amini et al. shows in +[Amini +et al. +, +2025 +, Figure 1] +that the variance increases drastically as +p +p +and +q +q +diverge. See Figure +4 +: for large KL values, the variance of the estimate is extremely large. (The +k +2 +k_{2} +estimator does not suffer from variance blow-up, but is biased.) Therefore, we use the standard estimator +k +1 += +− +log +⁡ +r +k_{1}=-\log r +instead. +Figure 5: +Both average and best-of-K performance increase over the RL training period. +The above curves are reported on a held-out evaluation set, along with CursorBench tasks. Performance steadily improves throughout RL training. Importantly, we do not observe a tradeoff between average performance and best-of-K performance. +A growing body of recent literature has argued that RL on LLMs often improves average performance primarily by concentrating probability mass on already-known successful trajectories, sometimes at the cost of policy entropy and output diversity +Yue +et al. +[ +2025 +]; Liang +et al. +[ +2026 +]; Chen +et al. +[ +2025 +]; Wen +et al. +[ +2026 +]; Tajwar +et al. +[ +2026 +] +. Under this view, improvements at best-of-K may be limited because the model becomes better at selecting one high-confidence solution rather than expanding the set of reachable correct solutions. Against this backdrop, our results are notable: rather than observing a trade-off in which average reward rises while best-of-K remains flat, we find that our training improves both statistics as shown in Figure +5 +. This suggests that, in our setting, RL is not merely reweighting a fixed pool of reasoning paths, but is also improving the model’s effective coverage of correct solutions under repeated sampling. +Self-Summarization +To enable Composer 2 to work across long horizons, we use the self-summarization technique introduced in Composer 1.5 +Team [ +2025b +] +. Each training rollout can involve multiple generations chained together by summaries, rather than a single prompt–response pair. We use the final reward for all tokens produced by the model in the chain. This upweights both the agent responses in good trajectories and also the self-summarizations that made them work. At the same time, poor summaries that lose critical information are downweighted. As Composer trains, it learns to use self-summaries to process more information, even with a limited context window. For hard examples, it often self-summarizes multiple times. In our experiments, we find that self-summary consistently reduces the error compared to using separate prompt-based compaction, while using significantly fewer tokens and reusing the KV cache. +4.2 +Agent Behavior +While the primary goal of RL training is to improve model intelligence, we also aim to produce a model that provides a good developer experience. +This is affected by the communication style of the model as well as the time and resources it takes to answer a question. +Figure 6: +Nonlinear penalties push the model to be quick on easy tasks and think more on hard tasks. +For behavior and communication, we apply an array of auxiliary rewards to ensure the model provides a good experience. These include rewards for coding style, communication, and product-specific penalties for poor tool calls, such as creating to-do list items and then leaving them unfinished. During RL training, we monitor the model for emergent behaviors and occasionally introduce additional behavior rewards as needed. For example, we observed that the model would start to leave long chains-of-thought in comments or collapse to using the terminal tool only. +To incentivize the model to produce solutions quickly on easy requests while allowing it to think longer on hard requests, we add a concave down and increasing nonlinear length penalty to the reward: +C +length +​ +{ +k +, +q +} +​ +( +x +) += +( +1 ++ +k +​ +x +) +1 +− +q +− +1 +k +​ +( +1 +− +q +) +, +C_{\text{length}\{k,q\}}(x)=\frac{(1+kx)^{1-q}-1}{k(1-q)}, +where +k +k +and +q +q +are hyperparameters which define the curvature of the penalty, and the input +x +x +is a weighted combination of thinking tokens, tool calling tokens, tool output tokens, final message tokens, number of tool calls, and number of turns of a rollout. +The nonlinearity reflects that on easy tasks, achievable with only a few tool calls, every additional bit of effort is felt more acutely than in long-horizon tasks, where the agent might iterate for hundreds of tool calls. See Figure +6 +for some examples of the nonlinear curves produced by this equation. +We find that utilizing such length penalties enables the model to learn particularly efficient behaviors, e.g., making multiple tool calls in parallel. +5 +Real-World Evaluation with CursorBench +The application of coding agents has evolved rapidly over the past year, expanding from simple, tightly-scoped edits to complex debugging, large-scale refactoring, and feature development. +At Cursor, we have observed that performance on public evaluation benchmarks often correlates only loosely with the real-world utility of these models. +We attribute this misalignment to four primary factors: +• +Domain Mismatch: +As the capabilities of coding agents expand, static benchmarks often fail to capture the full spectrum of developer workflows. +For instance, SWE-bench and its variants predominantly focus on isolated bug-fixing. +Terminal-Bench covers a wider range of task types, but many of its tasks (e.g., computing chess moves) are abstract puzzles rather than typical software engineering operations. +• +Prompt Over-specification: +Public benchmarks are typically highly specified, assuming a narrow set of correct solutions. +In contrast, real developer requests are often underspecified and admit multiple valid architectural approaches. +Consequently, public benchmarks either penalize correct alternative solutions or rely on unnaturally explicit prompts that bypass the challenge of interpreting ambiguous intent. +• +Data Contamination and Overfitting: +Because public benchmarks are constructed from historical scrapes of open-source repositories, they are frequently leaked into model training mixtures, artificially inflating scores. +Recently, OpenAI suspended reporting SWE-bench Verified results after finding evidence that frontier models could generate gold patches from memory +74 +. +Beyond contamination, the fixed and narrow nature of these benchmarks can compress performance differences: for instance, Haiku 4.5 achieves 73.3% on SWE-bench Verified, very close to GPT-5’s 74.9%, misaligning with accuracy on broader and more diverse task distributions like Terminal-Bench. +• +Narrow Evaluation Scope: +Existing coding evaluations predominantly measure functional correctness. +In practice, developers also heavily weigh code quality, readability, latency, cost, and the quality of the agent’s interactive behavior throughout a session. +(a) +Lines changed in reference diff. +(b) +Problem description length. +Figure 7: +Compared to public benchmarks, CursorBench tasks have less-specified task prompts, and require an order of magnitude more code changes. +We find this better represents the complexity and ambiguity of real-world software engineering requests. +To address these limitations, we introduce CursorBench, an internal evaluation suite comprising tasks drawn from actual coding sessions of our engineering team. +Because these tasks originate from real agent sessions rather than curated public repositories, CursorBench better reflects the true distribution of software engineering tasks while completely avoiding train-set contamination. +Furthermore, rather than relying solely on functional correctness, we evaluate models using specific metrics targeting code quality, execution efficiency, and interactive agent behavior in realistic settings. +Figure +7 +highlights the structural differences between CursorBench and public evaluation sets. +CursorBench tasks necessitate substantially more extensive code modifications, with a median of 181 lines changed compared to just 7–10 lines for SWE-bench Verified and Multilingual (Figure +7(a) +). +At the same time, CursorBench prompts are also more underspecified, featuring a median description length of only 390 characters versus 1,185–3,055 characters for public benchmarks (Figure +7(b) +). +This combination of broad execution scope and high intent ambiguity accurately reflects the intrinsic difficulty of real-world software engineering, where developers must frequently synthesize context from production logs, sparse user bug reports, and large existing codebases to derive a solution. +Figures +8 +and +12 +show representative examples: one requires diagnosing a build-tool transpilation bug in a retry loop from a terse bug report and observability logs, while the other requires designing a tuned heuristic detector over hundreds of chat responses to quantify a subtle streaming regression and discover its hidden invariants. +⬇ +// +executeScoringRollout.ts +- +linked +code +snippet +from +the +problem +statement +for +( +let +attempt += +1; +attempt +<= +MAX_RETRIES +; +attempt +++) +{ +try +{ +const +request += +new +ScoringRequest +(...); +const +{ +ctx +: +Ctx +, +startSpan +: +taskSpan +} += +ctx +. +span +( +"scoring" +); +using _taskSpan = taskSpan.start(); +const +result += +await +executeScoring +(...); +let +rawOutput += +"" +; +if +( +result +. +response +) +{ +rawOutput += +result +. +response +. +join +( +"\n" +); +} +const +parsed += +parseOutput +( +rawOutput +); +if +( +parsed +. +parseError +) +{ +lastError += +parsed +. +parseError +; +ctx +. +warn +({ +error +: +lastError +}, +"Error, +will +retry" +); +if +( +attempt +< +MAX_RETRIES +) +{ +continue +; +} +} +// +... +} +catch +( +error +) +{ +/* +... +*/ +} +} +Problem statement: +scoring attempt 2 and attempt 3 succeeded but i get “failed after 3 attempts. Last error: [canceled] User aborted request” error at the end +@executeScoringRollout.ts (1084-1118) +check if there is some bug in this +Please see datadog logs at @logs and fix +Figure 8: +Example CursorBench task +(truncated and obfuscated from our evaluation pipeline). The agent receives a terse bug report and must cross-reference the source code with production observability logs to diagnose the failure. The logs also contain unrelated production service warnings which are a red herring: the true root cause is an esbuild 0.20.2 downleveling bug for +using +. The transpiled output lowers the highlighted declaration into +var +-scoped error state that is not reset between retry iterations, causing stale failure state to be re-thrown from the generated +finally +block even after later attempts succeed. +New CursorBench iterations are continually developed by our team. +As user workflows evolve and agent capabilities improve, we regularly update the evaluation set to remain aligned with how developers actually use the product. +Figure +9 +shows how the benchmark has grown in complexity across iterations: compared to earlier versions of CursorBench, tasks from CursorBench-3 involve changing more than twice as many files and lines of code on average. +In addition to increased problem size, the distribution of task types has also shifted, as developers increasingly delegate long-running command execution, experiment monitoring, and data analysis to agents. +This continual refresh ensures that our evaluations remain aligned with the shifting frontier of real-world difficulty and not saturated. +Finally, we complement our primary CursorBench evaluation with a suite of targeted evaluations covering other aspects of coding agent quality and behavior. These include an intent evaluation, which assesses how the model handles ambiguous prompts; an instruction-following evaluation, which measures how well the model follows system prompts, user prompts, rules, and skills; an eager editing evaluation, which tests how the model responds to questions where it should avoid editing code; a code quality evaluation, which judges the quality of both code and comments; and an interruption evaluation, which quantifies how well the model handles mid-rollout interruptions and user feedback. We develop these evaluations by identifying important dimensions of agent behavior, selecting data points that elicit them, and writing rubrics to measure performance. +Figure 9: +Evolution of CursorBench across iterations. +Each version incorporates more complex requests. CursorBench-3 more than doubles the median task size from the initial version, shown as the relative percent change in the bottom bar. +6 +Infrastructure +6.1 +Training Infrastructure +Parallelism. +Previous Composer training stacks combined Fully Sharded Data Parallelism (FSDP) +Rajbhandari +et al. +[ +2020 +]; Zhao +et al. +[ +2023 +] +, Expert Parallelism (EP) +Shazeer +et al. +[ +2017 +]; Fedus +et al. +[ +2022 +] +, and Tensor Parallelism (TP) +Shoeybi +et al. +[ +2019 +] +. +In the original MoE design, EP reused the same rank group as TP, so EP was not an independent scaling axis. +This coupling kept the implementation simple, but constrained support for larger MoE configurations and would unnecessarily enable activation sharding in the continued pretraining phase, even when activation memory pressure is modest. +Composer 2 instead uses Context Parallelism (CP) +Liu +et al. +[ +2024 +]; Jacobs +et al. +[ +2023 +] +as the primary long-context scaling axis. CP requires less communication than TP and improves compute efficiency by preserving full hidden dimensions in various projections; in contrast, TP produces less efficient skinny local matrix multiplications. There are a few tricks we use to implement CP efficiently in the Multi-Head Latent Attention (MLA) architecture. To minimize communication overhead, we compute local KV latent vectors, all-gather the latent vectors across CP ranks, and then compute the KV projections. Although this replicates the projection on all CP ranks, the projection is small and reduces CP communications, allowing us to fully overlap CP communications with the computation of the Q projection. Additionally, while naive CP causes load imbalance during causal attention as later tokens have to attend to more tokens, we use the technique from +Liu +et al. +[ +2024 +] +to address this: we split the sequence into +2 +× +CP +2\times\text{CP} +chunks, and the +i +i +-th rank processes chunks +i +i +and +2 +× +CP +− +1 +− +i +2\times\text{CP}-1-i +, resulting in roughly equal work during causal attention for all ranks. Finally, the context parallelism dimension is folded into the FSDP dimension, allowing us to use CP ranks to reduce per-GPU parameter/state memory usage. +Composer 2 also introduces a more flexible expert-parallel design by decoupling EP from TP. This requires using different meshes for sharding dense layers and expert weights. EP is formed from DP and CP capacity, enabling support for larger expert-parallel degrees and making expert-grouped GEMMs more efficient with larger per-rank token batches. We use EP=8, CP=2 for the continued pretraining phase and EP=8, CP=8 for the RL phase. We use DeepEP to implement high-throughput token dispatch/combine +Zhao +et al. +[ +2025 +] +. DeepEP communication buffers have relatively low overhead, and DeepEP’s kernel uses 20 SMs by default, leaving headroom for concurrent compute. We also quantize the tokens to MXFP8 (discussed below) before dispatch for more efficient communication, which does not affect our precision since we already perform our expert computations in MXFP8. We keep the combine at BF16 for increased precision. To maximize compute–communication overlap, tokens are split into microbatches and pipelined across separate communication and compute streams. +Finally, we found that it was critical for different DP ranks to have similar amounts of compute to achieve high utilization. In continued pretraining, DP balance is easily achieved with fixed sequence lengths. In RL, different rollouts of different prompts can result in very different sequence lengths, so before each training step, we run a global sequence packing stage to ensure balanced DP compute load. The packing algorithm takes into account the increased attention costs of longer sequences. +Kernels. +Figure 10: +Overview of a single grouped GEMM training flow in our Mixture-of-Experts layer. Each colored block represents a single kernel launch. +Composer 2 training uses in-house kernels written in CUDA, PTX, and ThunderKittens/ParallelKittens +Spector +et al. +[ +2025 +]; Sul +et al. +[ +2025a +] +. The kernels primarily optimize low-precision training of the mixture-of-experts (MoE) layer. Our training recipe uses both MXFP8 +Open Compute Project [ +2023 +] +and NVFP4 +NVIDIA [ +2025 +] +precision formats. We exclusively target NVIDIA Blackwell GPUs for block-scaled tensor-core matrix multiplications (i.e., in-hardware dequantization during systolic-array matrix multiplication). Figure +10 +illustrates a single grouped GEMM training flow within our MoE layer. +For the MoE forward pass, we use a novel variant of NVFP4: values are quantized from BF16 into FP4E2M1 using FP8E4M3 per-block scales (block size = 16) and FP32 +per-token +scales. We found the original NVFP4 format, which uses FP32 per-tensor scales, fragile for two reasons. First, per-tensor scaling makes training batch-variant, collapsing numerical precision and causing the RL training to diverge. Second, inter-token scale values leak future token information into past tokens, resulting in biased gradients. Despite adding latency to the quantization and GEMM epilogue, per-token scaling thus proved to be the more effective scheme. +For the MoE backward pass, we use the standard MXFP8 format with FP8E4M3 values and FP8E8M0 scales per 32-element block. We can do this because of the asymmetry in RL training. On the forward pass, it is necessary that the trainer match the inference for numerical stability. We therefore use trainer NVFP4 in order to support fast inference. The backward pass, however, runs only on the training cluster. This is not a bottleneck on system-wide RL efficiency, so we can afford higher precision to improve training stability. +Finally, the choice of hardware-level math precision mattered considerably. For NVFP4 quantization, we found that using IEEE-compliant floating-point arithmetic (e.g., +__fdiv_rn +) is critical; using fast-approximation alternatives causes training to diverge after roughly a hundred RL steps. Conversely, using the fast-approximation path (e.g., +__fdividef +) for MXFP8 quantization has not caused any divergence since our initial training of Composer 1, so we select it for the best performance. +We actively open-source our kernel implementations and support community efforts to improve the GPU kernel ecosystem. We collaborated closely with Colfax to implement the Flash Attention 4 backward kernel for the QK 192 / V 128 configuration (a.k.a. the "DeepSeek shapes"), which has been merged into the public repository +Jay Shah [ +2026 +] +. We also actively support the development of ThunderKittens in collaboration with the Hazy Research group at Stanford +Sul +et al. +[ +2025a +, +c +, +b +]; Sul and Ré [ +2026 +] +. Recently, we open-sourced the state-of-the-art BF16, MXFP8, and NVFP4 GEMM implementations into ThunderKittens +HazyResearch [ +2026 +] +. Finally, we share our knowledge on quantization and MoE kernel implementation through online posts +Team [ +2025a +] +. +6.2 +RL Infrastructure +Our RL infrastructure consists of four decoupled services: training, environments, inference, and evaluations. A decoupled service stack enables larger-scale global training, high availability, and independent scaling and sharding. The production training job for Composer 2 spanned 3 regions for GPU compute and 4 regions for CPU compute. +Training +We use a fully asynchronous, high-throughput training stack built on Ray +Moritz +et al. +[ +2018 +] +and PyTorch +Paszke +et al. +[ +2019 +] +. A centralized reconciler performs slot-based sample lifecycle state management, moving samples through a pipeline of distributed executors and implementing scheduling policies that balance sample generation throughput with policy staleness. We design all services within the trainer around the concept of futures, which allow for eager execution of computation when upstream dependencies are ready. We leverage the Ray object store to hold samples that are ready for consumption by train workers, which allows for natural spilling to local NVMe storage when nodes have insufficient CPU memory. +To support large-scale post-training, all components within the trainer are fault-tolerant down to the process or process-group level. We run passive and active health checks on all nodes during training; upon detection of a hardware fault, we mark the node as unhealthy for scheduling but continue training with warm standby nodes. Decoupling training from inference and environment infrastructure naturally makes training more resilient to failures in these services; during the training run, we saw many cases where these services had partial or full outages without failing the training job. To minimize the number of training job restarts, we use a reactive configuration system and support live code updates on a per-process level; when new code is deployed, existing actors are drained of in-flight requests and transparently replaced. +Replaying long-running coding rollouts is expensive. To mitigate expensive failures on job-level faults, we perform policy-aware checkpointing at the rollout level and group level in addition to conventional checkpointing of model weights at the step level. For rollout checkpointing, we rely on memory snapshots of the codebase environment state, so that upon recovery, we can pass the reconstructed codebase environment to verifiers. For group checkpointing, we write sequences with advantages tagged with policy versions to NFS; upon job restart, the scheduler considers these when determining whether to dispatch new work or simply load ready groups. +Environments and Anyrun. +Stateful codebase environments are a first-class artifact of our post-training stack. Environments are run on top of Anyrun, an internal compute platform built for running untrusted code at scale. This is the same compute platform that powers Cloud Agents and Automations in the Cursor product. +All environment creation requests from the trainer are sent to a global service, which routes the request to an underlying Anyrun cluster. Our training workload is sharded across multiple Anyrun clusters for both instance availability and fault tolerance. Within a cluster, a distributed set of Anyrun managers schedule pods, scale cloud compute provisioned across multiple regions, and perform state reconciliation to manage hundreds of thousands of pods per cluster. Each pod is a dedicated Firecracker VM capable of running a full development environment, including a browser and GUI for computer use. We run pods on a large mixture of machine types and architectures (x86, ARM) to maximize instance availability. +Scheduling throughput is particularly important for the bursty nature of RL workloads. Each Anyrun cluster is capable of scheduling more than 500 pods per second while maintaining desired binpacking requirements. One challenge with a naive packing strategy is that the steady-state resource usage for a pod can be dramatically lower than its peak during startup and can also be bursty due to overcommits. To solve this, we monitor and schedule with awareness of live readings of hardware pressure (CPU, memory, disk) along with more conventional scheduling heuristics. +Anyrun supports forking and snapshotting of full coding environments at both the filesystem and memory level. This unlocks useful capabilities during RL, such as mid-trajectory rollout checkpointing and post-rollout state capture for future introspection. When a pod fork is requested, we attempt to first schedule the fork onto the same node; if not feasible due to space constraints, we live-migrate pod state to a node with capacity. +Egress is carefully controlled in environments to limit any external impact. Any access to the internet from a pod must go through Anygress, an internal service within Anyrun responsible for proxying traffic, enforcing granular request policies, and dropping sensitive headers. To better replicate real-world environments, Anygress operates transparently instead of relying on proxy environment variables by injecting a trusted root CA on pod startup and redirecting pod traffic at the TCP layer. +We train with tools that are representative of the harness in the Cursor client. Each codebase environment starts with a shared tool library that can be invoked over RPC. Some tools like semantic search have external dependencies and are handled outside of the environment. To support the full tool set available in the Cursor client, we maintain a shadow deployment of the Cursor backend that is used both during dataset preparation and rollouts. Sharing the production implementation in this way allows us to scale experiments and training safely while remaining faithful to the harness that Composer 2 will be deployed into. +There are cases where we want tool behavior to differ between training and production settings. Concrete examples include enforcing stricter tool argument checks to encourage more precise model behavior, and removing certain tools to improve model steerability. To achieve this, the set of available tools and the desired behavior of each tool are dynamically determined for each environment. +Inference and Weight Sync. +We partner with Fireworks AI to run RL inference. Because Kimi K2.5 is a Mixture-of-Experts model, numerical differences can cause different experts to be chosen in the inference engine forward pass and trainer forward pass. If the trainer and inference engine do not agree on expert routing for each token, log-probabilities computed during training may not match the distribution from which tokens were sampled, introducing noise into the policy gradient. To address this, we employ router replay +Zheng +et al. +[ +2025 +]; Ma +et al. +[ +2025 +] +: during inference, the engine returns the selected expert indices for every token at every MoE layer, and during the training forward pass the router’s expert assignment is overridden to match. The router still computes gating scores so that gradients flow through it. We extend the basic replay scheme by filtering out replayed experts whose gating scores fall below a plausibility threshold derived from the router’s own top- +k +k +selections, replacing them with the router’s candidates; we found that this reduces p99 numerics mismatch between the inference and training forward passes. +Every training step, we synchronize updated weights to the inference engine by uploading to a shared S3 bucket. To minimize transfer size, we use delta compression: each rank caches its previous upload and transmits only the diff against the new weights. Because RL updates are small, even with full-parameter training these diffs compress to a handful of gigabytes for the 1T-parameter model. Uploading is fully sharded across all training ranks, allowing us to saturate the egress bandwidth of the training cluster; similarly, download on the Fireworks side is sharded across inference replicas. Compression, upload, and hotload signaling are fully pipelined in background workers so that training is never blocked. During the Composer 2 training run, we ran inference across geographically distributed clusters in the US and Europe. Each cluster independently downloads and reconstructs weights from the shared delta chain, requiring no direct connectivity to the training cluster, enabling world-scale distributed RL inference over commodity cloud storage. +Online Evaluations. +To provide faithful evaluations of our model during training, we run a pinned version of the production backend and Cursor client for each evaluation job. This provides high confidence that model behavior during evals is an exact replication of what our end users see, and also allows us to iterate on the Cursor harness and model system prompt using the same infrastructure. For each training step we want to evaluate, we acquire a lease for an evaluation deployment, automatically move GPUs to that deployment, and perform a cross-region weight sync of the evaluation checkpoint from the training cluster where it resides to the inference deployment. +7 +Results +7.1 +CursorBench +We evaluate our models by running Cursor agents directly within Anyrun (Section +6.2 +), the same infrastructure that supports our reinforcement learning pipeline. +For each task in CursorBench, we initialize the codebase environment and initial task prompt, and we run the agent exactly as it would execute in our production environment. +Metrics. +We compute accuracy aggregated over all tasks across multiple passes of the evaluation set to reduce variance. +In addition to accuracy, we also measure efficiency metrics like completion tokens, end-to-end latency, and inference cost to ensure the model remains maximally useful for interactive developer workflows. +Figure 11: +On CursorBench, Composer 2 achieves a superior Pareto frontier in cost while remaining highly competitive in token efficiency. +For GPT-5.4, Codex-5.3, Opus 4.6, and Sonnet 4.6, we plot the high (circle), medium (triangle), and low (square) effort variants. +Table +1 +reports the accuracy of various models on CursorBench-3. +Composer 2 achieves 61.3%, representing a 37% relative improvement over Composer 1.5 and a 61% improvement over Composer 1. +Compared to its base model, Kimi K2.5, Composer 2 demonstrates a substantial accuracy boost, validating the effectiveness of our continued pretraining and reinforcement learning pipeline. +Furthermore, Composer 2 achieves accuracy competitive with the strongest frontier models despite being significantly cheaper at inference. +Figure +11 +contextualizes these accuracy metrics against resource consumption. +Regarding token usage, Composer 2 generates trajectories comparable in length to other models while providing frontier-level accuracy, remaining highly token-efficient relative to other frontier models operating at similar accuracy levels. +However, due to differences in active parameter counts, raw token usage does not fully capture inference efficiency. +Since we do not have access to FLOPs used by API models, we provide the median inference cost per CursorBench task in Figure +11 +. +Here, Composer 2 achieves a Pareto-optimal trade-off: its inference cost is similar to smaller or low-effort variants of models, while its accuracy remains competitive with much larger frontier models. +Together, these results demonstrate that domain-specialized training can yield models that are simultaneously more accurate and more cost-effective than general-purpose alternatives for the demanding requirements of real-world software engineering. +Table 1: +Benchmark results across public and internal evaluation suites. +For third-party models, we present results in an (our harness / self-reported) format where both are available. For Anthropic models on Terminal-Bench, we report the Claude Code scores from the official leaderboard in place of our harness evaluation. Overall, Composer 2 achieves accuracy competitive with the strongest frontier models. +Model +CursorBench +SWE-bench Multi. +Terminal-Bench +Composer 2 +61.3 +73.7 +61.7 +Composer 1.5 +44.2 +65.9 +47.9 +Composer 1 +38.0 +56.9 +40.0 +Opus 4.6 High +58.2 +75.8 +/ +77.8 +58.0 +/ +65.4 +Opus 4.5 High +48.4 +73.8 +/ +76.2 +52.1 +/ +59.8 +GPT-5.4 +63.9 +76.8 +/ +- +66.5 +† +/ +75.1 +GPT-5.3 Codex +59.1 +74.8 +/ +- +64.8 +† +/ +77.3 +GPT-5.2 +56.5 +68.3 +/ +- +60.5 +/ +62.2 +GLM-5 +42.7 +66.9 +/ +73.3 +59.6 +/ +56.2 +Kimi K2.5 +36.0 +65.1 +/ +73.0 +47.3 +/ +50.8 +† +OpenAI safety filters refused 5 GPT-5.4 and 3 GPT-5.3-Codex tasks; refused problems scored as 0. +7.2 +Public Benchmarks +We further evaluate Composer 2 on two public benchmarks: SWE-bench Multilingual and Terminal-Bench (Table +1 +, last two columns). +For Composer models, we compute scores using our own harness. +For third-party models, we report results as (our harness / self-reported) where both are available; for Anthropic models on Terminal-Bench, we use the official Claude Code leaderboard scores rather than our own harness evaluations. +For SWE-bench, we simply prepend “ +please solve this github issue +” to the problem statement without instructions for writing or running test cases. +For Terminal-Bench, we augment the user prompt with solution formatting instructions on where files should be placed or environment should be set up. +On SWE-bench Multilingual, Composer 2 scores 73.7%, a 7.8% improvement over Composer 1.5 and 16.8% over Composer 1. +On Terminal-Bench, Composer 2 achieves 61.7%, improving upon Composer 1.5 by 13.8% and Composer 1 by 21.7%. +Against its base model, Kimi K2.5, Composer 2 achieves similar performance on SWE-bench Multilingual and considerably improved performance on Terminal-Bench. +Overall, Composer 2’s performance on these public benchmarks remains highly competitive with other state-of-the-art models. +Across both benchmarks, each successive Composer version shows consistent gains, demonstrating that continued investment in both pretraining and reinforcement learning yields compounding gains for agentic software engineering. +8 +Conclusion +Composer 2 demonstrates that strong specialized models can be trained through continued pretraining and reinforcement learning. Starting from a strong general-purpose model, a model can be specialized to achieve frontier-level performance in agentic coding. The main insight, from both an algorithmic and infrastructure point of view, is to scale training while ensuring a close domain match with the target domain. We do this through careful domain benchmarking with CursorBench, harness and environment engineering, and behavioral reward development, along with rigorous infrastructure reliability. +The results of Composer 2 are optimistic on the future improvement available through further scaling. While Composer 2 marks a steady improvement over previous versions, there are many cases where the model shows intelligence or coherence behaviors that can be clearly improved. The model trained in this work is large (1.04T parameters, 32B active) but likely smaller than other proprietary models of comparative ability. We believe there remains considerable room for development both architecturally and algorithmically. +The scope of coding agents as a tool is also expanding from interactive problems to agentic tasks that would require hours of human time +Kwa +et al. +[ +2025 +] +, with a general expectation that the horizon will grow quickly in the future +Team [ +2025c +] +. For future Composer iterations, our team is focused on expanding the ability of the model to work on these problems through training methods to handle longer problems both in the algorithms to effectively utilize longer term training signal and in the infrastructure to support faithful long-horizon problems. +References +A. Ahmadian, C. Cremer, M. Gallé, M. Fadaee, J. Kreutzer, O. Pietquin, A. Üstün, and S. Hooker (2024) +Back to basics: revisiting reinforce-style optimization for learning from human feedback in llms +. +In +Proceedings of the 62nd Annual Meeting of the Association for Computational +Linguistics (Volume 1: Long Papers), ACL 2024, Bangkok, Thailand, +August 11-16, 2024 +, +pp. 12248–12267 +. +External Links: +Link +, +Document +Cited by: +§4.1 +. +A. AI (2025) +LoCoDiff-bench: long context diff reconstruction benchmark +. +Note: +https://abanteai.github.io/LoCoDiff-bench/ +Cited by: +2nd item +. +Z. AI (2026) +GLM-5: from vibe coding to agentic engineering +. +Note: +https://z.ai/blog/glm-5 +Cited by: +Appendix B +. +L. B. Allal, R. Li, D. Kocetkov, C. Mou, C. Akiki, C. M. Ferrandis, N. Muennighoff, M. Mishra, A. Gu, M. Dey, +et al. +(2023) +SantaCoder: don’t reach for the stars! +. +In +International Conference on Machine Learning, ICML 2023 Workshop on Knowledge and Logical Reasoning in the Era of Data-driven Learning +, +Cited by: +§2 +. +A. Amini, T. Vieira, and R. Cotterell (2025) +Better estimation of the kullback–leibler divergence between language models +. +In +The Thirty-ninth Annual Conference on Neural Information Processing Systems +, +External Links: +Link +Cited by: +§4.1 +. +M. Chen, J. Tworek, H. Jun, Q. Yuan, H. P. d. O. Pinto, J. Kaplan, H. Edwards, Y. Burda, N. Joseph, G. Brockman, +et al. +(2021) +Evaluating large language models trained on code +. +arXiv preprint arXiv:2107.03374 +. +External Links: +Link +Cited by: +§2 +. +Z. Chen, X. Qin, Y. Wu, Y. Ling, Q. Ye, W. X. Zhao, and G. Shi (2025) +Pass@k training for adaptively balancing exploration and exploitation of large reasoning models +. +arXiv preprint arXiv:2508.10751 +. +External Links: +Document +, +Link +Cited by: +§4.1 +. +C. Clement, D. Drain, J. Timcheck, A. Svyatkovskiy, and N. Sundaresan (2020) +PyMT5: multi-mode translation of natural language and python code with transformers +. +In +Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) +, +pp. 9052–9065 +. +Cited by: +§2 +. +DeepSeek-AI (2024a) +DeepSeek-coder-v2: breaking the barrier of closed-source models in code intelligence +. +arXiv preprint arXiv:2406.11931 +. +External Links: +Link +Cited by: +§2 +. +DeepSeek-AI (2024b) +DeepSeek-v3 technical report +. +arXiv preprint arXiv:2412.19437 +. +External Links: +Link +Cited by: +§3.1 +. +DeepSeek-AI (2025) +DeepSeek-v3.2: pushing the frontier of open large language models +. +arXiv preprint arXiv:2512.02556 +. +External Links: +Link +Cited by: +Appendix B +. +D. Du, S. Liu, T. Yang, S. Chen, and Y. Li (2025) +UloRL: an ultra-long output reinforcement learning approach for advancing large language models’ reasoning abilities +. +arXiv preprint arXiv:2507.19766 +. +External Links: +Link +Cited by: +§4.1 +. +W. Fedus, B. Zoph, and N. Shazeer (2022) +Switch transformers: scaling to trillion parameter models with simple and efficient sparsity +. +Journal of Machine Learning Research +23 +( +120 +), +pp. 1–39 +. +External Links: +Link +Cited by: +§6.1 +. +Z. Feng, D. Guo, D. Tang, N. Duan, X. Feng, M. Gong, L. Shou, B. Qin, T. Liu, D. Jiang, +et al. +(2020) +Codebert: a pre-trained model for programming and natural languages +. +In +Findings of the association for computational linguistics: EMNLP 2020 +, +pp. 1536–1547 +. +Cited by: +§2 +. +D. Fried, A. Aghajanyan, J. Lin, S. Wang, E. Wallace, F. Shi, R. Zhong, S. Yih, L. Zettlemoyer, and M. Lewis (2023) +InCoder: A generative model for code infilling and synthesis +. +In +The Eleventh International Conference on Learning Representations, +ICLR 2023, Kigali, Rwanda, May 1-5, 2023 +, +External Links: +Link +Cited by: +§2 +. +F. Gloeckle, B. Y. Idrissi, B. Rozière, D. Lopez-Paz, and G. Synnaeve (2024) +Better & faster large language models via multi-token prediction +. +In +Forty-first International Conference on Machine Learning, ICML 2024, +Vienna, Austria, July 21-27, 2024 +, +External Links: +Link +Cited by: +§3.1 +. +A. Golubev, M. Trofimova, S. Polezhaev, I. Badertdinov, M. Nekrashevich, A. Shevtsov, S. Karasik, S. Abramov, A. Andriushchenko, F. Fisin, S. Skvortsov, and B. Yangel (2025) +Training long-context, multi-turn software engineering agents with reinforcement learning +. +arXiv preprint arXiv:2508.03501 +. +External Links: +Link +Cited by: +§4.1 +. +D. Guo, Q. Zhu, D. Yang, Z. Xie, K. Dong, W. Zhang, G. Chen, X. Bi, Y. Wu, Y. K. Li, +et al. +(2024) +DeepSeek-coder: when the large language model meets programming–the rise of code intelligence +. +arXiv preprint arXiv:2401.14196 +. +External Links: +Link +Cited by: +§2 +. +S. Gururangan, A. Marasović, S. Swayamdipta, K. Lo, I. Beltagy, D. Downey, and N. A. Smith (2020) +Don’t stop pretraining: adapt language models to domains and tasks +. +In +Proceedings of the 58th Annual Meeting of the Association for Computational +Linguistics, ACL 2020, Online, July 5-10, 2020 +, +pp. 8342–8360 +. +External Links: +Link +, +Document +Cited by: +§3 +. +HazyResearch (2026) +ThunderKittens gemm kernels +. +External Links: +Link +Cited by: +§6.1 +. +J. Hoffmann, S. Borgeaud, A. Mensch, E. Buchatskaya, T. Cai, E. Rutherford, D. de Las Casas, L. A. Hendricks, J. Welbl, A. Clark, +et al. +(2022) +Training compute-optimal large language models +. +In +Advances in Neural Information Processing Systems 35: Annual Conference +on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, +LA, USA, November 28 - December 9, 2022 +, +Cited by: +§3 +. +S. Hong, M. Zhuge, J. Chen, X. Zheng, Y. Cheng, J. Wang, C. Zhang, Z. Wang, S. K. S. Yau, Z. Lin, +et al. +(2023) +MetaGPT: meta programming for a multi-agent collaborative framework +. +In +The twelfth international conference on learning representations +, +Cited by: +§2 +. +J. Howard and S. Ruder (2018) +Universal language model fine-tuning for text classification +. +In +Proceedings of the 56th Annual Meeting of the Association for Computational +Linguistics, ACL 2018, Melbourne, Australia, July 15-20, 2018, Volume +1: Long Papers +, +pp. 328–339 +. +External Links: +Link +, +Document +Cited by: +§3 +. +B. Hui, J. Yang, Z. Cui, J. Yang, D. Liu, L. Zhang, T. Liu, J. Zhang, B. Yu, K. Lu, +et al. +(2024) +Qwen2.5-coder technical report +. +arXiv preprint arXiv:2409.12186 +. +Cited by: +§2 +. +S. A. Jacobs, M. Tanaka, C. Zhang, M. Zhang, S. L. Song, S. Rajbhandari, and Y. He (2023) +Deepspeed ulysses: system optimizations for enabling training of extreme long sequence transformer models +. +arXiv preprint arXiv:2309.14509 +. +Cited by: +§6.1 +. +Jay Shah (2026) +Flash attention pull request #2270 +. +External Links: +Link +Cited by: +§6.1 +. +C. E. Jimenez, J. Yang, A. Wettig, S. Yao, K. Pei, O. Press, and K. R. Narasimhan (2024) +SWE-bench: can language models resolve real-world github issues? +. +In +The Twelfth International Conference on Learning Representations +, +External Links: +Link +Cited by: +§1 +. +T. Kwa, B. West, J. Becker, A. Deng, K. Garcia, M. Hasin, S. Jawhar, M. Kinniment, N. Rush, S. V. Arx, R. Bloom, T. Broadley, H. Du, B. Goodrich, N. Jurkovic, L. H. Miles, S. Nix, T. Lin, N. Parikh, D. Rein, L. J. K. Sato, H. Wijk, D. M. Ziegler, E. Barnes, and L. Chan (2025) +Measuring ai ability to complete long tasks +. +Note: +https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/ +Cited by: +§8 +. +R. Li, L. B. Allal, Y. Zi, N. Muennighoff, D. Kocetkov, C. Mou, M. Marone, C. Akiki, J. Li, J. Chim, +et al. +(2023) +StarCoder: may the source be with you! +. +Trans. Mach. Learn. Res. +2023 +. +External Links: +Link +Cited by: +§2 +. +Y. Li, D. Choi, J. Chung, N. Kushman, J. Schrittwieser, R. Leblond, T. Eccles, J. Keeling, F. Gimeno, A. Dal Lago, +et al. +(2022) +Competition-level code generation with alphacode +. +Science +378 +( +6624 +), +pp. 1092–1097 +. +Cited by: +§2 +. +X. Liang, Z. Li, Y. Gong, Y. Shen, Y. N. Wu, Z. Guo, and W. Chen (2026) +Beyond pass@1: self-play with variational problem synthesis sustains RLVR +. +In +The Fourteenth International Conference on Learning Representations, +ICLR 2026 +, +External Links: +Link +Cited by: +§4.1 +. +H. Liu, M. Zaharia, and P. Abbeel (2024) +Ring attention with blockwise transformers for near-infinite context +. +In +The Twelfth International Conference on Learning Representations, +ICLR 2024, Vienna, Austria, May 7-11, 2024 +, +External Links: +Link +Cited by: +§6.1 +. +Z. Liu, C. Chen, W. Li, P. Qi, T. Pang, C. Du, W. S. Lee, and M. Lin (2025a) +Understanding r1-zero-like training: a critical perspective +. +arXiv preprint arXiv:2503.20783 +. +External Links: +Link +Cited by: +§4.1 +, +§4.1 +. +Z. Liu, Z. Yang, Y. Chen, C. Lee, M. Shoeybi, B. Catanzaro, and W. Ping (2025b) +AceReason-nemotron 1.1: advancing math and code reasoning through sft and rl synergy +. +arXiv preprint arXiv:2506.13284 +. +External Links: +Link +Cited by: +§4.1 +. +A. Lozhkov, R. Li, L. B. Allal, F. Cassano, J. Lamy-Poirier, N. Tazi, A. Tang, D. Pykhtar, J. Liu, Y. Wei, +et al. +(2024) +StarCoder 2 and the stack v2: the next generation +. +arXiv preprint arXiv:2402.19173 +. +External Links: +Link +Cited by: +§2 +. +Z. Luo, C. Xu, P. Zhao, Q. Sun, X. Geng, W. Hu, C. Tao, J. Ma, Q. Lin, and D. Jiang (2024) +WizardCoder: empowering code large language models with evol-instruct +. +In +The Twelfth International Conference on Learning Representations, +ICLR 2024, Vienna, Austria, May 7-11, 2024 +, +External Links: +Link +Cited by: +§2 +. +W. Ma, H. Zhang, L. Zhao, Y. Song, Y. Wang, Z. Sui, and F. Luo (2025) +Stabilizing MoE reinforcement learning by aligning training and inference routers +. +arXiv preprint arXiv:2510.11370 +. +External Links: +2510.11370 +, +Link +Cited by: +§4.1 +, +§6.2 +. +M. A. Merrill, A. G. Shaw, N. Carlini, B. Li, H. Raj, I. Bercovich, L. Shi, J. Y. Shin, T. Walshe, E. K. Buchanan, +et al. +(2026) +Terminal-bench: benchmarking agents on hard, realistic tasks in command line interfaces +. +In +The Fourteenth International Conference on Learning Representations, +ICLR 2026 +, +External Links: +Link +Cited by: +§1 +. +MiniMax (2025) +MiniMax-m1: scaling test-time compute efficiently with lightning attention +. +arXiv preprint arXiv:2506.13585 +. +External Links: +Link +Cited by: +§4.1 +. +M. Mishra, M. Stallone, G. Zhang, Y. Shen, A. Prasad, A. M. Soria, M. Merler, P. Selvam, S. Surendran, S. Singh, +et al. +(2024) +Granite code models: a family of open foundation models for code intelligence +. +arXiv preprint arXiv:2405.04324 +. +Cited by: +§2 +. +P. Moritz, R. Nishihara, S. Wang, A. Tumanov, R. Liaw, E. Liang, M. Elibol, Z. Yang, W. Paul, M. I. Jordan, and I. Stoica (2018) +Ray: A distributed framework for emerging AI applications +. +In +13th USENIX Symposium on Operating Systems Design and Implementation, +OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018 +, +A. C. Arpaci-Dusseau and G. Voelker (Eds.) +, +pp. 561–577 +. +External Links: +Link +Cited by: +§6.2 +. +N. Muennighoff, Q. Liu, A. R. Zebaze, Q. Zheng, B. Hui, T. Y. Zhuo, S. Singh, X. Tang, L. von Werra, and S. Longpre (2024) +OctoPack: instruction tuning code large language models +. +In +The Twelfth International Conference on Learning Representations, +ICLR 2024, Vienna, Austria, May 7-11, 2024 +, +External Links: +Link +Cited by: +§2 +. +E. Nijkamp, B. Pang, H. Hayashi, L. Tu, H. Wang, Y. Zhou, S. Savarese, and C. Xiong (2023) +CodeGen: an open large language model for code with multi-turn program synthesis +. +In +The Eleventh International Conference on Learning Representations, +ICLR 2023, Kigali, Rwanda, May 1-5, 2023 +, +External Links: +Link +Cited by: +§2 +. +NVIDIA (2025) +Pretraining large language models with nvfp4 +. +arXiv preprint arXiv:2509.25149 +. +External Links: +Link +Cited by: +§6.1 +. +Open Compute Project (2023) +OCP microscaling formats (mx) specification version 1.0 +. +Note: +https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf +Cited by: +§6.1 +. +A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga, +et al. +(2019) +Pytorch: an imperative style, high-performance deep learning library +. +Advances in neural information processing systems +32 +. +Cited by: +§6.2 +. +A. Piché, E. Kamalloo, R. Pardinas, X. Chen, and D. Bahdanau (2025) +PipelineRL: faster on-policy reinforcement learning for long sequence generation +. +arXiv preprint arXiv:2509.19128 +. +External Links: +Link +Cited by: +§4.1 +. +C. Qian, W. Liu, H. Liu, N. Chen, Y. Dang, J. Li, C. Yang, W. Chen, Y. Su, X. Cong, +et al. +(2024) +Chatdev: communicative agents for software development +. +In +Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers) +, +pp. 15174–15186 +. +Cited by: +§2 +. +S. Rajbhandari, J. Rasley, O. Ruwase, and Y. He (2020) +ZeRO: memory optimizations toward training trillion parameter models +. +In +SC20: International Conference for High Performance Computing, Networking, Storage and Analysis +, +pp. 1–16 +. +Cited by: +§6.1 +. +B. Rozière, J. Gehring, F. Gloeckle, S. Sootla, I. Gat, X. E. Tan, Y. Adi, J. Liu, R. Sauvestre, T. Remez, +et al. +(2023) +Code llama: open foundation models for code +. +arXiv preprint arXiv:2308.12950 +. +External Links: +Link +Cited by: +§2 +. +J. Schulman (2020) +Approximating KL divergence +. +Note: +Blog post +Cited by: +§4.1 +. +Z. Shao, P. Wang, Q. Zhu, R. Xu, J. Song, M. Zhang, Y. K. Li, Y. Wu, and D. Guo (2024) +DeepSeekMath: pushing the limits of mathematical reasoning in open language models +. +arXiv preprint arXiv:2402.03300 +. +External Links: +Link +Cited by: +§4.1 +, +§4.1 +. +N. Shazeer, A. Mirhoseini, K. Maziarz, A. Davis, Q. Le, G. Hinton, and J. Dean (2017) +Outrageously large neural networks: the sparsely-gated mixture-of-experts layer +. +In +International Conference on Learning Representations (ICLR) +, +External Links: +Link +Cited by: +§6.1 +. +M. Shoeybi, M. Patwary, R. Puri, P. LeGresley, J. Casper, and B. Catanzaro (2019) +Megatron-LM: training multi-billion parameter language models using model parallelism +. +arXiv preprint arXiv:1909.08053 +. +Cited by: +§6.1 +. +B. F. Spector, S. Arora, A. Singhal, A. Parthasarathy, D. Y. Fu, and C. Ré (2025) +ThunderKittens: simple, fast, and adorable kernels +. +In +The Thirteenth International Conference on Learning Representations +, +External Links: +Link +Cited by: +§6.1 +. +S. H. Sul, S. Arora, B. F. Spector, and C. Ré (2025a) +ParallelKittens: systematic and practical simplification of multi-gpu ai kernels +. +arXiv preprint arXiv:2511.13940 +. +External Links: +Link +Cited by: +§6.1 +, +§6.1 +. +S. H. Sul, S. Arora, B. Spector, and C. Ré (2025b) +Loads and loads of fluffy kittens +. +External Links: +Link +Cited by: +§6.1 +. +S. H. Sul, D. Lim, B. Spector, and C. Ré (2025c) +One kernel for all your gpus +. +External Links: +Link +Cited by: +§6.1 +. +S. H. Sul and C. Ré (2026) +ThunderKittens 2.0: even faster kernels for your gpus +. +External Links: +Link +Cited by: +§6.1 +. +F. Tajwar, G. Zeng, Y. Zhou, Y. Song, D. Arora, Y. Jiang, J. Schneider, R. Salakhutdinov, H. Feng, and A. Zanette (2026) +Maximum likelihood reinforcement learning +. +arXiv preprint arXiv:2602.02710 +. +External Links: +Document +, +Link +Cited by: +§4.1 +. +C. Team, H. Zhao, J. Hui, J. Howland, N. Nguyen, S. Zuo, A. Hu, C. A. Choquette-Choo, J. Shen, J. Kelley, +et al. +(2024) +Codegemma: open code models based on gemma +. +arXiv preprint arXiv:2406.11409 +. +Cited by: +§2 +. +C. Team (2025a) +1.5x faster moe training with custom mxfp8 kernels +. +Note: +https://cursor.com/blog/kernels +Cited by: +§6.1 +. +C. Team (2025b) +Self-summarization for composer +. +Note: +https://cursor.com/blog/self-summarization +Cited by: +§4.1 +. +C. Team (2025c) +The third era of software +. +Note: +https://cursor.com/blog/third-era +Cited by: +§8 +. +K. Team (2025d) +Kimi k1.5: scaling reinforcement learning with LLMs +. +arXiv preprint arXiv:2501.12599 +. +Cited by: +§4.1 +. +K. Team (2026) +Kimi K2.5: visual agentic intelligence +. +arXiv preprint arXiv:2602.02276 +. +Cited by: +Appendix B +, +Appendix B +, +§3 +. +Q. Team (2025e) +Qwen3 technical report +. +arXiv preprint arXiv:2505.09388 +. +Cited by: +§3.1 +. +H. Touvron, T. Lavril, G. Izacard, X. Martinet, M. Lachaux, T. Lacroix, B. Rozière, N. Goyal, E. Hambro, F. Azhar, +et al. +(2023) +LLaMA: open and efficient foundation language models +. +arXiv preprint arXiv:2302.13971 +. +External Links: +Link +Cited by: +§3 +. +X. Wang, B. Li, Y. Song, F. F. Xu, X. Tang, M. Zhuge, J. Pan, Y. Song, B. Li, J. Singh, H. H. Tran, F. Li, R. Ma, M. Zheng, B. Qian, Y. Shao, N. Muennighoff, Y. Zhang, B. Hui, J. Lin, R. Brennan, H. Peng, H. Ji, and G. Neubig (2025) +OpenHands: an open platform for AI software developers as generalist agents +. +In +The Thirteenth International Conference on Learning Representations, +ICLR 2025, Singapore, April 24-28, 2025 +, +External Links: +Link +Cited by: +§2 +. +Y. Wang, H. Le, A. Gotmare, N. Bui, J. Li, and S. Hoi (2023) +Codet5+: open code large language models for code understanding and generation +. +In +Proceedings of the 2023 conference on empirical methods in natural language processing +, +pp. 1069–1088 +. +Cited by: +§2 +. +Y. Wang, W. Wang, S. Joty, and S. C. Hoi (2021) +Codet5: identifier-aware unified pre-trained encoder-decoder models for code understanding and generation +. +In +Proceedings of the 2021 conference on empirical methods in natural language processing +, +pp. 8696–8708 +. +Cited by: +§2 +. +Y. Wei, Z. Wang, J. Liu, Y. Ding, and L. Zhang (2024) +Magicoder: empowering code generation with OSS-Instruct +. +In +Forty-first International Conference on Machine Learning, ICML 2024, +Vienna, Austria, July 21-27, 2024 +, +Proceedings of Machine Learning Research +, +pp. 52632–52657 +. +External Links: +Link +Cited by: +§2 +. +X. Wen, Z. Liu, S. Zheng, Z. Xu, S. Ye, Z. Wu, X. Liang, Y. Wang, J. Li, Z. Miao, J. Bian, and M. Yang (2026) +Reinforcement learning with verifiable rewards implicitly incentivizes correct reasoning in base LLMs +. +In +The Fourteenth International Conference on Learning Representations, +ICLR 2026 +, +External Links: +Link +Cited by: +§4.1 +. +[74] +() +Why SWE-bench Verified no longer measures frontier coding capabilities — openai.com +. +Note: +https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/ +[Accessed 24-03-2026] +Cited by: +3rd item +. +J. Yang, C. E. Jimenez, A. Wettig, K. Lieret, S. Yao, K. Narasimhan, and O. Press (2024) +SWE-agent: agent-computer interfaces enable automated software engineering +. +In +Advances in Neural Information Processing Systems 38: Annual Conference +on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, +BC, Canada, December 10 - 15, 2024 +, +External Links: +Link +Cited by: +§2 +. +J. Yang, K. Lieret, C. E. Jimenez, A. Wettig, K. Khandpur, Y. Zhang, B. Hui, O. Press, L. Schmidt, and D. Yang (2025) +SWE-smith: scaling data for software engineering agents +. +In +Advances in Neural Information Processing Systems 38: Annual Conference +on Neural Information Processing Systems 2025, NeurIPS 2025, +San Diego, CA, USA, December 1-4, 2025 +, +External Links: +Link +Cited by: +§2 +. +J. Ye +et al. +(2024) +Data mixing made efficient: a biannual survey of data mixing for LLM pre-training +. +arXiv preprint arXiv:2403.16952 +. +External Links: +Link +Cited by: +§3 +. +Q. Yu, Z. Zhang, R. Zhu, Y. Yuan, X. Zuo, Y. Yue, W. Dai, T. Fan, G. Liu, J. Liu, L. Liu, X. Liu, H. Lin, Z. Lin, B. Ma, G. Sheng, Y. Tong, C. Zhang, M. Zhang, R. Zhang, W. Zhang, H. Zhu, J. Zhu, J. Chen, J. Chen, C. Wang, H. Yu, Y. Song, X. Wei, H. Zhou, J. Liu, W. Ma, Y. Zhang, L. Yan, Y. Wu, and M. Wang (2025) +DAPO: an open-source LLM reinforcement learning system at scale +. +In +The Thirty-ninth Annual Conference on Neural Information Processing Systems, +NeurIPS 2025 +, +External Links: +Link +Cited by: +§4.1 +, +§4.1 +. +Y. Yue, Z. Chen, R. Lu, A. Zhao, Z. Wang, Y. Yue, S. Song, and G. Huang (2025) +Does reinforcement learning really incentivize reasoning capacity in LLMs beyond the base model? +. +In +The Thirty-ninth Annual Conference on Neural Information Processing Systems, +NeurIPS 2025 +, +Note: +Oral +External Links: +Link +Cited by: +§4.1 +. +C. Zhao, S. Zhou, L. Zhang, C. Deng, Z. Xu, Y. Liu, K. Yu, J. Li, and L. Zhao (2025) +DeepEP: an efficient expert-parallel communication library +. +GitHub +. +Note: +https://github.com/deepseek-ai/DeepEP +Cited by: +§6.1 +. +Y. Zhao, A. Gu, R. Varma, L. Luo, C. Huang, M. Xu, L. Wright, H. Shojanazeri, M. Ott, S. Shleifer, +et al. +(2023) +PyTorch fsdp: experiences on scaling fully sharded data parallel +. +arXiv preprint arXiv:2304.11277 +. +Cited by: +§6.1 +. +C. Zheng, S. Liu, M. Li, X. Chen, B. Yu, C. Gao, K. Dang, Y. Liu, R. Men, A. Yang, J. Zhou, and J. Lin (2025) +Group sequence policy optimization +. +arXiv preprint arXiv:2507.18071 +. +External Links: +2507.18071 +, +Link +Cited by: +§4.1 +, +§6.2 +. +T. Y. Zhuo, A. R. Zebaze, L. Von Werra, H. de Vries, Q. Liu, and N. Muennighoff (2025) +Parameter-efficient instruction tuning code large language models: an empirical study +. +In +ICLR 2025 Third Workshop on Deep Learning for Code +, +Cited by: +§2 +. +Appendix A +Contributors +The Composer research team consists of: +Aaron Chan, +Ahmed Shalaby, +Alexander Wettig, +Aman Sanger, +Andrew Zhai, +Anurag Ajay, +Ashvin Nair, +Charlie Snell, +Chen Lu, +Chen Shen, +Emily Jia, +Federico Cassano, +Hanpeng Liu, +Haoyu Chen, +Henry Wildermuth, +Jacob Jackson, +Janet Li, +Jediah Katz, +Jiajun Yao, +Joey Hejna, +Josh Warner, +Julius Vering, +Kevin Frans, +Lee Danilek, +Less Wright, +Lujing Cen, +Luke Melas-Kyriazi, +Michael Truell, +Michiel de Jong, +Naman Jain, +Nate Schmidt, +Nathan Wang, +Niklas Muennighoff, +Oleg Rybkin, +Paul Loh, +Phillip Kravtsov, +Rishabh Yadav, +Sahil Shah, +Sam Kottler, +Alexander M Rush, +Shengtong Zhang, +Shomil Jain, +Sriram Sankar, +Stefan Heule, +Stuart H. Sul, +Sualeh Asif, +Victor Rong, +Wanqi Zhu, +William Lin, +Yuchen Wu, +Yuri Volkov, +Yury Zemlyanskiy, +Zack Holbrook, +Zhiyuan Zhang +Appendix B +Base Model Selection +Before training, we evaluated several potential open-source base models including GLM-5 +AI [ +2026 +] +, Kimi K2.5 +Team [ +2026 +] +, and DeepSeek V3.2 +DeepSeek-AI [ +2025 +] +. Three base model evaluations contributed to our selection of Kimi K2.5: +• +Coding knowledge +: We score factual knowledge with an internal benchmark called FreshBench. FreshBench is a question-answer benchmark adversarially constructed against previous Composer models. We identify turns where Composer had to read library source code or perform a web search to solve a coding task. From these traces we create question-answer pairs, validating the answers with a web searching agent. +• +State tracking +: While editing a repository, coding agents often need to understand dozens of past file edits before taking an action. +LoCoDiff +AI [ +2025 +] +is a benchmark that asks the model to recreate the state of a file after many diffs, an important base skill for model long-term memory. State tracking is an internal benchmark similar to LoCoDiff built from our monorepo. +Instead of measuring raw accuracy, which we found sensitive to single-character errors, we report the average character-level distance. +• +Codebase perplexity +: We measure perplexity to determine the coding intelligence of the base model. +We use our private monorepo as an uncontaminated source, concatenating the files alphabetically and computing the sum of the negative log-likelihoods over a rolling window. +We intentionally do not consider coding agent benchmarks when testing base models. We find that such benchmarks are less predictive of final performance, as agentic and long-horizon capabilities can drastically change during the RL stage. +Table +2 +shows the results of the analysis. All three models considered perform quite well in these experiments. We selected Kimi K2.5 +Team [ +2026 +] +due to its general strong performance as well as further additional considerations such as its efficiency in our infrastructure. +Model +FreshBench +↑ +\uparrow +State Tracking +↓ +\downarrow +Negative Log-Likelihood +↓ +\downarrow +DeepSeek V3.2 +68.9% +66 +11.75M +Kimi K2.5 +83.2% +86 +13.81M +GLM-5 +79.2% +92 +14.11M +GPT-5.4 +92.5 +% +103 +- +Claude 4.6 Opus +88.9% +65 +- +Gemini 3 Flash +84.5% +27 +- +Claude 4.5 Sonnet +80.1% +69 +- +Claude 4.5 Haiku +61.7% +177 +- +Table 2: +Base models evaluated on our internal benchmarks. +Negative log-likelihood is measured over our internal codebase. +Appendix C +CursorBench +C.1 +Streaming Prefix Detection +The following is another example CursorBench task. +Problem statement: +We’re seeing a weird streaming bug in some chat responses: +Now I +Now I need to updat +Now I need to update this. +Now I need to update this. I ha +Now I need to update this. I have the +Instead of getting proper streaming deltas, we get repeated growing prefixes like the snippet. I think this happens mostly inside think tokens. I want to know how common this is. Look at 954 response json files in @logs folder +Figure 12: +Example CursorBench task. +The agent must infer the failure mode from a partial symptom report, write a heuristic detection algorithm over 954 heterogeneous chat responses, and carefully tune that heuristic to recover an exact count of malformed prefix-streaming cases without overcounting normal incremental output. Additionally, a variant of the bug produces an “interleave stutter” where the initial prefix chain is only two lines long before stabilizing into a repeating line with incrementing repetitions and agent must carefully examine chat responses to discover this. +The following listing shows the algorithmic core of the reference diff for this task. +⬇ +MIN_CHAIN += +3 +MIN_SEED_LEN += +2 +MAX_SEED_LEN += +50 +def +find_prefix_chain +( +text +: +str +) +-> +tuple +[ +int +, +str +] +| +None +: +if +len +( +text +) +< +10: +return +None +first_nl += +text +. +find +( +"\n" +) +if +first_nl +< +MIN_SEED_LEN +or +first_nl +> +MAX_SEED_LEN +: +return +None +seed += +text +[: +first_nl +] +needle += +"\n" ++ +seed +starts += +[0] +pos += +0 +while +True +: +idx += +text +. +find +( +needle +, +pos +) +if +idx +== +-1: +break +starts +. +append +( +idx ++ +1) +pos += +idx ++ +1 +if +len +( +starts +) +< +MIN_CHAIN +: +return +None +ends += +[ +s +- +1 +for +s +in +starts +[1:]] ++ +[ +len +( +text +)] +chunks += +[ +text +[ +s +: +e +] +for +s +, +e +in +zip +( +starts +, +ends +)] +chain += +1 +for +i +in +range +( +len +( +chunks +) +- +1): +cur +, +nxt += +chunks +[ +i +], +chunks +[ +i ++ +1] +if +len +( +cur +) +< +len +( +nxt +) +and +nxt +. +startswith +( +cur +): +chain ++= +1 +else +: +break +return +( +chain +, +seed +) +if +chain +>= +MIN_CHAIN +else +None +def +iter_think_blocks +( +text +: +str +): +pos += +0 +while +True +: +open_idx += +text +. +find +( +"" +, +pos +) +if +open_idx +== +-1: +return +close_idx += +text +. +find +( +"" +, +open_idx +) +if +close_idx +== +-1: +yield +text +[ +open_idx ++ +7:]. +lstrip +( +"\n" +) +return +yield +text +[ +open_idx ++ +7: +close_idx +]. +lstrip +( +"\n" +) +pos += +close_idx ++ +8 +def +has_prefix_streaming_bug +( +chat_response +: +str +) +-> +bool +: +return +any +( +find_prefix_chain +( +block +) +is +not +None +for +block +in +iter_think_blocks +( +chat_response +) +) +BETA \ No newline at end of file diff --git a/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md b/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md new file mode 100644 index 0000000000000000000000000000000000000000..5f6e44ee95c499ba91cf6eb1cea721d150f0143b --- /dev/null +++ b/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md @@ -0,0 +1,16916 @@ +--- +title: 'CWM: An Open-Weights LLM for Research on Code Generation with World Models' +id: cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models +tags: +- deepread +created: '2026-06-10T00:30:44.771058Z' +source: https://arxiv.org/html/2510.02387 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:30:44.770910Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +CWM: An Open-Weights LLM for Research on Code Generation with World Models +\metadata +[Inference Code] +github.com/facebookresearch/cwm +\metadata +[Model Weights] +ai.meta.com/resources/models-and-libraries/cwm-downloads +, +huggingface.co/facebook/cwm +, +../cwm-sft +, +../cwm-pretrain +CWM +: An Open-Weights LLM for Research on Code Generation with World Models +Meta FAIR CodeGen Team +( +September 29, 2025 +) +Abstract +We release Code World Model ( +CWM +), a 32-billion-parameter open-weights LLM, to advance research on code generation with world models. +To improve code understanding beyond what can be learned from training on static code alone, we mid-train +CWM +on a large amount of observation-action trajectories from Python interpreter and agentic Docker environments, and perform extensive multi-task reasoning RL in verifiable coding, math, and multi-turn software engineering environments. +With +CWM +, we provide a strong testbed for researchers to explore the opportunities world modeling affords for improving code generation with reasoning and planning in computational environments. +We present first steps of how world models can benefit agentic coding, enable step-by-step simulation of Python code execution, and show early results of how reasoning can benefit from the latter. +CWM +is a dense, decoder-only LLM trained with a context size of up to +131 +k +131\text{\,}\mathrm{k} +tokens. +Independent of its world modeling capabilities, +CWM +offers strong performance on general coding and math tasks: it reaches pass@1 scores of +65.8 +% +65.8\text{\,}\mathrm{\char 37\relax} +on SWE-bench Verified (with test-time scaling), +68.6 +% +68.6\text{\,}\mathrm{\char 37\relax} +on LiveCodeBench, +96.6 +% +96.6\text{\,}\mathrm{\char 37\relax} +on Math-500, and +76.0 +% +76.0\text{\,}\mathrm{\char 37\relax} +on AIME 2024. +To support further research on code world modeling, we release model checkpoints after mid-training, SFT, and RL. +1 +Introduction +Software development is one of the domains where Large Language Models (LLMs) have already had a significant real-world impact +(Cui et al., +2024 +; Bick et al., +2024 +) +. +They have quickly been adopted into the workflows of software engineers worldwide, and their capabilities are advancing fast: from only supporting programmers with small snippets of code to fixing issues or writing code bases autonomously +(Yeverechyahu et al., +2024 +; Handa et al., +2025 +) +. +However, reliably generating high-quality code remains a challenge even for the current generation of LLMs, with benchmarks consistently revealing shortcomings upon release +(Hendrycks et al., +2021a +; Chen et al., +2021 +; Aider Team, +2025 +; Jimenez et al., +2024 +) +. +We believe that advancing code generation with LLMs may require new training and modeling paradigms. Typically, code is treated the same as any other text data during pre-training: the model learns to predict code line by line, from left to right and top to bottom. We think this is not sufficient – to master coding, one must understand not just what code +looks like +but what it +does +when executed. Such skill is instrumental to the everyday work of software engineers: at a local level, they understand how the execution of a line of code changes the state of the local variables, and, at a global level, they can make predictions about how changes to a codebase affect program outputs. Yet, teaching LLMs such +code world modeling +capabilities is typically not considered before post-training. +Figure 1 +: +Overview of the +CWM +training stages and the model checkpoints that we release. We generally report performance of the final CWM (instruct, RL trained) model, except where otherwise stated. +We release Code World Model ( +CWM +), a new LLM for code generation and reasoning that has been trained on large amounts of code world modeling data. +Concretely, +CWM +is mid-trained on two different kinds of observation-action trajectories that capture important aspects of software development: Python code execution traces and agentic interactions with Docker environments. Mid-training on such data at scale should help improve coding performance by grounding our model’s predictions in the underlying dynamical systems and provide a superior starting point for RL. +For the Python execution data, actions are Python statements and observations contain the contents of the local variables. By training +CWM +on a trajectory of observation-action pairs conditioned on the code only as context, we directly teach the model how the execution of a line of Python affects the state of the local variables. Our premise here is that teaching +CWM +the semantics and not just syntax of programs should help with writing code as well as with reasoning tasks like verification, testing, and debugging. +We also train +CWM +on a large-scale collection of synthetically generated agentic interactions with computational environments. These trajectories are generated with our so-called ForagerAgent, which “forages” for data covering agentic software engineering scenarios such as implementing missing functionality or fixing bugs from error messages. Actions here are shell-like commands or code edits generated by the agent, and observations are responses from the running environment. While it is not uncommon for recent models to include similar data, this is mostly done at smaller scale during post-training +(Yang et al., +2025b +) +. +Data from ForagerAgent, on the other hand, is large scale and included already during mid-training, helping shape internal representations ahead of post-training. +CWM +uses a dense, decoder-only Transformer architecture +(Vaswani et al., +2017 +; Radford et al., +2018 +) +with +32 +B +32\text{\,}\mathrm{B} +parameters, interleaved sliding window attention supporting up to +131 +k +131\text{\,}\mathrm{k} +tokens context size, and is trained over pre-, mid-, and post-training phases (see +Figure +˜ +1 +). With quantization, inference with +CWM +can be performed on a single +80 +GB +80\text{\,}\mathrm{G}\mathrm{B} +NVIDIA H100. +Beyond world modeling capabilities, +CWM +achieves strong performance on general and agentic coding and reasoning tasks relative to other open-weights models of comparable size: it reaches pass@1 scores of +65.8 +% +65.8\text{\,}\mathrm{\char 37\relax} +on SWE-bench Verified (with test-time scaling; see +Figure +˜ +2 +), +68.6 +% +68.6\text{\,}\mathrm{\char 37\relax} +on LiveCodeBench-v5, +96.6 +% +96.6\text{\,}\mathrm{\char 37\relax} +on Math-500, +76.0 +% +76.0\text{\,}\mathrm{\char 37\relax} +on AIME 2024, and +94.3 +% +94.3\text{\,}\mathrm{\char 37\relax} +on CruxEval Output. +First and foremost, the release of +CWM +is meant to enable novel research on improving code generation with world modeling. +We are excited by this vision and our report provides early supportive evidence: we give examples of how world models can benefit agentic coding, enable step-by-step simulation of Python code execution, and show early results of how reasoning can benefit from the latter. +However, we believe the best is yet to come and hope to join forces with the open source research community to explore how world models can be used to leverage reasoning and planning to improve code generation. +To this end, we release both the final weights and intermediate checkpoints under a noncommercial research license. +Given +CWM +’s competitive performance, we conducted a preparedness assessment which concluded that +CWM +is unlikely to increase catastrophic risks beyond those present in the current model ecosystem. +1 +1 +1 +See +ai.meta.com/research/publications/cwm-preparedness +and +Section +8 +for details. +Figure 2 +: +On SWE-bench Verified, +CWM +outperforms open-weight models with similar parameter counts and is even competitive with much larger or closed-weight LLMs. +The base score for +CWM +is computed with a single attempt per instance (no retries, majority voting, or parallel candidates), averaged over multiple runs to reduce variance. +For “Test Time Scaling”, we generate multiple candidates in parallel and then submit one patch based on ranking. +The “Test Time Scaling” score for GPT-oss models is +high +reasoning budget, while the lower score is +low +. +(*: GPT-5 and GPT-oss use a custom subset of +477 +477 +problems, while +CWM +is evaluated on the full set of +500 +500 +problems.) +2 +Code world model datasets +CWM +is trained on a large variety of datasets across pre-, mid-, and post-training phases. We focus strongly on code and code world modeling data across all stages of training. We highlight two large-scale data collection efforts that empower +CWM +’s world modeling capabilities: Python execution traces and ForagerAgent. We refer to +Section +˜ +4 +for more traditional ingredients in our datamixes. +2.1 +Executable repository images: building repositories at scale +Figure 3 +: +CWM format for Python traces. Given a source code context and a marker of the trace starting point, +CWM +predicts a series of stack frames representing the Program states and the actions (executed code). +A core prerequisite for capturing Python execution traces and agentic trajectories in real-world software engineering tasks is executing code in repositories at scale. For isolation and repeatability, we build these repositories as Docker containers, referred to as +executable repository images +. These images contain a preconfigured environment capable of running repository code and tests without additional setup. +As manually building arbitrary GitHub repositories cannot scale to our desired dataset size, we apply both LLM- and CI-assisted methods. +For the former, an LLM-backed agent, denoted as RepoAgent, was tasked with setting up the development environment of a target repository, finding test files, and ensuring that a significant number of them could run and pass. To support its efforts, we provide RepoAgent with human-readable documentation extracted from the target repository. Although this further improves RepoAgent’s success rates, human-targeted documentation can suffer from inaccuracies due to lack of verifiability and insufficient maintenance incentives. In contrast, machine-targeted instructions must remain accurate for successful builds, with platforms like GitHub immediately signaling failures. +Therefore, we also developed the +Activ +(Act in virtual) pipeline to repurpose GitHub Actions CI execution for building executable repository images. This pipeline runs the workflows locally via the +act +(Lee, +2019 +) +library. Since many GitHub Actions workflows are not designed for third-party execution and not limited to CI builds, we modify the target repository’s source code and trigger an early exit after the completion of a single successful build. +As all GitHub Actions workflow jobs run simultaneously in individual containers and the build state is transient, we add or modify the repository’s pytest configuration files, to inject a fixture that is automatically run at test-execution time. This fixture captures the build state of the container running unit tests. We then commit and push the resulting image from each repository, as further detailed in +Section +˜ +13 +. +Running both RepoAgent and Activ methods in parallel, we created over +35 +k +35\text{\,}\mathrm{k} +unique executable repository images. +2.2 +Python tracing: neural code interpretation data +The first type of +CWM +data we present is memory tracing of Python programs. This involves gathering executable functions or executable repository images, and running them using different IO pairs or CI tests, while capturing the state of the memory, chiefly the local variables, after each line is executed. This process enables us to align code and execution trace to simulate observation-action data within the computational environment. Prior work empirically shows this approach is beneficial in improving general code generation and understanding capabilities +(Armengol-Estapé et al., +2025 +; +Zhang et al., +) +. Neural code interpretation further has the potential to go beyond traditional interpreters, with applications such as tracing through unexecutable code or combining it with reasoning capabilities. +Next, we describe the different sources from which we gather execution trace data. +Function-level tracing. +We collect a dataset of Python functions from online sources and automatically generate input-output pairs with a combination of fuzzing and prompting Llama3-70B-Instruct. Our tracing process captures the state of the Python program (interpreter stack frames) at different intermediate execution points, corresponding to events of the Python interpreter (e.g., executed lines, return statements, exceptions). The final dataset contains over +120 +M +120\text{\,}\mathrm{M} +traced Python functions. +We post-process the raw traces to construct observation-action pairs. The observation contains local variables and stack frame metadata immediately prior to executing a line of code, the action is the specific Python line being executed, and the subsequent observation captures the resulting local variable states and additional event metadata such as return statements; we disregard global variables and external side effects. The variable values that do not change with respect to the previous step are summarized with an ellipsis. We prefix the trace data with the source code context. +Figure +˜ +3 +illustrates the CWM format for Python traces. Given a Python code context and a marker of the tracing initial point, the model follows with a series of Python stack frame predictions (in the form of a JSON-formatted dictionary with the local variables) and the corresponding actions (i.e., the part of the code that is being executed). The frame, action, and argument separators, as well as the trace context start indicator, are represented using custom tokens. +We refer the reader to +Section +˜ +11 +for trace prediction examples and +Section +˜ +17 +for a specification of this trace representation format. +We re-use both the tracing app and trace formatting for all other execution trace data described in the remainder of this section. +CodeContests solutions tracing. +We also generate tracing data for solutions to competitive programming problems. Concretely, we use Llama-3.1-70B-Instruct to generate Python solutions to training set problems in CodeContests +(Li et al., +2022 +) +, reusing the framework of +Gehring et al. ( +2025 +) +. Generations are filtered to ensure a balance of incorrect and correct submissions, leading to an overall count of +262 +k +262\text{\,}\mathrm{k} +. We trace these solutions with inputs from the provided unit tests and filter out long traces with more than +10 +k +10\text{\,}\mathrm{k} +line events or large traces taking up more than +1 +MB +1\text{\,}\mathrm{M}\mathrm{B} +disk space, leaving us with +33 +k +33\text{\,}\mathrm{k} +effective code snippets and +70 +k +70\text{\,}\mathrm{k} +traces. +Repository-level tracing. +We also performed Python execution tracing for the unit tests of more than +21 +k +21\text{\,}\mathrm{k} +available and traceable repository images. For a subset of these repositories, we use the repository’s git log to randomly select additional commits prior to our built commit. Since the build environment is configured for the current commit’s dependencies, older commits may fail to execute. We attempted tracing for up to +40 +40 +historical commits per repository but capped successful traces at +4 +4 +commits per repository to avoid over-representation of any single repository. This process resulted in around +70 +k +70\text{\,}\mathrm{k} +execution-traced commits. +We post-process raw traces in two steps. First, we “episodify” our traces, extracting function-level traces from raw pytest traces with configurable stack depth and stochastic step-in probability. When stochastic step-in occurs, function calls are probabilistically included in their parent trace rather than a separate episode to simulate variable execution depth. In a second step, we then gather and compress the source code context from the target repository that is necessary for predicting the observation-action trajectory. +To the resulting context-trace pair, we then apply the same +CWM +formatting as before. +Natural language tracing. +Lastly, we generate a dataset of step-by-step descriptions of Python code execution in +natural language +rather than our strict JSON-like format from before. Natural language explanations of code execution are closer in domain to other LLM tasks, which we hope will simplify knowledge transfer to other context such as reasoning in code generation. This less-structured format also has other advantages, such as allowing for injection of semantic context (e.g., “this operation preserves the structure property of the max heap”) or for compressing traces by dynamically skipping less interesting parts of the trajectory (e.g., repeated logic within a for loop). We generate this data by prompting Qwen3-32B-FP8 (without thinking) +(Yang et al., +2025a +) +to re-write execution traces from our function-level and CodeContests trace datasets. After removing cases where the final output prediction from Qwen diverges from the ground truth trace, we obtain +75 +M +75\text{\,}\mathrm{M} +trajectories from standalone Python functions and +110 +k +110\text{\,}\mathrm{k} +from CodeContests data. +2.3 +ForagerAgent: agentic midtraining data generation +We mid-train +CWM +on a large-scale dataset of interactions between an LLM-based software engineering agent and a computational environment. This data is generated with our so-called ForagerAgent, which collects multi-step trajectories by prompting an LLM with a software engineering task to solve in the context of a particular code repository. Exposing +CWM +to such data at large scale early on should improve subsequent post-training in similar environments, as model predictions should already be grounded in environment dynamics. +The actions available to the agent are derived from the standard SWE-Agent +(Yang et al., +2024 +) +toolset: (i) create a file, (ii) edit a file, (iii) run a bash command, and (iv) view or navigate inside a file. The trajectory is concluded once the LLM, either Llama3-70B-Instruct +(Dubey et al., +2024 +) +or Qwen3-235B-A22B (w/o thinking) +(Yang et al., +2025a +) +, believes the task has been solved or the number of tokens, turns, or API costs exceed a hard limit. Like the repository-level tracing data, ForagerAgent relies on our set of executable repository images (see +Section +˜ +2.1 +) to seed problem generation. To avoid contamination, we filter out all repositories (and their forks) that are used in SWE-bench. The tasks presented to the model can be categorized into two groups: synthetic tasks and real-world tasks, which we call +mutate-fix +and +issue-fix +. +Mutate-fix tasks +. For mutate-fix tasks, we start with a working codebase and then synthetically introduce a bug for the agent to fix. We begin by identifying functions (and methods – omitted for brevity below) that can be verified using the repository test suite. +As a first step, we filter these functions to the subset for which all unit tests pass successfully. +We then consider the following set of mutations to synthetically introduce a bug into these functions: +• +Functions: remove either a portion of the function or the entire function. +• +Arguments: remove arguments from the function definition or randomly re-order function call arguments. +• +Variables: sample a pair of variables in the function and swap all their occurrences. +• +Statements: remove an import or return statement. +• +Operators: replace operators (binary, unary, or boolean) in statements in the function. +We filter out mutations that cannot be applied for a given function by parsing the corresponding abstract syntax tree (AST). Lastly, we verify that applying the candidate mutation does in fact cause the associated unit tests to fail. +We can now use the mutation as a starting point for agentic data collection: we instruct the agent to inspect the mutated function, run its unit tests, and resolve the failing tests by fixing the bug. +Issue-fix tasks +. For issue-fix tasks, we prompt the model to fix real issues in our set of repositories, using both issue and pull request data from GitHub. +We check out commits preceding bug-fixing PRs and task the agent with resolving failing unit tests, providing the corresponding GitHub issue descriptions for context. +We ensure unit tests are failing before the PRs and that their resolution is necessary and sufficient for addressing the issues. +Post-processing. +To avoid overfitting to repetitive interactions, we apply a near-deduplication of trajectories foraged from the same source repository: we first represent a trajectory by the concatenation of its actions, then encode the trajectory using MinHash, and lastly drop trajectories such that the pairwise Jaccard similarity for all encoded trajectories we keep is less than +0.5 +0.5 +. Because our goal with the ForagerAgent data is to learn a comprehensive world model of agentic interactions with code environments, we do not filter trajectories based on whether they succeed at bug or issue resolution. Following the same motivation, we further train the model to predict both agent and environment turns, although we stochastically mask loss for +50 +% +50\text{\,}\mathrm{\char 37\relax} +of observations as they exhibit limited diversity. Overall, we are left with +3 +M +3\text{\,}\mathrm{M} +trajectories obtained from +10.2 +k +10.2\text{\,}\mathrm{k} +images, and we refer to +Table +˜ +1 +for more detailed statistics. +Table 1 +: +Statistics of ForagerAgent trajectories. We gather +3 +M +3\text{\,}\mathrm{M} +trajectories from +10.2 +k +10.2\text{\,}\mathrm{k} +images and +3.15 +k +3.15\text{\,}\mathrm{k} +underlying repositories. The trajectories are split +55 +55 +– +45 +45 +between issue- and mutate-fix tasks. +Repos +Images +Trajectories +Issues-Fix +Mutate-Fix +3.15k +10.2k +3M +55% +Functions +Arguments +Variables +Statements +Operators +7 +% +7\text{\,}\mathrm{\char 37\relax} +9 +% +9\text{\,}\mathrm{\char 37\relax} +6 +% +6\text{\,}\mathrm{\char 37\relax} +11 +% +11\text{\,}\mathrm{\char 37\relax} +12 +% +12\text{\,}\mathrm{\char 37\relax} +3 +Examples of code world modeling +Before introducing the +CWM +architecture and benchmark results more formally, in this section, we share a few example generations from the final model that illustrate our excitement for code world modeling. +For competitive programming, +Figure +˜ +4 +shows an example where +CWM +first creates an initial solution after reasoning about the problem statement, then constructs input-output pairs to assess the correctness of its solution, before finally comparing its own prediction against the actual results of program execution. +Although we did not train +CWM +for it directly, this is exactly the kind of reasoning about environment dynamics that we believe code world models can enable. +We are excited about future work that explicitly integrates reasoning about environment feedback to improve agentic code generation. +While +CWM +is trained to simulate the execution of Python code line-by-line, we believe the formal integration of trace prediction into natural language reasoning is interesting future work. +Figure +˜ +5 +presents an early prototype of this, showing +CWM +already generalizes to such behavior under light prompting. +We believe that execution trace prediction enables +grounded +reasoning about code generation and execution, without requiring access to live execution environments. +This can take explicit forms, such as in our prototype, but we also envision more implicit forms of regularizing reasoning through world modeling. +We further plan to extend +CWM +’s trace prediction to create a “neural debugger” whose capabilities go beyond traditional debugging, such as jumping to future lines of code in constant time, predicting inputs to reach arbitrary states, or learning abstract representations of program states that support program verification, debugging, or generation. +Relatedly, we are excited about more direct uses of Python execution trace prediction to improve code generation. +Under appropriate prompting, +CWM +can be used to predict execution traces for functions that are not given directly in the context but are instead constrained only through natural language or formal asserts. +After trace prediction, the model then constructs the function body from the generated actions, which are the executed Python statements. +We show early examples of this in +Figures +˜ +11.26 +and +11.27 +. +This method of reasoning-via-tracing for program generation is reminiscent of work by +Hoare ( +1971 +); Dijkstra ( +1976 +) +on using logical reasoning about program execution to guide development rather than only using it to confirm correctness post-hoc. +⬇ +Okay +, +I +need +to +solve +this +problem +where +I +have +to +check +if +a +given +string +is +evenly +separated +. +The +definition +says +that +between +every +pair +of +the +same +characters +, +there +must +be +an +even +number +of +characters +. +So +, +for +example +, +in +" +abba +", +each +’ +a +’ +is +at +positions +0 +and +3. +The +distance +between +them +is +3-0 += +3, +but +the +number +of +characters +in +between +is +3-0-1 += +2, +which +is +even +. +So +that +works +. +But +in +" +aabba +", +the +first +two +’ +a +’ +s +are +at +positions +0 +and +1. +The +number +of +characters +between +them +is +0, +which +is +even +. +Let +me +double +check +. +But +the +example +says +it +’ +s +not +correct +. +Hmm +, +wait +maybe +I +’ +m +misunderstanding +the +problem +... +⬇ +< +tool +: +create +> +/ +code +/ +solution +. +cpp +# +include +< +bits +/ +stdc +++. +h +> +using +namespace +std +; +bool +is_evenly_separated +( +const +string +& +s +) +{ +unordered_map +< +char +, +int +> +prev_index +; +... + +⬇ +File +created +successfully +at +: +/ +code +/ +solution +. +cpp +(35 +lines +). +⬇ +... +Looking +back +, +the +code +reads +a +string +, +passes +it +to +the +function +. +Let +me +create +some +test +cases +. +Test +1: +’ +abba +’ +-> +expected +YES +-> +the +code +should +return +YES +. +Test +2: +’ +aabba +’ +-> +expected +NO +-> +because +when +it +processes +the +third +character +, +it +’ +s +the +third +occurrence +of +’ +a +’. +Wait +for +’ +aabba +’: +Indices +0: +a +( +occurs +once +) +index1 +: +a +now +( +occurs +twice +. +check +0 +and +1 +-> +difference +is +1 +-> +??? +But +difference +is +-1 +-> +mod +2 +is +-1 +% +Then +loop +reaches +index +4 +( +a +’ +s +third +occurrence +), +which +code +detects +count +for +a +which +is +already +2 +-> +returns +NO +. +So +code +should +output +NO +here +. +Other +test +: +’ +aaa +’ +( +NO +when +processing +third +a +). +test4 +: +’ +abcd +’ +-> +YES +( +each +character +appears +once +or +twice +in +non +- +same +parity +) +-> +code +OK +. +ttest5 +: +’ +aabb +’ +-> +the +two +a +’ +s +( +index +0 +and1 +-> +difference +1, +ok +). +Two +b +’ +s +( +index +2,3 +difference +-> +1 +is +ok +). +Code +returns +YES +. +Now +let +’ +s +compile +and +run +the +code +. +⬇ +< +tool +: +bash +> +g +++ +- +std += +c +++17 +- +O2 +solution +. +cpp +- +o +solution +; +echo +- +e +" +abba +\ +n +" +| +./ +solution +echo +- +e +" +aabba +\ +n +" +| +./ +solution +echo +- +e +" +aaa +\ +n +" +| +./ +solution +echo +- +e +" +abcd +\ +n +" +| +./ +solution +echo +- +e +" +aabb +\ +n +" +| +./ +solution + +⬇ +YES +NO +NO +YES +YES +⬇ +All +the +test +cases +match +the +expected +results +. +The +code +works +correctly +for +these +samples +and +edge +cases +. +For +very +large +inputs +, +the +code +has +O +( +n +) +time +and +O +(1) +space +( +since +there +are +26 +possible +characters +, +hash +maps +would +use +at +most +26 +entries +). +Therefore +, +the +solution +is +efficient +and +should +pass +all +test +cases +, +including +those +up +to +1 +e6 +characters +long +. +The +approach +checks +for +each +character +occurrence +and +ensures +that +each +pair +of +occurrences +meet +the +parity +requirement +. +Also +, +it +immediately +terminates +upon +finding +a +third +occurrence +or +a +pair +with +same +parity +indices +, +which +is +crucial +for +efficient +processing +. +I +’ +m +confident +that +there +are +no +bugs +, +and +the +solution +meets +the +problem +’ +s +requirements +. +⬇ +The +approach +involves +checking +each +character +’ +s +occurrences +to +ensure +that +no +character +appears +more +than +twice +, +and +any +pair +of +occurrences +of +a +character +must +be +an +odd +number +of +positions +apart +( +i +. +e +., +their +indices +have +different +parities +). +‘‘‘ +cpp +# +include +< +bits +/ +stdc +++. +h +> +using +namespace +std +; +bool +is_evenly_separated +( +const +string +& +s +) +{... +Figure 4 +: +Example of +CWM +solving a competitive programming problem in agentic fashion. The reasoning steps are shortened and some details are omitted due to space constraints. (Tool calls in purple, environment feedback in orange, and reasoning in blue.) +⬇ +Given +a +python +code +function +and +an +assert +statement +containing +a +specific +input +, +provide +the +assertion +with +the +exact +literal +output +that +the +function +returns +with +that +input +. +Do +not +include +any +mathematical +expressions +or +function +calls +-- +only +the +final +literal +value +. +Your +response +should +be +solely +the +assertion +, +enclosed +within +[ +ANSWER +] +and +[/ +ANSWER +] +tags +. +You +are +a +computational +world +model +and +can +predict +the +program +execution +. +Your +execution +trace +prediction +format +MUST +follow +this +structure +: +1. +The +execution +trace +prediction +starts +with +the +<| +trace_context_start +|> +token +and +ends +with +a +final +<| +frame_sep +|> +token +. +2. +For +each +code +execution +step +: +- +Begin +with +<| +frame_sep +|> +followed +by +the +event +token +which +can +be +<| +call_sep +|>, +<| +line_sep +|>, +<| +return_sep +|> +or +<| +exception_sep +|>. +- +After +<| +call_sep +|> +or +<| +line_sep +|> +put +the +local +variable +states +as +dictionary +in +JSON +format +followed +by +the +<| +action_sep +|> +token +and +the +current +source +code +line +. +- +After +<| +return_sep +|>, +<| +exception_sep +|> +directly +put +the +<| +action_sep +|> +token +and +the +current +source +code +line +followed +by +an +<| +arg_sep +|> +token +and +the +return +or +exception +arguments +. +3. +Provide +the +full +assertion +with +the +correct +output +that +you +obtained +after +<| +return_sep +|> +in +[ +ANSWER +] +and +[/ +ANSWER +] +tags +Here +is +an +example +of +how +you +would +predict +the +output +of +the +program +using +your +trace +prediction +capability +: +Python +function +: +def +f +( +a +, +b +): +y += +a +for +i +in +range +( +b +): +y ++= +y +* +i +return +y +assert +f +(1,3) +== +?? +< +think +> +your +internal +reasoning + +Let +’ +s +verify +this +by +putting +the +code +into +a +trace +context +and +call +the +function +in +the +main +() +function +and +then +trace +the +execution +of +the +main +function +. +We +indicate +the +entry +point +of +the +execution +trace +with +a +# +<< +START_OF_TRACE +marker +. +def +f +( +a +, +b +): +y += +a +for +i +in +range +( +b +): +y ++= +y +* +i +return +y +def +main +(): +# +<< +START_OF_TRACE +return +f +(1,3) +<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +def +main +(): +# +<< +START_OF_TRACE +<| +frame_sep +|><| +line_sep +|>{}<| +action_sep +|> +return +f +(1,3) +<| +frame_sep +|><| +call_sep +|>{" +a +": +"1", +" +b +": +"3"}<| +action_sep +|> +def +f +( +a +, +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +".."}<| +action_sep +|> +y += +a +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"1"}<| +action_sep +|> +for +i +in +range +( +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"..", +" +i +": +"0"}<| +action_sep +|> +y ++= +y +* +i +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"..", +" +i +": +".."}<| +action_sep +|> +for +i +in +range +( +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"..", +" +i +": +"1"}<| +action_sep +|> +y ++= +y +* +i +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"2", +" +i +": +".."}<| +action_sep +|> +for +i +in +range +( +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"..", +" +i +": +"2"}<| +action_sep +|> +y ++= +y +* +i +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"6", +" +i +": +".."}<| +action_sep +|> +for +i +in +range +( +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +"..", +" +y +": +"..", +" +i +": +".."}<| +action_sep +|> +return +y +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +y +<| +arg_sep +|>"6"<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +f +(1,3) +<| +arg_sep +|>"6"<| +frame_sep +|> +Now +let +us +analyze +the +trace +. +The +return +argument +of +the +function +call +f +(1,3) +in +the +main +() +function +is +"6" +in +JSON +format +, +so +the +return +value +is +6. +[ +ANSWER +] +assert +f +(1,3) +== +6 +[/ +ANSWER +] +Python +function +: +def +f +( +d +, +k +): +new_d += +{} +for +key +, +val +in +d +. +items +(): +if +key +< +k +: +new_d +[ +key +] += +val +return +new_d +assert +f +({1: +2, +2: +4, +3: +3}, +3) +== +?? +⬇ +Let +’ +s +analyze +the +function +call +f +({1: +2, +2: +4, +3: +3}, +3). +The +function +f +takes +a +dictionary +d +and +a +key +k +. +It +creates +a +new +dictionary +new_d +and +iterates +over +the +items +in +d +. +For +each +key +- +value +pair +, +if +the +key +is +less +thank +, +it +adds +that +key +- +value +pair +to +new_d +. +In +this +case +, +the +dictionary +is +{1: +2, +2: +4, +3: +3} +and +the +key +is +3. +We +need +to +check +each +key +in +the +dictionary +to +see +if +it +’ +s +less +than +3. +- +Key +1: +1 +< +3 +is +True +, +so +we +add +1: +2 +to +new_d +. +- +Key +2: +2 +< +3 +is +True +, +so +we +add +2: +4 +to +new_d +. +- +Key +3: +3 +< +3 +is +False +, +so +we +skip +3: +3. +So +, +the +resulting +dictionary +new_d +should +be +{1: +2, +2: +4}. + +⬇ +Let +’ +s +verify +this +by +putting +the +code +into +a +trace +context +and +call +the +function +in +the +main +() +function +and +then +trace +the +execution +of +the +main +function +. +We +indicate +the +entry +point +of +the +execution +trace +with +a +# +<< +START_OF_TRACE +marker +. +def +f +( +d +, +k +): +new_d += +{} +for +key +, +val +in +d +. +items +(): +if +key +< +k +: +new_d +[ +key +] += +val +return +new_d +def +main +(): +# +<< +START_OF_TRACE +return +f +({1: +2, +2: +4, +3: +3}, +3) +<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +def +main +(): +# +<< +START_OF_TRACE +<| +frame_sep +|><| +line_sep +|>{}<| +action_sep +|> +return +f +({1: +2, +2: +4, +3: +3}, +3) +<| +frame_sep +|><| +call_sep +|>{" +d +": +"{1: +2, +2: +4, +3: +3}", +" +k +": +"3"}<| +action_sep +|> +def +f +( +d +, +k +): +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +".."}<| +action_sep +|> +new_d += +{} +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"{}"}<| +action_sep +|> +for +key +, +val +in +d +. +items +(): +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"1", +" +val +": +"2"}<| +action_sep +|> +if +key +< +k +: +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +new_d +[ +key +] += +val +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"{1: +2}", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +for +key +, +val +in +d +. +items +(): +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"2", +" +val +": +"4"}<| +action_sep +|> +if +key +< +k +: +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +new_d +[ +key +] += +val +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"{1: +2, +2: +4}", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +for +key +, +val +in +d +. +items +(): +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"3", +" +val +": +"3"}<| +action_sep +|> +if +key +< +k +: +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +for +key +, +val +in +d +. +items +(): +<| +frame_sep +|><| +line_sep +|>{" +d +": +"..", +" +k +": +"..", +" +new_d +": +"..", +" +key +": +"..", +" +val +": +".."}<| +action_sep +|> +return +new_d +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +new_d +<| +arg_sep +|>"{1: +2, +2: +4}"<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +f +({1: +2, +2: +4, +3: +3}, +3) +<| +arg_sep +|>"{1: +2, +2: +4}"<| +frame_sep +|> +Now +let +us +analyze +the +trace +. +The +return +argument +of +the +function +call +f +({1: +2, +2: +4, +3: +3}, +3) +in +the +main +() +function +is +"{1: +2, +2: +4}" +in +JSON +format +, +so +the +return +value +is +{1: +2, +2: +4}. +[ +ANSWER +] +assert +f +({1: +2, +2: +4, +3: +3}, +3) +== +{1: +2, +2: +4} +[/ +ANSWER +] +Figure 5 +: +Example with execution trace prediction and reasoning. In this example, we add an execution trace example to the prompt. After reasoning in natural language about the code, the model uses its execution trace prediction capability to confirm the correct return value +{1: 2, 2: 4} +. We encode all special tokens (e.g. +<|frame_sep|> +) as such. +We refer to +Section +˜ +11 +for additional example trajectories from +CWM +, including self-correcting SWE behavior, a comparison of the advantages and disadvantages of strict trace prediction compared to natural language (and why we think their combination is worth exploring), further demonstrations of how we can adapt trace prediction to elicit code generation, and an example of using +CWM +to predict program termination. +(a) +CWM +architecture: GQA with alternating local ( +8 +k +8\text{\,}\mathrm{k} +) and dynamic global ( +131 +k +131\text{\,}\mathrm{k} +) sliding window attention. +(b) +Overview of the +CWM +training and inference types for inputs and outputs. +Figure 6 +: +Figures illustrating the +CWM +Transformer architecture and the main types of data introduced in the different training steps and used at inference time. +4 +CWM +: architecture, pre-training, and scaling laws +We next share details on the +CWM +architecture, final pre-training recipe, and scaling law experiments. Specific details about our efficient training infrastructure can be found in +Section +˜ +6 +. +4.1 +Architecture and hyper-parameters +Model architecture. +CWM +is a +32 +32 +-billion-parameter dense decoder-only model. +We choose a dense architecture over sparse alternatives for ease-of-use in downstream open source research. +CWM +uses an alternating pattern of local and global attention blocks interleaved in a +3 +: +1 +3\!:\!1 +ratio (see +Figure +˜ +6(a) +) with sliding window sizes of +8192 +8192 +and +131 072 +131\,072 +tokens respectively. Transformer blocks use Grouped-Query-Attention +(Ainslie et al., +2023 +) +with +48 +48 +query heads and +8 +8 +key-value heads. We use SwiGLU activation functions +(Shazeer, +2020 +) +, RMSNorm +(Zhang and Sennrich, +2019 +) +with pre-normalization, Rotary Positional Embedding (RoPE) +(Su et al., +2021 +) +, and we train with full document-causal masking. To support long-context modeling, we follow +Roziere et al. ( +2023 +); Xiong et al. ( +2023 +) +and apply Scaled RoPE with +θ += +1 +M +\theta=$1\text{\,}\mathrm{M}$ +and scale factor +16 +16 +from mid-training onwards. We give a full overview of +CWM +parameters and architecture choices in +Table +˜ +2 +. +Training hyper-parameters. +We train the model with the AdamW optimizer +(Loshchilov and Hutter, +2019 +) +with +β +1 += +0.9 +\beta_{1}=0.9 +, +β +2 += +0.95 +\beta_{2}=0.95 +, weight decay of +0.1 +0.1 +, and gradient clipping at norm +1.0 +1.0 +. +After +2000 +2000 +steps of linear warmup, we use a cosine decay learning rate schedule with peak learning rate +8 +× +10 +− +4 +8\text{\times}{10}^{-4} +and decaying by a factor of +100 +× +100\times +over the training horizon. The cosine decay schedule is calculated for a total training duration of +13 +T +13\text{\,}\mathrm{T} +tokens, with the last +5 +T +5\text{\,}\mathrm{T} +tokens of the scheduler used during mid-training. +Key hyper-parameters were determined using scaling laws, which we detail in +Section +˜ +4.3 +. +Table 2 +: +Key hyper‑parameters of the 32 B +CWM +. +Parameter +Value +Number of parameters +32 +B +32\text{\,}\mathrm{B} +Layers +64 +64 +Hidden dimension +6144 +6144 +Intermediate dimension +21 504 +21\,504 +Number of attention heads / dimension +48 +48 +/ +128 +128 +Number of key-value heads +8 +8 +Local window size +8192 +8192 +tokens +Max global context +131 072 +131\,072 +tokens +Activation function +SwiGLU +Normalization +RMSNorm (pre-norm) +Positional Encoding +Scaled RoPE ( +θ += +10 +6 +\theta=10^{6} +, scale factor += +16 +=16 +) +Vocabulary size +128 256 +128\,256 +tokens +Tokenizer. +CWM +uses the Llama 3 tokenizer +(Dubey et al., +2024 +) +which is a fast Byte-Pair Encoding tokenizer implemented with TikToken. +2 +2 +2 +See +https://github.com/openai/tiktoken +. +The vocabulary contains +128 000 +128\,000 +regular tokens as well as +256 +256 +reserved tokens. +We keep the control tokens from Llama 3 and leverage unused reserved tokens to support our tracing and reasoning use cases. +4.2 +Two-stage pre-training +CWM +pre-training consists of two stages sharing learning-rate scheduler and optimizer states but differing in datamix and maximum document lengths: +1. +General pre-training +: We begin with an initial pre-training phase on +8 +T +8\text{\,}\mathrm{T} +tokens from a diverse range of mostly English sources, with an emphasis on coding data (making up about +30 +% +30\text{\,}\mathrm{\char 37\relax} +of the mix) as well as STEM and general knowledge. We pre-train our model with a global batch size of +8.4 +M +8.4\text{\,}\mathrm{M} +tokens and a context length of +8192 +8192 +tokens. +3 +3 +3 +Note that our “local” attention blocks are therefore effectively global during pre-training. +2. +Code world model mid-training +: We then mid-train the model for an additional +5 +T +5\text{\,}\mathrm{T} +tokens. We here depart from our more generalist pre-training datamix and introduce a number of datasets in support of our code world modeling objectives. We mid-train with a global batch size of +33 +M +33\text{\,}\mathrm{M} +tokens and maximum context length of +131 +k +131\text{\,}\mathrm{k} +tokens. +4 +4 +4 +We have observed lackluster performance when training on long-context data at smaller batch sizes. We speculate that increasing the batch size (in tokens) is beneficial for training on long-context data, as the decrease in the number of documents contained in each batch increases the variance in our gradient estimate. +Mid-training is the key stage for teaching code world modeling capabilities. +Next, we discuss the changes we make to the pre-training recipe during mid-training to optimize +CWM +performance. +Mid-training datamix. +For mid-training, we introduce the ForagerAgent and Python execution tracing data, our main +CWM +datasets introduced in +Section +˜ +2 +, into the datamix. We additionally include code- and reasoning-related data such as datasets derived from GitHub pull requests similar to SWE-RL +(Wei et al., +2025 +) +, data from compiler intermediate representations +(Cummins et al., +2024 +) +, Triton PyTorch kernels similar to +Paliskara and Saroufim ( +2025 +) +, and formal mathematics in Lean covering statement and proof translation, as well as world modeling (see +Section +˜ +18 +). +CWM +-specific data makes up +30 +% +30\text{\,}\mathrm{\char 37\relax} +of the overall mid-training datamix. +We further increase the fraction of general code data to +40 +% +40\text{\,}\mathrm{\char 37\relax} +and keep +30 +% +30\text{\,}\mathrm{\char 37\relax} +for rehearsal of the initial pre-training datamix, as this proved essential in retaining performance on standard evaluations. +Within the rehearsal fraction, we now upweight higher quality datasets such as those containing math or long context data, while making sure to avoid over-epoching. +We summarize the types of data used across +CWM +training stages in +Figure +˜ +6(b) +. +Mid-training datamix ablations. +For many of the datasets introduced during mid-training we can afford to train for multiple epochs. To determine the desired number of epochs per dataset, we perform a series of scaling law experiments +(Kaplan et al., +2020 +) +that +simulate +different levels of epoching +(Dubey et al., +2024 +) +. In agreement with the literature on this +(Muennighoff et al., +2023 +) +, we generally find that multi-epoch training improves downstream task performance, albeit at diminishing returns, before eventually leading to overfitting. By selecting target epochs such that metrics indicate little to no diminishing returns, we arrive at between +1 +1 +and +4 +4 +target epochs per dataset. +The final proportion of a dataset in the mid-training mix is then calculated such that the desired number of epochs is reached at the end of mid-training. +When estimating the number of steps per epoch, we account for both token packing (wrapping) for pre-training data and truncation for chat data. +Long-context mid-training. +A significant amount of the mid-training data is long-context, with about +30 +% +30\text{\,}\mathrm{\char 37\relax} +of documents exceeding +65 +k +65\text{\,}\mathrm{k} +tokens. +This motivates our decision to increase the maximum sequence length to +131 +k +131\text{\,}\mathrm{k} +tokens for all of mid-training. +Consequently, we do not need a dedicated long-context finetuning phase common in many other recipes +(Yang et al., +2025a +; Agarwal et al., +2025 +; Dubey et al., +2024 +) +. While our local-global pattern reduces the cost of long-context attention (see +Section +˜ +4.1 +), we still found that data-parallel workers with short-context documents would often wait for ranks with long-context data during distributed training. To improve iteration speeds, we “bucketize” all documents by sequence length, ensuring all workers draw documents from the same bucket at a given step. We choose the bucket boundaries as +( +0 +, +16385 +] +(0,16385] +, +( +16385 +, +65537 +] +(16385,65537] +, and +( +65537 +, +∞ +) +(65537,\infty) +tokens and take care that the marginal probability of sampling a dataset is unchanged from bucketization. Note that, to achieve further speedups, we limit the maximum global attention size to +32 768 +32\,768 +in the medium bucket. +4.3 +Scaling laws +Scaling laws for LLMs that predict model performance as a function of compute, data, and model size have been studied extensively +(Kaplan et al., +2020 +; Hoffmann et al., +2022 +; Bi et al., +2024 +) +. These empirical laws enable the estimation of the expected loss for a given compute budget, the identification of the optimal scaling strategy between model and data size, and an informed selection of training hyper-parameters. Following +Bi et al. ( +2024 +) +, we develop scaling laws for optimal hyper-parameter prediction for the pre-training of +CWM +. +We adopt the compute budget formula +C += +M +⋅ +D +C=M\cdot D +, where +M +M +is the model size represented as the number of non-embedding FLOP per token and +D +D +is the data scale corresponding to the total number of training tokens. For a decoder-only Transformer, the number of FLOP per token is approximated by +M += +6 +​ +N +ne +⏟ +linear term ++ +6 +​ +d +​ +L +​ +S +⏟ +attention term +, +M=\underbrace{6N_{\text{ne}}}_{\text{linear term}}+\underbrace{6dLS}_{\text{attention term}}, +(1) +where +N +ne +N_{\text{ne}} +is the number of parameters excluding embeddings, +d +d +is the model hidden dimension, +S +S +is the sequence length, and +L +L +is the number of layers. This formula explicitly accounts for the computational cost of self-attention, which constitutes a significant portion of the total compute, especially for smaller models and longer contexts where attention overhead is relatively more pronounced, as discussed in +Bi et al. ( +2024 +) +. We refer to +Section +˜ +14 +for further detail. +Recent LLMs are trained beyond data-optimal regimes +(Dubey et al., +2024 +) +to optimize inference costs and produce smaller yet capable models. +Gadre et al. ( +2024 +) +show that models scale predictably for a fixed model-to-data ratio and advocate for scaling laws that mirror the setting of the final pretraining run. Therefore, we maintain a fixed model-to-data ratio of +D +/ +M += +40 +D/M=40 +across compute budgets, matching the target ratio of our +32 +32 +B parameter model pre-trained on +8 +T +8\text{\,}\mathrm{T} +tokens. This ratio is roughly +8 +8 +times more data than would be compute optimal according to the Chinchilla paper +(Hoffmann et al., +2022 +) +. +We conduct a quasi-random search over batch size and learning rate across eight increasing compute scales, ranging from +2 +× +10 +18 +2\times 10^{18} +to +2 +× +10 +20 +2\times 10^{20} +FLOP. For each scale, we keep the configurations within 1% of the best validation loss and fit the batch size +B +​ +S +BS +and learning rate +L +​ +R +LR +with respect to the compute budget +C +C +. Consistent with prior work, +B +​ +S +BS +grows and +L +​ +R +LR +declines gradually with +C +C +, while near-optimal hyper-parameters span a broad range. However, likely due to our different pre-training data, our equations for learning rate and batch size diverge from +Bi et al. ( +2024 +) +: +L +​ +R +​ +( +C +) += +19.29 +⋅ +C +− +0.177 +, +B +​ +S +​ +( +C +) += +30.17 +⋅ +C +0.231 +. +\begin{split}LR(C)=19.29\cdot C^{-0.177},\\ +BS(C)=30.17\cdot C^{0.231}.\end{split} +(2) +See +Section +˜ +14 +for additional details. +5 +Post-training: SFT, RL algorithms and environments +Our post-training phase improves +CWM +’s ability to solve complex programming-related problems with reasoning, building on the internal code world model learned during earlier training stages. Concretely, we first perform supervised finetuning (SFT) to improve both reasoning and general instruction-following capabilities. We then carry out large-scale multi-task multi-turn reinforcement learning on coding contests, math questions, and software engineering environments. We describe the SFT stage, our RL algorithms, data and environments, and detail our joint RL training recipe. As we do not intend to develop a general-purpose chatbot we therefore deliberately omit an RLHF stage. +5.1 +SFT +We perform SFT for +100 +B +100\text{\,}\mathrm{B} +tokens, distributed across +50 +k +50\text{\,}\mathrm{k} +steps with a global batch size of +2 +M +2\text{\,}\mathrm{M} +tokens and +32 +k +32\text{\,}\mathrm{k} +token sequence lengths. +We share optimization hyperparameters with pre-training but change the learning rate schedule to +1 +k +1\text{\,}\mathrm{k} +steps of linear warmup followed by a constant learning rate of +1 +× +10 +− +5 +1\text{\times}{10}^{-5} +. +In preliminary experiments, keeping a constant learning rate achieved similar evaluation metrics to annealing with cosine schedules while enabling high learning rate training during RL. +We further observed a performance decrease when SFT-ing at longer sequence lengths. +We suspect this is due to the configuration of our dataloader which always sequence-packs inputs (per data-parallel rank and local batch) from a single dataset. For very small datasets and large context sizes, this reduces the amount of unique steps such datasets can be observed, which may negatively affect performance. +Datamix. +We train on a diverse mix of internal and open-access data during SFT, including standard instruction-following datasets. +About +30 +% +30\text{\,}\mathrm{\char 37\relax} +of the datamix is rehearsal from mid-training (which itself includes +30 +% +30\text{\,}\mathrm{\char 37\relax} +pre-training data). This is to avoid overfitting to the SFT distribution ahead of RL and retain +CWM +capabilities taught in mid-training. +Our datamix also contains agentic SWE RL trajectories (see +Section +˜ +5.3.1 +), some of which have been rejection-sampled from earlier iterations of the +CWM +itself. We have generally found it useful to iteratively improve the starting point for RL by including trajectories from earlier iterations in the next SFT. Similarly, we include external datasets with reasoning traces, as we have found the performance benefit from them carries through to our final post-RL model. Specifically, we use the OpenMathReasoning +(Moshkov et al., +2025 +) +and OpenCodeReasoning +(Ahmad et al., +2025 +) +datasets that rely on DeepSeek-R1 +(Guo et al., +2025 +) +. +Reasoning tokens. +For SFT training on reasoning data, we introduce +<|reasoning_thinking_start|> +and +<|reasoning_thinking_end|> +tokens that surround any reasoning text. Because we mask the loss on all +<|reasoning_thinking_start|> +tokens, the model does not learn to generate them. +This enables both reasoning and non-reasoning behavior for the +CWM +-SFT model: non-reasoning mode is active by default and reasoning mode can be activated by injecting +<|reasoning_thinking_start|> +into the beginning of assistant responses. +Note that we discontinue the use of these reasoning tokens during RL as explained in +Section +˜ +5.3 +. +5.2 +RL algorithm +We use a variant of Group Relative Policy Optimization (GRPO) to train +CWM +(Shao et al., +2024 +) +. +GRPO is a policy gradient method that uses the PPO loss +(Schulman et al., +2017 +) +in combination with Monte Carlo value estimation instead of a value model, as used in PPO. Many works have proposed improvements to and fixes of GRPO +(Yu et al., +2025 +; Liu et al., +2025 +; Hu et al., +2025 +; Mistral-AI et al., +2025 +) +. +We incorporate a number of these and include further changes to support multi-turn RL and efficient asynchronous RL. A formal description of our RL algorithm can be found in +Section +˜ +12 +. +Differences from GRPO. +We deviate from the original GRPO algorithm in the following ways: +• +Multi-turn: +GRPO was originally developed for single turn (prompt +→ +\rightarrow +response) environments. Instead, we use a multi-turn variant where the sequence contains both model- and environment-generated tokens after the prompt, resulting in the need for masking via +M +i +, +t +M_{i,t} +. Furthermore, whereas GRPO used the +reward +r +i +r_{i} +, we use the +return +R +i +R_{i} +(sum of rewards) in the advantage calculation. +• +Asynchronous: +Whereas GRPO uses a synchronous setup, where nodes switch between generating batches of completions and training on them, we use asynchronous RL, resulting in much higher throughput. +• +No +σ +\sigma +normalization: +GRPO calculates the advantage as the centered and scaled terminal reward +A +^ +i += +( +r +i +− +μ +) +/ +σ +\hat{A}_{i}=(r_{i}-\mu)/\sigma +, where +μ +\mu +and +σ +\sigma +are mean and standard deviation of rewards in the batch. This introduces a difficulty bias +(Liu et al., +2025 +) +, which we avoid by using the more conventional +A +^ +i += +( +R +i +− +μ +) +\hat{A}_{i}=(R_{i}-\mu) +. +• +No length normalization: +As noted by +Liu et al. ( +2025 +) +, dividing the loss by the trajectory length as done in GRPO leads to a length bias, whereby the agent is incentivized to increase the length on hard problems so as to lower the average loss. To avoid this bias, we divide by the maximum number of tokens in a trajectory, which matches the maximum context size of our model +N += +131072 +N=131072 +. +• +Batching strategy: +We batch by a maximum token limit instead of the common approach that keeps a fixed number of trajectories per batch. This change aims at improving efficiency and to stabilize training by lowering the variance in batch size between different optimization steps – this is important in combination with our removal of length normalization. As a result, different trajectories that belong to the same group might contribute to separate optimization steps. The proportion of groups that get split can be decreased by increasing the maximum token limit or the number of gradient accumulation steps, but in practice we observe this not to be an issue even when more than half of the groups are split. +• +Clip-higher: +Following +(Yu et al., +2025 +) +, we use a higher upper clip value, +ε +high += +0.25 +\varepsilon_{\text{high}}=0.25 +and +ε +low += +0.2 +\varepsilon_{\text{low}}=0.2 +to prevent entropy collapse. +• +No KL: +Using clip-higher to prevent entropy collapse, we found it unnecessary to use KL regularization +5 +5 +5 +We recommend using the k2 estimator +(Schulman, +2020 +) +whose gradient is an unbiased estimator of the forward +KL +⁡ +( +π +θ +, +π +old +) +\operatorname{KL}(\pi_{\theta},\pi_{\text{old}}) +, rather than the k3 estimator used by GRPO whose gradient is an unbiased estimator of the reverse +KL +⁡ +( +π +old +, +π +θ +) +\operatorname{KL}(\pi_{\text{old}},\pi_{\theta}) +. +. +• +Skip zero-advantage trajectories: +The +effective +batch size is the number of tokens that do not have zero advantage and contribute to the gradient. We reduce variance in the effective batch size by skipping all zero-advantage trajectories. +• +Skip stale trajectories: +To limit the degree of off-policyness, we skip trajectories whose most recent tokens were generated from a policy more than +100 +100 +training steps behind the current policy. +• +Weighted mean return: +We found that longer trajectories are more likely to fail +(Hassid et al., +2025 +) +, leading to the majority of +tokens +having a negative advantage. To avoid biasing the token-averaged return, we compute +μ +\mu +as a length-weighted average. +• +Gibberish detection: +While gibberish typically leads to lower rewards and naturally decreases at the beginning of RL, it can increase later when some successful gibberish trajectories get reinforced, especially for agentic SWE RL. So we explicitly reject any trajectory containing any token +y +t +y_{t} +that is both rare and generated with low probability: +id +⁡ +( +y +t +) +> +100 +, +000 +\operatorname{id}(y_{t})>100,000 +and +logprob +⁡ +( +y +t +) +< +− +log +⁡ +( +128 +, +256 +) +− +2 +\operatorname{logprob}(y_{t})<-\log(128,256)-2 +where +128 +, +256 +128,256 +is the vocabulary size and the thresholds are tuned for high precision. Gibbrish typically consists of a window of tokens generated at low probability. BPE tokens are sorted by merge order where large +id +\operatorname{id} +corresponds to rare tokens. Generating such tokens suggests that the model is generating at high entropy and over-weighing rare tokens. This method stopped any increasing gibberish generation and performed better than detectors based on logprob and position alone. +5.3 +RL environments & data +We consider four types of RL tasks: Agentic software engineering (SWE) ( +Section +˜ +5.3.1 +), Coding ( +Section +˜ +5.3.2 +), Agentic coding ( +Section +˜ +5.3.3 +), and Mathematics ( +Section +˜ +5.3.4 +). +Each RL task is defined by a dataset (containing prompts, a verification suite like unit tests, and additional metadata) and an environment that the agent interacts with. +We integrate these tasks into a joint RL training phase which we detail in +Section +˜ +5.4 +. +We further refer to +Section +˜ +6.2 +for implementation details regarding our environments and RL training infrastructure. +Our environments constitute partially observable Markov decision processes: a language model is employed as an agent, producing actions based on the preceding sequence of action-observation pairs and an initial prompt. All environments for training +CWM +utilize software-based verification of outcomes, producing a single terminal reward signal per rollout. In the remainder of this section, we describe the specification of environment, i.e., prompt and observation design, reward function, and the corresponding datasets. +During reasoning RL, we discontinue the use of SFT reasoning tokens and replace them with clear-text + + +tags. +Early RL experiments on top of the SFT model showed long initial reasoning traces and slow improvements. We attribute this to our SFT reasoning data, which enhances reasoning performance but limits exploration during RL training. Switching out reasoning tags resulted in shorter responses, higher starting entropies, and significantly improved final performance. This suggests our approach leads to a best-of-both worlds scenario: the model’s familiarity with reasoning responses from SFT enables rapid improvements early on in RL, while the introduction of the new tokens allows the model to develop its own reasoning that is guided – but not restricted – by the SFT data. +5.3.1 +Agentic SWE +Figure 7 +: +SWE RL design. An agent solves software engineering tasks end-to-end through long-horizon agent-environment interactions via reasoning and tool use (up to 128 turns and +131 +k +131\text{\,}\mathrm{k} +context size). SWE RL employs a minimal toolset: +bash +as the core, with +edit +, +create +, and +submit +as lightweight +bash +plugins. The reward combines hidden test outcomes with patch similarity, where the similarity reward is applied when tests fail to provide auxiliary learning signals. +Design. +Agentic SWE RL substantially improves our model’s software reasoning and engineering capability (e.g., on SWE-bench Verified +(Jimenez et al., +2024 +) +) by enhancing the model’s agentic reasoning and tool-use skills (see +Section +˜ +20 +for the capability evolution during RL training). Its philosophy is to remain simple yet general: an LLM agent tackles a task end-to-end through reasoning and tool execution, without relying on task-specific post-processing. +The same design is applied to the agentic coding environment (see +Section +˜ +5.3.3 +). Each SWE RL trajectory has a single human user turn (besides the system prompt) containing the issue description and multiple turns of agent-environment interactions. +During training, we allow long-horizon interaction, with a maximum of +128 +128 +turns over a context window of +131 +k +131\text{\,}\mathrm{k} +tokens. +As shown in +Figure +˜ +7 +, the agent is equipped with four tools to solve a given task (e.g., software issue), where we embrace a minimal tool design centered on bash and editing, inspired by Sonnet 3.5 +(Anthropic, +2025 +) +: +• +bash +: executing commands in a stateful shell session, +• +edit +: modifying an existing file using the search/replace format used by Agentless +(Xia et al., +2024 +) +and Aider +(Aider Team, +2025 +) +, +• +create +: creating a new file in the sandbox, and +• +submit +: marking something (e.g., a file path) as the final submission according to the task requirement. +The runtime implementation of the tools follows SWE-agent +(Yang et al., +2024 +) +and OpenHands +(Wang et al., +2025 +) +, where +bash +is a stateful shell session running in a persistent server process, and serves as the main component, while other customized tools are treated as plugins that can be “de-sugared” into simple bash commands. For example, the +edit +and +create +tools are two standalone Python scripts, and the +submit +tool, when used for file paths, reduces to +cat +to retrieve the file content. +Figure 8 +: +SWE RL interaction example. +The agent interacts extensively with the repository sandbox through reasoning, exploration, editing, and test execution, submitting a final patch using +git diff +along with a summary. +The user prompt includes custom instructions for resolving software issues. +For example, the prompt shown in +Figure +˜ +8 +asks the agent to “…thoroughly explore the codebase, think hard, and leverage significant execution to verify correctness by writing comprehensive tests to validate your solution…”, which it follows in its subsequent actions. Notable differences from prior designs are that (1) our agent must generate the complete end-to-end patch directly via +git diff +rather than relying on task-specific post-processing, and (2) it must also produce a summary explaining how it resolves the issue, to improve clarity and usability. We also retain all reasoning turns for logical coherence. +Reward. +We adopt a hybrid reward for SWE RL. When all the hidden tests pass, the reward is +1 +1 +. If not, we adopt the patch similarity reward used in the SWE-RL paper +(Wei et al., +2025 +) +. Unlike the SWE-RL paper, which uses a continuous reward value, we apply a discrete and threshold-based design to improve the training stability by avoiding rewarding low-similarity patches. +In detail, when the computed similarity is above the threshold of +0.5 +0.5 +, the reward is +0 +, otherwise, the reward is +− +1 +-1 +. +This reward shaping showed benefits in early ablation, because a higher patch similarity incentivizes the model to localize the actual bugs more precisely and to produce a closer fix to the oracle patch. +This also helps the model to gain more learning signals from difficult issues for which it cannot produce any test-passing patch. +Data self-bootstrapping. +In SWE RL, +CWM +is required to solve software issues as a +reasoning agent +(i.e., through both reasoning and tool use). +However, such data does not exist in the public and the format is completely new to the model before RL and different to our ForagerAgent data, so early iterations of our model struggled to interact with the software environment across long horizons without making format errors. To address this, we perform an iterative self-bootstrapping process to collect high-quality agentic reasoning traces and supply them back to the joint SFT stage so that +CWM +can have a better prior distribution before RL. This process not only helps with format adherence but also significantly improves our model’s software engineering capability both before and after RL. +Figure 9 +: +SWE RL self-bootstrapping. Starting from a pre-RL checkpoint, we iteratively perform RL, rejection-sample high-quality reasoning traces, and feed them back into SFT. This process improves data quality and format adherence across iterations, raising success rates, and providing stronger initialization for joint RL. +As shown in +Figure +˜ +9 +, we start from a pre-RL +CWM +checkpoint (not the final +CWM +SFT) that has not been SFT-ed on any SWE trajectories in the reasoning agent format. +Over three main iterations, we perform RL and use the RL-ed model (i.e., the SWE expert in the figure) to do rejection sampling. We then select high-quality traces from the rejection samples using custom heuristics (e.g., long trajectories that pass all hidden tests without any tool use errors). Next, we perform SFT with this data on top of the original model. +This filtering helps reduce biases that RL fails to eliminate, such as the tendency to make editing mistakes. Then, iteratively, we start RL with the new SFT-ed model and collect higher-quality traces for the next round. Eventually, we include the final set of the traces into the joint SFT mix to prepare for the final joint RL. +This results in the final +CWM +SFT model. +For each iteration, we redo SFT on the original midtraining checkpoint and discard old trajectories. +Importantly, we find that the bootstrapped data greatly improve the performance of the SFT checkpoint on SWE-bench Verified. Without SWE RL traces, the SFT model hardly resolves any issues due to format errors. With more iterations of bootstrapping, the data quality improves significantly, and the success rate increases from +30 +30 +%, to +37 +37 +%, and to +43 +43 +% pass@1 over SWE-bench Verified. During earlier iterations, we record the offline pass rate for each instance and use it as the GRPO baseline in later iterations. This lets us set the group size to 1 and speed up each epoch. We find this technique leads to faster SWE RL training. In the final joint RL, we still perform online estimation of the GRPO baseline for consistency with other environments and for a higher performance ceiling. +Data sourcing and filtering. +We reuse the executable repository images from our mid-training data generation efforts ( +Section +˜ +2.1 +). Since issue solving requires additional metadata (e.g., issue text, base commit hashes, and diff patches), we join these repositories with publicly available issue and pull request metadata to create repository-issue pairs. The +git log +history enables us to create one-to-many repository-issue pairs. We also include publicly available training data such as SWE-Gym +(Pan et al., +2025 +) +and R2E-Gym +(Jain et al., +2025b +) +, further filtered by us for quality (e.g., removing non-verifiable instances whose tests cannot pass). All training data are decontaminated against SWE-bench Verified at repository-level granularity, see +Section +˜ +15 +. +We estimate the difficulty of each instance using the pass@1 score from +CWM +SFT, calculated over at least +32 +32 +samples. Instances with a pass@1 above +95 +95 +% are filtered out as easy, while those with a non-zero pass@1 are included in the primary dataset. Instances with a +0 +% pass@1 are placed in a secondary dataset that is sampled less frequently at the beginning. To make these hard problems solvable, we augment their prompts by adding the hidden test as a hint. This augmentation increases the pass@1 rate from +0 +% to approximately +30 +30 +%. Later in training, we remove hints from hard instances so the model learns to solve them from scratch. Finally, this process yields +12.6 +k +12.6\text{\,}\mathrm{k} +unique training instances: +6.9 +k +6.9\text{\,}\mathrm{k} +in the primary set and +5.7 +k +5.7\text{\,}\mathrm{k} +in the secondary. +5.3.2 +Coding +Figure 10 +: +Prompt template for math (left path) and competitive programming (right path) RL tasks. +Design. +RL for competitive programming aims to teach the model to write correct programs for challenging tasks and to reason about code and algorithms. Our competitive programming environment presents the problem to the agent in the first turn and optionally allows follow-up attempts, during which the environment provides execution feedback. It supports multiple programming languages and provides detailed feedback on syntax errors, timeouts, and incorrect test outputs. The environment terminates either when the maximum number of turns is reached or when the agent produces a correct solution. In the joint RL run, we limit the number of attempts to one but allow up to +64 +K +64\text{\,}\mathrm{K} +tokens in responses to enable extensive reasoning. +We adapt a lightweight prompt template which is shown in +Figure +˜ +10 +. The system prompt asks for reasoning delimited by +... +in clear-text. The user prompt specifies the programming language and instructs the agent to put the code solution inside a markdown block. +Reward. +We assign a reward of +− +1 +-1 +for incorrect trajectories and +1 +1 +for correct ones. A trajectory is correct if it meets all of the following criteria: +• +Contains exactly one + +tag, signaling successful reasoning completion. +• +Contains exactly one markdown block in the model’s generated answer. +• +The code solution passes all unit tests within the specified time and memory limits. We execute the unit tests in parallel using an internal code execution service on remote machines. +Data sourcing and filtering. +We source coding problems from various programming contest websites. A problem typically consists of a problem description, limitations on the input and output domains, memory and time limits, and input-output examples. In addition, each problem comes with a set of tests that we use to verify the correctness of candidate solutions: a solution is considered correct only if it produces the expected output for any given test input. +We decontaminate the coding problems against test benchmarks and de-duplicate them to ensure that each training problem is unique. In both cases, we use MinHash-based similarity detection +6 +6 +6 +https://github.com/serega/gaoya +, applying word- or character-based matching depending on the length of each document. This process ensures the integrity of our evaluations, which is especially important for code generation and mathematical problems, where data contamination can significantly affect performance metrics. +We use Llama-3.3-70B-Instruct to identify and remove poorly posed problems, such as those containing gibberish, missing or truncated problem statements, or lacking input/output descriptions. We do not apply any difficulty-based filtering. After decontamination (see +Section +˜ +15 +), the final code RL dataset has +81 +k +81\text{\,}\mathrm{k} +prompts. +5.3.3 +Agentic coding +Figure 11 +: +Example interaction for the agentic coding RL environment. The agent uses reasoning and tools to solve competitive programming problems. Before generating a final solution, the agent summarizes the interaction. +The agentic coding environment combines the reasoning and tool use features of the SWE RL environment ( +Section +˜ +5.3.1 +) with the competitive programming setup described in +Section +˜ +5.3.2 +. The user prompt is customized for solving competitive programming tasks and explicitly asks the agent to write and run tests to check and improve the solution, as illustrated in +Figure +˜ +11 +. Different from SWE RL, there is no +submit +tool in this environment. Instead, the agent needs to provide the solution in its final response, which is then extracted for evaluation. We consider two programming languages, Python and C++, where the images used for agentic interaction are +python:3.11-slim +for Python and +python:3.11-bookworm +(with +gcc 12 +support) for C++. +The final solution is then evaluated using the same execution infrastructure as for the competitive programming environment. +5.3.4 +Mathematics +Design. +Although not the main focus on this research work, we consider mathematical reasoning as another RL task to further strengthen and generalize +CWM +’s reasoning capabilities. We restrict these problems to questions that have definitive and easy to verify answers. Both the questions and answers are formulated in +L +a +T +e +X +, similar to much of the math content found on the web. +We also include a tool-enabled version of the math environment, adhering to the format described in +Section +˜ +17 +. Here, the agent may invoke the Python interpreter with custom code. Standard output and error contents will form the next observation and the agent is prompted to continue solving the task. We impose a limit of +4 +4 +tool calls per episode and a +10 +s +10\text{\,}\mathrm{s} +timeout per call. +The prompt template shares its system prompt with the coding environment but differs in the user prompt. As shown in +Figure +˜ +10 +, the user prompt instructs the agent to place the final answer inside a +L +a +T +e +X +box. +Reward. +Every trajectory is classified as either correct (reward = +1 +1 +) or incorrect (reward = +− +1 +-1 +). Correctness is defined as: +• +Exactly one + +tag, signaling successful reasoning completion. +• +Exactly one +$\boxed{}$ +for the predicted answer. +• +Our verifier emits +True +for the comparison between the predicted answer and the ground-truth answer. +Since there is no general normal form for mathematical expressions, the verifier checks whether the predicted answer is equivalent to the ground truth answer using a variety of heuristics detailed in +Section +˜ +16 +. +Data sourcing and filtering. +We gather math questions and answers from publicly available sources. To remove duplicates, we use the MinHash LSH algorithm to identify similar problems and verify that they have the same answer using our verifier. We also filter out problems that were solved correctly in all attempts ( +32 +32 +out of +32 +32 +) by our SFT model in non-reasoning mode. This helps avoid wasting compute on easy problems and reduces the risk of reinforcing incorrect reasoning followed by a correctly memorized answer. The prompt set used for RL training contains a total of +278 +k +278\text{\,}\mathrm{k} +problem-answer pairs. +5.4 +Joint RL +Figure 12 +: +Async RL systems overview. Worker nodes generate trajectory batches from multiple RL environments and send them to trainer nodes via a transfer queue. Trainer nodes form training batches either from worker-provided data or the rehearsal mix, packing trajectories up to the maximum context length for a single gradient update. Environment execution and verification can occur locally on worker nodes or remotely on another cluster or in the cloud. +Finally, we train +CWM +using all of the above-mentioned RL tasks. As shown in +Figure +˜ +12 +, joint RL uses our asynchronous RL infrastructure: worker nodes generate +G +G +trajectories per prompt from multiple RL environments and send them to trainer nodes through the data transfer queue. The trainer nodes then form training batches either from these worker-provided batches or directly from the SFT datamix ( +Section +˜ +5.1 +) for rehearsal. We refer to +Section +˜ +6.2 +for further detail on engineering aspects of our asynchronous RL infrastructure. +Data and RL environment mix. +Worker nodes produce trajectories from three main data sources: software engineering, competitive programming, and mathematics. They use the four RL environments we describe in the previous sections, which we refer to as agentic SWE ( +Section +˜ +5.3.1 +), code ( +Section +˜ +5.3.2 +), agentic coding ( +Section +˜ +5.3.3 +), and +math ( +Section +˜ +5.3.4 +). Each data source may contain multiple datasets from different origins; however, all datasets within the same data source share a consistent format and problem domain. We sample +40 +% +40\text{\,}\mathrm{\char 37\relax} +of tasks from software engineering, +40 +% +40\text{\,}\mathrm{\char 37\relax} +from competitive programming, and +20 +% +20\text{\,}\mathrm{\char 37\relax} +from mathematics. Rehearsal batches constitute +1 +/ +3 +1/3 +of the training data and are integrated with a standard negative log likelihood loss, scaled by a factor of +0.1 +0.1 +to match the gradient magnitudes obtained with GRPO ( +Section +˜ +5.2 +). +Three-stage training. +We split our joint RL training into three distinct stages. Between stages, we adapt the task distribution and employ custom reward shaping techniques. +• +Stage 1 – Reasoning format bootstrapping: +In the initial training stage, we soft-control the length of generations in math and coding tasks with an action length reward schedule. +For the +40 +% +40\text{\,}\mathrm{\char 37\relax} +of tasks related to competitive programming, we evenly sample from four environments: code Python, code C++, agentic coding Python, and agentic coding C++ (10% each). +For a subset of SWE tasks identified as challenging, we include a hint in the prompts and downsample their occurence (4% of overall tasks; +Section +˜ +5.3.1 +). +• +Stage 2 – Increasing task diversity and data resampling: +After +14 125 +14\,125 +gradient steps, we increase the proportion of competitive programming tasks in the datamix to +50 +% +50\text{\,}\mathrm{\char 37\relax} +and reduce the fraction of SWE tasks to +30 +% +30\text{\,}\mathrm{\char 37\relax} +. +We also include additional environment variations for each task. +Specifically, we add Rust, Go, Java, and JavaScript versions of the code environment, which, together with Python and C++, now constitute +25 +% +25\text{\,}\mathrm{\char 37\relax} +of the datamix. The other half of the competitive programming tasks use the agentic coding environment, to which we do not add new languages. +For the SWE data, we disable plugins with a +50 +% +50\text{\,}\mathrm{\char 37\relax} +chance, such that file edits require standard terminal commands. +We further remove hints from the challenging SWE subset and oversample it in a +4 +: +1 +4:1 +ratio when plugins are used, and reverse this ratio when plugins are disabled. +Competitive programming and SWE datasources are filtered to include instances with a solve rate in +[ +0.1 +, +0.7 +] +[0.1,0.7] +only in order to to maximize the learning signal. +For math tasks, we enable Python tool calling for +2 +% +2\text{\,}\mathrm{\char 37\relax} +of the total datamix. +At +16 500 +16\,500 +steps, we apply filtering based on solve rate with the +[ +0.1 +, +0.7 +] +[0.1,0.7] +interval to our math dataset as well. +For SWE data, we create fine-grained subsets for each 0.1 solve-rate interval from +( +0.0 +, +0.7 +] +(0.0,0.7] +, such as +( +0.1 +, +0.2 +] +(0.1,0.2] +and +( +0.6 +, +0.7 +] +(0.6,0.7] +, and sample harder examples more frequently using weights inversely proportional to the interval’s midpoint. +Hyperparameters. +After an initial linear warmup over 100 steps, we employ a learning rates of +2.5 +× +10 +− +7 +2.5\text{\times}{10}^{-7} +throughout training. +The maximum batch size for each gradient step is +8.4 +M +8.4\text{\,}\mathrm{M} +tokens during the first stage and +16.8 +M +16.8\text{\,}\mathrm{M} +tokens for the second stage. +Gradients are clipped to norm 0.1. We use +G += +8 +G=8 +rollouts per data point, and new model weights are broadcast to workers after 4 gradient steps. We list further GRPO-specific hyperparameters in +Section +˜ +5.2 +. +Length reward scheduling. +In both the code and mathematics environments, we allow context lengths of up to +64 +k +64\text{\,}\mathrm{k} +. We observe that, at the start of RL training, the model rapidly increases its response length, leading to inefficient token usage. To address this, we penalize the reward for correct but overlong solutions similar to DAPO +(Yu et al., +2025 +) +, but gradually phase out this penalty over training. Specifically, we linearly interpolate the reward between +1 +1 +and +− +1 +-1 +for correct answers with a length that exceeds a soft maximum ( +8 +k +8\text{\,}\mathrm{k} +at the beginning of training) but is lower than the hard maximum of +64 +k +64\text{\,}\mathrm{k} +. This provides a dense reward signal to the model that incentivizes it to reduce its response length, while still providing a positive signal when the answer is correct. +During training, we gradually increase the soft maximum in a continuous manner until it is equal to the hard maximum after +10 +k +10\text{\,}\mathrm{k} +training steps. See +Figure +˜ +13 +for an illustration of this process. +Figure 13 +: +Length reward scheduling for RL training. The decaying threshold +δ +\delta +starts at +8 +k +8\text{\,}\mathrm{k} +at the start of training and linearly increases to its +64 +k +64\text{\,}\mathrm{k} +limit over +10 000 +10\,000 +steps. +6 +Code and infrastructure +This section discusses details of our training pipeline, including efficiency-related features leveraged for +CWM +training and the asynchronous RL architecture. +6.1 +Techniques for efficient training +CWM +is trained on H100s using a combination of Fully-Sharded Data Parallelism (FSDP) and Tensor Parallelism (TP), see +Table +˜ +3 +. We adopt FlashAttention-3 +(Dao et al., +2022 +; Dao, +2024 +) +to improve training speed and reduce memory overhead. Additionally, we incorporate several optimizations towards efficient training. +fp8 +matrix multiplication. +All linear layers in transformer blocks used +float8 +low-precision mode, similar to +Micikevicius et al. ( +2022 +) +, achieving twice the nominal FLOPs of +bfloat16 +on Hopper GPUs. (For RL training, fp8 precision hurt performance and hence we used +bfloat16 +for linear layers in transformer blocks.) We used dynamic “row-wise” scaling, also called “outer-vector”, that is, operands were scaled along their reduction dimension. We used the +e4m3 +variant exclusively, and we disabled fast-accumulation throughout. The two matrix multiplication operations for the gradient computation in the backwards pass use special setups: +w.grad +is computed in +bfloat16 +(which increased accuracy and precluded the need to transpose its operands to satisfy +float8 +layout constraints, which make kernel fusion difficult); +in.grad +uses “tensor-wise” scaling for its weight operand, i.e., a single scaling factor for the whole tensor, which again makes transposition more efficient. In practice, we try to issue “unscaled” matrix multiplication kernels, introducing scaling in the kernel epilogue worsens performance, and perform the scaling as a manual post-processing step, which can be fused into subsequent kernels. +Reducing communication overhead for tensor parallelism. +We reduced the communication overhead of tensor parallelism (which we implement as sequence parallelism) by more-effectively overlapping it with computation via decomposition and micropipelining, using PyTorch’s Asynchronous Tensor Parallel (Async-TP) feature +7 +7 +7 +See +Async tensor parallelism in PyTorch with TorchTitan +. +, derived from xFormers +(Lefaudeux et al., +2022 +) +and originally inspired by +Wang et al. ( +2022a +) +. We implemented this optimization by-hand for the matrix multiplication needed to compute +w.grad +during the backward pass (sharded along the reduction dimension), as there is no defacto support in PyTorch. +fp8 +with tensor parallelism. +When TP is enabled, we adapted our +fp8 +recipe to further improve performance. We use “sub-row-wise” scaling where appropriate to align quantization boundaries with TP shards, which both avoids communication and improves accuracy. We perform all-gathers in +fp8 +, which improves throughput and also enables fusing quantization into previous kernels (e.g., LayerNorm). During the backward pass, however, we sometimes all-gather the same data twice (once in +fp8 +and once in +bf16 +, since it will be consumed by two separate matmuls, one for each dtype). Because of Async-TP, however, this adds zero overhead and enables fusing quantization into previous kernels. +Reducing memory consumption. +We use PyTorch’s AutoAC +8 +8 +8 +Enabled by setting +torch._functorch.config.activation_memory_budget < 1 +. +for activation checkpointing, which is integrated in the “partitioner” layer of the +torch.compile +stack, and uses an integer-linear program solver to optimize the memory-versus-recomputation tradeoff given a user-provided budget. We also leverage PyTorch’s vocab- and loss-parallel helpers to reduce memory consumption. +Table 3 +: +Summary of the training setup for the different +CWM +training stages on H100 GPUs. +Phase +Seq. Length +Batch size +# GPUs +Shards +DP +TP +Pre-training +8 +k +8\text{\,}\mathrm{k} +8.4 +M +8.4\text{\,}\mathrm{M} +2048 +2048 +1024 +1024 +2 +2 +Mid-training +131 +k +131\text{\,}\mathrm{k} +33.6 +M +33.6\text{\,}\mathrm{M} +2048 +2048 +256 +256 +8 +8 +Supervised Fine-tuning +32 +k +32\text{\,}\mathrm{k} +2.1 +M +2.1\text{\,}\mathrm{M} +256 +256 +32 +8 +Reinforcement Learning +131 +k +131\text{\,}\mathrm{k} +8.4 +M +8.4\text{\,}\mathrm{M} +/ +16.8 +M +16.8\text{\,}\mathrm{M} +2560/4608 +64 +8 +6.2 +RL systems +We train our models using our own asynchronous distributed RL framework. The key distinction from the standard +llm +training lies in the data collection process: in RL, training data is gathered through rollouts where the agent interacts with an environment. +Rollouts. +As shown in +Figure +˜ +14 +, a rollout consists of a sequence of iterative agent-environment interactions. Each environment implements two methods: +• +start +: start a new episode by producing an initial state and an observation (prompt) based on a sample from the dataset. The state encapsulates the contents of the hidden environment along with any specific resources corresponding to the current episode. +• +step +: takes an action (sequence of tokens) leading to a state transition. The new observation includes all information visible to the agent and necessary during training or inference, such as the latest action, observation, and reward. +Figure 14 +: +Overview of how agents interact with RL environments to produce trajectories. +All interactions between the agent and the environment are token-based. In addition, the environment can suggest context switches to erase past history or restart from scratch, allowing multi-context trajectories. +Our environments adhere to a common +trajectory format +, which prescribes that a trajectory consists of a sequence of messages, whose format is detailed in +Section +˜ +17 +. +Training. +GPUs are divided into a set of +workers +that continuously perform rollouts and +trainers +that update the current policy. Workers send batches of trajectories to the trainers as soon as they are completed and trainers send updated model weights to the workers periodically. After a model update is received and applied on a worker, the worker continues generation of partially completed trajectories using the old KV-cache. This approach ensures continuously high GPU utilization (see +Figure +˜ +15 +) and has been used in our previous work +(Synnaeve et al., +2019 +; Gehring et al., +2025 +; Tang et al., +2025 +; Cohen et al., +2025 +) +and notable RL frameworks such as PipelineRL +(Piche et al., +2025 +) +. +Inference. +We use our own throughput-optimized inference backend FastGen +(Carbonneaux, +2025 +) +. +FastGen supports batched inference, CUDA graphs, paged attention +(Kwon et al., +2023 +) +, chunked prefills, host-side KV-cache, tensor parallelism, and CPU/GPU profiling. +In batched inference, one generates tokens for each sequence in a batch in parallel, continuing without synchronizing CUDA streams until a block of tokens (e.g., +32 +32 +) is completed. +After each block, completed sequences are truncated at stop tokens and returned, and new sequences are added to the batch so as to keep the batch size constant. +For more details, see +Carbonneaux ( +2025 +) +. +Parallelism. +We support various kinds of parallelism on both trainer and worker nodes. +Trainers operate largely as in pretraining (see +Section +˜ +4.2 +), supporting FSDP and TP. +Worker nodes are grouped with TP to perform batched inference. +Whereas all trainer GPUs are synchronized, the worker groups/model replicas operate asynchronously from each other and from the trainers. +Model transfer. +For efficient model transfers, we use our custom PyTorch distributed backend, moodist +9 +9 +9 +See +https://github.com/facebookresearch/moodist +. +(Mella, +2025 +) +. It implements efficient queues that transfer data directly between GPU and CPU memory via InfiniBand both within the same compute node and between different nodes. This facilitates transferring model weights directly from the trainer’s GPU memory to the worker’s CPU memory. With FSDP, each trainer has a shard of the model weights. These shards must be concatenated and sent to the workers. +Model transfer consists of three stages: +1. +Each trainer sends their local shard from GPU memory to the CPU memory of a single worker. +2. +The workers perform a distributed concatenation similar to an all-gather, such that each worker ends up with all model weights. +3. +Each worker then individually applies the new weights. +The trainers are only involved in the first stage, which minimizes the amount of time model transfer takes on the trainers. On the workers, the first and second stages both occur entirely in CPU memory, which allows them to overlap with generations. The third stage is simply a CPU to GPU memory copy, so it is reasonably fast. When TP is enabled, this process occurs individually for each data-parallel group. +Execution infrastructure. +Our training pipeline leverages an internal code execution service to safely execute tens of thousands of code snippets per second, in parallel across multiple programming languages and asynchronously in isolated containerized environments. This code execution service is integrated into our training loop to provide execution results including stdout, stderr, exit codes, and environment state as feedback to the LLM. +Containerized execution for agentic RL. +We use a custom tool-based execution environment for agentic reinforcement learning, enabling agents to interact with containerized environments through structured tool calls for agentic tasks. It features a core tool execution framework based on flexible container backends (e.g., Docker execution services or Modal +( +Modal Team, +) +), implementation of remote execution servers and clients as an interface to a persistent shell session, plugins that can be defined as standalone Python scripts invoked through bash, along with evaluation infrastructure for reward calculation or benchmarking. +Figure 15 +: +In +CWM +-RL, model weights can be updated at any time on the worker side: between trajectories, within a trajectory between steps, or even during token generation. Compared to traditional RL, this removes all synchronization overhead, maximizing worker throughput while minimizing idle time. In exchange for never blocking inference, we accept that trajectories will potentially use mixed weights, though frequent model updates ensure that generations remain reasonably on-policy. Different workers may not update their weights at the same time: the system waits for each worker to signal readiness before sending new weights to avoid memory overload. +7 +Experimental results +We begin this section by analyzing the impact of incorporating +CWM +data during mid-training for a small-scale ablation. +Next, we evaluate +CWM +and compare its performance against relevant baselines, focusing on coding and mathematical reasoning tasks. +We consider agentic evaluation for coding tasks, together with additional computation-oriented evaluations covering +(i) output prediction with execution traces and reasoning, (ii) full execution trace prediction, (iii) program termination prediction, and (iv) prediction and generation of algorithmic complexity. +Finally, we evaluate +CWM +considering established benchmarks for competitive programming, mathematical reasoning, non-reasoning evaluation, and long-context. +Unless otherwise mentioned, we use a temperature of +1.0 +1.0 +and top-p value of +0.95 +0.95 +for all evaluations. +7.1 +The impact of +CWM +data +To evaluate the effect of incorporating +CWM +data during mid-training, we perform ablations with +8 +B +8\text{\,}\mathrm{B} +parameter models trained for +7 +T +7\text{\,}\mathrm{T} +tokens. We first pre-trained one model for +6 +T +6\text{\,}\mathrm{T} +tokens and then studied different mid-training datamixes for the remaining +1 +T +1\text{\,}\mathrm{T} +tokens, ablating the two +CWM +datasets, ForagerAgent and Python execution trace data, as well as our Github PR trajectory data. +10 +10 +10 +One may wonder if using the non-agentic PR data alone is sufficient for reaching strong performance on SWE-bench Verified. +After mid-training, all variants underwent a fine-tuning phase comparable to our main setup for +CWM +described in +Section +˜ +5.1 +but excluding the RL phase. We report results on CruxEval-O, CruxEval-I, NLLs over SWE-bench Verified (SBV) oracle patches, and NLLs over agentic SBV trajectories (truncated to +32 +k +32\text{\,}\mathrm{k} +sequence length) for the models out of mid-training and pass@1 SBV numbers for the models after SFT. +The results in +Table +˜ +4 +show that the best performance across our set of metrics is achieved when using all datasets together. This effect carries over to our SBV evaluation of the SFT model, demonstrating how mid-training data choices can positively affect final model performance. Looking at the impact of individual datasets, we find the inclusion of the PR data helps oracle SBV NLLs and SBV pass@1 but not the agentic SBV trajectory NLLs or CruxEval. Further incorporating execution trace data significantly improves CruxEval-input and -output prediction but leaves all SBV-related metrics unaffected. Lastly, only the addition of ForagerAgent data improves agentic SBV NLLs. +The ForagerAgent data is further able to improve SBV pass@1 scores by another +3.7 +% +3.7\text{\,}\mathrm{\char 37\relax} +. +Table 4 +: +Our ablation study reveals a positive impact on performance from introducing GitHub PR trajectory, Python execution tracing, and ForagerAgent data during mid-training. We report results for CruxEval-output, CruxEval-input, NLLs on oracle SWE-bench Verified (SBV) trajectories, NLLs on agentic SBV trajectories, and SBV pass@1 scores. All results are for +8 +B +8\text{\,}\mathrm{B} +models, jointly pre-trained for +6 +T +6\text{\,}\mathrm{T} +tokens followed by +1 +T +1\text{\,}\mathrm{T} +tokens of mid-training ablation, with SBV pass@1 reported after an additional SFT phase. +PRs +Tracing +Forager +CruxEval-O +↑ +\uparrow +CruxEval-I +↑ +\uparrow +Oracle SBV NLL +↓ +\downarrow +Agentic SBV NLL (32k) +↓ +\downarrow +SBV +↑ +\uparrow +✗ +✗ +✗ +45.4 +44.1 +0.64 +0.39 +14.6 +✓ +✗ +✗ +44.6 +45.8 +0.55 +0.37 +18.6 +✓ +✓ +✗ +73.9 +51.5 +0.54 +0.38 +18.4 +✓ +✓ +✓ +74.5 +54.8 +0.54 +0.29 +22.1 +7.2 +Agentic evaluation +SWE-bench Verified. +Figures +˜ +2 +and +16 +show results for SWE-bench Verified. +CWM +achieves pass@1 resolve rates of +65.8 +% +65.8\text{\,}\mathrm{\char 37\relax} +with test-time-scaling and +53.9 +% +53.9\text{\,}\mathrm{\char 37\relax} +without test-time scaling (averaged over 4 runs). With test-time scaling, +CWM +outperforms open-weight models at similar size and is competitive to larger and proprietary models. The base score without test-time scaling also surpasses open-weight models with similar parameter counts and remains respectable even when comparing to much larger models such as GPT-oss-120B +(Agarwal et al., +2025 +) +, Qwen3-Coder +(Yang et al., +2025a +) +, and Kimi K2 +(Kimi Team et al., +2025 +) +. +Figure 16 +: +SWE-bench Verified pass@1 scores. +CWM +achieves best-in-class performance with and without test-time-scaling (tts), achieving +65.8 +% +65.8\text{\,}\mathrm{\char 37\relax} +and +53.9 +% +53.9\text{\,}\mathrm{\char 37\relax} +respectively. Note that GPT-oss scores are computed with respect to a limited subset of +477 +477 +out of +500 +500 +problems. +For Test-Time-Scaling (TTS) on SWE-bench Verified, we first generate +k +k +candidate solutions as well as +40 +40 +novel +unit tests in parallel agentic loops for each instance. +Like Agentless +(Xia et al., +2024 +) +, we ask the model to generate tests that verify patch correctness +and +reproduce the original bug, enabling us to filter out tests that fail to reproduce errors. Following SWE-RL +(Wei et al., +2025 +) +, we keep the top-5 majority tests for each instance. +Since candidate solutions are often similar in the number of +existing +tests they pass, we prioritize the strongest candidates by keeping only those patches that pass the highest number of existing tests. We then execute the remaining patches on the filtered set of novel tests and select the patch with the highest pass rate for submission. In case of ties, we prioritize the majority patch, and if the tie remains, we choose the patch whose trajectory has fewer tokens. We refer to this approach as best@ +k +k +. +In +Figure +˜ +16 +, we report results for best@ +k +k +for +k += +16 +k=16 +, which achieves a +65.8 +% +65.8\text{\,}\mathrm{\char 37\relax} +resolve rate. As a simple alternative to best@ +k +k +, we found that majority voting +(Wang et al., +2022b +) +of candidate patches, based on exact string matching and without any test generation or execution, leads to a pass rate of +58.4 +% +58.4\text{\,}\mathrm{\char 37\relax} +. In +Figure +˜ +17(a) +, we report best@ +k +k +and pass@ +k +k +across different values of +k +k +. +As expected, pass@k improves monotonically with larger +k +k +, ultimately reaching a success rate of +80.4 +% +80.4\text{\,}\mathrm{\char 37\relax} +at +k += +40 +k=40 +. For best@ +k +k +, performance improves sharply from +k += +2 +k=2 +before plateauing around +k += +16 +k=16 +. For majority-voting, performance improves gradually from +k += +2 +k=2 +and plateaus at +k += +24 +k=24 +. +(a) +(b) +Figure 17 +: +(a) Test time scaling (TTS) with both our best@ +k +k +method majority voting can significantly increase pass@ +1 +1 +rates for +CWM +on SWE-bench Verified. (b) Accuracy of +CWM +on Aider Polyglot by programming language using the whole file edit format. +Alternative harnesses for SWE-bench Verified. +To better understand the robustness of +CWM +to the choice of evaluation harness and tool-calling implementations, we perform experiments with third-party approaches, namely Mini-SWE-Agent +(Yang et al., +2024 +) +and OpenHands +(Wang et al., +2025 +) +. For both, we shortened and adapt the system prompt to better align with the SWE RL prompt (see +Figure +˜ +11 +) and make sure to keep reasoning output as part of the message history. We configure both harnesses to use OpenAI function calling, which sends messages along with structured tool descriptions. When prompting the model, we format and append the available tools to the system prompt. When the model decides to call a tool, the call is parsed and returned in a +tool_calls +field in our response. This makes sure that tools are rendered with a syntax template suitable for prompting +CWM +. +For Mini-SWE-Agent, we follow the official budget of +250 +250 +turns. For OpenHands, we report results for +40 +40 +, +128 +128 +, and +500 +500 +turns. +Additionally, we report results for our harness when limiting tool use to bash commands only. As +Section +˜ +7.2 +shows, although resolve rates degrade when using different agents, tool implementations, or limiting tool choices, +CWM +provides robust and reasonable performance across all setups. +Table 5 +: +SWE-bench Verified resolve rates for alternative agentic harnesses are lower than the +53.9 +% +53.9\text{\,}\mathrm{\char 37\relax} +pass@1 achieved with our approach, but performance remains reasonable across the board. +Harness +Configuration +Resolve Rate (%) +Mini-SWE-Agent +250 turns +37.6 +OpenHands +40 turns +36.0 +128 turns +42.6 +500 turns +40.8 +Ours +(bash-only) +128 turns +42.1 +Ours +128 turns +53.9 +Table 6 +: +Results on Aider Polyglot for +CWM +and baselines from the official leaderboard. +Model +Format +Pass 1@2 (%) +o3-pro (high) +Diff +84.9 +DeepSeek R1 (0528) +Diff +71.4 +Qwen3 235B A22B diff, no think +Diff +59.6 +Kimi K2 +Diff +59.1 +gpt-oss-120b (high) +Diff +41.8 +Qwen3-32B +Diff +40.0 +Gemini 2.0 Pro exp-02-05 +Whole File +35.6 +CWM +Whole File +35.1 +Grok 3 Mini Beta (low) +Whole File +34.7 +o1-mini-2024-09-12 +Whole File +32.9 +gpt-4.1-mini +Diff +27.1 +Codestral 25.01 +Whole File +11.1 +Multi-lingual coding. +The Aider Polyglot benchmark +(Aider Team, +2025 +) +measures coding ability across a diverse set of programming languages using challenging exercises from Exercism. +11 +11 +11 +See +https://exercism.org/ +. +The primary metric is the pass rate on the second attempt, allowing the model to iterate on test failures once. We make a few changes to the harness to align it with the +CWM +training distribution, such as removing hard-coded assistant messages from the history, concatenating adjacent messages of the same role, removing examples from the system prompt, turning off auto-linting and stripping of reasoning traces, and reiterating in the prompt that exact matches are needed. We evaluate with reasoning, at temperature +0.4 +0.4 +, and without test-time-scaling. Although Aider Polyglot may not fully qualify as a truly agentic benchmark – given its lack of dynamic tool use beyond code execution and limited interaction – we include it here in light of the self-correction capabilities that it allows for. +As shown in +Table +˜ +6 +, CWM achieves +35.1 +% +35.1\text{\,}\mathrm{\char 37\relax} +accuracy, comparable to other models in its class such as Qwen3-32B ( +40.0 +% +40.0\text{\,}\mathrm{\char 37\relax} +) and other models using the “whole file” edit format such as Gemini 2.0 Pro ( +35.6 +% +35.6\text{\,}\mathrm{\char 37\relax} +). +We also observe good generalization performance across the six languages tested in the benchmark, as shown in +Figure +˜ +17(b) +. Many top-performing models, such as o3-pro ( +84.9 +% +84.9\text{\,}\mathrm{\char 37\relax} +) +(OpenAI, +2025b +) +, DeepSeek R1 ( +71.4 +% +71.4\text{\,}\mathrm{\char 37\relax} +), and Qwen3 235B ( +59.6 +% +59.6\text{\,}\mathrm{\char 37\relax} +), achieve substantially higher scores using the “diff” edit format. However, +CWM +was not optimized for this format and does not reach competitive performance with it. +Terminal-Bench. +Another multi-turn agentic coding benchmark that is gaining in popularity is Terminal-Bench +(The Terminal-Bench Team, +2025 +) +. In Terminal-Bench, the agent is asked to solve various complex tasks by operating directly in a tmux session. +Again, we align the prompts and response parsing of the Terminus-1 +12 +12 +12 +See +https://www.tbench.ai/terminus +. +agent provided by the benchmark with our RL training phase: we modify the system prompt to use the tools that +CWM +was trained with (see +Figure +˜ +8 +) and parse the model output back into the format that Terminus-1 expects. We also include reasoning tokens from prior turns into the agent’s history. +In this setup, +CWM +achieves a +26.25 +% +26.25\text{\,}\mathrm{\char 37\relax} +accuracy with the Terminus 1 agent following the default budget of +50 +50 +turns. +Table +˜ +7 +shows this places +CWM +below o4-mini but above Gemini 2.5 Pro on the Terminal-Bench leaderboard. +Table 7 +: +Results on Terminal-Bench for +CWM +and baselines from the official leaderboard. +Model +Agent +Accuracy (%) +OpenAI-Multiple +OB-1 +59.0 +GPT-5 +OB-1 +49.0 +GPT-5 +Terminus 1 +30.0 +o4-mini +Goose +27.5 +CWM +Terminus 1 +26.3 +Gemini 2.5 Pro +Terminus 1 +25.3 +o4-mini +Terminus 1 +18.5 +Grok 3 Beta +Terminus 1 +17.5 +Gemini 2.5 Flash +Terminus 1 +16.8 +Qwen3-32B +TerminalAgent +15.5 +Table 8 +: +Execution trace prediction is competitive with reasoning for CruxEval-output pass@1 scores. For +CWM +, we use temperature +0.6 +0.6 +, top-p +0.95 +0.95 +, and +10 +10 +generations, while for +CWM +SFT we use greedy decoding. +Budget +Mode +CWM SFT +CWM +small +Language w/o CoT +67.8 +66.6 +Trace Step +59.1 +58.1 +large +Language w/ CoT +83.3 +94.3 +Trace Full +87.3 +87.7 +7.3 +Execution trace prediction +Next, we analyze the ability of +CWM +to perform trace prediction, analyze its prediction, and explore this ability to predict program termination. +CruxEval-O as execution trace prediction. +The following experiment evaluates +CWM +’s ability to predict Python execution traces using the format introduced in +Section +˜ +2.2 +. We prompt the model with functions and input arguments from the CruxEval test set, ask it to predict the function execution trace line-by-line, and then compare its output prediction to the ground truth. To elicit trace prediction, we construct prompts following our custom trace format, with the input containing the function as the code context, the call arguments as the state, and the line containing the function definition as the first action. +In addition to this “full” execution trace prediction scenario, we also study a single-“step” scenario, for which we ask the model to directly predict the return value of the function. This is achieved by replacing the +<|line_sep|> +token with +<|return_sep|> +. +We illustrate both formats in +Figure +˜ +11.22 +. We compare the “step” scenario to classic CruxEval-output prediction, which few-shot prompts the model to directly predict outputs given function definitions and inputs. +Consequently, we compare the “full” trace prediction mode to CruxEval-output with reasoning, which allows +CWM +to use reasoning as introduced in +Section +˜ +5.3 +before predicting the function output. +Our results in +Table +˜ +8 +show that large compute budgets, either allowing for execution trace prediction or reasoning, produce better results. +CWM +achieves a best score of +94.0 +% +94.0\text{\,}\mathrm{\char 37\relax} +in natural language reasoning mode, while full trace prediction achieves +88 +% +88\text{\,}\mathrm{\char 37\relax} +. Note that language reasoning traces are significantly more verbose, using +1164 +1164 +tokens on average compared to +497 +497 +tokens for full trace prediction. We also report results for +CWM +after SFT, which achieves its best result of +87.3 +% +87.3\text{\,}\mathrm{\char 37\relax} +using full execution trace prediction. +Single-step trace prediction is not competitive with classic few-shot prompting for either +CWM +model. +Execution trace prediction analysis. +Follow previous paragraph, we present a detailed evaluation of the quality of the execution traces predicted by +CWM +for validation sets of CruxEval and our function-level data. Concretely, we measure the fraction of generated traces that follow our format (Valid Trace Format) and the observation (action) exact match accuracy (Observation (Action) Exact Match), which measures the number of observations (actions) exactly matching ground truth relative to the total number of observations (actions) per execution trace. Our trace format specifies the state as a JSON dump of a dictionary containing the local variables. +We report the fraction of state predictions matching this format (Valid JSON Format). +Additionally, Key (+Value) Match measures the average fraction of matching keys (and values) per state prediction. +The results in +Table +˜ +9 +show that +CWM +adheres to the correct trace and observation format for all data sources, achieving more than +99 +% +99\text{\,}\mathrm{\char 37\relax} +format matching across the board. +CWM +is able to accurately predict the execution trace as well as intermediate observations and actions, which is reflected in scores larger than +96 +% +96\text{\,}\mathrm{\char 37\relax} +for Observation/Action Exact Match and larger than +97 +% +97\text{\,}\mathrm{\char 37\relax} +in Key (+Value) Match. +Table 9 +: +Detailed analysis of execution trace prediction with +CWM +and greedy decoding. We present a breakdown of the accuracy of the individual components of trace prediction for validation set inputs from CruxEval as well as our function-level data. +The CruxEval pass@ +1 +1 +score here differs from the one in +Table +˜ +8 +( +87.7 +% +87.7\text{\,}\mathrm{\char 37\relax} +) due to greedy decoding. +Overall, we find solid accuracy across state and action prediction. +CruxEval +Function-level +Output +pass@1 +88.0 +94.4 +Trace +Valid Trace Format +99.6 +100.0 +State Exact Match +96.9 +96.4 +Action Exact Match +96.5 +98.0 +States +Valid JSON Format +100.0 +100.0 +Key Match +99.1 +99.0 +Key+Value Match +98.1 +97.9 +Statistics +Avg State Length (Token) +11.7 +18.8 +Avg Action Length (Token) +11.2 +10.0 +7.4 +Program termination prediction +The question of +whether a program terminates is a reasoning problem which goes beyond what can be shown by considering individual finite traces as in +CWM +training: non-termination cannot be observed by executing a trace in finite time, and termination on all inputs cannot be feasibly observed by enumerating traces. +Figure +˜ +11.28 +in the Appendix illustrates termination reasoning, whereby +CWM +considers several concrete inputs before generalizing to the conclusion of terminating on all inputs. +We propose HaltEval-prelim, a novel benchmark obtained by automatically translating C programs with termination annotations into Python using LLaMA-3-70B via few-shot prompting. The C programs are sourced from the International Competition on Software Verification (SVCOMP) and the Termination Problems Database (TPDB). +13 +13 +13 +See +https://sv-comp.sosy-lab.org/ +and +https://termination-portal.org/wiki/TPDB +. +Each original problem comes with termination/non-termination annotations, which we manually verify are preserved during the Python translation phase and otherwise discard. +We obtained a balanced dataset consisting of +115 +115 +terminating ( +T +) and +115 +115 +non-terminating ( +NT +) Python programs. +We query LLMs to judge whether a program terminates (answer +#T +) or diverges (answer +f +​ +( +n +) +f(n) +where n leads to divergence, followed by the comment +#NT +). +We reward a divergence claim if +f +​ +( +n +) +f(n) +times out after 5 seconds. +If, however, the model predicts +#NT +, and the ground truth is +#T +, it is not rewarded, even if execution exceeds the timeout. For instance, if +f +f +’s ground truth is +#T +and +f +​ +( +42 +) +f(42) +runs for 7.5 million years and then terminates, our scoring (pass@1) will not reward a +#NT +claim for +f +​ +( +42 +) +f(42) +even though it trips timeout. +Our use of timeout as a proxy for divergence is similar to +Alon and David ( +2022 +) +in judging correct non-termination claims, but different in that timeout is not used as a ground truth for termination claims. This results in an eval that is conservative in the sense that it awards scores that could be higher than that given by a perfect oracle, but never lower. +It would be worth exploring replacing the ground truths and input validation by logical proofs of termination and non-termination +(Cook et al., +2011 +; Gupta et al., +2008 +) +. +Table +˜ +10 +reports results for +CWM +, Qwen3-32B, and Llama3-70B with direct prediction, prompted chain-of-thought (CoT), and reasoning (for +CWM +and Qwen3-32B only). “Reasoning” here means use of the + ... +format from RL. We report CoT prompting numbers to represent an attempt to approximate reasoning that is compatible with Llama3-70B. +As a reference, we also provide the scores of a constant classifier tagging all programs as terminating, which would obtain a pass@ of +0.5 +0.5 +. +When comparing +CWM +and Qwen3, results suggest Qwen3 reaches better direct and CoT performance, however under the reasoning setup, both models significantly improved, reaching comparable performance of +∼ +\sim +0.94 +0.94 +pass@ +1 +1 +. +Table 10 +: +HaltEval-prelim pass@1 +scores for different LLMs in different prompting settings. For reasoning we use temperature +0.6 +0.6 +, top-p of +0.95 +0.95 +, and +10 +10 +generations, while for direct and CoT predictions we use greedy decoding. +Constant +CWM +Qwen3-32B +Llama-3-70B +T +Direct +CoT +Reasoning +Direct +CoT +Reasoning +Direct +CoT +pass@1 +0.5 +0.37 +0.55 +0.94 +0.49 +0.68 +0.94 +0.43 +0.48 +We initially designed HaltEval-prelim under the assumption that termination would be difficult to assess, given its undecidability. The strong results achieved by both +CWM +and Qwen3-32B with reasoning were therefore unexpected. Still, these findings should be interpreted cautiously: the benchmark is based on small, self-contained programs and does not reflect the challenges of real-world software, where bugs must be detected in large and complex codebases. Hence, success on this preliminary dataset may not translate directly to practice. Moreover, termination in real systems is highly imbalanced – typically with hundreds or thousands of terminating loops for every non-terminating one – unlike the balanced distribution in our dataset +(Vanegue et al., +2025 +) +. +7.5 +Algorithmic complexity prediction +We evaluate +CWM +on two tasks from +BigO(Bench) +(Chambon et al., +2025 +) +: complexity prediction, determining the Big-O time/space complexity of existing code, and complexity generation, solving coding problems while adhering to specified complexity constraints. We report all@ +1 +1 +scores, which require correct LLM output simultaneously across all possible complexity classes for a given problem. For complexity generation, we also report the pass@ +1 +1 +score with and without the complexity requirement (the solution still needs to be correct), and a best@ +1 +1 +score that corresponds to pass +@ +​ +1 +@1 +on the lowest complexity class of each problem, dismissing suboptimal classes. +Results for +CWM +, Qwen3-32B, Qwen3-coder-30B, and Gemma-3-27B are all presented in +Table +˜ +11 +. To ensure the comparison with external models remains as fair as possible, we choose to re-evaluate them alongside +CWM +in the same evaluation setting. +For both tasks, we use +BigO(Bench) +’s official setup, after performing a prompt ablation that did not seem to further boost performance. +On time complexity prediction, +CWM +achieves the best all@ +1 +1 +score of all compared models but fares worse for space complexity. In particular, looking at the official benchmark leaderboard, +14 +14 +14 +See +https://facebookresearch.github.io/BigOBench/leaderboard.html +at the time of writing. +CWM +ranks second overall on time complexity prediction (all@ +1 +1 +) across all reported models of all sizes. +For time complexity generation, +CWM +achieves the best overall pass@ +1 +1 +, best@ +1 +1 +, and all@ +1 +1 +scores for our set of models, and also ranks second in general looking at the official benchmark scores. +For space complexity generation, our model ranks first for pass@ +1 +1 +on code only, and second behind Qwen3-32B in terms of the remaining metrics. We note that +CWM +stands out in particular in time complexity reasoning, systematically outperforming other models across all metrics on both prediction and generation variants. Moreover, when complexity requirements are set aside, the model’s performance on code-only pass@ +1 +1 +degrades far less than for other models, indicating +CWM +is able to maintain focus on fundamental task requirements while effectively handling additional constraints. +Table 11 +: +BigOBench +results comparing +CWM +against Qwen3-32B (with reasoning), Qwen3-coder-30B, and Gemma-3-27B on complexity prediction and complexity generation, for both time and space complexity. +CWM +outperforms our set of baseline models for all metrics on time complexity prediction and generation. For space complexity generation, +CWM +performs best on code-only pass@ +1 +1 +and ranks second on the remaining metrics. +We refer to the main text for details on the task and metrics. +CWM +Qwen3-32B +Qwen3-coder-30B +Gemma-3-27B +Prediction +Time Complexity +- all@ +1 +1 +41.3 +39.0 +36.6 +37.7 +Space Complexity +- all@ +1 +1 +12.3 +15.1 +9.1 +13.1 +Generation +Time Complexity +Code Only - pass@ +1 +1 +76.1 +70.0 +43.8 +34.4 +Code & Complexity - pass@ +1 +1 +31.3 +29.1 +20.3 +13.3 +Code & Complexity - best@ +1 +1 +48.6 +43.5 +27.2 +15.2 +Code & Complexity - all@ +1 +1 +7.6 +6.5 +5.5 +2.1 +Space Complexity +Code Only - pass@ +1 +1 +73.2 +65.9 +45.1 +36.4 +Code & Complexity - pass@ +1 +1 +24.1 +25.5 +17.7 +14.6 +Code & Complexity - best@ +1 +1 +36.6 +39.6 +26.3 +20.6 +Code & Complexity - all@ +1 +1 +3.2 +5.1 +2.4 +1.5 +7.6 +Code and mathematical reasoning +We present results on LiveCodeBench (LCB, +Jain et al. ( +2025a +) +), concretely the LCBv5 and LCBv6 date ranges +01.10.2024-01.02.2025 +15 +15 +15 +For LCBv5, we report results starting from October to be consistent with the numbers reported by Qwen3. +and +01.08.2024-01.05.2025 +, in +Table +˜ +12 +. +16 +16 +16 +Results for Magistral were taken from the official reported numbers for 1.2 version, where no explicit dates were mentioned. +We here compare +CWM +to relevant baseline models with similar parameter counts. +Again, we observe highly competitive performance on par with Magistral-small-1.2 +(Rastogi et al., +2025 +) +, Qwen3-32B, and gpt-oss-20B +(Agarwal et al., +2025 +) +. +Table +˜ +12 +also contains pass@1 results for +CWM +on Math-500 +(Lightman et al., +2023 +) +, AIME24 +(OpenAI, +2024 +) +, and AIME25 – all averaged over +n += +20 +n=20 +samples. +CWM +performs slightly worse across the board, with notable gap compared to gpt-oss-20B (high) on AIME. +In +Figure +˜ +18(a) +, we additionally report test-time scaling results using majority voting and short-3@k for +CWM +on AIME. +Short-m@k +(Hassid et al., +2025 +) +begins sampling +k +k +answers in parallel but stops sampling once the first +m +m +generations are complete, and then selects the most common answer among the three. +CWM +performance on AIME24 increases by up to +11 +% +11\text{\,}\mathrm{\char 37\relax} +at +k += +10 +k=10 +with majority voting. +Short-3@k achieves performance comparable to majority voting, while significantly reducing computational cost for a given +k +k +. +Table 12 +: +Agentic, code, and mathematical reasoning benchmarks. We compare +CWM +to baselines with roughly the same number of parameters.( +† +\dagger +: LCB results for gpt-oss-20B (high) suffered from repeated time-outs due to repetitive reasoning, despite our prompt-tuning efforts – which boosted gpt-oss (low/medium) scores by about +10 +% +10\text{\,}\mathrm{\char 37\relax} +.) +Magistral-small-1.2-24B +Qwen3 -32B +gpt-oss-20B (low / med / high) +CWM +LCBv5 +70.0 +65.7 +54.2 / 66.9 / +0 +– +† +.0 +68.6 +LCBv6 +61.6 +61.9 +47.3 / +62.0 +/ +0 +– +† +.0 +63.5 +Math-500 +- +97.2 +– +96.6 +AIME24 +86.1 +81.4 +42.1 / 80.0 / +92.1 +* +76.0 +AIME25 +77.3 +72.9 +37.1 / 72.1 / +91.7 +* +68.2 +7.7 +Non-reasoning evaluations +Although our main focus with +CWM +is code world modeling, we also provide evaluation results of +CWM +on a set of standard tasks covering code, math, and general knowledge without reasoning mode enabled. We here compare to models with similar parameter counts, such as Qwen3-32B or Gemma-3-27B, as baselines, and we use greedy generation instead of sampling at non-zero temperature. +The results in +Table +˜ +13 +show that +CWM +typically performs better than Gemma-3-27B, similar to Qwen2.5-32B, but worse than Qwen3-32B. An interesting exception to this is CruxEval-O +(Gu et al., +2024 +) +, where the introduction of the tracing data (see +Section +˜ +2.2 +) likely helps +CWM +gain an advantage. Note that we achieve even better results on CruxEval-Output when using reasoning (see +Section +˜ +7.3 +). +Next, we consider two long-context evaluation benchmarks: LoCoDiff +(Mentat AI Team, +2025 +) +and RULER +(Hsieh et al., +2024 +) +. In LoCoDiff, models are provided with the commit history of a specific file and asked to construct its final version. To succeed, models must follow the files’ evolution – from the initial commit, through diffs on multiple branches, to the resolution of merge conflicts. Performance is evaluated by the proportion of files for which the model reproduces the target version exactly. We compare +CWM +to DeepSeek-R1 0528, Claude Sonnet 4 +(Anthropic, +2025 +) +, Gemini 2.5 Pro 06-05 +(Comanici et al., +2025 +) +, Kimi K2, GPT-5, and gpt-oss-120B. Qwen3-32B is not on the leaderboard and has a shorter native context length. +For LoCoDiff, the results in +Figure +˜ +18(b) +show that, while all models suffer a degradation in performance as the sequence length increases, +CWM +provides better performance than DeepSeek-R1 0528 and gpt-oss-120B and is competitive with large scale commercial models (e.g., GPT-5 and Gemini 2.5 Pro) on both short and long sequences, with a significant gap to Claude Sonnet 4 only. We present results for RULER in +Section +˜ +19 +. +Table 13 +: +Performance of +CWM +and +CWM +Mid +\textsc{CWM}_{\textup{\tiny{Mid}}} +, ( +CWM +after mid-training), on a set of general, math, and coding tasks without any reasoning compared to a set of recent baseline models with similar parameter counts. +CWM +CWM +Mid +{}_{\textup{\tiny{Mid}}} +Qwen3-32B +Qwen2.5-32B +Gemma-3-27B +Llama-3-70B +Llama-4-Scout +MMLU +77.7 +73.6 +83.6 +83.3 +78.7 +79.3 +78.3 +MMLU-Pro +60.2 +52.3 +65.5 +55.1 +52.9 +53.8 +56.1 +GPQA +40.6 +31.7 +49.5 +48.0 +26.3 +– +40.4 +GSM8k +93.3 +84.7 +93.4 +92.9 +81.2 +83.7 +85.4 +HumanEval-Plus +75.0 +68.3 +72.1 +66.3 +55.8 +– +59.9 +MBPP +73.4 +67.8 +78.2 +73.6 +68.4 +66.2 +68.6 +CRUX-O +83.4 +78.9 +72.5 +67.8 +60.0 +– +61.9 +(a) +(b) +Figure 18 +: +(a) Test-time scaling on AIME24 with majority voting and short-3@k. See main text for details.(b) LoCoDiff results for +CWM +and baselines considering different sequence lengths buckets. +8 +Transparency, Risks & Limitations +8.1 +Transparency on external models and data +As mentioned previously in the relevant sections, we use data from external LLMs in four contexts: (i) ForagerAgent, (ii) trace-to-natural language conversion, (iii) function tracing, and (iv) the SFT phase. For the ForagerAgent, we employ Llama3-70B-Instruct +(Dubey et al., +2024 +) +and Qwen3-235B-A22B (without thinking) +(Yang et al., +2025a +) +as base models to interact with the computational environment. For converting raw Python traces into natural language, we use Qwen3-32B-FP8 (without thinking) +(Yang et al., +2025a +) +. For function tracing, we use Llama3-70B-Instruct to generate Python function inputs and to generate solutions for CodeContests data. Finally, during SFT, we incorporate trajectories from DeepSeek-R1 +(Guo et al., +2025 +) +through the OpenMathReasoning +(Moshkov et al., +2025 +) +and OpenCodeReasoning +(Ahmad et al., +2025 +) +datasets. We used mitigated versions of the OpenCodeReasoning and OpenMathReasoning datasets, where mitigations included algorithmic bias filtering and cybersecurity protections. We applied similar mitigations when using Qwen3-32B-FP8 to generate data for training. No external LLM tokens were used beyond those explicitly mentioned in these four contexts. +8.2 +Code World Model Preparedness Report +Despite its relatively small size of +32 +B +32\text{\,}\mathrm{B} +parameters, +CWM +outperforms open-weight models at similar size and is competitive to larger and proprietary models on verified software engineering benchmarks. To anticipate risks from this release, including potentially novel risks, we conducted an automated assessment of CWM capabilities relevant to the domains identified in our Frontier AI Framework +17 +17 +17 +See +https://ai.meta.com/static-resource/meta-frontier-ai-framework +. +that could present potentially catastrophic risks, namely Cyber and Chemical & Biological risks. As part of ongoing work to improve the robustness of our evaluations and the reliability of our models, we also include a preliminary propensity evaluation, with plans to expand this area in future assessments. +We performed this assessment by testing the relative performance of +CWM +against a set popular and capable open-source models that represent a baseline of capabilities available in the open ecosystem: Qwen3-Coder-480B-A35B-Instruct +(Yang et al., +2025a +) +, Llama 4 Maverick +(Meta AI, +2025 +) +, and gpt-oss-120B +(OpenAI, +2025a +) +. +Based on the results of these assessments, we believe that the open-source release of +CWM +is unlikely to meaningfully increase risks related to Cybersecurity or Chemical & Biological threats beyond the current ecosystem baseline. +Additionally, our preliminary evaluations suggest that +CWM +shows undesirable propensities at rates comparable to most open-source models though some models achieve substantially lower rates, i.e., gpt-oss-120B. +These results indicate that +CWM +is within the “moderate” risk threshold for the catastrophic domains defined in Meta’s Frontier AI Framework. +17 +We share the details in the Code World Model Preparedness Report. +18 +18 +18 +Code World Model Preparedness Report, available at +https://ai.meta.com/research/publications/cwm-preparedness +. +8.3 +Limitations & future research +We explicitly release CWM as a research model under a noncommercial research license for the community to explore the opportunities afforded by world modeling and reasoning in computational environments. As such, our models come with a number of limitations which we outline below to help the research community make the most of +CWM +, while being aware of its shortcomings and avoiding accidental misuse. +As these are research-only models, they are not suitable for production use cases. Although we have performed some limited evaluations, we have not conducted a full range of possible evaluations for these models. The performance of +CWM +in production and real-world scenarios has not been evaluated by Meta. These models have not been fully evaluated or trained for user-facing interactions and they are not intended for such use. Researchers are recommended to exercise caution when deploying or using these models. +Similarly, +CWM +should not be used as a general-purpose assistant or chat model. While it was exposed to some level of instruction-following data during SFT, +CWM +has not undergone any thorough optimization for general chat-bot use, such as RLHF +(Ouyang et al., +2022 +) +. General chat use is not an intended use of CWM and generations may diverge from expectations and/or be inappropriate or inaccurate. Further, +CWM +training focuses strongly on code generation and reasoning with code. Thus, our models may be lacking in other domains such as factual knowledge or classic natural language tasks. +CWM +is not trained for use as a general-purpose assistant or chat model and has not been aligned on, or fully evaluated for, content risks. We make available system level protections – like Llama Guard, Prompt Guard, and Code Shield – as a solution to help manage content generation in research environments. +19 +19 +19 +See +https://www.llama.com/llama-protections +. +However, these system level protections alone are unlikely to be sufficient to enable production uses of +CWM +and further evaluations and fine-tuning may be required. +CWM +is intended to be used in English only. It is not multilingual and performance in other languages has not been evaluated or optimized. +Lastly, while we are excited about the opportunities that world modeling affords, these are only our first steps in this direction. Our code world modeling dataset collection efforts focus on explicit Python execution, and expanding this set to include other programming languages or symbolic execution is left for future work. Robust ways to leverage world model knowledge to improve performance across a variety of tasks via prompting or fine tuning is a ripe area for research. Similarly, planning with code world models, either using formal inference frameworks or informally during reasoning, is an exciting direction for research and core to our motivation for building +CWM +in the first place. In some sense, one might compare the current state of +CWMs +to LLMs before CoT +(Wei et al., +2023 +) +: the capabilities are there, we just need to find out how to make the most of them. +9 +Conclusion +Our vision is for Code World Models to bridge the gap between language‑level reasoning and executable semantics. We believe that coding and agentic use cases of LLMs will benefit from having a world model, a learned transition function between states conditioned on actions. +With the release of +CWM +, we present the first steps of this vision. +Our ablations already show that world modeling data, Python execution traces, and executable Docker environments can be directly beneficial for downstream task performance. +More broadly though, +CWM +provides a strong test-bed for future research in zero-shot planning, grounded chain-of-thought, and reinforcement learning with sparse, verifiable rewards. +Similar to our early results with execution trace prediction, we believe that the Python tracing world model enables research on reasoning about code generation, execution, correctness, and verification. +World models should improve reinforcement learning because agents that are already familiar with the dynamics of the environment can focus on learning which actions lead to rewards. +More research is needed to consistently leverage the benefits of incorporating world models into LLMs during pre-training across tasks. +Ultimately, models that can reason about the consequences of their actions should be much more efficient in their interactions with the environment which should allow for scaling the complexity of the tasks they perform. +Authors: Meta FAIR CodeGen Team +Alphabetic order for core contributors, from second author onward and excluding senior authors, and contributors. +Core contributors +Jade Copet +Quentin Carbonneaux +Gal Cohen +Jonas Gehring +Jacob Kahn +Jannik Kossen +Felix Kreuk +Emily McMilin +Michel Meyer +Yuxiang Wei +David Zhang +Kunhao Zheng +Contributors +Jordi Armengol-Estapé +Pedram Bashiri +Maximilian Beck +Pierre Chambon +Abhishek Charnalia +Chris Cummins +Juliette Decugis +Zacharias V. Fisches +François Fleuret +Fabian Gloeckle +Alex Gu +Michael Hassid +Daniel Haziza +Badr Youbi Idrissi +Christian Keller +Rahul Kindi +Hugh Leather +Gallil Maimon +Aram Markosyan +Francisco Massa +Pierre-Emmanuel Mazaré +Vegard Mella +Naila Murray +Keyur Muzumdar +Peter O’Hearn +Matteo Pagliardini +Dmitrii Pedchenko +Tal Remez +Volker Seeker +Marco Selvi +Oren Sultan +Sida Wang +Luca Wehrstedt +Ori Yoran +Lingming Zhang +Senior core contributors +Taco Cohen +Yossi Adi +Gabriel Synnaeve +References +Agarwal et al. (2025) +Sandhini Agarwal, Lama Ahmad, Jason Ai, Sam Altman, Andy Applebaum, Edwin +Arbus, Rahul K Arora, Yu Bai, Bowen Baker, Haiming Bao, et al. +gpt-oss-120b & gpt-oss-20b model card. +arXiv preprint arXiv:2508.10925 +, 2025. +Ahmad et al. (2025) +Wasi Uddin Ahmad, Sean Narenthiran, Somshubra Majumdar, Aleksander Ficek, +Siddhartha Jain, Jocelyn Huang, Vahid Noroozi, and Boris Ginsburg. +Opencodereasoning: Advancing data distillation for competitive +coding. +arXiv preprint arXiv:2504.01943 +, 2025. +Aider Team (2025) +Aider Team. +aider, 2025. +https://github.com/Aider-AI/aider +. +GitHub repository; accessed 2025-08-18. +Ainslie et al. (2023) +Joshua Ainslie, James Lee-Thorp, Michiel De Jong, Yury Zemlyanskiy, Federico +Lebrón, and Sumit Sanghai. +Gqa: Training generalized multi-query transformer models from +multi-head checkpoints. +arXiv preprint arXiv:2305.13245 +, 2023. +Alon and David (2022) +Yoav Alon and Cristina David. +Using graph neural networks for program termination. +In Abhik Roychoudhury, Cristian Cadar, and Miryung Kim, editors, +Proceedings of the 30th ACM Joint European Software Engineering +Conference and Symposium on the Foundations of Software Engineering, +ESEC/FSE 2022, Singapore, Singapore, November 14-18, 2022 +, pages 910–921. +ACM, 2022. +10.1145/3540250.3549095 +. +https://doi.org/10.1145/3540250.3549095 +. +Anthropic (2025) +Anthropic. +Claude 3.7 sonnet and claude code, February 2025. +https://www.anthropic.com/news/claude-3-7-sonnet +. +Anthropic (2025) +Anthropic. +Raising the bar on swe-bench verified with claude 3.5 sonnet. +2025. +https://www.anthropic.com/engineering/swe-bench-sonnet +. +Accessed 2025-08-18. +Aram H. Markosyan (2024) +Hugh Leather Aram H. Markosyan, Gabriel Synnaeve. +Leanuniverse: A library for consistent and scalable lean4 dataset +management, 2024. +Armengol-Estapé et al. (2025) +Jordi Armengol-Estapé, Quentin Carbonneaux, Tianjun Zhang, Aram H +Markosyan, Volker Seeker, Chris Cummins, Melanie Kambadur, Michael FP +O’Boyle, Sida Wang, Gabriel Synnaeve, et al. +What i cannot execute, i do not understand: Training and evaluating +llms on program execution traces. +arXiv preprint arXiv:2503.05703 +, 2025. +Austin et al. (2021) +Jacob Austin, Augustus Odena, Maxwell I. Nye, Maarten Bosma, Henryk +Michalewski, David Dohan, Ellen Jiang, Carrie J. Cai, Michael Terry, Quoc V. +Le, and Charles Sutton. +Program synthesis with large language models. +CoRR +, abs/2108.07732, 2021. +https://arxiv.org/abs/2108.07732 +. +Azerbayev et al. (2023) +Zhangir Azerbayev, Bartosz Piotrowski, Hailey Schoelkopf, Edward W. Ayers, +Dragomir Radev, and Jeremy Avigad. +Proofnet: Autoformalizing and formally proving undergraduate-level +mathematics, 2023. +https://arxiv.org/abs/2302.12433 +. +Bi et al. (2024) +Xiao Bi, Deli Chen, Guanting Chen, Shanhuang Chen, Damai Dai, Chengqi Deng, +Honghui Ding, Kai Dong, Qiushi Du, Zhe Fu, et al. +Deepseek llm: Scaling open-source language models with longtermism. +arXiv preprint arXiv:2401.02954 +, 2024. +Bick et al. (2024) +Alexander Bick, Adam Blandin, and David J Deming. +The rapid adoption of generative ai. +Technical report, National Bureau of Economic Research, 2024. +Bisk et al. (2020) +Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, and Yejin Choi. +PIQA: reasoning about physical commonsense in natural language. +In +The Thirty-Fourth AAAI Conference on Artificial +Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of +Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium +on Educational Advances in Artificial Intelligence, EAAI 2020, New York, +NY, USA, February 7-12, 2020 +, pages 7432–7439. AAAI Press, 2020. +10.1609/AAAI.V34I05.6239 +. +https://doi.org/10.1609/aaai.v34i05.6239 +. +Carbonneaux (2025) +Quentin Carbonneaux. +Fastgen, 2025. +https://github.com/facebookresearch/fastgen +. +Chambon et al. (2025) +Pierre Chambon, Baptiste Roziere, Benoit Sagot, and Gabriel Synnaeve. +Bigo(bench) – can llms generate code with controlled time and space +complexity?, 2025. +https://arxiv.org/abs/2503.15242 +. +Chen et al. (2021) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Pondé +de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, +Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy +Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, +Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens +Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias +Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, +William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor +Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher +Hesse, Andrew N. Carr, Jan Leike, Joshua Achiam, Vedant Misra, Evan Morikawa, +Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter +Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and +Wojciech Zaremba. +Evaluating large language models trained on code. +CoRR +, abs/2107.03374, 2021. +https://arxiv.org/abs/2107.03374 +. +Clark et al. (2018) +Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa +Schoenick, and Oyvind Tafjord. +Think you have solved question answering? try arc, the AI2 +reasoning challenge. +CoRR +, abs/1803.05457, 2018. +http://arxiv.org/abs/1803.05457 +. +Cobbe et al. (2021) +Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz +Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, +Christopher Hesse, and John Schulman. +Training verifiers to solve math word problems. +CoRR +, abs/2110.14168, 2021. +https://arxiv.org/abs/2110.14168 +. +Cohen et al. (2025) +Taco Cohen, David W. Zhang, Kunhao Zheng, Yunhao Tang, Rémi Munos, and +Gabriel Synnaeve. +Soft policy optimization: Online off-policy RL for sequence models. +CoRR +, abs/2503.05453, 2025. +10.48550/ARXIV.2503.05453 +. +https://doi.org/10.48550/arXiv.2503.05453 +. +Comanici et al. (2025) +Gheorghe Comanici, Eric Bieber, Mike Schaekermann, Ice Pasupat, Noveen +Sachdeva, Inderjit Dhillon, Marcel Blistein, Ori Ram, Dan Zhang, Evan Rosen, +et al. +Gemini 2.5: Pushing the frontier with advanced reasoning, +multimodality, long context, and next generation agentic capabilities. +arXiv preprint arXiv:2507.06261 +, 2025. +CompFiles authors (2025) +CompFiles authors. +Compfiles. +https://github.com/dwrensha/compfiles +, 2025. +Cook et al. (2011) +Byron Cook, Andreas Podelski, and Andrey Rybalchenko. +Proving program termination. +Commun. ACM +, 54(5):88–98, 2011. +10.1145/1941487.1941509 +. +https://doi.org/10.1145/1941487.1941509 +. +Cui et al. (2024) +Zheyuan Cui, Mert Demirer, Sonia Jaffe, Leon Musolff, Sida Peng, and Tobias +Salz. +The Effects of Generative AI on High Skilled Work: Evidence from +Three Field Experiments with Software Developers. +SSRN eLibrary +, 2024. +10.2139/ssrn.4945566 +. +Cummins et al. (2024) +Chris Cummins, Volker Seeker, Dejan Grubisic, Baptiste Roziere, Jonas Gehring, +Gabriel Synnaeve, and Hugh Leather. +Meta large language model compiler: Foundation models of compiler +optimization. +arXiv preprint arXiv:2407.02524 +, 2024. +Dao (2024) +Tri Dao. +FlashAttention-2: Faster attention with better parallelism and work +partitioning. +In +International Conference on Learning Representations +(ICLR) +, 2024. +Dao et al. (2022) +Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher Ré. +FlashAttention: Fast and memory-efficient exact attention with +IO-awareness. +In +Advances in Neural Information Processing Systems +(NeurIPS) +, 2022. +Dijkstra (1976) +Edsger W. Dijkstra. +A Discipline of Programming +. +Prentice-Hall, 1976. +ISBN 013215871X. +https://www.worldcat.org/oclc/01958445 +. +Dua et al. (2019) +Dheeru Dua, Yizhong Wang, Pradeep Dasigi, Gabriel Stanovsky, Sameer Singh, and +Matt Gardner. +DROP: A reading comprehension benchmark requiring discrete +reasoning over paragraphs. +In Jill Burstein, Christy Doran, and Thamar Solorio, editors, +Proceedings of the 2019 Conference of the North American Chapter of the +Association for Computational Linguistics: Human Language Technologies, +NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and +Short Papers) +, pages 2368–2378. Association for Computational Linguistics, +2019. +10.18653/V1/N19-1246 +. +https://doi.org/10.18653/v1/n19-1246 +. +Dubey et al. (2024) +Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad +Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, +et al. +The llama 3 herd of models. +arXiv e-prints +, pages arXiv–2407, 2024. +Gadre et al. (2024) +Samir Yitzhak Gadre, Georgios Smyrnis, Vaishaal Shankar, Suchin Gururangan, +Mitchell Wortsman, Rulin Shao, Jean Mercat, Alex Fang, Jeffrey Li, Sedrick +Keh, et al. +Language models scale reliably with over-training and on downstream +tasks. +arXiv preprint arXiv:2403.08540 +, 2024. +Gao et al. (2025) +Bofei Gao, Feifan Song, Zhe Yang, Zefan Cai, Yibo Miao, Qingxiu Dong, Lei Li, +Chenghao Ma, Liang Chen, Runxin Xu, Zhengyang Tang, Benyou Wang, Daoguang +Zan, Shanghaoran Quan, Ge Zhang, Lei Sha, Yichang Zhang, Xuancheng Ren, +Tianyu Liu, and Baobao Chang. +Omni-math: A universal olympiad level mathematic benchmark for +large language models. +In +The Thirteenth International Conference on Learning +Representations, ICLR 2025, Singapore, April 24-28, 2025 +. OpenReview.net, +2025. +https://openreview.net/forum?id=yaqPf0KAlN +. +Gehring et al. (2025) +Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, +Taco Cohen, and Gabriel Synnaeve. +Rlef: Grounding code llms in execution feedback with reinforcement +learning, 2025. +https://arxiv.org/abs/2410.02089 +. +Gu et al. (2024) +Alex Gu, Baptiste Rozière, Hugh Leather, Armando Solar-Lezama, Gabriel +Synnaeve, and Sida I. Wang. +Cruxeval: A benchmark for code reasoning, understanding and +execution. +arXiv preprint arXiv:2401.03065 +, 2024. +Guo et al. (2025) +Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, +Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. +Deepseek-r1: Incentivizing reasoning capability in llms via +reinforcement learning. +arXiv preprint arXiv:2501.12948 +, 2025. +Gupta et al. (2008) +Ashutosh Gupta, Thomas A. Henzinger, Rupak Majumdar, Andrey Rybalchenko, and +Ru-Gang Xu. +Proving non-termination. +In George C. Necula and Philip Wadler, editors, +Proceedings of +the 35th ACM SIGPLAN-SIGACT Symposium on Principles of Programming +Languages, POPL 2008, San Francisco, California, USA, January 7-12, 2008 +, +pages 147–158. ACM, 2008. +10.1145/1328438.1328459 +. +https://doi.org/10.1145/1328438.1328459 +. +Handa et al. (2025) +Kunal Handa, Alex Tamkin, Miles McCain, Saffron Huang, Esin Durmus, Sarah Heck, +Jared Mueller, Jerry Hong, Stuart Ritchie, Tim Belonax, et al. +Which economic tasks are performed with ai? evidence from millions of +claude conversations. +arXiv preprint arXiv:2503.04761 +, 2025. +Hassid et al. (2025) +Michael Hassid, Gabriel Synnaeve, Yossi Adi, and Roy Schwartz. +Don’t overthink it. preferring shorter thinking chains for improved +llm reasoning. +arXiv preprint arXiv:2505.17813 +, 2025. +Hendrycks et al. (2021a) +Dan Hendrycks, Steven Basart, Saurav Kadavath, Mantas Mazeika, Akul Arora, +Ethan Guo, Collin Burns, Samir Puranik, Horace He, Dawn Song, et al. +Measuring coding challenge competence with apps. +arXiv preprint arXiv:2105.09938 +, 2021a. +Hendrycks et al. (2021b) +Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric +Tang, Dawn Song, and Jacob Steinhardt. +Measuring mathematical problem solving with the MATH dataset. +In Joaquin Vanschoren and Sai-Kit Yeung, editors, +Proceedings +of the Neural Information Processing Systems Track on Datasets and Benchmarks +1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual +, +2021b. +https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html +. +Hoare (1971) +C. A. R. Hoare. +Proof of a program: FIND. +Commun. ACM +, 14(1):39–45, 1971. +10.1145/362452.362489 +. +https://doi.org/10.1145/362452.362489 +. +Hoffmann et al. (2022) +Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor +Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes +Welbl, Aidan Clark, et al. +Training compute-optimal large language models. +arXiv preprint arXiv:2203.15556 +, 2022. +Hsieh et al. (2024) +Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei +Jia, Yang Zhang, and Boris Ginsburg. +Ruler: What’s the real context size of your long-context language +models? +arXiv preprint arXiv:2404.06654 +, 2024. +Hu et al. (2025) +Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, Xiangyu Zhang, and Heung-Yeung +Shum. +Open-reasoner-zero: An open source approach to scaling up +reinforcement learning on the base model, 2025. +https://arxiv.org/abs/2503.24290 +. +Jain et al. (2025a) +Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida +Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica. +Livecodebench: Holistic and contamination free evaluation of large +language models for code. +In +The Thirteenth International Conference on Learning +Representations, ICLR 2025, Singapore, April 24-28, 2025 +. OpenReview.net, +2025a. +https://openreview.net/forum?id=chfJJYC3iL +. +Jain et al. (2025b) +Naman Jain, Jaskirat Singh, Manish Shetty, Liang Zheng, Koushik Sen, and Ion +Stoica. +R2e-gym: Procedural environments and hybrid verifiers for scaling +open-weights swe agents. +arXiv preprint arXiv:2504.07164 +, 2025b. +Jimenez et al. (2024) +Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir +Press, and Karthik R. Narasimhan. +Swe-bench: Can language models resolve real-world github issues? +In +The Twelfth International Conference on Learning +Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024 +. +OpenReview.net, 2024. +https://openreview.net/forum?id=VTF8yNQM66 +. +Kaplan et al. (2020) +Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon +Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. +Scaling laws for neural language models. +arXiv preprint arXiv:2001.08361 +, 2020. +Kimi Team et al. (2025) +Kimi Team, Yifan Bai, Yiping Bao, Guanduo Chen, Jiahao Chen, Ningxin +Chen, Ruijue Chen, Yanru Chen, Yuankun Chen, Yutian Chen, et al. +Kimi k2: Open agentic intelligence. +arXiv preprint arXiv:2507.20534 +, 2025. +Kwon et al. (2023) +Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, +Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. +Efficient memory management for large language model serving with +pagedattention. +In +Proceedings of the ACM SIGOPS 29th Symposium on Operating +Systems Principles +, 2023. +Kydlicek et al. (2025) +Hynek Kydlicek, Alina Lozovskaya, Nathan Habib, and Clémentine Fourrier. +Math-verify, 2025. +https://github.com/huggingface/Math-Verify +. +Lee (2019) +Casey Lee. +act, 2019. +https://github.com/nektos/act +. +Lefaudeux et al. (2022) +Benjamin Lefaudeux, Francisco Massa, Diana Liskovich, Wenhan Xiong, Vittorio +Caggiano, Sean Naren, Min Xu, Jieru Hu, Marta Tintore, Susan Zhang, et al. +xformers: A modular and hackable transformer modelling library, 2022. +Li et al. (2024) +Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu, +Joseph E. Gonzalez, and Ion Stoica. +From crowdsourced data to high-quality benchmarks: Arena-hard and +benchbuilder pipeline. +CoRR +, abs/2406.11939, 2024. +10.48550/ARXIV.2406.11939 +. +https://doi.org/10.48550/arXiv.2406.11939 +. +Li et al. (2022) +Yujia Li, David H. Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser, +Rémi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal +Lago, Thomas Hubert, Peter Choy, Cyprien de Masson d’Autume, Igor Babuschkin, +Xinyun Chen, Po-Sen Huang, Johannes Welbl, Sven Gowal, Alexey Cherepanov, +James Molloy, Daniel J. Mankowitz, Esme Sutherland Robson, Pushmeet Kohli, +Nando de Freitas, Koray Kavukcuoglu, and Oriol Vinyals. +Competition-level code generation with alphacode. +CoRR +, abs/2203.07814, 2022. +10.48550/ARXIV.2203.07814 +. +https://doi.org/10.48550/arXiv.2203.07814 +. +Lightman et al. (2023) +Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, +Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. +Let’s verify step by step. +In +The Twelfth International Conference on Learning +Representations +, 2023. +Lin et al. (2025) +Yong Lin, Shange Tang, Bohan Lyu, Jiayun Wu, Hongzhou Lin, Kaiyu Yang, Jia Li, +Mengzhou Xia, Danqi Chen, Sanjeev Arora, and Chi Jin. +Goedel-prover: A frontier model for open-source automated theorem +proving, 2025. +https://arxiv.org/abs/2502.07640 +. +Liu et al. (2025) +Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun +Lee, and Min Lin. +Understanding r1-zero-like training: A critical perspective. +arXiv preprint arXiv:2503.20783 +, 2025. +Loshchilov and Hutter (2019) +Ilya Loshchilov and Frank Hutter. +Decoupled weight decay regularization. +In +International Conference on Learning Representations +, 2019. +https://openreview.net/forum?id=Bkg6RiCqY7 +. +mathlib Community (2020) +The mathlib Community. +The lean mathematical library. +In +Proceedings of the 9th ACM SIGPLAN International Conference +on Certified Programs and Proofs +, CPP 2020, page 367–381, New York, NY, +USA, 2020. Association for Computing Machinery. +ISBN 9781450370974. +10.1145/3372885.3373824 +. +https://doi.org/10.1145/3372885.3373824 +. +Mella (2025) +Vegard Mella. +Moodist, 2025. +https://github.com/facebookresearch/moodist +. +Mentat AI Team (2025) +Mentat AI Team. +Locodiff-bench: Natural long-context code benchmark, 2025. +https://github.com/AbanteAI/LoCoDiff-bench +. +Meta AI (2025) +Meta AI. +Llama 4 model card. +https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md +, +2025. +Accessed: 2025-09-18. +Meurer et al. (2017) +Aaron Meurer, Christopher P. Smith, Mateusz Paprocki, Ondřej +Čertík, Sergey B. Kirpichev, Matthew Rocklin, AMiT Kumar, Sergiu +Ivanov, Jason K. Moore, Sartaj Singh, Thilina Rathnayake, Sean Vig, Brian E. +Granger, Richard P. Muller, Francesco Bonazzi, Harsh Gupta, Shivam Vats, +Fredrik Johansson, Fabian Pedregosa, Matthew J. Curry, Andy R. Terrel, +Štěpán Roučka, Ashutosh Saboo, Isuru Fernando, Sumith Kulal, +Robert Cimrman, and Anthony Scopatz. +Sympy: symbolic computing in python. +PeerJ Computer Science +, 3:e103, January 2017. +ISSN 2376-5992. +10.7717/peerj-cs.103 +. +https://doi.org/10.7717/peerj-cs.103 +. +Micikevicius et al. (2022) +Paulius Micikevicius, Dusan Stosic, Neil Burgess, Marius Cornea, Pradeep Dubey, +Richard Grisenthwaite, Sangwon Ha, Alexander Heinecke, Patrick Judd, John +Kamalu, et al. +Fp8 formats for deep learning. +arXiv preprint arXiv:2209.05433 +, 2022. +Mihaylov et al. (2018) +Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. +Can a suit of armor conduct electricity? A new dataset for open +book question answering. +In Ellen Riloff, David Chiang, Julia Hockenmaier, and Jun’ichi +Tsujii, editors, +Proceedings of the 2018 Conference on Empirical +Methods in Natural Language Processing, Brussels, Belgium, October 31 - +November 4, 2018 +, pages 2381–2391. Association for Computational +Linguistics, 2018. +10.18653/V1/D18-1260 +. +https://doi.org/10.18653/v1/d18-1260 +. +Mistral-AI et al. (2025) +Mistral-AI, :, Abhinav Rastogi, Albert Q. Jiang, Andy Lo, Gabrielle Berrada, +Guillaume Lample, Jason Rute, Joep Barmentlo, Karmesh Yadav, Kartik +Khandelwal, Khyathi Raghavi Chandu, Léonard Blier, Lucile Saulnier, Matthieu +Dinot, Maxime Darrin, Neha Gupta, Roman Soletskyi, Sagar Vaze, Teven Le Scao, +Yihan Wang, Adam Yang, Alexander H. Liu, Alexandre Sablayrolles, Amélie +Héliou, Amélie Martin, Andy Ehrenberg, Anmol Agarwal, Antoine Roux, Arthur +Darcet, Arthur Mensch, Baptiste Bout, Baptiste Rozière, Baudouin De +Monicault, Chris Bamford, Christian Wallenwein, Christophe Renaudin, +Clémence Lanfranchi, Darius Dabert, Devon Mizelle, Diego de las Casas, +Elliot Chane-Sane, Emilien Fugier, Emma Bou Hanna, Gauthier Delerce, Gauthier +Guinet, Georgii Novikov, Guillaume Martin, Himanshu Jaju, Jan Ludziejewski, +Jean-Hadrien Chabran, Jean-Malo Delignon, Joachim Studnia, Jonas Amar, +Josselin Somerville Roberts, Julien Denize, Karan Saxena, Kush Jain, Lingxiao +Zhao, Louis Martin, Luyu Gao, Lélio Renard Lavaud, Marie Pellat, Mathilde +Guillaumin, Mathis Felardos, Maximilian Augustin, Mickaël Seznec, Nikhil +Raghuraman, Olivier Duchenne, Patricia Wang, Patrick von Platen, Patryk +Saffer, Paul Jacob, Paul Wambergue, Paula Kurylowicz, Pavankumar Reddy +Muddireddy, Philomène Chagniot, Pierre Stock, Pravesh Agrawal, Romain +Sauvestre, Rémi Delacourt, Sanchit Gandhi, Sandeep Subramanian, Shashwat +Dalal, Siddharth Gandhi, Soham Ghosh, Srijan Mishra, Sumukh Aithal, Szymon +Antoniak, Thibault Schueller, Thibaut Lavril, Thomas Robert, Thomas Wang, +Timothée Lacroix, Valeriia Nemychnikova, Victor Paltz, Virgile Richard, +Wen-Ding Li, William Marshall, Xuanyu Zhang, and Yunhao Tang. +Magistral, 2025. +https://arxiv.org/abs/2506.10910 +. +(68) +Modal Team. +Modal: High-performance ai infrastructure. +https://modal.com/docs +. +Accessed 2025-08-18. +Moshkov et al. (2025) +Ivan Moshkov, Darragh Hanley, Ivan Sorokin, Shubham Toshniwal, Christof Henkel, +Benedikt Schifferer, Wei Du, and Igor Gitman. +Aimo-2 winning solution: Building state-of-the-art mathematical +reasoning models with openmathreasoning dataset. +arXiv preprint arXiv:2504.16891 +, 2025. +Moura and Ullrich (2021) +Leonardo de Moura and Sebastian Ullrich. +The lean 4 theorem prover and programming language. +In André Platzer and Geoff Sutcliffe, editors, +Automated +Deduction – CADE 28 +, pages 625–635, Cham, 2021. Springer International +Publishing. +ISBN 978-3-030-79876-5. +Muennighoff et al. (2023) +Niklas Muennighoff, Alexander Rush, Boaz Barak, Teven Le Scao, Nouamane Tazi, +Aleksandra Piktus, Sampo Pyysalo, Thomas Wolf, and Colin A Raffel. +Scaling data-constrained language models. +Advances in Neural Information Processing Systems +, +36:50358–50376, 2023. +OpenAI (2024) +OpenAI. +Learning to reason with llms, September 2024. +https://openai.com/index/learning-to-reason-with-llms/ +. +OpenAI (2025a) +OpenAI. +gpt-oss-120b & gpt-oss-20b model card. +arXiv preprint arXiv:2508.10925 +, 2025a. +OpenAI (2025b) +OpenAI. +Claude 3.7 sonnet and claude code, April 2025b. +https://openai.com/index/introducing-o3-and-o4-mini/ +. +Ouyang et al. (2022) +Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela +Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. +Training language models to follow instructions with human feedback. +Advances in neural information processing systems +, +35:27730–27744, 2022. +Paliskara and Saroufim (2025) +Sahan Paliskara and Mark Saroufim. +Kernelbook, 5 2025. +https://huggingface.co/datasets/GPUMODE/KernelBook +. +Pan et al. (2025) +Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, +and Yizhe Zhang. +Training software engineering agents and verifiers with swe‑gym. +In +Proceedings of the 42nd International Conference on Machine +Learning (ICML 2025) +, 2025. +https://arxiv.org/abs/2412.21139 +. +arXiv:2412.21139, accepted at ICML 2025. +Piche et al. (2025) +Alex Piche, Rafael Pardinas, Ehsan Kamalloo, and Dzmitry Bahdanau. +Pipeline RL: fast LLM agent training, 2025. +https://huggingface.co/blog/ServiceNow/pipelinerl +. +Radford et al. (2018) +Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. +Improving language understanding by generative pre-training. +Technical report, OpenAI, 2018. +https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf +. +Rastogi et al. (2025) +Abhinav Rastogi, Albert Q Jiang, Andy Lo, Gabrielle Berrada, Guillaume Lample, +Jason Rute, Joep Barmentlo, Karmesh Yadav, Kartik Khandelwal, Khyathi Raghavi +Chandu, et al. +Magistral. +arXiv preprint arXiv:2506.10910 +, 2025. +Rein et al. (2023) +David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe +Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman. +GPQA: A graduate-level google-proof q&a benchmark. +CoRR +, abs/2311.12022, 2023. +10.48550/ARXIV.2311.12022 +. +https://doi.org/10.48550/arXiv.2311.12022 +. +Roziere et al. (2023) +Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, +Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez, +et al. +Code llama: Open foundation models for code. +arXiv preprint arXiv:2308.12950 +, 2023. +Saavedra et al. (2024) +Nuno Saavedra, André Silva, and Martin Monperrus. +Gitbug-actions: Building reproducible bug-fix benchmarks with github +actions. +In +Proceedings of the 2024 IEEE/ACM 46th International +Conference on Software Engineering: Companion Proceedings +, ICSE-Companion +’24, page 1–5, New York, NY, USA, 2024. Association for Computing +Machinery. +ISBN 9798400705021. +10.1145/3639478.3640023 +. +https://doi.org/10.1145/3639478.3640023 +. +Sakaguchi et al. (2020) +Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. +Winogrande: An adversarial winograd schema challenge at scale. +In +The Thirty-Fourth AAAI Conference on Artificial +Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of +Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium +on Educational Advances in Artificial Intelligence, EAAI 2020, New York, +NY, USA, February 7-12, 2020 +, pages 8732–8740. AAAI Press, 2020. +10.1609/AAAI.V34I05.6399 +. +https://doi.org/10.1609/aaai.v34i05.6399 +. +Schulman (2020) +John Schulman. +Approximating kl divergence, 2020. +https://joschu.net/blog/kl-approx.html +. +Schulman et al. (2017) +John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. +Proximal policy optimization algorithms. +arXiv preprint arXiv:1707.06347 +, 2017. +Shao et al. (2024) +Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei +Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo. +Deepseekmath: Pushing the limits of mathematical reasoning in open +language models, 2024. +https://arxiv.org/abs/2402.03300 +. +Shazeer (2020) +Noam Shazeer. +Glu variants improve transformer, 2020. +https://arxiv.org/abs/2002.05202 +. +Su et al. (2021) +Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. +Roformer: Enhanced transformer with rotary position embedding, 2021. +https://arxiv.org/abs/2104.09864 +. +Synnaeve et al. (2019) +Gabriel Synnaeve, Jonas Gehring, Zeming Lin, Daniel Haziza, Nicolas Usunier, +Danielle Rothermel, Vegard Mella, Da Ju, Nicolas Carion, Laura Gustafson, +et al. +Growing up together: Structured exploration for large action spaces. +2019. +Talmor et al. (2019) +Alon Talmor, Jonathan Herzig, Nicholas Lourie, and Jonathan Berant. +Commonsenseqa: A question answering challenge targeting commonsense +knowledge. +In Jill Burstein, Christy Doran, and Thamar Solorio, editors, +Proceedings of the 2019 Conference of the North American Chapter of the +Association for Computational Linguistics: Human Language Technologies, +NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and +Short Papers) +, pages 4149–4158. Association for Computational Linguistics, +2019. +10.18653/V1/N19-1421 +. +https://doi.org/10.18653/v1/n19-1421 +. +Tang et al. (2025) +Yunhao Tang, Kunhao Zheng, Gabriel Synnaeve, and Rémi Munos. +Optimizing language models for inference time objectives using +reinforcement learning. +CoRR +, abs/2503.19595, 2025. +10.48550/ARXIV.2503.19595 +. +https://doi.org/10.48550/arXiv.2503.19595 +. +The Terminal-Bench Team (2025) +The Terminal-Bench Team. +Terminal-bench: A benchmark for ai agents in terminal environments, +Apr 2025. +https://github.com/laude-institute/terminal-bench +. +Vanegue et al. (2025) +Julien Vanegue, Jules Villard, Peter O’Hearn, and Azalea Raad. +Non-termination proving: 100 million loc and beyond, 2025. +https://arxiv.org/abs/2509.05293 +. +Vaswani et al. (2017) +Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, +Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin. +Attention is all you need. +Advances in neural information processing systems +, 30, 2017. +Wang et al. (2022a) +Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake +Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao +Zhang, et al. +Overlap communication with dependent computation via decomposition in +large deep learning models. +In +Proceedings of the 28th ACM International Conference on +Architectural Support for Programming Languages and Operating Systems, Volume +1 +, pages 93–106, 2022a. +Wang et al. (2025) +Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xianu Tang, Mingchen Zhuge, +Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, +Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe +Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and +Graham Neubig. +Openhands: An open platform for AI software developers as +generalist agents. +In +The Thirteenth International Conference on Learning +Representations +, 2025. +https://openreview.net/forum?id=OJd3ayDDoF +. +Wang et al. (2022b) +Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, Sharan Narang, +Aakanksha Chowdhery, and Denny Zhou. +Self-consistency improves chain of thought reasoning in language +models. +arXiv preprint arXiv:2203.11171 +, 2022b. +Wei et al. (2023) +Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, +Ed Chi, Quoc Le, and Denny Zhou. +Chain-of-thought prompting elicits reasoning in large language +models, 2023. +https://arxiv.org/abs/2201.11903 +. +Wei et al. (2024) +Jason Wei, Nguyen Karina, Hyung Won Chung, Yunxin Joy Jiao, Spencer Papay, +Amelia Glaese, John Schulman, and William Fedus. +Measuring short-form factuality in large language models. +CoRR +, abs/2411.04368, 2024. +10.48550/ARXIV.2411.04368 +. +https://doi.org/10.48550/arXiv.2411.04368 +. +Wei et al. (2025) +Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, +Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang. +Swe-rl: Advancing llm reasoning via reinforcement learning on open +software evolution. +arXiv preprint arXiv:2502.18449 +, 2025. +Xia et al. (2024) +Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang. +Agentless: Demystifying llm-based software engineering agents. +arXiv preprint +, 2024. +Xiong et al. (2023) +Wenhan Xiong, Jingyu Liu, Igor Molybog, Hejia Zhang, Prajjwal Bhargava, Rui +Hou, Louis Martin, Rashi Rungta, Karthik Abinav Sankararaman, Barlas Oguz, +et al. +Effective long-context scaling of foundation models. +arXiv preprint arXiv:2309.16039 +, 2023. +Yang et al. (2025a) +An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen +Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. +Qwen3 technical report. +arXiv preprint arXiv:2505.09388 +, 2025a. +Yang et al. (2024) +John Yang, Carlos E Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, +Karthik Narasimhan, and Ofir Press. +Swe-agent: Agent-computer interfaces enable automated software +engineering. +Advances in Neural Information Processing Systems +, +37:50528–50652, 2024. +Yang et al. (2025b) +John Yang, Kilian Leret, Carlos E Jimenez, Alexander Wettig, Kabir Khandpur, +Yanzhe Zhang, Binyuan Hui, Ofir Press, Ludwig Schmidt, and Diyi Yang. +Swe-smith: Scaling data for software engineering agents. +arXiv preprint arXiv:2504.21798 +, 2025b. +Yeverechyahu et al. (2024) +Doron Yeverechyahu, Raveesh Mayya, and Gal Oestreicher-Singer. +The impact of large language models on open-source innovation: +Evidence from github copilot. +arXiv preprint arXiv:2409.08379 +, 2024. +Ying et al. (2025) +Huaiyuan Ying, Zijian Wu, Yihan Geng, Zheng Yuan, Dahua Lin, and Kai Chen. +Lean workbook: A large-scale lean problem set formalized from natural +language math problems, 2025. +https://arxiv.org/abs/2406.03847 +. +Yu et al. (2025) +Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Tiantian +Fan, Gaohong Liu, Lingjun Liu, Xin Liu, et al. +Dapo: An open-source llm reinforcement learning system at scale. +arXiv preprint arXiv:2503.14476 +, 2025. +Yue et al. (2024) +Albert S. Yue, Lovish Madaan, Ted Moskovitz, DJ Strouse, and Aaditya K. Singh. +HARP: A challenging human-annotated math reasoning benchmark. +CoRR +, abs/2412.08819, 2024. +10.48550/ARXIV.2412.08819 +. +https://doi.org/10.48550/arXiv.2412.08819 +. +Zellers et al. (2019) +Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. +Hellaswag: Can a machine really finish your sentence? +In Anna Korhonen, David R. Traum, and Lluís Màrquez, +editors, +Proceedings of the 57th Conference of the Association for +Computational Linguistics, ACL 2019, Florence, Italy, July 28- August 2, +2019, Volume 1: Long Papers +, pages 4791–4800. Association for Computational +Linguistics, 2019. +10.18653/V1/P19-1472 +. +https://doi.org/10.18653/v1/p19-1472 +. +Zhang and Sennrich (2019) +Biao Zhang and Rico Sennrich. +Root mean square layer normalization +. +Curran Associates Inc., Red Hook, NY, USA, 2019. +(113) +David W Zhang, Michaël Defferrard, Corrado Rainone, and Roland Memisevic. +Grounding code understanding in step-by-step execution. +https://openreview.net/forum?id=MUr7Fl93QS +. +Zheng et al. (2022) +Kunhao Zheng, Jesse Michael Han, and Stanislas Polu. +Minif2f: a cross-system benchmark for formal olympiad-level +mathematics, 2022. +https://arxiv.org/abs/2109.00110 +. +Zheng et al. (2023) +Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, +Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, +Joseph E. Gonzalez, and Ion Stoica. +Judging llm-as-a-judge with mt-bench and chatbot arena. +In Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz +Hardt, and Sergey Levine, editors, +Advances in Neural Information +Processing Systems 36: Annual Conference on Neural Information Processing +Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023 +, +2023. +http://papers.nips.cc/paper_files/paper/2023/hash/91f18a1287b398d378ef22505bf41832-Abstract-Datasets_and_Benchmarks.html +. +\beginappendix +10 +Acknowledgments +The authors thank Ariel Stolerman, Ayelet Regev Dabah, Dani Shames, Tamir Meyer and Nadav Azaria for support in building executable repository images at scale; Jeff Yang, Yonatan Komornik and Tarun Anand for support in curating GitHub PR and Issue metadata; Qian Liang, Meng Zhang, Hanwen Zha, Ananya Saxena, Emily Dinan, Melanie Kambadur for the support in data preparation; Yining Yang, Sten Sootla, Chris Waterson and Michael Jiang for support in the development of RepoAgent and additional repository images; +Eslam Elnikety, Jamie Cahill, Christine Wang, Don Landrum, Sadman Fahmid, Andrew Hamiel, Ned Newton, Andrii Golovei, Rashmi Narasimha, Zack Leman, Mehrdad Mahdavi, Leon Yang, Joshua Fink, Sargun Dillon, Jeff Hanson and Zach Wentz for the internal sandboxing platform and the code execution and Docker execution services built atop it, enabling secure and massively parallel execution of untrusted code; Mathurin Videau, Leonid Shamis, Jeremy Reizenstein, Maria Lomeli, Lucca Bertoncini, Vivien Cabannes, Charles Arnal and Pascal Kesseli for their contributions to the CWM research codebase and training and evaluation infrastructure; Julien Vanegue for advice on practical aspects of the halting problem; +Daniel Fried and Rishabh Singh for support in designing and developing Agentic SWE RL; +the Modal team – especially Jonathon Belotti, Matthew Saltz, Colin Weld, Peyton Walters, Deven Navani, Michael Waskom, Advay Pal, Akshat Bubna, Alec Powell, Lucy Zhang, and Eric Zhang – for extensive support with remote execution, infrastructure, and platform stability; +Lovish Madaan, Binh Tang, Viktor Kerkez, Rishabh Agarwal, Alan Schelten, Xuewei Wang and Jeremy Fu for support with mathematical expression comparison code. +11 +CWM +Examples +Extending +Section +˜ +3 +, we here present additional examples of using +CWM +for SWE reasoning, trace prediction, and a combination of the two. +Reasoning agent. +Figure +˜ +11.19 +shows an example of +CWM +solving an SWE-bench Verified problem in a bash-only environment, which is more challenging than environments that provide dedicated tools for common tasks such as file editing. In this example, the model makes incorrect edits in the initial turns but realizes its error and restores the original file state using +git checkout +, followed by producing a correct edit with +sed +. +Figure +˜ +11.20 +demonstrates that +CWM +can leverage test execution to verify patch correctness before submission. In this specific example, the agent makes sure that the changes it makes do not break any existing functionality. Only after this verification, the agent submits the patch and generates a summary. +Lastly, +Figure +˜ +11.21 +shows the default SWE RL setting where +CWM +is paired with the +edit +tool. In this example, +CWM +performs extensive reasoning before making the edit. The +edit +tool then provides agent-friendly feedback showing the surrounding code after the change. +Python execution trace prediction. +Figures +˜ +11.23 +and +11.24 +showcase Python execution trace prediction at inference time and compare it to reasoning about program execution in natural language. +For +Figure +˜ +11.23 +, a Python list is modified while iterating over it. +In execution trace prediction mode, the model tracks all list modifications and predicts the output correctly. +With natural language reasoning, the model fails to predict the correct return value – even though it appears to recognize the list modification during reasoning. +Conversely, +Figure +˜ +11.24 +presents an example requiring the evaluation of a complex Python statement. +Execution trace prediction fails to correctly predict the outcome of the statement in a single prediction step, but natural language reasoning breaks down the complex statement into simpler expressions and then combines those into a correct result. +We believe that combining the groundedness of trace prediction with the flexibility of natural language reasoning makes for interesting future research. +Figure +˜ +11.25 +demonstrates how +CWM +’s execution trace prediction capabilities allow it to function as a neural Python debugger. +We think that equipping +CWM +with debugging capabilities that are not available with traditional debuggers, such as skipping loops in constant time, jumping to arbitrary lines of code, or predicting inputs to reach arbitrary states is highly interesting future work. +Figure +˜ +11.22 +shows how we execution trace prediction for CruxEval output prediction in our experiments in +Section +˜ +7.3 +. +Figure +˜ +11.26 +demonstrates how +CWM +’s Python execution trace prediction capability can be used for code generation. +By specifying a set of +asserts +consistent with the desired behavior and simply pretending to +import +the desired function, without actually giving a function definition, +CWM +starts to generate actions consistent with the desired function. +It is possible that the model acquired this capability of jointly tracing and generating code because for some tracing data we do not include the source context of third-party libraries. +We expand on this in +Figure +˜ +11.27 +, the example of mixing tracing and code generation discussed earlier in Section +3 +. +Future work could build on +CWM +’s capabilities here and explore how execution trace prediction can be used to improve code generation. +Program Termination. +Figure +˜ +11.28 +llustrates termination reasoning, whereby CWM considers several concrete inputs before generalizing to the conclusion of terminating on all inputs. +⬇ +The +issue +persists +due +to +an +indentation +error +. +I +need +to +correct +the +indentation +in +the +sky_coordinate +. +py +file +. +However +, +since +the +’ +edit +’ +tool +is +not +available +, +I +’ +ll +try +a +different +approach +. +Let +me +check +the +current +state +of +the +__getattr__ +method +in +sky_coordinate +. +py +. +⬇ +< +tool +: +bash +> +sed +- +n +890,910 +p +/ +testbed +/ +astropy +/ +coordinates +/ +sky_coordinate +. +py + +⬇ +< +output +> +return +getattr +( +self +. +_sky_coord_frame +, +attr +) +# +Try +to +interpret +as +a +new +frame +for +transforming +. +frame_cls += +frame_transform_graph +. +lookup_name +( +attr +) +if +frame_cls +is +not +None +and +self +. +frame +. +is_transformable_to +( +frame_cls +): +return +self +. +transform_to +( +attr +) +# +Fail +return +self +. +__getattribute__ +( +attr +) +f +"’{ +self +. +__class__ +. +__name__ +}’ +object +has +no +attribute +’{ +attr +}’" +) +def +__setattr__ +( +self +, +attr +, +val +): +# +This +is +to +make +anything +available +through +__getattr__ +immutable +if +" +_sky_coord_frame +" +in +self +. +__dict__ +: +if +self +. +_is_name +( +attr +): +raise +AttributeError +( +f +"’{ +attr +}’ +is +immutable +") +if +not +attr +. +startswith +(" +_ +") +and +hasattr +( +self +. +_sky_coord_frame +, +attr +): +setattr +( +self +. +_sky_coord_frame +, +attr +, +val +) +return + +< +budget +> +* +Remaining +turns +: +82 +* +Remaining +tokens +: +85000 + +⬇ +The +issue +is +that +the +line +‘ +return +self +. +__getattribute__ +( +attr +)‘ +is +not +properly +replacing +the +original +‘ +raise +AttributeError +‘ +line +, +and +there +’ +s +an +extra +line +with +‘ +f +"’{ +self +. +__class__ +. +__name__ +}’ +object +has +no +attribute +’{ +attr +}’"‘ +that +’ +s +causing +a +syntax +error +. +I +need +to +correctly +replace +the +entire +block +. +Let +me +try +to +fix +this +by +restoring +the +original +file +and +applying +the +change +again +carefully +. +⬇ +< +tool +: +bash +> +git +checkout +/ +testbed +/ +astropy +/ +coordinates +/ +sky_coordinate +. +py + +⬇ +< +output +> +Updated +1 +path +from +the +index + +< +budget +> +* +Remaining +turns +: +81 +* +Remaining +tokens +: +84837 + +⬇ +Now +that +the +sky_coordinate +. +py +file +has +been +restored +, +I +’ +ll +apply +the +correct +change +using +sed +, +making +sure +to +replace +the +entire +raise +AttributeError +block +with +the +correct +line +. +The +original +code +at +line +898 +is +: +# +Fail +raise +AttributeError +( +f +"’{ +self +. +__class__ +. +__name__ +}’ +object +has +no +attribute +’{ +attr +}’" +) +I +need +to +replace +this +with +: +# +Fail +return +self +. +__getattribute__ +( +attr +) +Let +me +use +sed +to +do +this +correctly +. +⬇ +< +tool +: +bash +> +sed +- +i +’898,900 +c +\ +# +Fail +\ +n +return +self +. +__getattribute__ +( +attr +)’ +/ +testbed +/ +astropy +/ +coordinates +/ +sky_coordinate +. +py + +Figure 11.19 +: +Example intermediate turns in SWE RL (bash-only). The agent learns to correct indentation mistakes from wrong edits by restoring the original file state using +git checkout +, followed by applying a correct edit using +sed +. +⬇ +The +code +ran +without +errors +, +and +‘ +model +. +dual_coef_ +‘ +was +printed +, +which +should +be +an +empty +CSR +matrix +. +However +, +the +output +is +empty +, +which +suggests +that +‘ +model +. +dual_coef_ +‘ +is +indeed +an +empty +matrix +. +Now +, +I +need +to +verify +that +this +fix +doesn +’ +t +break +any +other +functionality +. +To +do +that +, +I +’ +ll +run +the +tests +for +the +SVM +module +in +sklearn +to +ensure +that +all +tests +pass +. +⬇ +< +tool +: +bash +> +python +- +m +pytest +sklearn +/ +svm +/ +tests +/ + +⬇ +============================= +test +session +starts +============================== +platform +linux +-- +Python +3.6.13, +pytest +-6.2.4, +py +-1.11.0, +pluggy +-0.13.1 +rootdir +: +/ +testbed +, +configfile +: +setup +. +cfg +collected +135 +items +sklearn +/ +svm +/ +tests +/ +test_bounds +. +py +................... +[ +14% +sklearn +/ +svm +/ +tests +/ +test_sparse +. +py +............................... +[ +37% +sklearn +/ +svm +/ +tests +/ +test_svm +. +py +.......................................... +[ +68% +........................................... +[100% +======================= +135 +passed +, +15 +warnings +in +4.31 +s +======================= +⬇ +All +tests +have +passed +, +which +suggests +that +the +fix +is +correct +and +doesn +’ +t +break +any +existing +functionality +. +Now +, +I +need +to +create +a +patch +file +that +includes +the +changes +made +to +‘ +sklearn +/ +svm +/ +base +. +py +‘. +⬇ +< +tool +: +bash +> +git +diff +sklearn +/ +svm +/ +base +. +py +> +svm_patch +. +patch + +⬇ +( +No +output +) +⬇ +The +patch +file +has +been +created +. +Now +, +I +’ +ll +submit +it +as +the +final +solution +. +⬇ +< +tool +: +submit +> +./ +svm_patch +. +patch + +⬇ +The +following +patch +content +is +marked +as +your +final +submission +: +< +submission +> +diff +-- +git +a +/ +sklearn +/ +svm +/ +base +. +py +b +/ +sklearn +/ +svm +/ +base +. +py +index +64 +cebe837 +..67 +dd8e38f +100644 +--- +a +/ +sklearn +/ +svm +/ +base +. +py ++++ +b +/ +sklearn +/ +svm +/ +base +. +py +@@ +-287,8 ++287,11 +@@ +class +BaseLibSVM +( +BaseEstimator +, +metaclass += +ABCMeta +): +n_SV += +self +. +support_vectors_ +. +shape +[0] +dual_coef_indices += +np +. +tile +( +np +. +arange +( +n_SV +), +n_class +) +- +dual_coef_indptr += +np +. +arange +(0, +dual_coef_indices +. +size ++ +1, +- +dual_coef_indices +. +size +/ +n_class +) ++ +if +dual_coef_indices +. +size +== +0: ++ +dual_coef_indptr += +np +. +array +([0, +0], +dtype += +np +. +int32 +) ++ +else +: ++ +dual_coef_indptr += +np +. +arange +(0, +dual_coef_indices +. +size ++ +1, ++ +dual_coef_indices +. +size +/ +n_class +) +self +. +dual_coef_ += +sp +. +csr_matrix +( +( +dual_coef_data +, +dual_coef_indices +, +dual_coef_indptr +), +( +n_class +, +n_SV +)) + +Review +the +patch +content +and +ensure +it +correctly +resolves +the +issue +. +If +necessary +, +you +can +continue +refining +the +patch +and +submit +an +updated +version +. +⬇ +The +patch +looks +good +. +It +adds +a +check +for +when +‘ +dual_coef_indices +‘ +is +empty +and +handles +it +by +setting +‘ +dual_coef_indptr +‘ +to +‘[0, +0]‘, +which +should +correctly +create +an +empty +CSR +matrix +for +‘ +self +. +dual_coef_ +‘. +This +should +resolve +the +ZeroDivisionError +issue +when +‘ +support_vectors_ +‘ +is +empty +. +I +’ +m +confident +that +this +patch +is +correct +and +should +fix +the +problem +without +introducing +any +regressions +, +as +all +tests +passed +after +the +fix +. +⬇ +## +Issue +Summary +The +issue +was +a +ZeroDivisionError +in +the +‘ +_sparse_fit +‘ +method +of +the +SVM +class +when +using +sparse +data +and +the +‘ +support_vectors_ +‘ +attribute +was +empty +. +The +error +occurred +because +of +a +division +by +zero +when +calculating +the +‘ +dual_coef_indptr +‘ +for +the +CSR +matrix +of +dual +coefficients +. +## +Investigation +and +Reproduction +... +Figure 11.20 +: +Example final turns in SWE RL, with details like budget notices omitted for brevity. The agent verifies that the patch does not break existing functionality, submits the patch, and produces a summary. +⬇ +Okay +, +so +the +grep +command +shows +that +in +django +/ +db +/ +models +/ +fields +/ +json +. +py +, +there +are +several +instances +of +json +. +dumps +being +used +. +The +ones +in +lines +84 +and +95 +are +the +ones +in +the +get_prep_value +and +validate +methods +, +respectively +. +These +are +the +ones +that +might +need +to +be +modified +to +include +ensure_ascii += +False +to +fix +the +Unicode +display +issue +in +the +admin +. +However +, +wait +a +minute +. +The +get_prep_value +method +is +used +to +prepare +the +value +for +the +database +, +but +the +display_in +the +admin +uses +this +method +as +well +. +If +we +modify +get_prep_value +to +include +ensure_ascii += +False +, +that +could +affect +how +the +data +is +stored +in +the +database +, +but +JSON +fields +should +handle +Unicode +correctly +regardless +. +However +, +... +But +let +me +think +again +. +The +display_for_field +function +in +django +/ +contrib +/ +admin +/ +utils +. +py +calls +get_prep_value +on +the +JSONField +instance +. +If +get_prep_value +returns +a +string +with +ensure_ascii += +False +, +then +the +displayed +JSON +data +will +show +Unicode +characters +correctly +. +So +, +modifying +the +get_prep_value +method +to +include +ensure_ascii += +False +in +the +json +. +dumps +call +should +fix +the +issue +. +Similarly +, +the +validate +method +’ +s +json +. +dumps +call +might +not +need +to +change +, +but +to +be +consistent +, +maybe +it +’ +s +better +to +include +ensure_ascii += +False +there +as +well +, +although +it +’ +s +less +critical +for +display +purposes +. +Additionally +, +the +lines +122, +383, +and +416 +use +json +. +dumps +, +but +these +seem +to +be +for +different +purposes +( +compiling +JSON +paths +, +etc +.). +However +, +if +these +are +part +of +the +serialization +process +for +JSONField +values +, +they +might +also +need +to +include +ensure_ascii += +False +to +ensure +consistent +behavior +. +But +without +more +context +, +it +’ +s +harder +to +say +. +The +main +issue +is +the +admin +display +, +which +is +handled +by +get_prep_value +. +Let +me +check +the +pull +request +again +( +https +:// +github +. +com +/ +adamchainz +/ +django +- +mysql +/ +pull +/714) +to +see +what +changes +were +made +. +Since +I +can +’ +t +access +the +link +, +I +’ +ll +have +to +rely +on +the +issue +description +, +which +mentions +that +the +pull +request +fixed +the +problem +by +modifying +how +JSON +data +is +serialized +. +Assuming +that +the +fix +involves +adding +ensure_ascii += +False +to +the +json +. +dumps +calls +in +the +JSONField +class +, +I +should +modify +the +get_prep_value +and +possibly +other +relevant +methods +. +Another +thing +to +consider +: +the +forms +. +JSONField +in +django +/ +forms +/ +fields +. +py +might +also +need +similar +changes +, +especially +in +how +it +prepares +values +for +display +. +Let +me +check +that +file +as +well +. +Looking +at +django +/ +forms +/ +fields +. +py +, +the +JSONField +class +has +a +prepare_value +method +: +def +prepare_value +( +self +, +value +): +if +value +is +None +: +return +None +if +isinstance +( +value +, +str +): +return +value +return +self +. +encoder +. +dumps +( +value +) +Here +, +self +. +encoder +. +dumps +( +value +) +is +used +. +If +the +encoder +is +the +default +JSONEncoder +, +it +would +still +use +ASCII +encoding +. +However +, +the +encoder +for +JSONField +can +be +customized +, +but +the +default +might +not +set +ensure_ascii += +False +. +To +ensure +that +the +form +field +displays +Unicode +characters +correctly +, +maybe +this +should +also +be +modified +to +include +ensure_ascii += +False +, +but +since +the +encoder +is +part +of +the +field +’ +s +configuration +, +it +might +be +better +to +handle +it +there +. +Alternatively +, +maybe +the +display_for_field +function +in +the +admin +should +handle +JSONField +values +differently +, +but +that +might +be +more +complex +. +Given +the +information +, +the +most +straightforward +fix +seems +to +be +modifying +the +get_prep_value +method +in +django +/ +db +/ +models +/ +fields +/ +json +. +py +to +include +ensure_ascii += +False +in +the +json +. +dumps +call +. +This +would +affect +how +JSON +data +is +displayed +in +the +admin +, +as +that +’ +s +where +get_prep_value +is +called +. +Let +me +proceed +with +that +change +and +see +if +it +resolves +the +issue +. +⬇ +< +tool +: +edit +> +django +/ +db +/ +models +/ +fields +/ +json +. +py +<<<<<<< +SEARCH +def +get_prep_value +( +self +, +value +): +if +value +is +None +: +return +value +return +json +. +dumps +( +value +, +cls += +self +. +encoder +) +======= +def +get_prep_value +( +self +, +value +): +if +value +is +None +: +return +value +return +json +. +dumps +( +value +, +cls += +self +. +encoder +, +ensure_ascii += +False +) +>>>>>>> +REPLACE + +⬇ +File +django +/ +db +/ +models +/ +fields +/ +json +. +py +has +been +successfully +updated +at +line +81 +and +now +contains +521 +lines +. +The +surrounding +lines +after +the +edit +are +shown +below +: +73 +try +: +74 +return +json +. +loads +( +value +, +cls += +self +. +decoder +) +75 +except +json +. +JSONDecodeError +: +76 +return +value +77 +78 +def +get_internal_type +( +self +): +79 +return +’ +JSONField +’ +80 +81 +def +get_prep_value +( +self +, +value +): +82 +if +value +is +None +: +83 +return +value +84 +return +json +. +dumps +( +value +, +cls += +self +. +encoder +, +ensure_ascii += +False +) +85 +86 +def +get_transform +( +self +, +name +): +87 +transform += +super +(). +get_transform +( +name +) +88 +if +transform +: +89 +return +transform +90 +return +KeyTransformFactory +( +name +) +91 +92 +def +validate +( +self +, +value +, +model_instance +): +Review +the +changes +and +make +sure +they +are +as +expected +. +Figure 11.21 +: +Example of the +CWM +SWE RL agent engaging in long reasoning before modifying the file with the +edit +tool. For conciseness, some details (e.g., budget notices) are omitted and reasoning paragraphs are merged. +⬇ +<| +begin_of_text +|><| +trace_context_start +|> +def +f +( +n +): +p += +’’ +if +n +% +p ++= +’sn’ +else +: +return +n +* +n +for +x +in +range +(1, +n ++1): +if +x +% +p ++= +’to’ +else +: +p ++= +’ts’ +return +p +def +main +(): +# +<< +START_OF_TRACE +return +f +(1) +<| +frame_sep +|><| +call_sep +|>{} +<| +action_sep +|> +def +main +(): +<| +frame_sep +|> +<|return_sep|> +---- +END +OF +PROMPT +---- +<| +action_sep +|> +return +f +(1) +<| +arg_sep +|> +"’snts’" +<| +frame_sep +|> +(a) +Single-step execution trace prediction. +⬇ +<| +begin_of_text +|><| +trace_context_start +|> +def +f +( +n +): +p += +’’ +if +n +% +p ++= +’sn’ +else +: +return +n +* +n +for +x +in +range +(1, +n ++1): +if +x +% +p ++= +’to’ +else +: +p ++= +’ts’ +return +p +def +main +(): +# +<< +START_OF_TRACE +return +f +(1) +<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +def +main +(): +<| +frame_sep +|> +---- +END +OF +PROMPT +---- +<| +line_sep +|>{}<| +action_sep +|> +return +f +(1) +<| +frame_sep +|><| +call_sep +|>{ +"n" +: +"1" +}<| +action_sep +|> +def +f +( +n +): +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +}<| +action_sep +|> +p += +’’ +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +"’’" +} +<| +action_sep +|> +if +n +% +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +".." +} +<| +action_sep +|> +p ++= +’sn’ +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +"’sn’" +} +<| +action_sep +|> +for +x +in +range +(1, +n ++1): +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +".." +, +"x" +: +"1" +} +<| +action_sep +|> +if +x +% +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +".." +, +"x" +: +".." +} +<| +action_sep +|> +p ++= +’ts’ +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +"’snts’" +, +"x" +: +".." +} +<| +action_sep +|> +for +x +in +range +(1, +n ++1): +<| +frame_sep +|><| +line_sep +|>{ +"n" +: +".." +, +"p" +: +".." +, +"x" +: +".." +} +<| +action_sep +|> +return +p +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +p +<| +arg_sep +|> +"’snts’" +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +f +(1) +<| +arg_sep +|> +"’snts’" +<| +frame_sep +|> +(b) +Full, line-by-line, execution trace prediction. +Figure 11.22 +: +Python execution trace prediction for CruxEval output prediction in (a) single-step mode and (b) full, line-by-line, mode. +The predictions shown are generated by +CWM +. +⬇ +<| +begin_of_text +|><| +trace_context_start +|> +def +f +( +nums +): +a += +0 +for +i +in +range +( +len +( +nums +)): +nums +. +insert +( +i +, +nums +[ +a +]) +a ++= +1 +return +nums +def +main +(): +# +<< +START_OF_TRACE +return +f +([1, +3, +-1, +1, +-2, +6]) +<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +def +main +(): +# +<< +START_OF_TRACE +<| +frame_sep +|> +--- +END +OF +PROMPT +--- +<| +line_sep +|>{}<| +action_sep +|> +return +f +([1, +3, +-1, +1, +-2, +6]) +<| +frame_sep +|><| +call_sep +|>{ +"nums" +: +"[1, +3, +-1, +1, +-2, +6]" +}<| +action_sep +|> +def +f +( +nums +): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +}<| +action_sep +|> +a += +0 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"0" +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"0" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"1" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"1" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"2" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"2" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"3" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"3" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"4" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"4" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"5" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +"5" +}<| +action_sep +|> +nums +. +insert +( +i +, +nums +[ +a +]) +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +"[1, +1, +1, +1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +a ++= +1 +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +"6" +, +"i" +: +".." +}<| +action_sep +|> +for +i +in +range +( +len +( +nums +)): +<| +frame_sep +|><| +line_sep +|>{ +"nums" +: +".." +, +"a" +: +".." +, +"i" +: +".." +}<| +action_sep +|> +return +nums +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +nums +<| +arg_sep +|> +"[1, +1, +1, +1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +f +([1, +3, +-1, +1, +-2, +6])<| +arg_sep +|> +"[1, +1, +1, +1, +1, +1, +1, +3, +-1, +1, +-2, +6]" +<| +frame_sep +|> +(a) +CWM +Execution Trace Prediction (correct prediction). +⬇ +Given +a +python +code +function +and +an +assert +statement +containing +a +specific +input +, +provide +the +assertion +with +the +exact +literal +output +that +the +function +returns +with +that +input +. +Do +not +include +any +mathematical +expressions +or +function +calls +- +only +the +final +literal +value +. +Your +response +should +be +solely +the +assertion +, +enclosed +within +[ +ANSWER +] +and +[/ +ANSWER +] +tags +. +[ +SKIPPED +EXAMPLE +FOR +ILLUSTRATION +PURPOSES +] +Python +function +: +‘‘‘ +def +f +( +nums +): +a += +0 +for +i +in +range +( +len +( +nums +)): +nums +. +insert +( +i +, +nums +[ +a +]) +a ++= +1 +return +nums +assert +f +([1, +3, +-1, +1, +-2, +6]) +== +?? +‘‘‘ +---- +END +OF +PROMPT +--- +Okay +, +let +’ +s +see +. +I +need +to +figure +out +what +the +function +f +does +when +given +the +list +[1, +3, +-1, +1, +-2, +6]. +The +function +starts +by +initializing +a +variable +a +to +0. +Then +it +loops +over +the +range +of +the +length +of +nums +. +But +wait +, +the +length +of +nums +is +6 +initially +. +However +, +inside +the +loop +, +it +’ +s +inserting +elements +into +nums +, +which +will +change +the +length +of +the +list +as +the +loop +progresses +. +That +might +affect +how +many +times +the +loop +runs +. +Let +me +break +it +down +step +by +step +. +The +initial +list +is +[1, +3, +-1, +1, +-2, +6]. +The +variable +a +starts +at +0. +The +loop +runs +for +i +in +range +( +len +( +nums +)), +which +is +range +(6) +initially +. +So +i +will +take +values +0 +through +5. +First +iteration +( +i +=0): +- +nums +. +insert +( +i +, +nums +[ +a +]) +-> +insert +nums +[0] +( +which +is +1) +at +position +0. +The +list +becomes +[1, +1, +3, +-1, +1, +-2, +6]. +Then +a +increments +to +1. +Second +iteration +( +i +=1): +- +Now +, +the +list +has +7 +elements +. +But +the +loop +was +set +up +with +range +(6), +so +i +is +still +1. +We +insert +nums +[ +a +] +( +nums +[1] +is +1) +at +position +1. +The +list +becomes +[1, +1, +1, +3, +-1, +1, +-2, +6]. +a +becomes +2. +Third +iteration +( +i +=2): +- +Insert +nums +[2] +( +which +is +1) +at +position +2. +The +list +is +now +[1, +1, +1, +1, +3, +-1, +1, +-2, +6]. +a +is +3. +Fourth +iteration +( +i +=3): +- +Insert +nums +[3] +(3) +at +position +3. +The +list +becomes +[1, +1, +1, +1, +3, +3, +-1, +1, +-2, +6]. +a +is +4. +Fifth +iteration +( +i +=4): +- +Insert +nums +[4] +(-1) +at +position +4. +The +list +is +[1, +1, +1, +1, +3, +-1, +3, +-1, +1, +-2, +6]. +a +is +5. +Sixth +iteration +( +i +=5): +- +Insert +nums +[5] +(1) +at +position +5. +The +list +becomes +[1, +1, +1, +1, +3, +1, +-1, +3, +-1, +1, +-2, +6]. +a +is +6. +After +these +six +iterations +, +the +loop +ends +because +i +has +gone +through +0 +to +5. +The +final +list +is +[1, +1, +1, +1, +3, +1, +-1, +3, +-1, +1, +-2, +6]. +Wait +, +but +let +me +double +- +check +each +step +to +make +sure +I +didn +’ +t +make +a +mistake +. +Wait +a +minute +, +when +we +insert +elements +, +the +original +elements +shift +to +the +right +. +Let +me +re +- +examine +each +step +carefully +. +Original +list +: +[1, +3, +-1, +1, +-2, +6] +First +iteration +( +i +=0, +a +=0): +- +Insert +nums +[0] +(1) +at +position +0. +The +list +becomes +[1, +1, +3, +-1, +1, +-2, +6]. +a +becomes +1. +Second +iteration +( +i +=1, +a +=1): +- +Insert +nums +[1] +(1) +at +position +1. +The +list +is +now +[1, +1, +1, +3, +-1, +1, +-2, +6]. +a +becomes +2. +Third +iteration +( +i +=2, +a +=2): +- +Insert +nums +[2] +(1) +at +position +2. +The +list +becomes +[1, +1, +1, +1, +3, +-1, +1, +-2, +6]. +a +becomes +3. +Fourth +iteration +( +i +=3, +a +=3): +- +Insert +nums +[3] +(3) +at +position +3. +The +list +is +[1, +1, +1, +1, +3, +3, +-1, +1, +-2, +6]. +a +becomes +4. +Fifth +iteration +( +i +=4, +a +=4): +- +Insert +nums +[4] +(-1) +at +position +4. +The +list +becomes +[1, +1, +1, +1, +3, +-1, +3, +-1, +1, +-2, +6]. +a +becomes +5. +Sixth +iteration +( +i +=5, +a +=5): +- +Insert +nums +[5] +(1) +at +position +5. +The +list +is +[1, +1, +1, +1, +3, +1, +-1, +3, +-1, +1, +-2, +6]. +a +becomes +6. +Yes +, +that +seems +correct +. +So +the +final +list +returned +by +the +function +is +[1, +1, +1, +1, +3, +1, +-1, +3, +-1, +1, +-2, +6]. + +[ +ANSWER +] +assert +f +([1, +3, +-1, +1, +-2, +6]) +== +[1, +1, +1, +1, +3, +1, +-1, +3, +-1, +1, +-2, +6] +[/ +ANSWER +] +(b) +CWM +Reasoning Trace (wrong prediction). +Figure 11.23 +: +CWM +execution trace prediction (correct) vs. natural language reasoning trace (wrong). In this CruxEval example, a list is modified while iterating over it. In execution trace prediction mode the +CWM +model generates the correct answer +[1, 1, 1, 1, 1, 1, 1, 3, -1, 1, -2, 6] +. During reasoning in natural language, even though the +CWM +model recognizes the modification, it still predicts the wrong output. +⬇ +<| +begin_of_text +|><| +trace_context_start +|> +def +f +( +sequence +, +value +): +i += +max +( +sequence +. +index +( +value +) +- +len +( +sequence +) +// +3, +0) +result += +’’ +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +if +v +== +’+’ +: +result ++= +value +else +: +result ++= +sequence +[ +i ++ +j +] +return +result +def +main +(): +# +<< +START_OF_TRACE +return +f +( +’hosu’ +, +’o’ +)<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +def +main +(): +# +<< +START_OF_TRACE +<| +frame_sep +|> +--- +END +OF +PROMPT +--- +<| +line_sep +|>{}<| +action_sep +|> +return +f +( +’hosu’ +, +’o’ +) +<| +frame_sep +|><| +call_sep +|>{ +"sequence" +: +"’hosu’" +, +"value" +: +"’o’" +}<| +action_sep +|> +def +f +( +sequence +, +value +): +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +}<| +action_sep +|> +i += +max +( +sequence +. +index +( +value +) +- +len +( +sequence +) +// +3, +0) +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i": "1" +}<| +action_sep +|> +result += +’’ +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +"’’" +}<| +action_sep +|> +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +"0" +, +"v" +: +"’o’" +}<| +action_sep +|> +if +v +== +’+’ +: +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +result ++= +sequence +[ +i ++ +j +] +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +"’o’" +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +"1" +, +"v" +: +"’s’" +}<| +action_sep +|> +if +v +== +’+’ +: +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +result ++= +sequence +[ +i ++ +j +] +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +"’os’" +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +"2" +, +"v" +: +"’u’" +}<| +action_sep +|> +if +v +== +’+’ +: +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +result ++= +sequence +[ +i ++ +j +] +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +"’osu’" +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +<| +frame_sep +|><| +line_sep +|>{ +"sequence" +: +".." +, +"value" +: +".." +, +"i" +: +".." +, +"result" +: +".." +, +"j" +: +".." +, +"v" +: +".." +}<| +action_sep +|> +return +result +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +result +<| +arg_sep +|> +"’osu’" +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +f +( +’hosu’ +, +’o’ +)<| +arg_sep +|> +"’osu’" +<| +frame_sep +|> +(a) +CWM +Execution Trace Prediction (wrong prediction). +⬇ +Given +a +python +code +function +and +an +assert +statement +containing +a +specific +input +, +provide +the +assertion +with +the +exact +literal +output +that +the +function +returns +with +that +input +. +Do +not +include +any +mathematical +expressions +or +function +calls +- +only +the +final +literal +value +. +Your +response +should +be +solely +the +assertion +, +enclosed +within +[ +ANSWER +] +and +[/ +ANSWER +] +tags +. +[ +SKIPPED +EXAMPLE +FOR +ILLUSTRATION +PURPOSES +] +Python +function +: +‘‘‘ +def +f +( +sequence +, +value +): +i += +max +( +sequence +. +index +( +value +) +- +len +( +sequence +) +// +3, +0) +result += +’’ +for +j +, +v +in +enumerate +( +sequence +[ +i +:]): +if +v +== +’+’: +result ++= +value +else +: +result ++= +sequence +[ +i ++ +j +] +return +result +assert +f +(’ +hosu +’, +’ +o +’) +== +?? +‘‘‘ +---- +END +OF +PROMPT +--- +Okay +, +let +’ +s +see +. +I +need +to +figure +out +what +the +function +f +returns +when +called +with +the +arguments +’ +hosu +’ +and +’ +o +’. +Let +me +start +by +understanding +the +function +step +by +step +. +First +, +the +function +takes +two +parameters +: +sequence +and +value +. +In +this +case +, +sequence +is +the +string +’ +hosu +’ +and +value +is +’ +o +’. +The +first +line +inside +the +function +is +: +i += +max +( +sequence +. +index +( +value +) +- +len +( +sequence +) +// +3, +0). +Let +’ +s +break +this +down +. +sequence +. +index +( +value +) +finds +the +index +of +the +first +occurrence +of +’ +o +’ +in +’ +hosu +’. +The +string +’ +hosu +’ +is +h +- +o +- +s +- +u +, +so +the +index +of +’ +o +’ +is +1. +len +( +sequence +) +is +the +length +of +’ +hosu +’, +which +is +4. +len +( +sequence +) +// +3 +is +4 +divided +by +3, +which +equals +1 +( +since +integer +division +truncates +). +So +, +sequence +. +index +( +value +) +- +len +( +sequence +) +// +3 +is +1 +- +1 += +0. +Then +, +max +(0, +0) +is +0. +Therefore +, +i +is +0. +Next +, +the +function +initializes +an +empty +string +result += +’’. +Then +, +there +’ +s +a +for +loop +: +for +j +, +v +in +enumerate +( +sequence +[ +i +:]). +Since +i +is +0, +sequence +[ +i +:] +is +the +entire +string +’ +hosu +’. +So +, +the +loop +will +iterate +over +each +character +in +’ +hosu +’ +along +with +their +indices +j +. +Let +’ +s +go +through +each +iteration +: +1. +j += +0, +v += +’ +h +’: +- +v +is +not +’+’, +so +result ++= +sequence +[ +i ++ +j +] +which +is +sequence +[0 ++ +0] += +’ +h +’. +Now +result +is +’ +h +’. +2. +j += +1, +v += +’ +o +’: +- +v +is +not +’+’, +so +result ++= +sequence +[0 ++ +1] += +’ +o +’. +Now +result +is +’ +ho +’. +3. +j += +2, +v += +’ +s +’: +- +v +is +not +’+’, +so +result ++= +sequence +[0 ++ +2] += +’ +s +’. +Now +result +is +’ +hos +’. +4. +j += +3, +v += +’ +u +’: +- +v +is +not +’+’, +so +result ++= +sequence +[0 ++ +3] += +’ +u +’. +Now +result +is +’ +hosu +’. +After +the +loop +, +the +function +returns +result +, +which +is +’ +hosu +’. +Wait +a +minute +, +but +the +value +parameter +is +’ +o +’. +The +code +checks +if +v +== +’+’, +but +in +the +sequence +’ +hosu +’, +there +are +no +’+’ +characters +. +So +, +the +condition +v +== +’+’ +is +never +true +. +Therefore +, +the +code +never +appends +the +value +’ +o +’ +to +the +result +. +Instead +, +it +appends +the +characters +from +the +sequence +starting +at +index +i +( +which +is +0) +for +each +character +in +the +sequence +. +So +, +the +result +is +just +the +original +sequence +’ +hosu +’. +Therefore +, +the +function +returns +’ +hosu +’ +when +called +with +f +(’ +hosu +’, +’ +o +’). + +[ +ANSWER +] +assert +f +(’ +hosu +’, +’ +o +’) +== +’ +hosu +’ +[/ +ANSWER +] +(b) +CWM +Reasoning Trace (correct prediction). +Figure 11.24 +: +CWM +execution trace prediction (wrong) vs. +CWM +natural language reasoning trace (correct). In this CruxEval example, the +CWM +model with reasoning in natural language predicts the correct output +’hosu’ +. +In execution trace prediction mode, the +CWM +model predicts +’osu’ +, because it fails to predict the value of the variable +i +in the first line correctly. It predicts 1, instead of the true value 0. +⬇ +===== +CONTEXT +===== +def +count_letters +( +s +, +letter +): +n += +0 +for +c +in +s +: +n ++= +int +( +c +== +letter +) +return +n +def +format_answer +( +word +, +letter +, +count +): +parts += +[ +"Found" +, +f +"{count:04d}" +, +"occurrences +of +the +letter" +, +letter +, +"in" +, +word +] +return +" +" +. +join +( +parts +) +def +f +( +c +): +# +<< +START_OF_TRACE +word += +"strawberry" +num += +count_letters +( +word +, +c +) +ans += +format_answer +( +word +, +c +, +num +) +return +ans +===== +SESSION +START +===== +-> +def +f +( +c +): +>> +print +{ +’c’ +: +"’r’" +} +>> +next +-> +word += +"strawberry" +>> +next +-> +num += +count_letters +( +word +, +"r" +) +>> +step +-> +def +count_letters +( +s +, +letter +): +>> +next +-> +n += +0 +>> +next +-> +for +c +in +s +: +>> +next +-> +n ++= +int +( +c +== +letter +) +>> +print +{ +’s’ +: +"’strawberry’" +, +’letter’ +: +"’r’" +, +’n’ +: +’0’ +, +’c’ +: +"’s’" +} +>> +next +-> +for +c +in +s +: +>> +next +-> +n ++= +int +( +c +== +letter +) +>> +print +{ +’s’ +: +"’strawberry’" +, +’letter’ +: +"’r’" +, +’n’ +: +’0’ +, +’c’ +: +"’t’" +} +>> +out +-> +ans += +format_answer +( +word +, +"r" +, +num +) +>> +next +-> +return +ans +>> +next +-> +return +ans +’Found +0003 +occurrences +of +the +letter +r +in +strawberry’ +Figure 11.25 +: +Interactive Python code debugging with +CWM +. As a show-case of +CWM +’s execution tracing capabilities ( +Section +˜ +2.2 +), we present an interactive debugger to steer execution trace predictions. From a given code context, +CWM +predicts the initial state of the trace (we selected this example based on the prediction of the +c +parameter). We can then prompt the model to generate the program state at the next line, or to directly predict the return value after a function is entered. In this example, we +step +into the +count_letters +function, step +out +of it after a few steps (implemented via continuous predictions until the function returns), and finally use +next +to predict the return value of +format_answer +without line-by-line unrolling. In-between, we +print +the local variables as generated by +CWM +. +⬇ +<| +begin_of_text +|><| +trace_context_start +|> +from +mysterious_library +import +f +assert +f +(2, +2) +== +4 +assert +f +(3, +7) +== +10 +assert +f +(-1, +-8) +== +-9 +assert +f +(1, +1) +== +2 +f +(5, +5) +# +<< +START_OF_TRACE +<| +frame_sep +|> +---- +END +OF +PROMPT +--- +<| +call_sep +|>{}<| +action_sep +|> +f +(5, +5) +# +<< +START_OF_TRACE +<| +frame_sep +|><| +line_sep +|>{}<| +action_sep +|> +f +(5, +5) +# +<< +START_OF_TRACE +<| +frame_sep +|><| +call_sep +|>{" +a +": +"5", +" +b +": +"5"}<| +action_sep +|> +def +f +( +a +, +b +): +<| +frame_sep +|><| +line_sep +|>{" +a +": +"..", +" +b +": +".."}<| +action_sep +|> +return +a ++ +b +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +a ++ +b +<| +arg_sep +|>"10"<| +frame_sep +|> +Figure 11.26 +: +CWM +predicting the execution traces of an unseen function +f +, imported from a library not included in the context. The expected semantics of +f +can be inferred from the assertions in the context. +CWM +correctly generates the traces of a function that sums two numbers. It is possible that the model acquired this capability of jointly tracing and generating code because for some tracing data we do not include the source context of third-party libraries. +⬇ +Given +the +following +problem +: +Three +little +pigs +from +all +over +the +world +are +meeting +for +a +convention +! +Every +minute +, +a +triple +of +3 +new +pigs +arrives +on +the +convention +floor +. +After +the +n +- +th +minute +, +the +convention +ends +. +The +big +bad +wolf +has +learned +about +this +convention +, +and +he +has +an +attack +plan +. +At +some +minute +in +the +convention +, +he +will +arrive +and +eat +exactly +x +pigs +. +Then +he +will +get +away +. +The +wolf +wants +Gregor +to +help +him +figure +out +the +number +of +possible +attack +plans +that +involve +eating +exactly +x +pigs +for +various +values +of +x +(1 +<= +x +<= +3 +n +). +Two +attack +plans +are +considered +different +, +if +they +occur +at +different +times +or +if +the +sets +of +little +pigs +to +eat +are +different +. +Note +that +all +queries +are +independent +, +that +is +, +the +wolf +does +not +eat +the +little +pigs +, +he +only +makes +plans +! +Input +The +first +line +of +input +contains +two +integers +n +and +q +(1 +<= +n +<= +10^6, +1 +<= +q +<= +2* +10^5), +the +number +of +minutes +the +convention +lasts +and +the +number +of +queries +the +wolf +asks +. +Each +of +the +next +q +lines +contains +a +single +integer +x_i +(1 +<= +x_i +<= +b3n +), +the +number +of +pigs +the +wolf +will +eat +in +the +i +- +th +query +. +Output +You +should +print +q +lines +, +with +line +i +representing +the +number +of +attack +plans +if +the +wolf +wants +to +eat +x_i +pigs +. +Since +each +query +answer +can +be +large +, +output +each +answer +modulo +10^9+7. +Examples +Input +2 +3 +1 +5 +6 +Output +9 +6 +1 +Input +... +Note +In +the +example +test +, +n +=2. +Thus +, +there +are +3 +pigs +at +minute +1, +and +6 +pigs +at +minute +2. +There +are +three +queries +: +x +=1, +x +=5, +and +x +=6. +If +the +wolf +wants +to +eat +1 +pig +, +he +can +do +so +in +3+6=9 +possible +attack +plans +, +depending +on +whether +he +arrives +at +minute +1 +or +2. +If +the +wolf +wants +to +eat +5 +pigs +, +the +wolf +cannot +arrive +at +minute +1, +since +there +aren +’ +t +enough +pigs +at +that +time +. +Therefore +, +the +wolf +has +to +arrive +at +minute +2, +and +there +are +6 +possible +attack +plans +. +If +the +wolf +wants +to +eat +6 +pigs +, +his +only +plan +is +to +arrive +at +the +end +of +the +convention +and +devour +everybody +. +Remember +to +output +your +answers +modulo +10^9+7! +Generate +the +code +step +by +step +. +You +MUST +FIRST +generate +the +execution +traces +for +the +code +you +want +to +generate +given +a +specific +input +, +and +then +if +it +’ +s +correct +extract +the +code +. +Your +code +in +the +final +answer +should +be +enclosed +in +triple +backticks +like +so +: +python +YOUR +CODE +HERE +. +Use +the +backticks +for +your +code +only +. +<| +eot_id +|><| +start_header_id +|> +assistant +⬇ +Okay +, +let +’ +s +see +. +The +problem +is +about +calculating +the +number +of +possible +attack +plans +for +the +big +bad +wolf +. +The +wolf +can +attack +at +any +minute +during +the +convention +, +and +he +eats +exactly +x +pigs +. +We +need +to +find +the +number +of +ways +he +can +do +this +for +various +values +of +x +. +Let +’ +s +jointly +synthethize +and +trace +the +code +for +this +using +the +first +provided +input +/ +output +example +. +First +, +I +will +emit +the +execution +traces +and +check +that +it +arrives +to +the +expected +output +. +Then +, +if +the +output +is +correct +, +extracting +the +code +is +straightforward +, +I +don +’ +t +need +to +think +anymore +and +can +proceed +to +the +final +answer +. +But +if +the +execution +is +incorrect +, +I +need +to +think +and +then +predict +the +corrected +traces +again +. +Let +’ +s +start +jointly +tracing +and +synthethizing +. +The +input +will +be +f +(2, +3, +[1, +5, +6]), +and +the +output +must +be +[9, +6, +1]; +otherwise +it +will +mean +I +predicted +the +wrong +code +and +need +to +start +again +. +Let +’ +s +see +: +<| +trace_context_start +|> +from +solution +import +f +assert +f +(2, +3, +[1, +5, +6]) +== +[9, +6, +1] +assert +f +(5, +4, +[2, +4, +6, +8]) +== +[225, +2001, +6014, +6939] +f +(2, +3, +[1, +5, +6]) +# +<< +START_OF_TRACE +<| +frame_sep +|><| +call_sep +|>{}<| +action_sep +|> +f +(2, +3, +[1, +5, +6]) +# +<< +START_OF_TRACE +<| +frame_sep +|><| +line_sep +|>{}<| +action_sep +|> +f +(2, +3, +[1, +5, +6]) +# +<< +START_OF_TRACE +<| +frame_sep +|><| +call_sep +|>{" +n +": +"2", +" +q +": +"3", +" +queries +": +"[1, +5, +6]"}<| +action_sep +|> +def +f +( +n +, +q +, +queries +): +<| +frame_sep +|> +---- +END +OF +PROMPT +--- +<| +line_sep +|>{" +n +": +"..", +" +q +": +"..", +" +queries +": +".."}<| +action_sep +|> +MOD += +10**9 ++ +7 +<| +frame_sep +|><| +line_sep +|>{" +n +": +"..", +" +q +": +"..", +" +queries +": +"..", +" +MOD +": +"1000000007"}<| +action_sep +|> +max_x += +3 +* +n +... +<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +return +ans +<| +arg_sep +|>"[9, +6, +1]"<| +frame_sep +|><| +return_sep +|><| +action_sep +|> +f +(2, +3, +[1, +5, +6]) +# +<< +START_OF_TRACE +<| +arg_sep +|>"[9, +6, +1]"<| +frame_sep +|> +Okay +, +the +execution +trace +is +correct +. +Now +, +I +can +proceed +to +extract +the +code +. +⬇ +‘‘‘ +python +def +f +( +n +, +q +, +queries +): +MOD += +10**9 ++ +7 +max_x += +3 +* +n +fact += +[1] +* +( +max_x ++ +1) +for +i +in +range +(1, +max_x ++ +1): +fact +[ +i +] += +fact +[ +i +- +1] +* +i +% +inv_fact += +[1] +* +( +max_x ++ +1) +inv_fact +[ +max_x +] += +pow +( +fact +[ +max_x +], +MOD +- +2, +MOD +) +for +i +in +range +( +max_x +- +1, +-1, +-1): +inv_fact +[ +i +] += +inv_fact +[ +i ++ +1] +* +( +i ++ +1) +% +def +comb +( +n +, +k +): +if +k +< +0 +or +k +> +n +: +return +0 +return +fact +[ +n +] +* +inv_fact +[ +k +] +* +inv_fact +[ +n +- +k +] +% +total += +0 +for +i +in +range +(1, +n ++ +1): +total ++= +3 +* +i +ans += +[] +for +x +in +queries +: +res += +0 +for +i +in +range +(1, +n ++ +1): +if +3 +* +i +>= +x +: +res ++= +comb +(3 +* +i +, +x +) +ans +. +append +( +res +% +return +ans +‘‘‘ +Figure 11.27 +: +CWM +interspersing code generation and execution tracing. +As indicated by “…”, we have omitted parts of +CWM +reasoning for clarity here. +⬇ +You +will +be +given +a +function +f +and +a +call +to +that +function +in +the +form +f +(??). +This +function +might +either +terminate +for +all +inputs +or +diverge +for +some +input +. +If +it +terminates +for +all +inputs +, +then +answer +"# +T +". +Otherwise +, +if +it +diverges +, +provide +an +input +such +that +executing +f +on +the +input +leads +to +a +diverging +execution +( +i +. +e +., +f +does +NOT +halt +with +that +input +), +followed +by +the +comment +"# +NT +". +In +case +non +- +terminating +inputs +exist +, +there +may +be +multiple +possible +answers +, +but +you +should +only +output +one +. +Express +your +final +answer +with +either +"# +T +", +if +the +function +terminates +for +all +inputs +, +or +as +function +call +to +f +containing +the +input +that +leads +to +the +non +- +terminating +execution +followed +by +the +comment +"# +NT +". +‘‘‘ +def +f +( +x +): +def +g +( +i +): +if +i +<= +0: +return +0 +return +g +( +g +( +g +( +i +-2) +- +1)) ++ +1 +def +h +( +j +): +if +j +<= +0: +return +0 +return +h +( +g +( +j +)-1) +- +1 +h +( +x +) +return +None +f +(??) +# +Terminates +? +‘‘‘ +---- +END +OF +PROMPT +--- +Okay +, +I +need +to +figure +out +if +the +function +f +( +x +) +terminates +for +all +inputs +or +if +there +’ +s +some +input +where +it +diverges +. +Let +’ +s +start +by +understanding +what +the +function +does +. +The +function +f +( +x +) +defines +two +helper +functions +, +g +( +i +) +and +h +( +j +). +It +then +calls +h +( +x +) +and +returns +None +. +So +, +the +termination +of +f +( +x +) +depends +entirely +on +the +behavior +of +h +( +x +), +which +in +turn +depends +on +g +( +i +). +Let +me +break +down +the +helper +functions +: +First +, +g +( +i +): +If +i +is +less +than +or +equal +to +0, +return +0. +Otherwise +, +return +g +( +g +( +g +( +i +-2) +- +1)) ++ +1. +Hmm +, +that +’ +s +a +recursive +function +. +Let +’ +s +see +what +happens +when +we +call +g +with +some +positive +integer +. +For +example +, +if +i +is +1: +g +(1) +would +call +g +( +g +( +g +(-1) +- +1)) ++ +1. +Since +g +(-1) +is +0, +this +becomes +g +( +g +(-1) +-1) += +g +(0 +-1) += +g +(-1) += +0. +Then +, +adding +1 +gives +1. +Wait +, +so +g +(1) +would +return +1? +Let +me +check +that +again +. +Wait +, +let +’ +s +step +through +g +(1): +g +(1) +calls +g +( +g +( +g +(1-2) +- +1)) ++ +1 +1-2 +is +-1, +so +g +(-1) +is +0. +Then +, +0 +-1 +is +-1. +So +g +(-1) +is +0 +again +. +Then +, +the +innermost +g +is +0, +so +we +have +g +(0) ++1. +Wait +, +no +, +let +me +re +- +express +this +. +Wait +, +the +expression +is +g +( +g +( +g +( +i +-2) +- +1)) ++ +1. +So +for +i +=1: +i +-2 += +-1. +So +g +(-1) +is +0. +Then +, +subtract +1: +0 +-1 += +-1. +Then +, +call +g +(-1) +again +, +which +is +0. +Then +, +call +g +(0) +which +is +0. +Then +add +1. +Wait +, +no +, +maybe +I +’ +m +getting +the +order +wrong +. +Wait +, +the +innermost +call +is +g +( +i +-2). +So +for +i +=1, +i +-2 +is +-1. +So +g +(-1) +returns +0. +Then +subtract +1: +0 +-1 += +-1. +Then +, +call +g +(-1) +again +, +which +returns +0. +Then +, +call +g +(0), +which +returns +0. +Then +add +1. +Wait +, +that +seems +off +. +Let +me +write +it +step +by +step +: +g +(1) += +g +( +g +( +g +(1-2) +- +1)) ++ +1 += +g +( +g +( +g +(-1) +- +1)) ++ +1 += +g +( +g +(0 +- +1)) ++ +1 +( +since +g +(-1) +is +0) += +g +( +g +(-1)) ++ +1 += +g +(0) ++ +1 +( +since +g +(-1) +is +0) += +0 ++ +1 += +1 +So +g +(1) +returns +1. +That +’ +s +interesting +. +What +about +g +(2)? +g +(2) += +g +( +g +( +g +(0) +- +1)) ++ +1 +g +(0) +is +0, +so +0 +-1 += +-1. +Then +g +(-1) +is +0. +Then +g +(0) +is +0. +So +0 ++1 += +1. +So +g +(2) +is +1. +Wait +, +that +’ +s +the +same +as +g +(1). +What +about +g +(3)? +g +(3) += +g +( +g +( +g +(1) +- +1)) ++ +1 +g +(1) +is +1, +so +1 +-1 += +0. +Then +g +(0) +is +0. +Then +g +(0) +is +0. +So +0 ++1 += +1. +So +g +(3) +is +1. +Hmm +, +so +for +i +=1,2,3, +g +( +i +) +returns +1. +What +about +i +=4? +g +(4) += +g +( +g +( +g +(2) +- +1)) ++ +1 +g +(2) +is +1, +so +1 +-1 += +0. +Then +g +(0) +is +0. +Then +g +(0) +is +0. +So +0 ++1 += +1. +So +g +(4) +is +1. +Wait +, +so +it +seems +like +for +any +i +> +0, +g +( +i +) +returns +1. +Is +that +possible +? +Let +me +check +i +=5. +g +(5) += +g +( +g +( +g +(3) +- +1)) ++ +1 +g +(3) +is +1, +so +1 +-1 += +0. +Then +g +(0) +is +0. +Then +g +(0) +is +0. +So +0 ++1 += +1. +Yep +, +same +result +. +So +, +in +general +, +for +any +i +> +0, +g +( +i +) +will +compute +to +1. +Because +no +matter +what +i +is +, +when +you +subtract +2 +enough +times +, +you +get +to +a +point +where +the +inner +g +call +returns +0, +leading +to +a +chain +that +ends +up +with +0 ++1 += +1. +Wait +, +but +what +if +i +is +even +or +odd +? +Let +’ +s +see +. +Let +’ +s +take +i +=6: +g +(6) += +g +( +g +( +g +(4) +- +1)) ++ +1 +g +(4) +is +1, +so +1 +-1 += +0. +Then +g +(0) +is +0. +Then +g +(0) +is +0. +So +0 ++1 += +1. +Same +result +. +So +regardless +of +the +input +i +( +as +long +as +it +’ +s +positive +), +g +( +i +) +returns +1. +That +’ +s +a +key +insight +. +So +g +( +i +) +is +a +function +that +returns +0 +if +i +<=0, +and +1 +otherwise +. +Now +, +moving +on +to +h +( +j +): +h +( +j +) +is +defined +as +: +If +j +<= +0, +return +0. +Otherwise +, +return +h +( +g +( +j +) +- +1) +- +1. +But +since +g +( +j +) +is +1 +for +any +j +> +0, +then +g +( +j +) +-1 +is +0. +So +for +any +j +> +0, +h +( +j +) +becomes +h +(0) +-1. +But +h +(0) +is +0, +so +h +( +j +) += +0 +-1 += +-1. +Wait +, +but +then +h +( +j +) +for +j +> +0 +would +be +-1. +But +then +, +if +we +call +h +again +with +that +value +? +Wait +, +let +’ +s +step +through +h +( +j +) +for +j +> +0. +Let +’ +s +take +j +=1: +h +(1) += +h +( +g +(1) +- +1) +-1 +g +(1) +is +1, +so +1 +-1 += +0. +Then +h +(0) +is +0. +So +0 +-1 += +-1. +So +h +(1) +returns +-1. +What +about +j +=2? +h +(2) += +h +( +g +(2) +-1) +-1 +g +(2) +is +1, +so +1 +-1 += +0. +h +(0) +is +0. +0 +-1 += +-1. +So +h +(2) +is +-1. +Same +for +any +j +> +0. +So +h +( +j +) +for +j +> +0 +is +always +-1. +So +when +f +( +x +) +calls +h +( +x +), +if +x +is +positive +, +h +( +x +) +returns +-1, +and +then +f +returns +None +. +If +x +is +0 +or +negative +, +h +( +x +) +returns +0, +and +then +f +returns +None +. +Wait +, +but +this +seems +too +straightforward +. +So +in +all +cases +, +h +( +x +) +terminates +. +Because +even +if +x +is +a +very +large +number +, +h +( +x +) +just +does +a +constant +number +of +steps +. +For +example +, +h +(1000) +would +compute +g +(1000) +which +is +1, +subtract +1 +to +get +0, +then +call +h +(0) +which +is +0, +subtract +1 +to +get +-1. +So +it +’ +s +a +finite +number +of +steps +. +Therefore +, +h +( +x +) +always +terminates +, +regardless +of +the +input +x +. +Therefore +, +the +function +f +( +x +) +will +always +terminate +, +no +matter +what +input +x +is +given +. +So +the +answer +should +be +"# +T +". + +< +answer +> +# +T +Figure 11.28 +: +CWM +predicting termination. The prompt features a terminating example in HaltEval-prelim. After emitting a reasoning trace, which considers behavior on several specific inputs, CWM correctly predicts termination on all inputs. +12 +RL algorithm +Given a prompt +x +x +, we perform +G +G +rollouts, producing a set of trajectories (i.e., token sequences) +{ +y +1 +, +y +2 +, +… +​ +y +G +} +\{y_{1},y_{2},...y_{G}\} +. +In general, rollouts are multi-turn, so the trajectories +y +i +y_{i} +consist of a prompt +x +x +followed by a sequence of actions and observations. We use the binary mask +M +i +, +t +M_{i,t} +to signal whether token +y +i +, +t +y_{i,t} +was generated by the agent ( +M +i +, +t += +1 +M_{i,t}=1 +) or environment (initial prompt and later observations; +M +i +, +t += +0 +M_{i,t}=0 +). +The first input required by the PPO loss is an estimate of the advantage. We denote by +R +i +R_{i} +the total return (i.e., sum of undiscounted rewards) of trajectory +i +i +. +For a batch of +G +G +trajectories, we compute the length-weighted mean return +μ += +1 +L +​ +∑ +i += +1 +G +R +i +× +L +i +\mu=\frac{1}{L}\sum_{i=1}^{G}R_{i}\times L_{i} +, where +L +i += +∑ +t +M +i +, +t +L_{i}=\sum_{t}M_{i,t} +and +L += +∑ +i +L +i +L=\sum_{i}L_{i} +is the total number of agent-generated tokens. The advantage is then +A +^ +i += +R +i +− +μ +\hat{A}_{i}=R_{i}-\mu +. +The PPO loss further requires the log probabilities of the trajectory under the behavior policy, often denoted +π +old +\pi_{\text{old}} +, in order to compute the importance ratio. +One complicating factor here is that the workers continue rollouts in parallel to model updates (see +Section +˜ +6.2 +). +At a given point in time, any number of the +G +G +rollouts in a batch may be in progress. Hence, the true behavior policy distribution is difficult to describe mathematically. Nevertheless, we use the notation +log +⁡ +π +old +​ +( +y +i +, +t +| +y +i +, +< +t +) +\log\pi_{\text{old}}(y_{i,t}|y_{i, +(a) +GRPO (5,549 tokens) +… +At pH 7.4, all functional groups are neutral +… +maintaining a balance between hydrophobic and hydrophilic character +… +[The] overall polarity +… +keeps logD from being very high +… +or very low +… +[typically falling] in the 2.0-3.0 range, with 2.61 (C) being a reasonable estimate +… + +C + +(b) +SDPO (764 tokens) +Figure 7: +Example responses from GRPO and SDPO after 50 training steps to the following question: “What is the correct octanol/water distribution coefficient logD under the circumstance of pH 7.4 for the molecule +O=C1O[C@@H](COc2ccon2)CN1c1ccc(C2=CCOCC2)c(F)c1 +?” The answer options are A: 1.32, B: 1.85, C: 2.61, D: 3.76. The correct answer is +C +. +GRPO’s answer contains +5 +× +5\times +“Hmm.”, +9 +× +9\times +“No.”, and +25 +× +25\times +“Wait”. Further, GRPO’s answer repeats calculations such as “ +10 +1.85 +≈ +69.3 +\smash{10^{1.85}\approx 69.3} +”, which appears four times, and the model even explicitly generates “Wait I’m going in circles”. +SDPO’s answer avoids any circular reasoning and is more than +7 +× +7\times +shorter. The base model is Qwen3-8B. +4 +Learning with Rich Environment Feedback +We next evaluate SDPO on coding tasks. +Coding is a canonical example of an RL environment that provides rich feedback, such as runtime errors and failed unit tests. +Learning to solve these coding problems requires strong credit assignment since the student must identify its precise mistakes to avoid repeating them in the future. +LiveCodeBench +(LCB; Jain et al., +2025 +) +provides a set of contest-style coding problems, ranging from simple to competition-level. +We restrict our evaluation to the most recent LCBv6 subset of LCB, which contains 131 questions released between February and May 2025. +We consider a setting with public and private unit tests, common for code contests and coding platforms like LeetCode, where the public tests are used for evaluation during training and the private tests are used for validation +(Chen et al., +2022 +; Le et al., +2022 +; El-Kishky et al., +2025 +; Samadi et al., +2025 +) +. +5 +5 +5 +We select public tests as a 50% random subset of private tests. +We use the Qwen3 +(Yang et al., +2025a +) +model family for our experiments, with Qwen3-8B as default unless otherwise specified. +We report the average accuracy over 4 rollouts and use the same GRPO baseline as outlined in +Section +˜ +3.1 +. +Results. +Figure +˜ +1 +compares the learning curves of SDPO and GRPO on LCBv6. +We find that SDPO achieves a substantially higher final accuracy (48.8%) than GRPO (41.2%) while also outperforming the strongest instruct models on the public LCBv6 leaderboard: +6 +6 +6 +On the public leaderboard, the LCBv6 subset can be obtained by selecting February to May 2025. +Claude Sonnet 4 (40.5%) and Claude Opus 4 (39.7%). +Furthermore, SDPO reaches the final accuracy of GRPO in +4 +× +4\times +fewer generations. +We include an extended comparison to other RLVR baselines that perform similarly to GRPO in +Table +˜ +9 +in the appendix. +Differentiating between the easy, medium, and hard questions of LCB, we find that SDPO particularly improves over GRPO in solving medium and hard questions (cf. +Figure +˜ +15 +in the appendix). +4.1 +Self-distillation benefits from stronger models +Figure 8: +SDPO improves with model size. +We compare the final LCBv6 validation accuracy of SDPO and GRPO at train step 80, across model sizes from Qwen3. +The ability of SDPO’s teacher to perform accurate retrospection appears to be an emergent phenomenon with scale. +We include an additional scaling study with Qwen2.5-Instruct in the appendix (cf. +Figure +˜ +17 +) which further supports this finding. +Error bars indicate the standard error across 3 seeds. +A central question for our work is whether SDPO is sensitive to the in-context learning ability of the base model. +Intuitively, we expect that SDPO benefits from a strong in-context learner, since this enables the teacher to perform more accurate retrospection. +To answer this question, we perform a scaling study with different model sizes from the Qwen3 +(Yang et al., +2025a +) +family. +As shown by extensive prior work, the ability to learn in-context increases with model size +(e.g., Brown et al., +2020 +) +. +As depicted in +Figure +˜ +8 +, SDPO significantly outperforms GRPO on larger models while only slightly improving over GRPO on smaller models. +To determine whether SDPO can also underperform GRPO on a model weaker than Qwen3-0.6B, we performed an additional scaling study with Qwen2.5-Instruct +(Qwen et al., +2024 +) +. +While outperforming GRPO with Qwen2.5-7B and performing similarly with Qwen2.5-8B, we find that SDPO underperforms GRPO on Qwen2.5-1.5B, as seen in +Figure +˜ +17 +in +Appendix +˜ +D +. +Takeaway 2 +Our results suggest that the marginal improvement of SDPO over GRPO is tightly coupled with the strength of the base model, and motivates future study on models stronger than Qwen3-8B. +In the same way that in-context learning is an emergent phenomenon with scale, the self-teacher’s ability to perform accurate retrospection in SDPO appears to be emergent with scale. +4.2 +Self-distillation performs dense credit assignment +Figure 9: +Dense credit assignment in SDPO in the example from +Figure +˜ +4 +. Shown in blue are tokens which become more likely under the self-teacher. The self-teacher identifies how the returned range has to be modified so that it does not contain +n +. +Whereas GRPO assigns a constant advantage to each generated token, SDPO assigns an individual advantage to +each possible next token +along the generated sequence based on the agreement of student and teacher. +At each position +t +t +in the generated sequence +y +y +, there are +| +𝒱 +| +|\mathcal{V}| +possible next tokens where +𝒱 +\mathcal{V} +is the vocabulary. +In distillation, this level is typically called the +logit-level +since it corresponds to the logits of the model. +In practice, we approximate the full next-token distribution by the top- +K +K +tokens plus the tail, and as such, SDPO assigns +| +y +| +⋅ +( +K ++ +1 +) +|y|\cdot(K+1) +unique advantages per sequence. +This is illustrated in +Figure +˜ +9 +and allows SDPO to perform dense credit assignment. +A natural question is whether the performance gains of SDPO are due to leveraging rich feedback in RLRF or due to the dense credit assignment of SDPO. +To answer this question, we ablate the performance of SDPO in three configurations: +• +Logit-level SDPO: +credit assignment over the 100 most likely tokens (under the student) at each position. +• +Token-level SDPO: +credit assignment over the most likely token at each position. +• +Sequence-level SDPO: +We compute SDPO advantages for all generated tokens and average them to produce a single scalar advantage per sequence (as in GRPO). This does not perform denser credit assignment than GRPO but still leverages the rich feedback +f +f +. +As shown in Figure +10 +(left) +, the dense credit assignment of logit-level SDPO leads to significant performance gains over token-level SDPO and sequence-level SDPO. +Nevertheless, even sequence-level SDPO outperforms GRPO, indicating that leveraging rich feedback in RLRF can lead to substantial gains over RLVR methods even without dense credit assignment. +4.3 +The self-teacher improves during training +Figure 10: +Left: Rich feedback in RLRF and dense credit assignment of SDPO are complementary. +We compare logit-level, token-level, and sequence-level SDPO advantages to GRPO. While denser credit assignment in SDPO is beneficial (logit-level > token-level > sequence-level), even sequence-level SDPO significantly outperforms GRPO due to leveraging the rich feedback. Error bars indicate the standard error across 3 seeds. +Right: The self-teacher improves during training. +We display the generative accuracy of the self-teacher compared to student on the current training batch (with a rolling average over 5 steps). The final student score is taken at step 80. Notably, the performance of the student significantly surpasses the initial teacher’s accuracy. Error bars indicate the standard deviation across 3 seeds. +Teacher +Accuracy +Avg accuracy +q +θ +q_{\theta} +36.1 +± +1.6 +36.1\pm 1.6 +29.8 +± +1.3 +29.8\pm 1.3 +q +θ +ref +q_{\theta_{{\mathrm{ref}}}} +48.8 +± +0.7 +48.8\pm 0.7 +44.4 +± +0.2 +44.4\pm 0.2 +Trust-region +50.6 +± +0.9 +\mathbf{50.6}\pm 0.9 +45.6 +± +0.2 +\mathbf{45.6}\pm 0.2 +EMA +49.3 +± +0.3 +49.3\pm 0.3 +45.3 +± +0.2 +\mathbf{45.3}\pm 0.2 +Table 4: +Best/average accuracy until step 90 of various methods for teacher regularization. Trust-region and EMA teachers use +α += +0.01 +\alpha=0.01 +. Training of the +q +θ +q_{\theta} +eventually diverges. Error ranges indicate standard errors across 3 seeds. +Contrary to standard distillation, the self-teacher in SDPO is not frozen, but updated throughout training. +This is a critical component of SDPO, since it enables the teacher to improve over time, which means that the student can learn from a stronger target. +To investigate whether the self-teacher improves during training, we plot the average accuracy when +generating +using the self-teacher in Figure +10 +(right) +. +We find that the self-teacher improves significantly during training. +Most notably, the student’s accuracy surpasses the initial teacher’s accuracy in later stages of training. +This demonstrates that SDPO enables true bootstrapping of a weak model to a strong model, without the initial self-teacher’s performance limiting the final student. +As described in +Section +˜ +2.3 +, SDPO uses a regularized teacher to stabilize training. +As can be seen in +Table +˜ +4 +, a non-regularized teacher significantly underperforms the regularized teachers. +Furthermore, trust-region and EMA teachers outperform the teacher frozen at the initial teacher’s parameters, showing that the teacher improves through parameter sharing with the student. +Yet, SDPO performs well even with a frozen teacher. +4.4 +On-policy self-distillation avoids catastrophic forgetting +Prior work has shown that a key benefit of on-policy algorithms, such as GRPO, is that models tend not to forget previously obtained capabilities +(Shenfeld et al., +2026b +; Chen et al., +2025b +; Lu & Thinking Machines Lab, +2025 +) +. +This is practically desirable since it enables continual training pipelines where a model is trained sequentially on diverse tasks without the need to retrain from scratch. +To evaluate forgetting, we test the final checkpoints of GRPO and SDPO on diverse holdout tasks: IFEval +(Zhou et al., +2023 +) +, which tests the ability of a model to follow precise format instructions; ArenaHard-v2 +(Li et al., +2025a +) +, which is an LLM-judged benchmark of real-world instruction-following prompts derived from LMArena +(Chiang et al., +2024 +) +; and MMLU-Pro +(Wang et al., +2024b +) +, which tests broad multi-task knowledge and reasoning. +As displayed in +Table +˜ +5 +, SDPO learns the new task while mitigating degradation of initial capabilities, overall achieving a better performance–forgetting tradeoff than GRPO. +Off-policy self-distillation baseline. +As an additional baseline, we consider training the student via supervised fine-tuning (SFT) on successful generations from the self-teacher +(Scheurer et al., +2023 +; Dou et al., +2024 +; Zhou et al., +2025 +) +. +7 +7 +7 +SFT on a teacher’s predictions is a standard off-policy distillation approach +(Kim & Rush, +2016 +) +. +This requires +2 +× +2\times +the generations of SDPO for the same number of steps, since we have to generate from both the student and the teacher. +We report SFT on the successes of the self-teacher, which achieves a higher accuracy than also including initial successes from the student in the SFT data. +As shown in +Table +˜ +5 +, SFT on the self-teacher significantly underperforms SDPO on LCBv6, while leading to worse forgetting of prior capabilities. +This mirrors prior findings on the instability of off-policy imitation +(see, e.g., Agarwal et al., +2024 +) +. +Task: +Holdout tasks: +LCBv6 +IFEval +ArenaHard-v2 +(hard prompt) +ArenaHard-v2 +(creative writing) +MMLU-Pro +Avg. +(holdout) +Base +27.9 +27.9 +83.9 +{83.9} +14.0 +{14.0} +13.7 +{13.7} +62.5 +{62.5} +43.5 +{43.5} +SFT on self-teacher +42.7 +{42.7} +83.7 +11.2 +8.9 +61.9 +41.4 +GRPO +41.2 +41.2 +82.2 +82.2 +12.0 +12.0 +10.8 +10.8 +62.3 +62.3 +41.8 +41.8 +SDPO +48.8 +{48.8} +83.2 +{83.2} +12.3 +{12.3} +11.1 +{11.1} +62.9 +{62.9} +42.4 +{42.4} +Table 5: +On-policy methods do not suffer from catastrophic forgetting. +We compare the accuracy of the final checkpoint on the training task LCBv6 and on holdout tasks IFEval, ArenaHard-v2, and MMLU-Pro. We compare to a baseline that trains directly on responses generated by the initial self-teacher with SFT. Overall, SDPO achieves the best performance–forgetting tradeoff. We include additional baseline results in +Table +˜ +9 +in the appendix. +4.5 +Can GRPO and SDPO be combined? +GRPO utilizes Monte Carlo advantages, which are unbiased with respect to the objective of maximizing expected reward +J +​ +( +θ +) +:= +𝔼 +y +∼ +π +θ +( +⋅ +∣ +x +) +​ +[ +r +​ +( +y +∣ +x +) +] +J(\theta):=\smash{\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[r(y\mid x)\right]} +. +In contrast, SDPO advantages are inherently biased with respect to +J +​ +( +θ +) +J(\theta) +due to being computed from rich feedback and a self-teacher. +This dichotomy parallels the fundamental distinction between Monte Carlo and bootstrapped advantages in RL: while the latter are biased, they typically yield lower variance +(Sutton & Barto, +1998 +; Schulman et al., +2016 +) +. +This motivates a hybrid approach that combines reward-derived GRPO advantages with feedback-derived SDPO advantages: +A +i +, +t +SDPO ++ +GRPO +​ +( +y +^ +i +, +t +) +:= +λ +​ +A +i +, +t +GRPO +​ +( +y +^ +i +, +t +) ++ +( +1 +− +λ +) +​ +A +i +, +t +SDPO +​ +( +y +^ +i +, +t +) +, +λ +∈ +[ +0 +, +1 +] +. +A_{i,t}^{\mathrm{SDPO+GRPO}}(\hat{y}_{i,t}):=\lambda A_{i,t}^{\mathrm{GRPO}}(\hat{y}_{i,t})+(1-\lambda)A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t}),\quad\lambda\in[0,1]. +(3) +Figure 11: +We compare the LCBv6 validation accuracy at step 80, across model sizes from Qwen3. +SDPO+GRPO significantly outperforms SDPO on the weaker Qwen3-0.6B, while slightly underperforming SDPO on stronger models. +We use +λ += +0.9 +\lambda=0.9 +. +Error bars indicate the standard error across 3 seeds. +As shown in +Figure +˜ +11 +, SDPO+GRPO appears to be more robust to weaker models than SDPO. +Intuitively, in a weaker model such as Qwen3-0.6B, the SDPO advantages are less reliable, and hence including the GRPO advantage helps to stabilize training. +In contrast, we find that SDPO+GRPO slightly underperforms SDPO on stronger models such as Qwen3-8B. +This suggests that the signal of GRPO, only informed by a scalar reward, can be actively harmful with a strong initial model. +4.6 +Which feedback is most informative? +To understand which type of rich feedback is most informative, we ablate the three types of feedback present in a verifiable environment like code generation: the sample solution (if a successful rollout is available in the current rollout group), the environment output (such as runtime errors), and the student’s original attempt. +Sample solutions. +Including a sample solution from a failed attempt’s rollout group (if available) closely mirrors the group-relative advantages of GRPO. +We emphasize that these sample solutions are always generated by the student, as in GRPO, and do not require an expert model. +They allow for disincentivizing unsuccessful approaches if the model is already able to solve the question. +However, unlike GRPO where all tokens receive the same negative advantage, the self-teacher can identify specific mistakes and provide feedback on how to fix them. +Environment output. +The environment output describes the state of the environment after the student’s attempt. +This is complementary to sample solutions since it can provide useful signal even if the student has never solved the question before (a setting we explore extensively in +Section +˜ +5 +). +Leveraging environment output is a key differentiating factor between RLRF and RLVR settings. +Student’s original attempt. +The student’s original attempt +y +y +does not have to be included in the reprompting template of the teacher. +Indeed, we find that including it biases the teacher towards the student’s attempt (cf. +Table +˜ +6 +). +This reduces the entropy of the student’s distribution (particularly for initially uncertain tokens), thereby reducing exploration. +Teacher before training +Student trained with SDPO +↑ +\uparrow +Acc. (%) +↓ +\downarrow +Same output (%) +↑ +\uparrow +Acc. (%) +Avg. entropy +f += +f= +output +32.5 +± +0.5 +32.5\pm 0.5 +13.7 +± +0.6 +13.7\pm 0.6 +39.9 +± +1.1 +39.9\pm 1.1 +0.40 +± +0.0 +0.40\pm 0.0 +f += +f= +own solution +42.4 +± +1.0 +\mathbf{42.4}\pm 1.0 +12.1 +± +0.7 +12.1\pm 0.7 +42.6 +± +1.3 +42.6\pm 1.3 +0.41 +± +0.0 +0.41\pm 0.0 +f += +f= +output + own solution +42.5 +± +1.2 +\mathbf{42.5}\pm 1.2 +10.1 +± +0.2 +\mathbf{10.1}\pm 0.2 +48.3 +± +1.4 +\mathbf{48.3}\pm 1.4 +0.38 +± +0.0 +0.38\pm 0.0 +f += +f= +y +y ++ output + own solution +39.3 +± +0.8 +39.3\pm 0.8 +30.0 +± +0.9 +30.0\pm 0.9 +44.5 +± +1.3 +44.5\pm 1.3 +0.23 +± +0.0 +\emph{0.23}\pm 0.0 +Table 6: +Performance of varying kinds of feedback. +We evaluate informativeness of feedback based on SDPO training (until step 60) as well as the direct impact on the self-teacher. “Same output” measures the percentage of cases where the teacher receives the same environment output as the student’s initial attempt (i.e., not exploring alternative approaches). We observe that environment output and sample solutions are complementary and each provide informative feedback. Naively including only solutions or initial attempts +y +y +significantly reduces diversity in the teacher and student. We remark that the sample solutions are generated by the student, enabling similar group-relative advantage estimation to GRPO. Error bars indicate standard deviation across 3 seeds. +We summarize results in +Table +˜ +6 +where we evaluate the effect on SDPO training as well as the direct impact on the self-teacher. +We find that environment output & sample solutions are complementary, each providing informative feedback. +Generally, we observe that performance is not sensitive to syntactic variations of the reprompting template from +Table +˜ +2 +. +5 +Solving Hard Questions via Test-Time Self-Distillation +In +Sections +˜ +3 +and +4 +, we have demonstrated that SDPO can substantially improve over RLVR methods when performing “train-time RL” for reasoning tasks. +We now turn to a test-time setting where the model is given only a single hard (binary-reward) question +x +x +and must discover a solution as quickly as possible: +Definition 5.1 +(Discovery time) +. +The discovery time is the number of trials needed until a solution is found (i.e., the smallest +k +k +with the +k +k +-th attempt +y +k +y_{k} +receiving reward 1). +Based on this notion, we +can define a measure of the efficacy of discovery: +discovery +​ +@ +​ +k +:= +ℙ +​ +( +discovery time +≤ +k +) += +ℙ +​ +( +r +​ +( +y +1 +∣ +x +) += +1 +or +r +​ +( +y +2 +∣ +x +) += +1 +or …or +r +​ +( +y +k +∣ +x +) += +1 +) +, +\displaystyle\begin{split}\mathrm{discovery@}k:=&\ \mathbb{P}(\text{discovery time $\leq k$})\\ +=&\ \mathbb{P}(\text{$r(y_{1}\mid x)=1$ or $r(y_{2}\mid x)=1$ or \ldots or $r(y_{k}\mid x)=1$}),\end{split} +(4) +where the probability is over any randomness in the algorithm producing +y +k +y_{k} +and the rewards. +Thus, the discovery@ +k +k +metric quantifies the probability of +discovering the solution within +k +k +steps. +8 +8 +8 +Our proposed discovery@ +k +k +metric is a canonical metric +in the study of runtime speedup (i.e., time until termination, +Dolan & Moré ( +2002 +) +). +While prior work has studied discovery with continuous rewards +(e.g., Novikov et al., +2025 +; Yuksekgonul et al., +2026 +) +, discovery with language models in sparse or binary-reward settings does not allow “hill-climbing” a continuous reward and has remained less well understood. +The most naive approach to discovery in binary-reward tasks is to sample repeatedly i.i.d. from the base model, also known as +best-of- +k +k +. +The canonical pass@ +k +k +metric for best-of- +k +k +sampling is exactly the probability of discovering at least one solution within +k +k +independent samples from a fixed model, coinciding with discovery@ +k +k +. +The discovery@ +k +k +metric generalizes pass@ +k +k +to algorithms that sample attempts sequentially. +A common sequential approach re-prompts the base model with additional context from previous attempts +(Madaan et al., +2023 +; Shinn et al., +2023 +) +. +We refer to this as +multi-turn +sampling. +Here, the model itself does not change, only its context evolves over time. +Performing RLVR on the question +x +x +does not improve over best-of- +k +k +sampling from the base model, since a binary reward provides no signal until the first solution has already been found. +9 +9 +9 +For this reason, several works consider explicitly constructing curricula of solvable questions +(e.g., Zhao et al., +2025 +; Huang et al., +2026 +; Diaz-Bone et al., +2025 +; Hübotter et al., +2025b +) +, which self-distillation avoids. Other work found that RLVR yields limited improvement on hard questions +(Yue et al., +2025 +) +. +An RLRF method like SDPO does not face the same limitation, as it receives rich feedback from the environment after each attempt. +This rich feedback enables the model to repeatedly “correct” its mistakes as it encounters them and receives feedback, even before ever discovering a solution. +In contrast to multi-turn sampling, SDPO repeatedly compresses context +c += +( +y +k +, +f +k +) +c=(y_{k},f_{k}) +by distilling +π +θ +( +⋅ +∣ +x +, +c +) +\pi_{\theta}(\cdot\mid x,c) +into a model +π +θ +′ +( +⋅ +∣ +x +) +\pi_{\theta^{\prime}}(\cdot\mid x) +as we illustrate in +Figure +˜ +12 +. +This self-distillation enables SDPO to continually learn over long contexts, whereas the memory bottleneck of transformers inherently limits the context length of multi-turn sampling +(Vaswani et al., +2017 +) +. +In this section, we seek to answer the question: +Can repeatedly compressing context into model weights via self-distillation +accelerate discovery for hard questions? +Figure 12: +Compressing context into model weights via self-distillation. +We illustrate the process of distilling the interaction history (context +c +c +) into the model parameters +θ +\theta +. +The model +π +θ +\pi_{\theta} +repeatedly attempts a fixed hard question +x +x +, generating an answer +y +y +and receiving feedback +f +f +. +Rather than appending this history to the context window, the model updates its weights +θ +t +→ +θ +t ++ +1 +\theta_{t}\to\theta_{t+1} +with SDPO (batch size +1 +1 +) based on the feedback, effectively “fixing” mistakes by encoding +π +θ +( +⋅ +∣ +x +, +c +) +\pi_{\theta}(\cdot\mid x,c) +directly into the policy +π +θ +′ +( +⋅ +∣ +x +) +\pi_{\theta^{\prime}}(\cdot\mid x) +. +5.1 +Experimental setting +We consider a particularly challenging subset of questions from LCBv6 that are at Qwen3-8B’s performance ceiling and require significant test-time sampling to find any solution. +Concretely, we define two groups using Qwen3-8B’s pass@ +k +k +: +Hard tasks +with +pass@ +​ +64 +< +0.5 +{\text{pass@}64<0.5} +and +very hard tasks +with +pass@ +​ +64 +< +0.03 +\text{pass@}64<0.03 +. +Among these, we retain questions for which any of best-of- +k +k +, multi-turn, or SDPO find at least one solution within +512 +512 +steps across +5 +5 +seeds. +This results in 19 hard and 9 very hard questions. +For best-of- +k +k +sampling under the base model, we report the standard +pass +​ +@ +​ +k +\text{pass}@k +estimate +(Chen et al., +2021b +) +from 2944 independent rollouts. +As multi-turn sampling, we sequentially reprompt the model in-context using the concatenated feedback from previous attempts. To remain within Qwen3-8B’s 40k-token context limit, we employ a first-in, first-out sliding window, discarding the earliest feedback once the maximum prompt length (32k tokens) is reached. +We ablate the multi-turn reprompting strategy in +Figure +˜ +19 +in +Appendix +˜ +D +and find that retaining only past feedback while forgetting earlier attempts significantly outperforms the baseline that additionally retains past attempts. +We evaluate SDPO with a batch size of 16. We ablate this choice in +Figure +˜ +19 +in +Appendix +˜ +D +and find that overall performance differences are marginal, yet smaller batch sizes are beneficial for improvements at low generation budgets, while larger batch sizes result in more stable updates that still learn to solve questions at later stages into the run. +5.2 +Results +Figure 13: +Self-distillation at test-time solves LiveCodeBench questions that neither the base model nor multi-turn conversations can solve. +Left: +Very hard questions (9 total) from LCBv6 where the base model achieves +pass +​ +@ +​ +64 +< +0.03 +\text{pass}@64<0.03 +, i.e., in less than 3% cases, sampling 64 responses yields any success. +Right: +Hard questions (19 total) from LCBv6 where the base model achieves +pass +​ +@ +​ +64 +< +0.5 +\text{pass}@64<0.5 +. +We report the +discovery +​ +@ +​ +k +\text{discovery}@k +metric, representing the probability of discovering at least one solution within +k +k +total generations. +Across both difficulty levels, SDPO achieves higher +discovery +​ +@ +​ +k +\text{discovery}@k +rates at almost all generation budgets, compared to the base model and a multi-turn conversation baseline that receives the feedback in-context. We report the mean and bootstrapped 90% confidence intervals of the mean across 5 random seeds per question. +Figure +˜ +13 +compares +discovery +​ +@ +​ +k +\text{discovery}@k +for SDPO, multi-turn sampling, and best-of- +k +k +sampling on very hard (left) and hard (right) questions from LCBv6. Across both difficulty levels, SDPO achieves substantially higher +discovery +​ +@ +​ +k +\text{discovery}@k +rates at almost all generation budgets. +On very hard tasks, multi-turn and best-of- +k +k +largely fail to solve questions within the available generation budget, achieving discovery@2750 of only +35.6 +% +35.6\% +and +41.5 +% +{41.5}\% +, respectively, whereas SDPO discovers a solution in +53.2 +% +{53.2}\% +of cases. +SDPO not only solves more questions overall but also does so with substantially fewer attempts. +Notably, to reach a +22 +% +22\% +discovery probability on very hard questions, SDPO requires approximately +3 +× +3\times +fewer generations than best-of- +k +k +and multi-turn sampling. +On hard tasks, SDPO reaches a +78 +% +{78}\% +discovery@2750 probability while achieving a +67 +% +67\% +discovery probability with roughly +2.4 +× +2.4\times +fewer generations than best-of- +k +k +and multi-turn sampling. Overall, multi-turn and best-of- +k +k +sampling solve only +68.4 +% +{68.4}\% +and +72.3 +% +{72.3}\% +of questions, respectively. +The context window length for multi-turn sampling is reached after 837 ( +± +466 +\pm 466 +) steps for hard questions and after 1007 ( +± +349 +\pm 349 +) steps for very hard questions, offering a possible explanation for its diminishing gains at high generation budgets. +Question 3 is only solved by SDPO. +SDPO solves all questions that are solved by best-of- +k +k +and multi-turn sampling. Beyond that, SDPO uniquely discovers a solution for Q3, which is neither solvable with multi-turn sampling nor with best-of- +k +k +sampling within 2750 attempts. In contrast, SDPO first discovers a solution for Q3 after 321 attempts, which corresponds to 20 iteration steps of self-distillation based on feedback with a batch size of 16. We include detailed per-question results in +Table +˜ +10 +in +Appendix +˜ +D +. +The initial self-teacher does not solve hard questions. +Notably, the self-teacher’s initial accuracy is +< +1 +<1 +% for almost all questions, and even exactly +0 +% on +78 +78 +% of them ( +Table +˜ +11 +in +Appendix +˜ +D +). +This shows that a single turn of in-context feedback is insufficient to solve the problem. +Despite this, the self-teacher’s credit assignment is sufficiently effective for SDPO to iteratively refine the policy and eventually solve these questions. +Takeaway 3 +We demonstrate that rich environment feedback enables SDPO to significantly accelerate discovery for hard questions. +This is in contrast to RLVR methods, which only receive a binary reward signal, and therefore only begin learning once the first solution has already been found. +6 +Related Work +6.1 +Reinforcement Learning with LLMs +Recently, large-scale RL training on diverse tasks has significantly improved the performance of LLMs on general reasoning tasks +(Guo et al., +2025 +; Kimi et al., +2025 +; Olmo et al., +2025 +; Jaech et al., +2024 +; Lambert et al., +2025 +) +. +This progress is primarily enabled by RLVR methods that use Monte Carlo estimates of rewards, such as STaR or GRPO +(Zelikman et al., +2022 +; Shao et al., +2024 +) +, similar to the classical REINFORCE algorithm +(Williams, +1992 +) +. +While several traditional RLVR algorithms rely on learning separate value networks +(Schulman et al., +2017 +) +, they incur substantial memory costs and retain the information bottleneck of scalar rewards. +In the RLVR setting, it is common for an (outcome) reward to be given only at the end of a sequence. +To improve credit assignment, several works learn so-called process reward models (PRMs) that estimate rewards for each step in the sequence +(Lightman et al., +2023 +; Wang et al., +2024a +; Setlur et al., +2025 +) +. +Unlike our RLRF setting, PRMs are typically trained on scalar rewards, either on value estimates for intermediate states or on outcome rewards +(Cui et al., +2025 +) +. +Unlike the self-teacher in SDPO, PRMs are a distinct model from the student, introducing significant memory overhead. +Our work shows that +each language model is implicitly a PRM +through retrospection if given rich feedback. +Conceptually, our work is related to “bootstrapping your own latent” +(BYOL; Grill et al., +2020 +) +and “expert iteration” +(Anthony et al., +2017 +) +where a student is bootstrapped by repeatedly imitating an improved version of itself (called the “expert”). +Canonically, the expert combines the student with test-time search, such as tree search +(Anthony et al., +2017 +) +or majority voting +(Zuo et al., +2025 +) +. +In contrast, SDPO leverages the student’s ability to learn from rich feedback provided in-context, which is related to “augmented views” in BYOL. +6.2 +Learning from Rich Feedback and through Retrospection +Beyond scalar outcome rewards, recent works have leveraged rich execution or verbal feedback to guide generation +(Gehring et al., +2025 +; Feng et al., +2024b +; Yuksekgonul et al., +2025 +) +. +A primary line of research focuses on translating verbal feedback into reward functions for RL. +This is often achieved by mapping feedback to discrete token-level rewards using an external frozen model +(Wang et al., +2026 +) +, or by employing strong external LLMs to explicitly construct state-wise reward functions +(Goyal et al., +2019 +; Xie et al., +2024 +; Urcelay et al., +2026 +) +. +Alternatively, feedback can be utilized without explicit reward modeling. +Several approaches focus on in-context improvement without integrating the process into the RL optimization loop +(Chen et al., +2021a +; Madaan et al., +2023 +; Shinn et al., +2023 +; Yao et al., +2024 +; Yuksekgonul et al., +2025 +; Lee et al., +2025 +) +. +Others manually curate preference datasets by pairing responses before and after feedback to train with direct preference optimization +(Stephan et al., +2024 +; Lee et al., +2024 +) +, though this requires additional generation and lacks the direct credit assignment of SDPO. +Various recent works bootstrap thinking traces from known answers, using these answers as rich feedback +(Zhou et al., +2026 +; Hatamizadeh et al., +2026 +; Zhang et al., +2025 +) +. +A central object in several recent works is a feedback-conditioned policy +π +θ +​ +( +y +∣ +x +, +f +) +\pi_{\theta}(y\mid x,f) +, which learns answers +y +y +that lead to feedback +f +f +(Liu et al., +2023 +; Zhang et al., +2023 +; Luo et al., +2025 +) +, typically through supervised objectives. +The idea behind these approaches is to deploy a policy conditioned on desirable (i.e., positive) feedback for deployment. +This approach is conceptually related to goal-conditioned RL +(Schaul et al., +2015 +; Liu et al., +2025a +) +, where one can learn from negative examples through goal relabeling +(Andrychowicz et al., +2017 +) +. +Feedback-conditioned policies view feedback as a goal, whereas RLRF views feedback as a state that can be used to determine whether the goal +x +x +is achieved. +Unlike SDPO, these methods do not use feedback for credit assignment in negative trajectories, but rather as a data transformation for goal relabeling. +6.3 +Distillation +Distillation is frequently employed as an alternative to supervised fine-tuning (SFT) when a strong teacher model is available. +Distillation transfers capabilities by training a student to mimic the output distribution or intermediate representations of the teacher +(Hinton et al., +2015 +; Romero et al., +2015 +; Kim & Rush, +2016 +; Sanh et al., +2019 +; Xie et al., +2020 +) +. +While often performed on fixed off-policy datasets, to address the distribution shift between training and inference, recent works explore on-policy distillation, where the student learns from feedback on its own generations provided by an external teacher +(Agarwal et al., +2024 +; Gu et al., +2024 +; Yang et al., +2025a +; Lu & Thinking Machines Lab, +2025 +) +. +This mitigates the train-test mismatch, which relates closely to earlier work on online imitation learning +(Ross et al., +2011 +) +. +6.4 +Self-Distillation +The concept of self-distillation was first proposed by +Snell et al. ( +2022 +) +in a setting akin to supervised learning, introducing the idea of sampling from a model provided with extra context and training the same model to mimic these predictions without that context. +This mechanism has proven effective for compressing behavior +(Bai et al., +2022 +; Choi et al., +2022 +; Yang et al., +2024 +; +2025b +) +and factual information +(Eyuboglu et al., +2026 +; Kujanpää et al., +2025 +; Cao et al., +2025a +) +into model weights. +Beyond compressing a fixed context into model weights, recent works have used self-distillation to learn from environment feedback +(Scheurer et al., +2023 +; Dou et al., +2024 +; Zhou et al., +2025 +; Mitra & Ulukus, +2025 +; Song et al., +2026 +) +. +These approaches use an +off-policy +self-distillation objective, which we find to substantially underperform SDPO’s on-policy learning. +Off-policy self-distillation trains the student on generations from the teacher, whereas SDPO trains the student to avoid mistakes in its own generations. +In concurrent work, +Chen et al. ( +2025c +) +apply on-policy self-distillation to grid world settings where feedback is a scalar reward, and a reflection stage in the self-teacher diagnoses possible mistakes, showing improved credit assignment compared to learning value networks for advantage estimation. +Other concurrent work studies SDPO on a fixed dataset of expert demonstrations, without online environment interaction +(Shenfeld et al., +2026a +; Zhao et al., +2026 +) +. +7 +Conclusion, Limitations, and Future Work +We introduced +Reinforcement Learning with Rich Feedback +(RLRF), a paradigm where environments provide tokenized feedback beyond scalar rewards, and argued that this removes a key information bottleneck of RLVR. +We then proposed +Self-Distillation Policy Optimization +(SDPO), which uses the current policy as a feedback-conditioned +self-teacher +and distills its corrected log-probabilities into the student. +This leverages the model’s ability to learn from context for dense credit assignment. +We further demonstrated that SDPO can be implemented as a minimal, drop-in modification to standard RLVR pipelines. +Empirically, SDPO demonstrates superior sample efficiency and wall-clock convergence compared to GRPO on reasoning tasks, even when training in standard RLVR environments without rich feedback. +SDPO’s gains grow with model scale, suggesting that the capacity for self-correction scales with the model’s in-context learning capabilities. +Moreover, we show that performing SDPO at test time on individual hard binary-reward tasks accelerates the discovery of solutions compared to strong baselines. +SDPO enables learning from rich feedback in a way that is arguably closer to human cognition: utilizing precise outcomes rather than just binary rewards. +By allowing the model to determine retrospectively how it should have acted, we demonstrate that language models can convert diverse tokenized feedback into effective self-supervision. +Limitations. +Our findings show that SDPO’s performance depends on a model’s in-context learning ability, suggesting that SDPO is primarily applicable for RL-training stronger base models, while it can underperform GRPO on weaker models. +Moreover, performance depends on the quality of the environment feedback. If the environment provides uninformative or misleading feedback, a model may not be able to learn from it through SDPO. +Finally, SDPO adds a small computational overhead compared to GRPO for computing the log-probs of the retrospective model. +While often negligible, this may be a larger overhead for smaller models with shorter generation lengths, where generation time is comparatively small. +Future Work. +Our work highlights several exciting directions for future research: +• +Long-horizon and agentic settings. +RLRF is particularly appealing when trajectories are long or expose information about intermediate states. +Evaluating SDPO in agentic environments is a natural next step. +• +Training dynamics at scale. +Beyond our evaluation on LiveCodeBench, it would be particularly interesting to scale SDPO to large multi-task RL training runs and further study its scaling properties with frontier base models. +• +Beyond verifiable rewards. +While we focused on verifiable code generation, many tasks provide textual feedback without a ground-truth verifier. +Investigating whether SDPO’s retrospection mechanism can improve alignment in open-ended text generation or continuous-reward tasks remains an open empirical question. +• +Behavioral differences in reasoning. +We observed that SDPO induces qualitatively different reasoning patterns than GRPO, notably avoiding the latter’s tendency toward verbosity and superficial reasoning. +Future work should systematically study how individual aspects, such as the reprompt template, influence behavior. +Author Contributions +Jonas Hübotter +conceived of the project in summer 2025 and has been working on it full-time since then, leading the team. +Jonas proposed the conceptual framework of self-distillation for credit assignment with input from Lejs, implemented the algorithm with help from others, led the quantitative experiments on LCBv6, and led the writing of the paper. +Frederike Lübeck +led the design of the code environment, led the design and evaluation of the TTT setting in +Section +˜ +5 +with input from Jonas, contributed to the project direction in discussions, and contributed significantly to the writing of the paper. +Lejs Behric +noted the dense credit assignment of knowledge distillation with strong teacher models in discussions with Jonas, inspiring the idea of self-distillation. Further, Lejs led the evaluation of different teacher templates, co-led the development of a tool for qualitative analysis of runs with Marco and Daniel, helped implement parts of the algorithm, and contributed to the project direction in discussions. +Anton Baumann +joined in December 2025 and led the evaluation of SDPO without rich feedback in +Section +˜ +3 +with input from Jonas, and contributed to the writing of the paper. +Marco Bagatella and Daniel Marta +co-led the development of a tool for qualitative analysis of runs with Lejs, contributed to the training infrastructure, and contributed to the project direction in discussions. +Ido Hakimi +significantly contributed to the initial codebase and experimental setup, contributed early algorithmic ideas, and contributed to the project direction in discussions. +Idan Shenfeld, Thomas Kleine Buening, Carlos Guestrin, and Andreas Krause +supported this project, with Idan and Carlos joining in December 2025. They made significant contributions to the project direction in discussions and gave valuable advice on our presentation. Thomas and Idan, in particular, significantly contributed to the development of core algorithmic ideas and design of experiments. Thomas further evaluated checkpoints on holdout benchmarks. Carlos suggested the qualitative analysis of reasoning traces in +Figure +˜ +7 +and the presentation of TTT results in +Section +˜ +5 +. Andreas pointed out valuable connections to existing work in RL which shaped the direction of the project. +Acknowledgments +We would like to thank Akira Yoshiyama, Yassir Akram, Parnian Kassraie, Jonathan Thomm, Roman Vorushin, Afra Amini, Imanol Schlag, Yu Sun, and Moritz Hardt for helpful discussions. +We thank Eduard Durech for helpful conversations regarding the scaling of RL fine-tuning and for his technical guidance on distributed infrastructure and long-context optimization. +We are grateful to Ruixu Zhou from Tsinghua University & the Tencent Hunyuan Team for pointing out an error in the initially derived gradient estimator. +Furthermore, we would like to thank Leander Diaz-Bone for supporting dataset generation. +This project was supported through the Swiss AI compute grant a156 and, in part, compute grant infra01. +JH was supported by the Swiss National Science Foundation under NCCR Automation, grant agreement 51NF40 180545. +FL and MB were supported by the ETH-MPI Center for Learning Systems. +TKB and IH were supported by an ETH AI Center Postdoctoral Fellowship. +DM was supported by the Knut and Alice Wallenberg Foundation. +References +Agarwal et al. (2024) +Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos Garea, Matthieu Geist, and Olivier Bachem. +On-policy distillation of language models: Learning from self-generated mistakes. +In +ICLR +, 2024. +Akyürek et al. (2025) +Ekin Akyürek, Mehul Damani, Adam Zweiger, Linlu Qiu, Han Guo, Jyothish Pari, Yoon Kim, and Jacob Andreas. +The surprising effectiveness of test-time training for few-shot learning. +In +ICML +, 2025. +Amini et al. (2025) +Afra Amini, Tim Vieira, and Ryan Cotterell. +Better estimation of the kullback–leibler divergence between language models. +In +NeurIPS +, 2025. +Andrychowicz et al. (2017) +Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, and Wojciech Zaremba. +Hindsight experience replay. +In +NeurIPS +, 2017. +Anthony et al. (2017) +Thomas Anthony, Zheng Tian, and David Barber. +Thinking fast and slow with deep learning and tree search. +In +NeurIPS +, 2017. +Bai et al. (2022) +Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, et al. +Constitutional ai: Harmlessness from ai feedback. +arXiv preprint arXiv:2212.08073 +, 2022. +Behrouz et al. (2025) +Ali Behrouz, Peilin Zhong, and Vahab Mirrokni. +Titans: Learning to memorize at test time. +In +NeurIPS +, 2025. +Berner et al. (2019) +Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Przemysław Debiak, Christy Dennison, David Farhi, Quirin Fischer, Shariq Hashme, Chris Hesse, et al. +Dota 2 with large scale deep reinforcement learning. +arXiv preprint arXiv:1912.06680 +, 2019. +Brown et al. (2020) +Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. +Language models are few-shot learners. +arXiv preprint ArXiv:2005.14165 +, 2020. +Cao et al. (2025a) +Bowen Cao, Deng Cai, and Wai Lam. +Infiniteicl: Breaking the limit of context window size via long short-term memory transformation. +In +ACL +, 2025a. +Cao et al. (2025b) +Meng Cao, Shuyuan Zhang, Xiao-Wen Chang, and Doina Precup. +Scar: Shapley credit assignment for more efficient rlhf. +arXiv preprint arXiv:2505.20417 +, 2025b. +Chan et al. (2024) +Alex J Chan, Hao Sun, Samuel Holt, and Mihaela Van Der Schaar. +Dense reward for free in reinforcement learning from human feedback. +In +ICML +, 2024. +Chen et al. (2025a) +Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, et al. +Minimax-m1: Scaling test-time compute efficiently with lightning attention. +arXiv preprint arXiv:2506.13585 +, 2025a. +Chen et al. (2022) +Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. +Codet: Code generation with generated tests. +In +ICLR +, 2022. +Chen et al. (2025b) +Howard Chen, Noam Razin, Karthik Narasimhan, and Danqi Chen. +Retaining by doing: The role of on-policy data in mitigating forgetting. +arXiv preprint arXiv:2510.18874 +, 2025b. +Chen et al. (2021a) +Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch. +Decision transformer: Reinforcement learning via sequence modeling. +In +NeurIPS +, 2021a. +Chen et al. (2021b) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. +Evaluating large language models trained on code. +arXiv preprint arXiv:2107.03374 +, 2021b. +Chen et al. (2025c) +Wentse Chen, Jiayu Chen, Fahim Tajwar, Hao Zhu, Xintong Duan, Ruslan Salakhutdinov, and Jeff Schneider. +Retrospective in-context learning for temporal credit assignment with large language models. +In +NeurIPS +, 2025c. +Chiang et al. (2024) +Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael Jordan, Joseph E Gonzalez, et al. +Chatbot arena: An open platform for evaluating llms by human preference. +In +ICML +, 2024. +Choi et al. (2022) +Eunbi Choi, Yongrae Jo, Joel Jang, and Minjoon Seo. +Prompt injection: Parameterization of fixed inputs. +arXiv preprint arXiv:2206.11349 +, 2022. +Cui et al. (2025) +Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al. +Process reinforcement through implicit rewards. +arXiv preprint arXiv:2502.01456 +, 2025. +Diaz-Bone et al. (2025) +Leander Diaz-Bone, Marco Bagatella, Jonas Hübotter, and Andreas Krause. +Discover: Automated curricula for sparse-reward reinforcement learning. +In +NeurIPS +, 2025. +Dolan & Moré (2002) +Elizabeth D Dolan and Jorge J Moré. +Benchmarking optimization software with performance profiles. +Mathematical programming +, 91(2), 2002. +Dou et al. (2024) +Zi-Yi Dou, Cheng-Fu Yang, Xueqing Wu, Kai-Wei Chang, and Nanyun Peng. +Re-rest: Reflection-reinforced self-training for language agents. +In +EMNLP +, 2024. +El-Kishky et al. (2025) +Ahmed El-Kishky, Alexander Wei, Andre Saraiva, Borys Minaiev, Daniel Selsam, David Dohan, Francis Song, Hunter Lightman, Ignasi Clavera, Jakub Pachocki, et al. +Competitive programming with large reasoning models. +arXiv preprint arXiv:2502.06807 +, 2025. +Eyuboglu et al. (2026) +Sabri Eyuboglu, Ryan Ehrlich, Simran Arora, Neel Guha, Dylan Zinsley, Emily Liu, Will Tennien, Atri Rudra, James Zou, Azalia Mirhoseini, et al. +Cartridges: Lightweight and general-purpose long context representations via self-study. +In +ICLR +, 2026. +Feng et al. (2024a) +Kehua Feng, Keyan Ding, Weijie Wang, Xiang Zhuang, Zeyuan Wang, Ming Qin, Yu Zhao, Jianhua Yao, Qiang Zhang, and Huajun Chen. +Sciknoweval: Evaluating multi-level scientific knowledge of large language models. +arXiv preprint arXiv:2406.09098 +, 2024a. +Feng et al. (2024b) +Xidong Feng, Bo Liu, Yan Song, Haotian Fu, Ziyu Wan, Girish A Koushik, Zhiyuan Hu, Mengyue Yang, Ying Wen, and Jun Wang. +Natural language reinforcement learning. +arXiv preprint arXiv:2411.14251 +, 2024b. +Gehring et al. (2025) +Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve. +Rlef: Grounding code llms in execution feedback with reinforcement learning. +In +ICML +, 2025. +Goyal et al. (2019) +Prasoon Goyal, Scott Niekum, and Raymond J Mooney. +Using natural language for reward shaping in reinforcement learning. +In +IJCAI +, 2019. +Grill et al. (2020) +Jean-Bastien Grill, Florian Strub, Florent Altché, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. +Bootstrap your own latent-a new approach to self-supervised learning. +In +NeurIPS +, 2020. +Gu et al. (2024) +Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. +Minillm: Knowledge distillation of large language models. +2024. +Guha et al. (2026) +Etash Guha, Ryan Marten, Sedrick Keh, Negin Raoof, Georgios Smyrnis, Hritik Bansal, Marianna Nezhurina, Jean Mercat, Trung Vu, Zayne Sprague, et al. +Openthoughts: Data recipes for reasoning models. +In +ICLR +, 2026. +Guo et al. (2025) +Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. +Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. +arXiv preprint arXiv:2501.12948 +, 2025. +Haarnoja et al. (2018) +Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine. +Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. +In +ICML +, 2018. +Hardt & Sun (2024) +Moritz Hardt and Yu Sun. +Test-time training on nearest neighbors for large language models. +In +ICLR +, 2024. +Hatamizadeh et al. (2026) +Ali Hatamizadeh, Syeda Nahida Akter, Shrimai Prabhumoye, Jan Kautz, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, and Yejin Choi. +Rlp: Reinforcement as a pretraining objective. +In +ICLR +, 2026. +Hinton et al. (2015) +Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. +Distilling the knowledge in a neural network. +arXiv preprint arXiv:1503.02531 +, 2015. +Huang et al. (2026) +Chengsong Huang, Wenhao Yu, Xiaoyang Wang, Hongming Zhang, Zongxia Li, Ruosen Li, Jiaxin Huang, Haitao Mi, and Dong Yu. +R-zero: Self-evolving reasoning llm from zero data. +In +ICLR +, 2026. +Hübotter et al. (2026) +Jonas Hübotter, Patrik Wolf, Alexander Shevchenko, Dennis Jüni, Andreas Krause, and Gil Kur. +Specialization after generalization: Towards understanding test-time training in foundation models. +In +ICLR +, 2026. +Hübotter et al. (2025a) +Jonas Hübotter, Sascha Bongni, Ido Hakimi, and Andreas Krause. +Efficiently learning at test-time: Active fine-tuning of llms. +In +ICLR +, 2025a. +Hübotter et al. (2025b) +Jonas Hübotter, Leander Diaz-Bone, Ido Hakimi, Andreas Krause, and Moritz Hardt. +Learning on the job: Test-time curricula for targeted reinforcement learning. +arXiv preprint arXiv:2510.04786 +, 2025b. +Jaech et al. (2024) +Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al. +Openai o1 system card. +arXiv preprint arXiv:2412.16720 +, 2024. +Jain et al. (2025) +Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica. +Livecodebench: Holistic and contamination free evaluation of large language models for code. +In +ICLR +, 2025. +Kaelbling et al. (1998) +Leslie Pack Kaelbling, Michael L Littman, and Anthony R Cassandra. +Planning and acting in partially observable stochastic domains. +Artificial intelligence +, 101(1-2), 1998. +Kazemnejad et al. (2025) +Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux. +Vineppo: Refining credit assignment in rl training of llms. +In +ICML +, 2025. +Khatri et al. (2026) +Devvrit Khatri, Lovish Madaan, Rishabh Tiwari, Rachit Bansal, Sai Surya Duvvuri, Manzil Zaheer, Inderjit S Dhillon, David Brandfonbrener, and Rishabh Agarwal. +The art of scaling reinforcement learning compute for llms. +In +ICLR +, 2026. +Kim & Rush (2016) +Yoon Kim and Alexander M Rush. +Sequence-level knowledge distillation. +In +EMNLP +, 2016. +Kimi et al. (2025) +Kimi, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. +Kimi k1.5: Scaling reinforcement learning with llms. +arXiv preprint arXiv:2501.12599 +, 2025. +Kujanpää et al. (2025) +Kalle Kujanpää, Pekka Marttinen, Harri Valpola, and Alexander Ilin. +Efficient knowledge injection in LLMs via self-distillation. +TMLR +, 2025. +Kwon et al. (2023) +Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. +Efficient memory management for large language model serving with pagedattention. +In +PSIGOPS +, 2023. +Lambert et al. (2025) +Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al. +Tulu 3: Pushing frontiers in open language model post-training. +In +COLM +, 2025. +Le et al. (2022) +Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi. +Coderl: Mastering code generation through pretrained models and deep reinforcement learning. +In +NeurIPS +, 2022. +Lee et al. (2024) +Kyungjae Lee, Dasol Hwang, Sunghyun Park, Youngsoo Jang, and Moontae Lee. +Reinforcement learning from reflective feedback (rlrf): Aligning and improving llms via fine-grained self-reflection. +arXiv preprint arXiv:2403.14238 +, 2024. +Lee et al. (2025) +Yoonho Lee, Joseph Boen, and Chelsea Finn. +Feedback descent: Open-ended text optimization via pairwise comparison. +arXiv preprint arXiv:2511.07919 +, 2025. +Levine (2018) +Sergey Levine. +Reinforcement learning and control as probabilistic inference: Tutorial and review. +arXiv preprint arXiv:1805.00909 +, 2018. +Li et al. (2025a) +Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu, Joseph E Gonzalez, and Ion Stoica. +From crowdsourced data to high-quality benchmarks: Arena-hard and benchbuilder pipeline. +In +ICML +, 2025a. +Li et al. (2025b) +Yi-Chen Li, Tian Xu, Yang Yu, Xuqin Zhang, Xiong-Hui Chen, Zhongxiang Ling, Ningjing Chao, Lei Yuan, and Zhi-Hua Zhou. +Generalist reward models: Found inside large language models. +arXiv preprint arXiv:2506.23235 +, 2025b. +Lightman et al. (2023) +Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. +Let’s verify step by step. +In +ICLR +, 2023. +Liu et al. (2025a) +Grace Liu, Michael Tang, and Benjamin Eysenbach. +A single goal is all you need: Skills and exploration emerge from contrastive rl without rewards, demonstrations, or subgoals. +In +ICLR +, 2025a. +Liu et al. (2023) +Hao Liu, Carmelo Sferrazza, and Pieter Abbeel. +Chain of hindsight aligns language models with feedback. +arXiv preprint arXiv:2302.02676 +, 2023. +Liu et al. (2025b) +Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. +Understanding r1-zero-like training: A critical perspective. +In +COLM +, 2025b. +Lu & Thinking Machines Lab (2025) +Kevin Lu and Thinking Machines Lab. +On-policy distillation. +Thinking Machines Lab: Connectionism +, 2025. +URL +https://thinkingmachines.ai/blog/on-policy-distillation +. +Luo et al. (2025) +Renjie Luo, Zichen Liu, Xiangyan Liu, Chao Du, Min Lin, Wenhu Chen, Wei Lu, and Tianyu Pang. +Language models can learn from verbal feedback without scalar rewards. +arXiv preprint arXiv:2509.22638 +, 2025. +Madaan et al. (2023) +Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al. +Self-refine: Iterative refinement with self-feedback. +In +NeurIPS +, 2023. +Mitra & Ulukus (2025) +Purbesh Mitra and Sennur Ulukus. +Semantic soft bootstrapping: Long context reasoning in llms without reinforcement learning. +arXiv preprint arXiv:2512.05105 +, 2025. +Mnih et al. (2015) +Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Andrei A. Rusu, Joel Veness, Marc G. Bellemare, Alex Graves, Martin Riedmiller, Andreas K. Fidjeland, Georg Ostrovski, et al. +Human-level control through deep reinforcement learning. +Nature +, 518(7540), 2015. +Muennighoff et al. (2025) +Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori B Hashimoto. +s1: Simple test-time scaling. +In +EMNLP +, 2025. +Ng et al. (2000) +Andrew Y Ng, Stuart Russell, et al. +Algorithms for inverse reinforcement learning. +In +ICML +, 2000. +Novikov et al. (2025) +Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii, Francisco JR Ruiz, Abbas Mehrabian, et al. +Alphaevolve: A coding agent for scientific and algorithmic discovery. +arXiv preprint arXiv:2506.13131 +, 2025. +Olmo et al. (2025) +Team Olmo, Allyson Ettinger, Amanda Bertsch, Bailey Kuehl, David Graham, David Heineman, Dirk Groeneveld, Faeze Brahman, Finbarr Timbers, Hamish Ivison, et al. +Olmo 3. +arXiv preprint arXiv:2512.13961 +, 2025. +Peng et al. (2019) +Xue Bin Peng, Aviral Kumar, Grace Zhang, and Sergey Levine. +Advantage-weighted regression: Simple and scalable off-policy reinforcement learning. +arXiv preprint arXiv:1910.00177 +, 2019. +Qwen et al. (2024) +Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, et al. +Qwen2.5 technical report. +arXiv preprint arXiv:2412.15115 +, 2024. +Rafailov et al. (2023) +Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. +Direct preference optimization: Your language model is secretly a reward model. +In +NeurIPS +, 2023. +Romero et al. (2015) +Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. +Fitnets: Hints for thin deep nets. +In +ICLR +, 2015. +Ross et al. (2011) +Stéphane Ross, Geoffrey Gordon, and Drew Bagnell. +A reduction of imitation learning and structured prediction to no-regret online learning. +In +AISTATS +, 2011. +Samadi et al. (2025) +Mehrzad Samadi, Aleksander Ficek, Sean Narenthiran, Siddhartha Jain, Wasi Uddin Ahmad, Somshubra Majumdar, Vahid Noroozi, and Boris Ginsburg. +Scaling test-time compute to achieve ioi gold medal with open-weight models. +arXiv preprint arXiv:2510.14232 +, 2025. +Sanh et al. (2019) +Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. +Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. +arXiv preprint arXiv:1910.01108 +, 2019. +Schaul et al. (2015) +Tom Schaul, Daniel Horgan, Karol Gregor, and David Silver. +Universal value function approximators. +In +ICML +, 2015. +Scheurer et al. (2023) +Jérémy Scheurer, Jon Ander Campos, Tomasz Korbak, Jun Shern Chan, Angelica Chen, Kyunghyun Cho, and Ethan Perez. +Training language models with language feedback at scale. +arXiv preprint arXiv:2303.16755 +, 2023. +Schulman et al. (2015) +John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz. +Trust region policy optimization. +In +ICML +, 2015. +Schulman et al. (2016) +John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. +High-dimensional continuous control using generalized advantage estimation. +In +ICLR +, 2016. +Schulman et al. (2017) +John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. +Proximal policy optimization algorithms. +arXiv preprint arXiv:1707.06347 +, 2017. +Setlur et al. (2025) +Amrith Setlur, Chirag Nagpal, Adam Fisch, Xinyang Geng, Jacob Eisenstein, Rishabh Agarwal, Alekh Agarwal, Jonathan Berant, and Aviral Kumar. +Rewarding progress: Scaling automated process verifiers for llm reasoning. +In +ICLR +, 2025. +Shao et al. (2024) +Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al. +Deepseekmath: Pushing the limits of mathematical reasoning in open language models. +arXiv preprint arXiv:2402.03300 +, 2024. +Shenfeld et al. (2026a) +Idan Shenfeld, Mehul Damani, Jonas Hübotter, and Pulkit Agrawal. +Self-distillation enables continual learning. +arXiv preprint arXiv:2601.19897 +, 2026a. +Shenfeld et al. (2026b) +Idan Shenfeld, Jyothish Pari, and Pulkit Agrawal. +Rl’s razor: Why online reinforcement learning forgets less. +In +ICLR +, 2026b. +Sheng et al. (2025) +Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. +Hybridflow: A flexible and efficient rlhf framework. +In +EuroSys +, 2025. +Shinn et al. (2023) +Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. +Reflexion: Language agents with verbal reinforcement learning. +In +NeurIPS +, 2023. +Silver et al. (2016) +David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. +Mastering the game of go with deep neural networks and tree search. +Nature +, 529(7587), 2016. +Silver et al. (2017) +David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, et al. +Mastering chess and shogi by self-play with a general reinforcement learning algorithm. +arXiv preprint arXiv:1712.01815 +, 2017. +Snell et al. (2022) +Charlie Snell, Dan Klein, and Ruiqi Zhong. +Learning by distilling context. +arXiv preprint arXiv:2209.15189 +, 2022. +Song et al. (2026) +Yuda Song, Lili Chen, Fahim Tajwar, Remi Munos, Deepak Pathak, J Andrew Bagnell, Aarti Singh, and Andrea Zanette. +Expanding the capabilities of reinforcement learning via text feedback. +arXiv preprint arXiv:2602.02482 +, 2026. +Stephan et al. (2024) +Moritz Stephan, Alexander Khazatsky, Eric Mitchell, Annie S Chen, Sheryl Hsu, Archit Sharma, and Chelsea Finn. +Rlvf: Learning from verbal feedback without overgeneralization. +In +ICML +, 2024. +Sun et al. (2020) +Yu Sun, Xiaolong Wang, Zhuang Liu, John Miller, Alexei Efros, and Moritz Hardt. +Test-time training with self-supervision for generalization under distribution shifts. +In +ICML +, 2020. +Sun et al. (2025) +Yu Sun, Xinhao Li, Karan Dalal, Jiarui Xu, Arjun Vikram, Genghan Zhang, Yann Dubois, Xinlei Chen, Xiaolong Wang, Sanmi Koyejo, et al. +Learning to (learn at test time): Rnns with expressive hidden states. +In +ICML +, 2025. +Sutton & Barto (1998) +Richard S Sutton and Andrew G Barto. +Reinforcement learning: An introduction +. +MIT press, 1998. +Tandon et al. (2025) +Arnuv Tandon, Karan Dalal, Xinhao Li, Daniel Koceja, Marcel Rød, Sam Buchanan, Xiaolong Wang, Jure Leskovec, Sanmi Koyejo, Tatsunori Hashimoto, et al. +End-to-end test-time training for long context. +arXiv preprint arXiv:2512.23675 +, 2025. +Tang et al. (2023) +Qiaoyu Tang, Ziliang Deng, Hongyu Lin, Xianpei Han, Qiao Liang, Boxi Cao, and Le Sun. +Toolalpaca: Generalized tool learning for language models with 3000 simulated cases. +arXiv preprint arXiv:2306.05301 +, 2023. +Urcelay et al. (2026) +Belen Martin Urcelay, Andreas Krause, and Giorgia Ramponi. +From words to rewards: Leveraging natural language for reinforcement learning. +In +TMLR +, 2026. +Vaswani et al. (2017) +Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. +Attention is all you need. +In +NeurIPS +, 2017. +Wang et al. (2026) +Hanyang Wang, Lu Wang, Chaoyun Zhang, Tianjun Mao, Si Qin, Qingwei Lin, Saravan Rajmohan, and Dongmei Zhang. +Text2grad: Reinforcement learning from natural language feedback. +In +ICLR +, 2026. +Wang et al. (2024a) +Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui. +Math-shepherd: Verify and reinforce llms step-by-step without human annotations. +In +ACL +, 2024a. +Wang et al. (2025) +Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, et al. +Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning. +In +NeurIPS +, 2025. +Wang et al. (2024b) +Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, et al. +Mmlu-pro: A more robust and challenging multi-task language understanding benchmark. +In +NeurIPS +, 2024b. +Wei et al. (2022) +Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. +Chain-of-thought prompting elicits reasoning in large language models. +In +NeurIPS +, 2022. +Williams (1992) +Ronald J Williams. +Simple statistical gradient-following algorithms for connectionist reinforcement learning. +Machine learning +, 8(3), 1992. +Xie et al. (2020) +Qizhe Xie, Minh-Thang Luong, Eduard Hovy, and Quoc V Le. +Self-training with noisy student improves imagenet classification. +In +CVPR +, 2020. +Xie et al. (2024) +Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, and Tao Yu. +Text2reward: Reward shaping with language models for reinforcement learning. +In +ICLR +, 2024. +Yang et al. (2025a) +An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. +Qwen3 technical report. +arXiv preprint arXiv:2505.09388 +, 2025a. +Yang et al. (2025b) +Wenkai Yang, Yankai Lin, Jie Zhou, and Ji-Rong Wen. +Distilling rule-based knowledge into large language models. +In +COLING +, 2025b. +Yang et al. (2024) +Zhaorui Yang, Tianyu Pang, Haozhe Feng, Han Wang, Wei Chen, Minfeng Zhu, and Qian Liu. +Self-distillation bridges distribution gap in language model fine-tuning. +In +ACL +, 2024. +Yao et al. (2025) +Feng Yao, Liyuan Liu, Dinghuai Zhang, Chengyu Dong, Jingbo Shang, and Jianfeng Gao. +Your efficient rl framework secretly brings you off-policy rl training, 2025. +URL +https://fengyao.notion.site/off-policy-rl +. +Yao et al. (2024) +Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, et al. +Retroformer: Retrospective large language agents with policy gradient optimization. +In +ICLR +, 2024. +Yu et al. (2025) +Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, et al. +Dapo: An open-source llm reinforcement learning system at scale. +In +NeurIPS +, 2025. +Yue et al. (2025) +Yang Yue, Zhiqi Chen, Rui Lu, Andrew Zhao, Zhaokai Wang, Shiji Song, and Gao Huang. +Does reinforcement learning really incentivize reasoning capacity in llms beyond the base model? +In +NeurIPS +, 2025. +Yuksekgonul et al. (2025) +Mert Yuksekgonul, Federico Bianchi, Joseph Boen, Sheng Liu, Pan Lu, Zhi Huang, Carlos Guestrin, and James Zou. +Optimizing generative ai by backpropagating language model feedback. +Nature +, 639:609–616, 2025. +Yuksekgonul et al. (2026) +Mert Yuksekgonul, Daniel Koceja, Xinhao Li, Federico Bianchi, Jed McCaleb, Xiaolong Wang, Jan Kautz, Yejin Choi, James Zou, Carlos Guestrin, et al. +Learning to discover at test time. +arXiv preprint arXiv:2601.16175 +, 2026. +Zelikman et al. (2022) +Eric Zelikman, Yuhuai Wu, Jesse Mu, and Noah D Goodman. +Star: Bootstrapping reasoning with reasoning. +In +NeurIPS +, 2022. +Zhang et al. (2025) +Kai Zhang, Xiangchao Chen, Bo Liu, Tianci Xue, Zeyi Liao, Zhihan Liu, Xiyao Wang, Yuting Ning, Zhaorun Chen, Xiaohan Fu, et al. +Agent learning via early experience. +arXiv preprint arXiv:2510.08558 +, 2025. +Zhang et al. (2023) +Tianjun Zhang, Fangchen Liu, Justin Wong, Pieter Abbeel, and Joseph E Gonzalez. +The wisdom of hindsight makes language models better instruction followers. +In +ICML +, 2023. +Zhao et al. (2025) +Andrew Zhao, Yiran Wu, Yang Yue, Tong Wu, Quentin Xu, Matthieu Lin, Shenzhi Wang, Qingyun Wu, Zilong Zheng, and Gao Huang. +Absolute zero: Reinforced self-play reasoning with zero data. +In +NeurIPS +, 2025. +Zhao et al. (2026) +Siyan Zhao, Zhihui Xie, Mengchen Liu, Jing Huang, Guan Pang, Feiyu Chen, and Aditya Grover. +Self-distilled reasoner: On-policy self-distillation for large language models. +arXiv preprint arXiv:2601.18734 +, 2026. +Zheng et al. (2025a) +Chujie Zheng, Shixuan Liu, Mingze Li, Xiong-Hui Chen, Bowen Yu, Chang Gao, Kai Dang, Yuqiong Liu, Rui Men, An Yang, et al. +Group sequence policy optimization. +arXiv preprint arXiv:2507.18071 +, 2025a. +Zheng et al. (2025b) +Tianyu Zheng, Tianshun Xing, Qingshui Gu, Taoran Liang, Xingwei Qu, Xin Zhou, Yizhi Li, Zhoufutu Wen, Chenghua Lin, Wenhao Huang, et al. +First return, entropy-eliciting explore. +arXiv preprint arXiv:2507.07017 +, 2025b. +Zhou et al. (2023) +Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and Le Hou. +Instruction-following evaluation for large language models. +arXiv preprint arXiv:2311.07911 +, 2023. +Zhou et al. (2025) +Ruiyang Zhou, Shuozhe Li, Amy Zhang, and Liu Leqi. +Expo: Unlocking hard reasoning with self-explanation-guided reinforcement learning. +In +NeurIPS +, 2025. +Zhou et al. (2026) +Xiangxin Zhou, Zichen Liu, Anya Sims, Haonan Wang, Tianyu Pang, Chongxuan Li, Liang Wang, Min Lin, and Chao Du. +Reinforcing general reasoning without verifiers. +In +ICLR +, 2026. +Ziebart et al. (2008) +Brian D Ziebart, Andrew L Maas, J Andrew Bagnell, Anind K Dey, et al. +Maximum entropy inverse reinforcement learning. +In +AAAI +, 2008. +Zuo et al. (2025) +Yuxin Zuo, Kaiyan Zhang, Shang Qu, Li Sheng, Xuekai Zhu, Biqing Qi, Youbang Sun, Ganqu Cui, Ning Ding, and Bowen Zhou. +Ttrl: Test-time reinforcement learning. +In +NeurIPS +, 2025. +Contents +section.1table.caption.4section.2subsection.2.1subsection.2.2subsection.2.3section.3subsection.3.1subsection.3.1subsection.3.2subsection.3.3section.4section.4subsection.4.1subsection.4.2subsection.4.3subsection.4.4subsection.4.4subsection.4.5subsection.4.6subsection.4.6subsection.4.6subsection.4.6section.5subsection.5.1subsection.5.2figure.caption.20figure.caption.20section.6subsection.6.1subsection.6.2subsection.6.3subsection.6.4section.7section.7section.7appendix.Asubsection.A.1subsection.A.1equation.6subsection.A.2subsection.A.3subsection.A.4appendix.Bsubsection.B.1subsection.B.2appendix.Cappendix.Cappendix.Cappendix.Cappendix.Csubsection.C.1subsection.C.1equation.19appendix.Dsubsection.D.1subsection.D.2subsubsection.D.2.1subsubsection.D.2.2subsubsection.D.2.3subsection.D.3appendix.Esubsection.E.1subsection.E.2subsubsection.E.2.1subsection.E.3appendix.Fsubsection.F.1subsection.F.2subsection.F.3subsection.F.4 +Appendix A +Implementation of SDPO +The following pseudocode in +Figure +˜ +14 +outlines the implementation of SDPO: +⬇ +def +compute_sdpo_loss +( +batch +, +teacher_context +, +loss_mask +): +""" +Computes +probabilities +of +response +y +under +the +self +- +teacher +and +the +per +- +logit +SDPO +loss +. +""" +# +Compute +model +probabilities +for +response +y +logprobs_student += +compute_log_prob +( +batch +) +# +( +T +, +V +) +probs_student += +logprobs_student +. +exp +() +# +( +T +, +V +) +\ +par +# +Compute +self +- +teacher +probabilities +for +response +y +teacher_batch += +reprompt +( +batch +, +teacher_context +) +logprobs_teacher += +compute_log_prob +( +teacher_batch +). +detach +() +# +( +T +, +V +) +\ +par +# +Compute +SDPO +loss +: +per +- +token +divergence +per_token_loss += +divergence +( +logprobs_student +, +logprobs_teacher +) +# +( +T +,) +return +agg_loss +( +per_token_loss +, +loss_mask +, +loss_agg_mode +=" +token +- +mean +") +Figure 14: +The pseudo-code of SDPO within a standard RL training pipeline. Omitted here is the filtering to top- +K +K +logprobs for student and teacher (including a tail term) as described in +Section +˜ +A.3 +. Further, we omit here any importance sampling weights to correct for off-policy data. +reprompt +modifies the batch to incorporate teacher context (i.e., rich feedback). +divergence +implements any per-token divergence such as reverse-KL, forward-KL, or Jensen-Shannon. +In the following, we provide further details on: +• +The gradient estimator used in our implementation ( +Section +˜ +A.1 +) +• +Teacher regularization ( +Section +˜ +A.2 +) +• +Approximating logit-distillation with the top- +K +K +logits for saving GPU memory ( +Section +˜ +A.3 +) +• +Generalizing PPO-style policy gradient algorithms to logit-level advantages ( +Section +˜ +A.4 +) +To disambiguate the notation of the self-teacher, we use +q +θ +( +⋅ +∣ +x +, +f +) +:= +π +θ +( +⋅ +∣ +reprompt +( +x +, +f +) +) +q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f)) +in the following. +Here, +reprompt +denotes the reprompt template of the self-teacher. +A.1 +Gradient Estimators +In this seciton, we discuss two possible gradient estimators for the KL divergence between the current policy +π +θ +​ +( +y +∣ +x +) +\pi_{\theta}(y\mid x) +and the teacher policy +q +θ +​ +( +y +∣ +x +, +f +) +q_{\theta}(y\mid x,f) +. +Per-token estimator. +Deriving the gradient of the SDPO loss as defined in +Equation +˜ +1 +: +ℒ +token +( +θ +) +:= +𝔼 +y +∼ +stopgrad +( +π +θ +( +⋅ +∣ +x +) +) +[ +∑ +t += +1 +T +KL +( +π +θ +( +⋅ +∣ +x +, +y +< +t +) +∥ +stopgrad +( +π +θ +( +⋅ +∣ +x +, +f +, +y +< +t +) +) +) +] +\mathcal{L}_{\mathrm{token}}(\theta):=\mathbb{E}_{y\sim\mathrm{stopgrad}(\pi_{\theta}(\cdot\mid x))}\left[\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{ +t +y_{>t} +(due to the additional dependence on +Π +θ +\Pi_{\theta} +). +Amini et al. ( +2025 +) +show that the corresponding gradient estimator is given by +∇ +ℒ +seq +( +θ +) += +∇ +ℒ +token +( +θ +) ++ +𝔼 +y +∼ +π +θ +( +⋅ +∣ +x +) +[ +∑ +t += +1 +T +KL +( +π +θ +( +⋅ +∣ +s +t +) +∥ +q +θ +( +⋅ +∣ +s +t +, +f +) +) +∇ +θ +log +Π +θ +( +s +t +) +] +. +\boldsymbol{\nabla}\mathcal{L}_{\text{seq}}(\theta)=\boldsymbol{\nabla}\mathcal{L}_{\text{token}}(\theta)+\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{T}\mathrm{KL}\left(\pi_{\theta}(\cdot\mid s_{t})\|q_{\theta}(\cdot\mid s_{t},f)\right)\boldsymbol{\nabla}_{\!\!\theta}\,\log\Pi_{\theta}(s_{t})\right]. +(8) +The additional term of the sequence-level gradient captures how prefixes influence the self-distillation divergence of future tokens. +We also experimented with this sequence-level gradient estimator but did not find measurable gains relative to its additional complexity. +A.2 +Regularized teacher +In contrast to standard distillation, the teacher in SDPO changes throughout training. This bootstrapping enables the teacher to improve, but it may also lead to training instability. +To stabilize training, we seek to prevent the teacher +q +q +from quickly diverging from the initial teacher +q +θ +ref +\smash{q_{\theta_{{\mathrm{ref}}}}} +. +We can achieve this by placing an explicit trust-region constraint on +q +q +(Schulman et al., +2015 +; Peng et al., +2019 +) +, that is: +∑ +t +KL +( +q +( +y +t +∣ +x +, +f +, +y +< +t +) +∥ +q +θ +ref +( +y +t +∣ +x +, +f +, +y +< +t +) +) +≤ +ϵ +, +ϵ +> +0 +. +\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{0. +(9) +This trust-region can be implemented in two ways: +1. +Explicit trust-region: +We can define the teacher as the policy closest to +q +θ +q_{\theta} +while satisfying the trust-region constraint. +This teacher can be expressed as +q +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +∝ +exp +⁡ +( +( +1 +− +α +) +​ +log +⁡ +q +θ +ref +​ +( +y +t +∣ +x +, +f +, +y +< +t +) ++ +α +​ +log +⁡ +q +θ +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +) +, +q(y_{t}\mid x,f,y_{ +0 +. +\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{0. +(16) +In the following, we derive a teacher +q +q +which satisfies the trust-region constraint while staying close to the target +q +θ +q_{\theta} +. +The following optimization problem characterizes such a +q +q +(Peng et al., +2019 +) +: +arg +​ +max +q +∈ +Δ +∑ +t +∑ +y +t +q +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +​ +log +⁡ +q +θ +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +q +θ +ref +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +s.t. +∑ +t +KL +( +q +( +y +t +∣ +x +, +f +, +y +< +t +) +∥ +q +θ +ref +( +y +t +∣ +x +, +f +, +y +< +t +) +) +≤ +ϵ +, +\displaystyle\begin{split}\operatorname*{arg\,max}_{q\in\Delta}\ &\sum_{t}\sum_{y_{t}}q(y_{t}\mid x,f,y_{ +0 +\operatorname*{arg\,max}_{\theta}\ \mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[\sum_{t}r(y_{t}\mid x,y_{0 +(19) +where +π +θ +​ +( +y +∣ +x +) += +∏ +t += +1 +T +π +θ +​ +( +y +t +∣ +x +, +y +< +t +) +\smash{\pi_{\theta}(y\mid x)=\prod_{t=1}^{T}\pi_{\theta}(y_{t}\mid x,y_{ +... + +< +answer +> +... + +For +the +answer +, +only +output +the +letter +corresponding +to +the +correct +option +( +A +, +B +, +C +, +or +D +), +and +nothing +else +. +Do +not +restate +the +answer +text +. +For +example +, +if +the +answer +is +" +A +", +just +output +: +< +answer +> +A + +Listing 1: +System prompt: Multiple Choice Questions +⬇ +{ +question +} +Please +reason +step +by +step +. +Listing 2: +User prompt: Multiple Choice Questions +⬇ +Your +task +is +to +answer +the +user +’ +s +question +using +available +tools +. +You +have +access +to +the +following +tools +: +Name +: +Axolotl +Description +: +Collection +of +axolotl +pictures +and +facts +Documentation +: +getRandomAxolotlImage +: +Retrieve +a +random +axolotl +image +with +information +on +the +image +source +. +Parameters +: +{} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Object +{ +url +, +source +, +description +} +searchAxolotlImages +: +Search +for +axolotl +images +based +on +specific +criteria +such +as +color +, +gender +, +and +size +. +Parameters +: +{" +color +": +" +string +. +One +of +: +[ +wild +, +leucistic +, +albino +]. +The +color +of +the +axolotl +( +e +. +g +., +’ +wild +’, +’ +leucistic +’, +’ +albino +’, +etc +.).", +" +gender +": +" +string +. +One +of +: +[ +male +, +female +]. +The +gender +of +the +axolotl +(’ +male +’, +’ +female +’).", +" +size +": +" +string +. +One +of +: +[ +small +, +medium +, +large +]. +The +size +of +the +axolotl +(’ +small +’, +’ +medium +’, +’ +large +’).", +" +page +": +" +integer +. +The +page +number +for +pagination +purposes +."} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Object +{ +results +: +Array +[ +Object +{ +url +, +source +, +description +}], +pagination +: +Object +{ +current_page +, +total_pages +, +total_results +}} +getAxolotlFacts +: +Retrieve +interesting +facts +about +axolotls +such +as +their +habits +, +habitats +, +and +physical +characteristics +. +Parameters +: +{" +category +": +" +string +. +One +of +: +[ +habits +, +habitat +, +physical +characteristics +]. +The +category +of +facts +to +retrieve +( +e +. +g +., +’ +habits +’, +’ +habitat +’, +’ +physical +characteristics +’).", +" +limit +": +" +integer +. +The +maximum +number +of +facts +to +return +."} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Array +[ +Object +{ +fact +, +source +}] +Use +the +following +format +: +Thought +: +you +should +always +think +about +what +to +do +Action +: +the +action +to +take +, +should +be +one +of +the +tool +names +. +Action +Input +: +the +input +to +the +action +, +must +be +in +JSON +format +. +All +of +the +action +input +must +be +realistic +and +from +the +user +. +Begin +! +Question +: +Hey +, +can +you +show +me +a +random +picture +of +an +axolotl +? +Listing 3: +Example user prompt: Tool use +Appendix F +Qualitative Examples +F.1 +Visualization of Advantages +Figure +˜ +21 +compares the advantages of SDPO and GRPO in a representative example. +Figure 21: +Visualization of advantages in SDPO and GRPO with Olmo3-7B-Instruct in a batch from the Chemistry task of +Section +˜ +3 +. Each row corresponds to the beginning of a response. The color indicates the advantage value at that token position, with positive advantages shown in blue and negative advantages shown in red. +F.2 +Examples +Below, we show an example from training SDPO on LCBv6 using Qwen3-8B. +⬇ +[Prompt] +You +are +a +coding +expert. +You +will +be +given +a +coding +problem, +and +you +need +to +write +a +correct +Python +program +that +matches +the +specification +and +passes +all +tests. +The +time +limit +is +1 +second. +You +may +start +by +outlining +your +thought +process. +In +the +end, +please +provide +the +complete +code +in +a +code +block +enclosed +with +‘‘‘ +‘‘‘. +You +are +given +a +binary +string +s +of +length +n, +where: +’1’ +represents +an +active +section. +’0’ +represents +an +inactive +section. +You +can +perform +at +most +one +trade +to +maximize +the +number +of +active +sections +in +s. +In +a +trade, +you: +Convert +a +contiguous +block +of +’1’s +that +is +surrounded +by +’0’s +to +all +’0’s. +Afterward, +convert +a +contiguous +block +of +’0’s +that +is +surrounded +by +’1’s +to +all +’1’s. +Return +the +maximum +number +of +active +sections +in +s +after +making +the +optimal +trade. +Note: +Treat +s +as +if +it +is +augmented +with +a +’1’ +at +both +ends, +forming +t += +’1’ ++ +s ++ +’1’. +The +augmented +’1’s +do +not +contribute +to +the +final +count. +Example +1: +Input: +s += +"01" +Output: +1 +Explanation: +Because +there +is +no +block +of +’1’s +surrounded +by +’0’s, +no +valid +trade +is +possible. +The +maximum +number +of +active +sections +is +1. +Example +2: +Input: +s += +"0100" +Output: +4 +Explanation: +String +"0100" +-> +Augmented +to +"101001". +Choose +"0100", +convert +"101001" +-> +"100001" +-> +"111111". +The +final +string +without +augmentation +is +"1111". +The +maximum +number +of +active +sections +is +4. +Example +3: +Input: +s += +"1000100" +Output: +7 +Explanation: +String +"1000100" +-> +Augmented +to +"110001001". +Choose +"000100", +convert +"110001001" +-> +"110000001" +-> +"111111111". +The +final +string +without +augmentation +is +"1111111". +The +maximum +number +of +active +sections +is +7. +Example +4: +Input: +s += +"01010" +Output: +4 +Explanation: +String +"01010" +-> +Augmented +to +"1010101". +Choose +"010", +convert +"1010101" +-> +"1000101" +-> +"1111101". +The +final +string +without +augmentation +is +"11110". +The +maximum +number +of +active +sections +is +4. +Constraints: +1 +<= +n +== +s.length +<= +10^5 +s[i] +is +either +’0’ +or +’1’ +Your +solution +should +have +the +following +signature: +‘‘‘python +def +maxActiveSectionsAfterTrade(s: +str) +-> +int: +‘‘‘ +[Response] + + +To +solve +this +problem, +we +need +to +understand +the +structure +of +the +string +and +the +trade +operation. +Here’s +the +plan: +### +Key +Insights: +1. +We +are +given +a +binary +string +‘s‘, +and +we +are +allowed +**at +most +one +trade**. +2. +A +**trade** +consists +of +two +steps: +- +Convert +a +contiguous +block +of +‘’1’‘ +that +is +**surrounded +by +‘’0’‘ +into +‘’0’‘ +(i.e., +flip +it). +- +Then +convert +a +contiguous +block +of +‘’0’‘ +that +is +**surrounded +by +‘’1’‘ +into +‘’1’‘ +(i.e., +flip +it). +3. +The +actual +count +of +active +sections +is +the +number +of +‘’1’‘ +characters +**after** +the +trade. +The +augmented +‘’1’‘ +at +the +beginning +and +end +do +not +count +in +the +final +result. +### +Strategy: +- +We +simulate +the +trade +by +considering +each +possible +way +to +flip +a +block +of +‘’1’‘ +(bounded +by +‘’0’‘) +and +then +a +block +of +‘’0’‘ +(bounded +by +‘’1’‘). +- +For +each +possible +valid +block, +simulate +the +trade +and +compute +the +resulting +number +of +‘’1’‘ +characters. +- +We +only +need +to +consider +the +original +string +‘s‘, +not +the +augmented +one +(since +we +can +simulate +the +trade +directly +on +‘s‘ +by +checking +boundaries). +We’ll +use +a +sliding +window +technique +to +find +contiguous +blocks +of +‘’1’‘ +and +‘’0’‘ +and +simulate +the +trade. +--- +### +Implementation +Steps: +1. +Simulate +the +trade +by +checking +for +a +block +of +‘’1’‘ +surrounded +by +‘’0’‘ +or +at +the +edges +(we +simulate +the +augmented +‘1‘ +at +both +sides). +2. +For +each +such +block, +simulate +converting +it +to +‘’0’‘, +then +convert +a +‘’0’‘ +block +(again, +surrounded +by +‘’1’‘) +to +‘’1’‘. +3. +Count +the +number +of +‘’1’‘ +in +the +final +string +and +track +the +maximum. +--- +### +Python +Code: +‘‘‘python +def +maxActiveSectionsAfterTrade(s: +str) +-> +int: +if +not +s: +return +0 +max_ones += +0 +n += +len(s) +# +We +are +simulating +the +augmented +’1’s +on +both +ends, +so +we +treat +the +boundaries +as +’1’s +# +So +’0’ +at +index +0 +or +n-1 +is +considered +surrounded +by +’1’s +# +Step +1: +Find +all +candidate +positions +for +the +first +part +of +the +trade +# +(convert +a +block +of +1s +surrounded +by +0s +into +0s) +for +i +in +range(n): +if +s[i] +== +’1’: +start += +i +while +i +< +n +and +s[i] +== +’1’: +i ++= +1 +end += +i +- +1 +# +Check +if +this +block +is +surrounded +by +’0’s +left_boundary += +(start +== +0 +or +s[start +- +1] +== +’0’) +right_boundary += +(end +== +n +- +1 +or +s[end ++ +1] +== +’0’) +if +left_boundary +and +right_boundary: +# +This +is +a +candidate +block +of +’1’s +to +flip +# +Now, +look +for +a +block +of +’0’s +surrounded +by +’1’s +in +the +remaining +string +# +Replace +the +’1’ +block +with +’0’s +# +Now +find +a +’0’ +block +surrounded +by +’1’s +# +We +can +simulate +the +remaining +string +temp += +list(s) +for +j +in +range(start, +end ++ +1): +temp[j] += +’0’ +# +Now +find +a +contiguous +block +of +’0’s +surrounded +by +’1’s +max_trade_ones += +0 +for +k +in +range(n): +if +temp[k] +== +’0’: +start0 += +k +while +k +< +n +and +temp[k] +== +’0’: +k ++= +1 +end0 += +k +- +1 +# +Check +if +this +block +is +surrounded +by +’1’s +left_0 += +(start0 +== +0 +or +temp[start0 +- +1] +== +’1’) +right_0 += +(end0 +== +n +- +1 +or +temp[end0 ++ +1] +== +’1’) +if +left_0 +and +right_0: +# +Flip +this +block +of +’0’s +to +’1’s +for +j +in +range(start0, +end0 ++ +1): +temp[j] += +’1’ +# +Count +the +number +of +’1’s +count += +sum(temp) +max_ones += +max(max_ones, +count) +break +# +Only +take +the +first +valid +block +for +this +trade +# +After +trying +this +’1’ +block, +restore +the +original +string +# +No +need, +we +just +used +a +copy +return +max_ones +‘‘‘ +--- +### +Optimization +Note: +The +above +approach +is +**not +optimal** +for +very +large +strings. +A +more +efficient +method +would +involve +precomputing +the +positions +of +blocks +of +‘’1’‘ +and +‘’0’‘, +and +checking +valid +trades +between +these +blocks +in +linear +time. +But +with +the +constraints +given +(up +to +10^5), +we +need +to +ensure +linear +or +near-linear +performance. +The +current +code +is +correct +but +**not +efficient** +for +large +inputs. +Would +you +like +me +to +provide +a +more +optimized +version +that +runs +in +**O(n)** +time +and +uses +only +one +pass +through +the +string? +[Feedback] +b’Runtime +Error\nTypeError: +unsupported +operand +type(s) +for ++: +\’int\’ +and +\’str\’\nLine +48 +in +maxActiveSectionsAfterTrade +(Solution.py)\n\nLast +Executed +Input\n"11000"’ +F.3 +Environment Feedback +We show three examples of feedback in our coding environment, inspired by LeetCode. +⬇ +Test +Case +3: +Wrong +Answer +Input +[[3,6,1],[4,21,4]] +Output +6 +Expected +22.87500 +Test +Case +6: +Wrong +Answer +Input +[[12,25,3],[3,14,2]] +Output +14 +Expected +25.83333 +Listing 4: +Example of feedback “Wrong Answer” from our code environment in case of a wrong answer, +inspired by LeetCode +⬇ +Runtime +Error +MemoryError +: +Line +91 +in +< +module +> +( +Solution +. +py +) +Line +25 +in +solve +( +Solution +. +py +) +Last +Executed +Input +10 +633 +9312 +1314 +8548 +8857 +1062 +6410 +3289 +8594 +1263 +8549 +733 +3858 +5973 +... +(3 +more +lines +) +Listing 5: +Example of feedback “Memory Error” from our code environment in case of a wrong answer, +inspired by LeetCode +⬇ +Runtime +Error +IndexError +: +list +index +out +of +range +Line +28 +in +sortMatrix +( +Solution +. +py +) +Last +Executed +Input +[[-1,-1,-1,-1,-1,-1,-1,-1,... +Listing 6: +Example of feedback “Index Error” from our code environment in case of a wrong answer, +inspired by LeetCode +F.4 +Illustrative Example +Figure +22 +shows an illustrative example of the dense credit assignment in SDPO. +Figure 22: +Dense credit assignment through self-teaching in SDPO. +The answer is generated by then model (Qwen3-8B) before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the self-teacher after seeing the feedback. We show the per-token +log +⁡ +( +ℙ +​ +( +self-teacher +) +/ +ℙ +​ +( +student +) +) +\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}}) +, with red indicating negative values ( +self-teacher disagrees +), blue indicating positive values ( +teacher reinforces +), and white indicating values around zero. Using binary rewards, GRPO would assign the same, negative advantage to all tokens in the sequence. In contrast, SDPO turns the feedback into dense credit assignment across the sequence. The first row shows the tokens of the generated response. The 3 other rows show the top- +k +k +logits of the self-teacher that are used during self-distillation, suggesting alternative tokens. Notably, in this example, the self-teacher identifies the error through retrospection without an explicit solution. The credit assignment on the generated sequence, and the alternative top- +k +k +logits correctly show that replacing +set +with +dict +maintains the order of elements. Further, in the seventh shown position, the model also identifies an alternative solution path which starts with the +seen +token, instead of directly returning the output. The activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution for specifically these few tokens. \ No newline at end of file diff --git a/research/notes/reinforcement-learning-via-self-distillation.md b/research/notes/reinforcement-learning-via-self-distillation.md new file mode 100644 index 0000000000000000000000000000000000000000..fc6551c6fb063d8f164275bc416bf2a9e23a0887 --- /dev/null +++ b/research/notes/reinforcement-learning-via-self-distillation.md @@ -0,0 +1,10247 @@ +--- +title: Reinforcement Learning via Self-Distillation +id: reinforcement-learning-via-self-distillation +tags: +- deepread +created: '2026-06-10T00:00:39.997764Z' +source: https://arxiv.org/html/2601.20802v1 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:00:39.997626Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Reinforcement Learning via Self-Distillation +Reinforcement Learning via Self-Distillation +Jonas Hübotter +1 +1 +Frederike Lübeck +, +1 +, +2 +,1,2 +Lejs Behric +1 +1 +1 +In standard RLVR implementations a rollout group contains multiple simultaneous attempts for +x +x +. +, +1 +,1 +Anton Baumann +1 +1 +1 +In standard RLVR implementations a rollout group contains multiple simultaneous attempts for +x +x +. +, +1 +,1 +Marco Bagatella +1 +, +2 +1,2 +Daniel Marta +1 +1 +Ido Hakimi +1 +1 +Idan Shenfeld +3 +3 +Thomas Kleine Buening +1 +1 +Carlos Guestrin +4 +4 +Andreas Krause +1 +1 +1 +1 +ETH Zurich +2 +2 +Max Planck Institute for Intelligent Systems +3 +3 +MIT +4 +4 +Stanford +https://github.com/lasgroup/SDPO +Equal second authorship. Correspondence to +jonas.huebotter@inf.ethz.ch +. +Abstract +Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math. +Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck. +Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain +why +an attempt failed. +We formalize this setting as reinforcement learning with rich feedback and introduce +Self-Distillation Policy Optimization +( +SDPO +), which converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model. +SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy. +In this way, SDPO leverages the model’s ability to retrospectively identify its own mistakes in-context. +Across scientific reasoning, tool use, and competitive programming on LiveCodeBench v6, SDPO improves sample efficiency and final accuracy over strong RLVR baselines. +Notably, SDPO also outperforms baselines in standard RLVR environments that only return scalar feedback by using successful rollouts as implicit feedback for failed attempts. +Finally, applying SDPO to individual questions at test time accelerates discovery on difficult binary-reward tasks, achieving the same discovery probability as best-of- +k +k +sampling or multi-turn conversations with +3 +× +3\times +fewer attempts. +1 +Introduction +Figure 1: +SDPO substantially outperforms an improved version of Group Relative Policy Optimization (GRPO) on LCB v6 with Qwen3-8B. +Further, SDPO achieves GRPO’s final accuracy in +4 +× +4\times +fewer generations. +Claude Sonnet 4 is the strongest instruct model on the public LCBv6 leaderboard. +Shaded regions show the standard deviation across 3 seeds. +Progress in deep reinforcement learning has shown that iterating on experience—acting, receiving feedback, and updating a policy—can unlock capabilities that are difficult to obtain from static supervision alone +(Mnih et al., +2015 +; Silver et al., +2016 +; +2017 +; Berner et al., +2019 +) +. +The same theme now appears in large language models (LLMs): large-scale post-training with reinforcement learning (RL) has substantially improved performance on reasoning-heavy tasks, especially in settings with programmatic or otherwise verifiable evaluation +(Jaech et al., +2024 +; Guo et al., +2025 +; Kimi et al., +2025 +; Olmo et al., +2025 +) +. +Nevertheless, the dominant RL recipe for LLM post-training remains bottlenecked by credit assignment. +Most current approaches operate in the setting of reinforcement learning with verifiable rewards (RLVR): given a question +x +x +, the model samples an answer +y +∼ +π +θ +( +⋅ +∣ +x +) +{y\sim\pi_{\theta}(\cdot\mid x)} +and receives a scalar reward +r +∈ +ℝ +r\in\mathbb{R} +, often binary (e.g., unit-tests pass/fail in code generation). +Modern policy gradient RLVR methods such as Group Relative Policy Optimization +(GRPO; Shao et al., +2024 +) +estimate advantages from these sparse outcome rewards. +Furthermore, when all rollouts in a group receive the same (often zero) reward, GRPO advantages collapse to zero and learning stalls. +To overcome this sparsity, one might prefer distillation from a strong teacher +(Guo et al., +2025 +; Yang et al., +2025 +; Lu & Thinking Machines Lab, +2025 +; Guha et al., +2026 +) +, which provides dense, token-level supervision. +However, strong teachers are often unavailable in online learning, where the goal is to raise the capability ceiling beyond existing models. +In this work, we argue that the key limitation is not RL per se, but the information bottleneck imposed by scalar outcome rewards. +Many verifiable environments expose +rich tokenized feedback +beyond scalar rewards +r +r +, such as runtime errors, failing unit tests, or evaluations from an LLM judge. +This feedback not only reveals +whether +a rollout was wrong, but also +what +went wrong. +We formalize this more general setting as +Reinforcement Learning with Rich Feedback +( +RLRF +) and illustrate its difference to RLVR in +Figure +˜ +2 +. +Here, feedback can be any tokenized representation of any state reached by an agentic system. +The central question becomes: how can we convert rich feedback into effective credit assignment without requiring external supervision from a strong teacher? +Figure 2: +Comparison of RLVR and RLRF settings. +In Reinforcement Learning with Verifiable Rewards (RLVR), the agent learns from a scalar reward +r +r +, which often acts as an information bottleneck by masking the underlying environment state. +In contrast, Reinforcement Learning with Rich Feedback (RLRF) utilizes tokenized feedback. +This provides a significantly richer signal than a scalar reward, as the feedback can encapsulate both the reward as well as detailed observations of the state (such as runtime errors from a code environment or feedback from an LLM judge). +⬇ +Runtime +Error +ZeroDivisionError +: +division +by +zero +Line +73 +in +separateSquares +( +Solution +. +py +) +\ +parLast +Executed +Input +[[26,30,2],[11,23,1]] +Figure 3: +Example of feedback from our code environment, inspired by LeetCode. Listings +LABEL:lst:feedback_example_wrong_answer +, +LABEL:lst:memory_error +, and +LABEL:lst:index_error +in the appendix show examples of feedback in case of a wrong answer, a memory error, and an index error. +Our starting point is the observation that LLMs already possess a powerful mechanism for using feedback: in-context learning +(Brown et al., +2020 +; Wei et al., +2022 +) +. +When conditioned on feedback, the same model can often identify plausible mistakes and propose a corrected approach. +A common example of such feedback is the summary of failed test cases on coding platforms like LeetCode ( +Figure +˜ +3 +). +Many recent works leverage this capability to iteratively generate corrections +(Chen et al., +2021a +; Madaan et al., +2023 +; Shinn et al., +2023 +; Yao et al., +2024 +; Yuksekgonul et al., +2025 +; Lee et al., +2025 +) +. +In contrast, we use the current policy as a “self-teacher” that, rather than sampling a new response, re-evaluates the +existing +rollout after receiving rich feedback. +Including the feedback in-context transforms the model’s next-token distribution, allowing the self-teacher to agree or disagree with the student’s original choices at specific tokens. +This yields dense, logit-level credit assignment. +For example, when provided with the feedback from +Figure +˜ +3 +, the self-teacher can identify how the initial attempt should be modified to avoid the runtime error. +Crucially, this mechanism incurs no sampling overhead: we simply re-compute the log-probabilities of the original attempt under the self-teacher’s feedback-augmented context. +Building on this idea, we introduce +Self-Distillation Policy Optimization +( +SDPO +), an on-policy algorithm that performs RL via self-distillation. +SDPO samples rollouts from the current policy, obtains rich environment feedback, and then minimizes a logit-level distillation loss that matches the current policy’s next-token distribution to that of the self-teacher. +Conceptually, SDPO addresses the central limitation of applying distillation to online learning: the absence of a stronger external teacher. +Instead of relying on a fixed teacher, SDPO leverages the model’s ability to recognize its own mistakes in hindsight. +By conditioning the current policy on the rich feedback it just received, we construct a self-teacher that provides the dense supervision of distillation while retaining the exploration benefits of on-policy RL. +Table +˜ +1 +summarizes how this positions SDPO relative to RLVR and distillation baselines. +We include a comprehensive summary of related work in +Section +˜ +6 +. +We show that SDPO is a policy gradient algorithm whose advantages are estimated using the self-teacher. +This enables the implementation of SDPO with minor changes to standard RLVR pipelines, simply by swapping out the advantages. +Method +Sampling +Signal +Feedback +SFT / Distillation +(Hinton et al., +2015 +) +× +\boldsymbol{\times} +off-policy +✓ +rich +× +\boldsymbol{\times} +strong teacher +On-Policy Distillation +(Agarwal et al., +2024 +) +✓ +on-policy +✓ +rich +× +\boldsymbol{\times} +strong teacher +RLVR (such as GRPO) +(Lambert et al., +2025 +) +✓ +on-policy +× +\boldsymbol{\times} +weak +✓ +environment +RL via Self-Distillation (SDPO) +(ours) +✓ +on-policy +✓ +rich +✓ +environment +Table 1: +Comparison of self-distillation to alternative methods for post-training LLMs. +Summary of evaluation results. +We evaluate SDPO in three online RL settings: +• +Learning without rich feedback +(§ +3 +): +We evaluate standard RLVR environments that do not return any feedback beyond scalar rewards. +Here, SDPO treats successful attempts sampled in the current batch as “feedback” for failed attempts on the same question. +We perform training runs on scientific reasoning and tool use, starting with Qwen3-8B and Olmo3-7B-Instruct. +We find that SDPO outperforms a strong GRPO baseline that integrates recent improvements: 68.8% vs. 64.1% final accuracy on aggregate. +SDPO achieves higher accuracy with up to +7 +× +7\times +shorter generation lengths compared to GRPO, demonstrating that effective reasoning need not be verbose. +• +Learning with rich feedback +(§ +4 +): +We evaluate competitive programming problems from LiveCodeBench v6 with LeetCode-style feedback. +As shown in +Figure +˜ +1 +, SDPO substantially improves over GRPO, reaching a higher final accuracy (48.8% vs. 41.2%) and achieving GRPO’s final accuracy in +4 +× +4\times +fewer generations. +SDPO’s gains grow with model scale, suggesting that the ability for self-teaching emerges as models become stronger in-context learners. +• +Discovering novel solutions to hard tasks at test-time +(§ +5 +): +Finally, we demonstrate that SDPO can accelerate the discovery of solutions to difficult binary-reward questions. +This contrasts with RLVR methods, which only begin learning once the first solution has been found. +We leverage SDPO for +Test-Time Self-Distillation +, a form of test-time training where the model specializes to an individual test question. +We consider very difficult LiveCodeBench questions, for which the base model’s pass@ +64 +64 +is below 0.03, and show that SDPO accelerates the discovery of solutions by +3 +× +3\times +. +2 +SDPO: Self-Distillation Policy Optimization +We propose an algorithm that uses the in-context learning ability of the current policy for assigning credit. +Our key object is the +self-teacher +, +π +θ +( +⋅ +∣ +x +, +f +) +\pi_{\theta}(\cdot\mid x,f) +, which refers to the current policy (the “student”) prompted with the question +x +x +and the rich feedback +f +f +. +Next to the students’ original attempt +y +y +, +f +f +may incorporate two key kinds of feedback: any environment output (such as runtime errors from a code environment) and a sample solution if +x +x +was already solved with another attempt in the rollout group. +1 +1 +1 +In standard RLVR implementations a rollout group contains multiple simultaneous attempts for +x +x +. +As discussed before, the self-teacher +π +θ +( +⋅ +∣ +x +, +f +) +\pi_{\theta}(\cdot\mid x,f) +should have a higher accuracy than the student +π +θ +( +⋅ +∣ +x +) +\pi_{\theta}(\cdot\mid x) +since it sees additional information in-context. +This leads us to observe: +We can use the same policy in two different roles: As the student for the initial attempt and as the teacher to determine the value of actions in hindsight. +We introduce +Self-Distillation Policy Optimization +( +SDPO +) which repeatedly distills the self-teacher into the student. +Given a question +x +x +, we first sample rollouts from the student +π +θ +\pi_{\theta} +and obtain corresponding environment feedback. +We then use the KL-divergence, +KL +​ +( +p +∥ +q +) += +∑ +i +p +​ +( +i +) +​ +log +⁡ +p +​ +( +i +) +/ +q +​ +( +i +) +\smash{\mathrm{KL}\left(p\|q\right)=\sum_{i}p(i)\log\nicefrac{{p(i)}}{{q(i)}}} +, as a distance measure for the next-token distributions of student and teacher, and optimize a standard logit distillation loss: +ℒ +SDPO +( +θ +) +:= +∑ +t +KL +( +π +θ +( +⋅ +∣ +x +, +y +< +t +) +∥ +stopgrad +( +π +θ +( +⋅ +∣ +x +, +f +, +y +< +t +) +) +) +\mathcal{L}_{\mathrm{SDPO}}(\theta):=\sum_{t}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{ +B + +(a) +GRPO (5,549 tokens) +… +At pH 7.4, all functional groups are neutral +… +maintaining a balance between hydrophobic and hydrophilic character +… +[The] overall polarity +… +keeps logD from being very high +… +or very low +… +[typically falling] in the 2.0-3.0 range, with 2.61 (C) being a reasonable estimate +… + +C + +(b) +SDPO (764 tokens) +Figure 7: +Example responses from GRPO and SDPO after 50 training steps to the following question: “What is the correct octanol/water distribution coefficient logD under the circumstance of pH 7.4 for the molecule +O=C1O[C@@H](COc2ccon2)CN1c1ccc(C2=CCOCC2)c(F)c1 +?” The answer options are A: 1.32, B: 1.85, C: 2.61, D: 3.76. The correct answer is +C +. +GRPO’s answer contains +5 +× +5\times +“Hmm.”, +9 +× +9\times +“No.”, and +25 +× +25\times +“Wait”. Further, GRPO’s answer repeats calculations such as “ +10 +1.85 +≈ +69.3 +\smash{10^{1.85}\approx 69.3} +”, which appears four times, and the model even explicitly generates “Wait I’m going in circles”. +SDPO’s answer avoids any circular reasoning and is more than +7 +× +7\times +shorter. The base model is Qwen3-8B. +4 +Learning with Rich Environment Feedback +We next evaluate SDPO on coding tasks. +Coding is a canonical example of an RL environment that provides rich feedback, such as runtime errors and failed unit tests. +Learning to solve these coding problems requires strong credit assignment since the student must identify its precise mistakes to avoid repeating them in the future. +LiveCodeBench +(LCB; Jain et al., +2025 +) +provides a set of contest-style coding problems, ranging from simple to competition-level. +We restrict our evaluation to the most recent LCBv6 subset of LCB, which contains 131 questions released between February and May 2025. +We consider a setting with public and private unit tests, common for code contests and coding platforms like LeetCode, where the public tests are used for evaluation during training and the private tests are used for validation +(Chen et al., +2022 +; Le et al., +2022 +; El-Kishky et al., +2025 +; Samadi et al., +2025 +) +. +5 +5 +5 +We select public tests as a 50% random subset of private tests. +We use the Qwen3 +(Yang et al., +2025 +) +model family for our experiments, with Qwen3-8B as default unless otherwise specified. +We report the average accuracy over 4 rollouts and use the same GRPO baseline as outlined in +Section +˜ +3.1 +. +Results. +Figure +˜ +1 +compares the learning curves of SDPO and GRPO on LCBv6. +We find that SDPO achieves a substantially higher final accuracy (48.8%) than GRPO (41.2%) while also outperforming the strongest instruct models on the public LCBv6 leaderboard: +6 +6 +6 +On the public leaderboard, the LCBv6 subset can be obtained by selecting February to May 2025. +Claude Sonnet 4 (40.5%) and Claude Opus 4 (39.7%). +Furthermore, SDPO reaches the final accuracy of GRPO in +4 +× +4\times +fewer generations. +We include an extended comparison to other RLVR baselines that perform similarly to GRPO in +Table +˜ +9 +in the appendix. +Differentiating between the easy, medium, and hard questions of LCB, we find that SDPO particularly improves over GRPO in solving medium and hard questions (cf. +Figure +˜ +15 +in the appendix). +4.1 +Self-distillation benefits from stronger models +Figure 8: +SDPO improves with model size. +We compare the final LCBv6 validation accuracy of SDPO and GRPO at train step 80, across model sizes from Qwen3. +The ability of SDPO’s teacher to perform accurate retrospection appears to be an emergent phenomenon with scale. +We include an additional scaling study with Qwen2.5-Instruct in the appendix (cf. +Figure +˜ +17 +) which further supports this finding. +Error bars indicate the standard error across 3 seeds. +A central question for our work is whether SDPO is sensitive to the in-context learning ability of the base model. +Intuitively, we expect that SDPO benefits from a strong in-context learner, since this enables the teacher to perform more accurate retrospection. +To answer this question, we perform a scaling study with different model sizes from the Qwen3 +(Yang et al., +2025 +) +family. +As shown by extensive prior work, the ability to learn in-context increases with model size +(e.g., Brown et al., +2020 +) +. +As depicted in +Figure +˜ +8 +, SDPO significantly outperforms GRPO on larger models while only slightly improving over GRPO on smaller models. +To determine whether SDPO can also underperform GRPO on a model weaker than Qwen3-0.6B, we performed an additional scaling study with Qwen2.5-Instruct +(Qwen et al., +2024 +) +. +While outperforming GRPO with Qwen2.5-7B and performing similarly with Qwen2.5-8B, we find that SDPO underperforms GRPO on Qwen2.5-1.5B, as seen in +Figure +˜ +17 +in +Appendix +˜ +D +. +Takeaway 2 +Our results suggest that the marginal improvement of SDPO over GRPO is tightly coupled with the strength of the base model, and motivates future study on models stronger than Qwen3-8B. +In the same way that in-context learning is an emergent phenomenon with scale, the self-teacher’s ability to perform accurate retrospection in SDPO appears to be emergent with scale. +4.2 +Self-distillation performs dense credit assignment +Figure 9: +Dense credit assignment in SDPO in the example from +Figure +˜ +4 +. Shown in blue are tokens which become more likely under the self-teacher. The self-teacher identifies how the returned range has to be modified so that it does not contain +n +. +Whereas GRPO assigns a constant advantage to each generated token, SDPO assigns an individual advantage to +each possible next token +along the generated sequence based on the agreement of student and teacher. +At each position +t +t +in the generated sequence +y +y +, there are +| +𝒱 +| +|\mathcal{V}| +possible next tokens where +𝒱 +\mathcal{V} +is the vocabulary. +In distillation, this level is typically called the +logit-level +since it corresponds to the logits of the model. +In practice, we approximate the full next-token distribution by the top- +K +K +tokens, and as such, SDPO assigns +| +y +| +⋅ +K +|y|\cdot K +unique advantages per sequence. +This is illustrated in +Figure +˜ +9 +and allows SDPO to perform dense credit assignment. +A natural question is whether the performance gains of SDPO are due to leveraging rich feedback in RLRF or due to the dense credit assignment of SDPO. +To answer this question, we ablate the performance of SDPO in three configurations: +• +Logit-level SDPO: +credit assignment over the 100 most likely tokens (under the student) at each position. +• +Token-level SDPO: +credit assignment over the most likely token at each position. +• +Sequence-level SDPO: +We compute SDPO advantages for all generated tokens and average them to produce a single scalar advantage per sequence (as in GRPO). This does not perform denser credit assignment than GRPO but still leverages the rich feedback +f +f +. +As shown in Figure +10 +(left) +, the dense credit assignment of logit-level SDPO leads to significant performance gains over token-level SDPO and sequence-level SDPO. +Nevertheless, even sequence-level SDPO outperforms GRPO, indicating that leveraging rich feedback in RLRF can lead to substantial gains over RLVR methods even without dense credit assignment. +4.3 +The self-teacher improves during training +Figure 10: +Left: Rich feedback in RLRF and dense credit assignment of SDPO are complementary. +We compare logit-level, token-level, and sequence-level SDPO advantages to GRPO. While denser credit assignment in SDPO is beneficial (logit-level > token-level > sequence-level), even sequence-level SDPO significantly outperforms GRPO due to leveraging the rich feedback. Error bars indicate the standard error across 3 seeds. +Right: The self-teacher improves during training. +We display the generative accuracy of the self-teacher compared to student on the current training batch (with a rolling average over 5 steps). The final student score is taken at step 80. Notably, the performance of the student significantly surpasses the initial teacher’s accuracy. Error bars indicate the standard deviation across 3 seeds. +Teacher +Accuracy +Avg accuracy +q +θ +q_{\theta} +36.1 +± +1.6 +36.1\pm 1.6 +29.8 +± +1.3 +29.8\pm 1.3 +q +θ +ref +q_{\theta_{{\mathrm{ref}}}} +48.8 +± +0.7 +48.8\pm 0.7 +44.4 +± +0.2 +44.4\pm 0.2 +Trust-region +50.6 +± +0.9 +\mathbf{50.6}\pm 0.9 +45.6 +± +0.2 +\mathbf{45.6}\pm 0.2 +EMA +49.3 +± +0.3 +49.3\pm 0.3 +45.3 +± +0.2 +\mathbf{45.3}\pm 0.2 +Table 4: +Best/average accuracy until step 90 of various methods for teacher regularization. Trust-region and EMA teachers use +α += +0.01 +\alpha=0.01 +. Training of the +q +θ +q_{\theta} +eventually diverges. Error ranges indicate standard errors across 3 seeds. +Contrary to standard distillation, the self-teacher in SDPO is not frozen, but updated throughout training. +This is a critical component of SDPO, since it enables the teacher to improve over time, which means that the student can learn from a stronger target. +To investigate whether the self-teacher improves during training, we plot the average accuracy when +generating +using the self-teacher in Figure +10 +(right) +. +We find that the self-teacher improves significantly during training. +Most notably, the student’s accuracy surpasses the initial teacher’s accuracy in later stages of training. +This demonstrates that SDPO enables true bootstrapping of a weak model to a strong model, without the initial self-teacher’s performance limiting the final student. +As described in +Section +˜ +2.3 +, SDPO uses a regularized teacher to stabilize training. +As can be seen in +Table +˜ +4 +, a non-regularized teacher significantly underperforms the regularized teachers. +Furthermore, trust-region and EMA teachers outperform the teacher frozen at the initial teacher’s parameters, showing that the teacher improves through parameter sharing with the student. +Yet, SDPO performs well even with a frozen teacher. +4.4 +On-policy self-distillation avoids catastrophic forgetting +Prior work has shown that a key benefit of on-policy algorithms, such as GRPO, is that models tend not to forget previously obtained capabilities +(Shenfeld et al., +2026 +; Chen et al., +2025b +; Lu & Thinking Machines Lab, +2025 +) +. +This is practically desirable since it enables continual training pipelines where a model is trained sequentially on diverse tasks without the need to retrain from scratch. +To evaluate forgetting, we test the final checkpoints of GRPO and SDPO on diverse holdout tasks: IFEval +(Zhou et al., +2023 +) +, which tests the ability of a model to follow precise format instructions; ArenaHard-v2 +(Li et al., +2025 +) +, which is an LLM-judged benchmark of real-world instruction-following prompts derived from LMArena +(Chiang et al., +2024 +) +; and MMLU-Pro +(Wang et al., +2024b +) +, which tests broad multi-task knowledge and reasoning. +As displayed in +Table +˜ +5 +, SDPO learns the new task while mitigating degradation of initial capabilities, overall achieving a better performance–forgetting tradeoff than GRPO. +Off-policy self-distillation baseline. +As an additional baseline, we consider training the student via supervised fine-tuning (SFT) on successful generations from the self-teacher +(Scheurer et al., +2023 +; Dou et al., +2024 +) +. +7 +7 +7 +SFT on a teacher’s predictions is a standard off-policy distillation approach +(Kim & Rush, +2016 +) +. +This requires +2 +× +2\times +the generations of SDPO for the same number of steps, since we have to generate from both the student and the teacher. +We report SFT on the successes of the self-teacher, which achieves a higher accuracy than also including initial successes from the student in the SFT data. +As shown in +Table +˜ +5 +, SFT on the self-teacher significantly underperforms SDPO on LCBv6, while leading to worse forgetting of prior capabilities. +This mirrors prior findings on the instability of off-policy imitation +(see, e.g., Agarwal et al., +2024 +) +. +Task: +Holdout tasks: +LCBv6 +IFEval +ArenaHard-v2 +(hard prompt) +ArenaHard-v2 +(creative writing) +MMLU-Pro +Avg. +(holdout) +Base +27.9 +27.9 +83.9 +{83.9} +14.0 +{14.0} +13.7 +{13.7} +62.5 +{62.5} +43.5 +{43.5} +SFT on self-teacher +42.7 +{42.7} +83.7 +11.2 +8.9 +61.9 +41.4 +GRPO +41.2 +41.2 +82.2 +82.2 +12.0 +12.0 +10.8 +10.8 +62.3 +62.3 +41.8 +41.8 +SDPO +48.8 +{48.8} +83.2 +{83.2} +12.3 +{12.3} +11.1 +{11.1} +62.9 +{62.9} +42.4 +{42.4} +Table 5: +On-policy methods do not suffer from catastrophic forgetting. +We compare the accuracy of the final checkpoint on the training task LCBv6 and on holdout tasks IFEval, ArenaHard-v2, and MMLU-Pro. We compare to a baseline that trains directly on responses generated by the initial self-teacher with SFT. Overall, SDPO achieves the best performance–forgetting tradeoff. We include additional baseline results in +Table +˜ +9 +in the appendix. +4.5 +Can GRPO and SDPO be combined? +GRPO utilizes Monte Carlo advantages, which are unbiased with respect to the objective of maximizing expected reward +J +​ +( +θ +) +:= +𝔼 +y +∼ +π +θ +( +⋅ +∣ +x +) +​ +[ +r +​ +( +y +∣ +x +) +] +J(\theta):=\smash{\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[r(y\mid x)\right]} +. +In contrast, SDPO advantages are inherently biased with respect to +J +​ +( +θ +) +J(\theta) +due to being computed from rich feedback and a self-teacher. +This dichotomy parallels the fundamental distinction between Monte Carlo and bootstrapped advantages in RL: while the latter are biased, they typically yield lower variance +(Sutton & Barto, +1998 +; Schulman et al., +2016 +) +. +This motivates a hybrid approach that combines reward-derived GRPO advantages with feedback-derived SDPO advantages: +A +i +, +t +SDPO ++ +GRPO +​ +( +y +^ +i +, +t +) +:= +λ +​ +A +i +, +t +GRPO +​ +( +y +^ +i +, +t +) ++ +( +1 +− +λ +) +​ +A +i +, +t +SDPO +​ +( +y +^ +i +, +t +) +, +λ +∈ +[ +0 +, +1 +] +. +A_{i,t}^{\mathrm{SDPO+GRPO}}(\hat{y}_{i,t}):=\lambda A_{i,t}^{\mathrm{GRPO}}(\hat{y}_{i,t})+(1-\lambda)A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t}),\quad\lambda\in[0,1]. +(3) +Figure 11: +We compare the LCBv6 validation accuracy at step 80, across model sizes from Qwen3. +SDPO+GRPO significantly outperforms SDPO on the weaker Qwen3-0.6B, while slightly underperforming SDPO on stronger models. +We use +λ += +0.9 +\lambda=0.9 +. +Error bars indicate the standard error across 3 seeds. +As shown in +Figure +˜ +11 +, SDPO+GRPO appears to be more robust to weaker models than SDPO. +Intuitively, in a weaker model such as Qwen3-0.6B, the SDPO advantages are less reliable, and hence including the GRPO advantage helps to stabilize training. +In contrast, we find that SDPO+GRPO slightly underperforms SDPO on stronger models such as Qwen3-8B. +This suggests that the signal of GRPO, only informed by a scalar reward, can be actively harmful with a strong initial model. +4.6 +Which feedback is most informative? +To understand which type of rich feedback is most informative, we ablate the three types of feedback present in a verifiable environment like code generation: the sample solution (if a successful rollout is available in the current rollout group), the environment output (such as runtime errors), and the student’s original attempt. +Sample solutions. +Including a sample solution from a failed attempt’s rollout group (if available) closely mirrors the group-relative advantages of GRPO. +We emphasize that these sample solutions are always generated by the student, as in GRPO, and do not require an expert model. +They allow for disincentivizing unsuccessful approaches if the model is already able to solve the question. +However, unlike GRPO where all tokens receive the same negative advantage, the self-teacher can identify specific mistakes and provide feedback on how to fix them. +Environment output. +The environment output describes the state of the environment after the student’s attempt. +This is complementary to sample solutions since it can provide useful signal even if the student has never solved the question before (a setting we explore extensively in +Section +˜ +5 +). +Leveraging environment output is a key differentiating factor between RLRF and RLVR settings. +Student’s original attempt. +The student’s original attempt +y +y +does not have to be included in the reprompting template of the teacher. +Indeed, we find that including it biases the teacher towards the student’s attempt (cf. +Table +˜ +6 +). +This reduces the entropy of the student’s distribution (particularly for initially uncertain tokens), thereby reducing exploration. +Teacher before training +Student trained with SDPO +↑ +\uparrow +Acc. (%) +↓ +\downarrow +Same output (%) +↑ +\uparrow +Acc. (%) +Avg. entropy +f += +f= +output +32.5 +± +0.5 +32.5\pm 0.5 +13.7 +± +0.6 +13.7\pm 0.6 +39.8 +± +0.2 +39.8\pm 0.2 +0.40 +± +0.0 +0.40\pm 0.0 +f += +f= +solution +42.4 +± +1.0 +\mathbf{42.4}\pm 1.0 +12.1 +± +0.7 +12.1\pm 0.7 +36.8 +± +2.7 +36.8\pm 2.7 +0.07 +± +0.0 +\emph{0.07}\pm 0.0 +f += +f= +output + solution +42.5 +± +1.2 +\mathbf{42.5}\pm 1.2 +10.1 +± +0.2 +\mathbf{10.1}\pm 0.2 +48.9 +± +0.9 +\mathbf{48.9}\pm 0.9 +0.37 +± +0.0 +0.37\pm 0.0 +f += +f= +y +y ++ output + solution +39.3 +± +0.8 +39.3\pm 0.8 +30.0 +± +0.9 +30.0\pm 0.9 +44.5 +± +1.8 +44.5\pm 1.8 +0.23 +± +0.0 +\emph{0.23}\pm 0.0 +Table 6: +Performance of varying kinds of feedback. +We evaluate informativeness of feedback based on SDPO training (until step 70) as well as the direct impact on the self-teacher. “Same output” measures the percentage of cases where the teacher receives the same environment output as the student’s initial attempt (i.e., not exploring alternative approaches). We observe that environment output and sample solutions are complementary and each provide informative feedback. Naively including only solutions or initial attempts +y +y +significantly reduces diversity in the teacher and student. We remark that the sample solutions are generated by the student, enabling similar group-relative advantage estimation to GRPO. Error bars indicate standard deviation across 3 seeds. +We summarize results in +Table +˜ +6 +where we evaluate the effect on SDPO training as well as the direct impact on the self-teacher. +We find that environment output & sample solutions are complementary, each providing informative feedback. +Generally, we observe that performance is not sensitive to syntactic variations of the reprompting template from +Table +˜ +2 +. +5 +Solving Hard Questions via Test-Time Self-Distillation +In +Sections +˜ +3 +and +4 +, we have demonstrated that SDPO can substantially improve over RLVR methods when performing “train-time RL” for reasoning tasks. +We now turn to a test-time setting where the model is given only a single hard (binary-reward) question +x +x +and must discover a solution as quickly as possible: +Definition 5.1 +(Discovery time) +. +The discovery time is the number of trials needed until a solution is found (i.e., the smallest +k +k +with the +k +k +-th attempt +y +k +y_{k} +receiving reward 1). +Based on this notion, we +can define a measure of the efficacy of discovery: +discovery +​ +@ +​ +k +:= +ℙ +​ +( +discovery time +≤ +k +) += +ℙ +​ +( +r +​ +( +y +1 +∣ +x +) += +1 +or +r +​ +( +y +2 +∣ +x +) += +1 +or …or +r +​ +( +y +k +∣ +x +) += +1 +) +, +\displaystyle\begin{split}\mathrm{discovery@}k:=&\ \mathbb{P}(\text{discovery time $\leq k$})\\ +=&\ \mathbb{P}(\text{$r(y_{1}\mid x)=1$ or $r(y_{2}\mid x)=1$ or \ldots or $r(y_{k}\mid x)=1$}),\end{split} +(4) +where the probability is over any randomness in the algorithm producing +y +k +y_{k} +and the rewards. +Thus, the discovery@ +k +k +metric quantifies the probability of +discovering the solution within +k +k +steps. +8 +8 +8 +Our proposed discovery@ +k +k +metric is a canonical metric +in the study of runtime speedup (i.e., time until termination, +Dolan & Moré ( +2002 +) +). +While prior work has studied discovery with continuous rewards +(e.g., Novikov et al., +2025 +; Yuksekgonul et al., +2026 +) +, discovery with language models in sparse or binary-reward settings does not allow “hill-climbing” a continuous reward and has remained less well understood. +The most naive approach to discovery in binary-reward tasks is to sample repeatedly i.i.d. from the base model, also known as +best-of- +k +k +. +The canonical pass@ +k +k +metric for best-of- +k +k +sampling is exactly the probability of discovering at least one solution within +k +k +independent samples from a fixed model, coinciding with discovery@ +k +k +. +The discovery@ +k +k +metric generalizes pass@ +k +k +to algorithms that sample attempts sequentially. +A common sequential approach re-prompts the base model with additional context from previous attempts +(Madaan et al., +2023 +; Shinn et al., +2023 +) +. +We refer to this as +multi-turn +sampling. +Here, the model itself does not change, only its context evolves over time. +Performing RLVR on the question +x +x +does not improve over best-of- +k +k +sampling from the base model, since a binary reward provides no signal until the first solution has already been found. +9 +9 +9 +For this reason, several works consider explicitly constructing curricula of solvable questions +(e.g., Zhao et al., +2025 +; Huang et al., +2026 +; Diaz-Bone et al., +2025 +; Hübotter et al., +2025b +) +, which self-distillation avoids. Other work found that RLVR yields limited improvement on hard questions +(Yue et al., +2025 +) +. +An RLRF method like SDPO does not face the same limitation, as it receives rich feedback from the environment after each attempt. +This rich feedback enables the model to repeatedly “correct” its mistakes as it encounters them and receives feedback, even before ever discovering a solution. +In contrast to multi-turn sampling, SDPO repeatedly compresses context +c += +( +y +k +, +f +k +) +c=(y_{k},f_{k}) +by distilling +π +θ +( +⋅ +∣ +x +, +c +) +\pi_{\theta}(\cdot\mid x,c) +into a model +π +θ +′ +( +⋅ +∣ +x +) +\pi_{\theta^{\prime}}(\cdot\mid x) +as we illustrate in +Figure +˜ +12 +. +This self-distillation enables SDPO to continually learn over long contexts, whereas the memory bottleneck of transformers inherently limits the context length of multi-turn sampling +(Vaswani et al., +2017 +) +. +In this section, we seek to answer the question: +Can repeatedly compressing context into model weights via self-distillation +accelerate discovery for hard questions? +Figure 12: +Compressing context into model weights via self-distillation. +We illustrate the process of distilling the interaction history (context +c +c +) into the model parameters +θ +\theta +. +The model +π +θ +\pi_{\theta} +repeatedly attempts a fixed hard question +x +x +, generating an answer +y +y +and receiving feedback +f +f +. +Rather than appending this history to the context window, the model updates its weights +θ +t +→ +θ +t ++ +1 +\theta_{t}\to\theta_{t+1} +with SDPO (batch size +1 +1 +) based on the feedback, effectively “fixing” mistakes by encoding +π +θ +( +⋅ +∣ +x +, +c +) +\pi_{\theta}(\cdot\mid x,c) +directly into the policy +π +θ +′ +( +⋅ +∣ +x +) +\pi_{\theta^{\prime}}(\cdot\mid x) +. +5.1 +Experimental setting +We consider a particularly challenging subset of questions from LCBv6 that are at Qwen3-8B’s performance ceiling and require significant test-time sampling to find any solution. +Concretely, we define two groups using Qwen3-8B’s pass@ +k +k +: +Hard tasks +with +pass@ +​ +64 +< +0.5 +{\text{pass@}64<0.5} +and +very hard tasks +with +pass@ +​ +64 +< +0.03 +\text{pass@}64<0.03 +. +Among these, we retain questions for which any of best-of- +k +k +, multi-turn, or SDPO find at least one solution within +512 +512 +steps across +5 +5 +seeds. +This results in 19 hard and 9 very hard questions. +For best-of- +k +k +sampling under the base model, we report the standard +pass +​ +@ +​ +k +\text{pass}@k +estimate +(Chen et al., +2021b +) +from 2944 independent rollouts. +As multi-turn sampling, we sequentially reprompt the model in-context using the concatenated feedback from previous attempts. To remain within Qwen3-8B’s 40k-token context limit, we employ a first-in, first-out sliding window, discarding the earliest feedback once the maximum prompt length (32k tokens) is reached. +We ablate the multi-turn reprompting strategy in +Figure +˜ +19 +in +Appendix +˜ +D +and find that retaining only past feedback while forgetting earlier attempts significantly outperforms the baseline that additionally retains past attempts. +We evaluate SDPO with a batch size of 16. We ablate this choice in +Figure +˜ +19 +in +Appendix +˜ +D +and find that overall performance differences are marginal, yet smaller batch sizes are beneficial for improvements at low generation budgets, while larger batch sizes result in more stable updates that still learn to solve questions at later stages into the run. +5.2 +Results +Figure 13: +Self-distillation at test-time solves LiveCodeBench questions that neither the base model nor multi-turn conversations can solve. +Left: +Very hard questions (9 total) from LCBv6 where the base model achieves +pass +​ +@ +​ +64 +< +0.03 +\text{pass}@64<0.03 +, i.e., in less than 3% cases, sampling 64 responses yields any success. +Right: +Hard questions (19 total) from LCBv6 where the base model achieves +pass +​ +@ +​ +64 +< +0.5 +\text{pass}@64<0.5 +. +We report the +discovery +​ +@ +​ +k +\text{discovery}@k +metric, representing the probability of discovering at least one solution within +k +k +total generations. +Across both difficulty levels, SDPO achieves higher +discovery +​ +@ +​ +k +\text{discovery}@k +rates at almost all generation budgets, compared to the base model and a multi-turn conversation baseline that receives the feedback in-context. We report the mean and bootstrapped 90% confidence intervals of the mean across 5 random seeds per question. +Figure +˜ +13 +compares +discovery +​ +@ +​ +k +\text{discovery}@k +for SDPO, multi-turn sampling, and best-of- +k +k +sampling on very hard (left) and hard (right) questions from LCBv6. Across both difficulty levels, SDPO achieves substantially higher +discovery +​ +@ +​ +k +\text{discovery}@k +rates at almost all generation budgets. +On very hard tasks, multi-turn and best-of- +k +k +largely fail to solve questions within the available generation budget, achieving discovery@2750 of only +35.6 +% +35.6\% +and +41.5 +% +{41.5}\% +, respectively, whereas SDPO discovers a solution in +53.2 +% +{53.2}\% +of cases. +SDPO not only solves more questions overall but also does so with substantially fewer attempts. +Notably, to reach a +22 +% +22\% +discovery probability on very hard questions, SDPO requires approximately +3 +× +3\times +fewer generations than best-of- +k +k +and multi-turn sampling. +On hard tasks, SDPO reaches a +78 +% +{78}\% +discovery@2750 probability while achieving a +67 +% +67\% +discovery probability with roughly +2.4 +× +2.4\times +fewer generations than best-of- +k +k +and multi-turn sampling. Overall, multi-turn and best-of- +k +k +sampling solve only +68.4 +% +{68.4}\% +and +72.3 +% +{72.3}\% +of questions, respectively. +The context window length for multi-turn sampling is reached after 837 ( +± +466 +\pm 466 +) steps for hard questions and after 1007 ( +± +349 +\pm 349 +) steps for very hard questions, offering a possible explanation for its diminishing gains at high generation budgets. +Question 3 is only solved by SDPO. +SDPO solves all questions that are solved by best-of- +k +k +and multi-turn sampling. Beyond that, SDPO uniquely discovers a solution for Q3, which is neither solvable with multi-turn sampling nor with best-of- +k +k +sampling within 2750 attempts. In contrast, SDPO first discovers a solution for Q3 after 321 attempts, which corresponds to 20 iteration steps of self-distillation based on feedback with a batch size of 16. We include detailed per-question results in +Table +˜ +10 +in +Appendix +˜ +D +. +The initial self-teacher does not solve hard questions. +Notably, the self-teacher’s initial accuracy is +< +1 +<1 +% for almost all questions, and even exactly +0 +% on +78 +78 +% of them ( +Table +˜ +11 +in +Appendix +˜ +D +). +This shows that a single turn of in-context feedback is insufficient to solve the problem. +Despite this, the self-teacher’s credit assignment is sufficiently effective for SDPO to iteratively refine the policy and eventually solve these questions. +Takeaway 3 +We demonstrate that rich environment feedback enables SDPO to significantly accelerate discovery for hard questions. +This is in contrast to RLVR methods, which only receive a binary reward signal, and therefore only begin learning once the first solution has already been found. +6 +Related Work +6.1 +Reinforcement Learning with LLMs +Recently, large-scale RL training on diverse tasks has significantly improved the performance of LLMs on general reasoning tasks +(Guo et al., +2025 +; Kimi et al., +2025 +; Olmo et al., +2025 +; Jaech et al., +2024 +; Lambert et al., +2025 +) +. +This progress is primarily enabled by RLVR methods that use Monte Carlo estimates of rewards, such as STaR or GRPO +(Zelikman et al., +2022 +; Shao et al., +2024 +) +, similar to the classical REINFORCE algorithm +(Williams, +1992 +) +. +While several traditional RLVR algorithms rely on learning separate value networks +(Schulman et al., +2017 +) +, they incur substantial memory costs and retain the information bottleneck of scalar rewards. +In the RLVR setting, it is common for an (outcome) reward to be given only at the end of a sequence. +To improve credit assignment, several works learn so-called process reward models (PRMs) that estimate rewards for each step in the sequence +(Lightman et al., +2023 +; Wang et al., +2024a +; Setlur et al., +2025 +) +. +Unlike our RLRF setting, PRMs are typically trained on scalar rewards, either on value estimates for intermediate states or on outcome rewards +(Cui et al., +2025 +) +. +Unlike the self-teacher in SDPO, PRMs are a distinct model from the student, introducing significant memory overhead. +Our work shows that +each language model is implicitly a PRM +through retrospection if given rich feedback. +Conceptually, our work is related to “expert iteration” +(Anthony et al., +2017 +) +where a student is bootstrapped by repeatedly imitating an improved version of itself (called the “expert”). +Canonically, the expert combines the student with test-time search, such as tree search +(Anthony et al., +2017 +) +or majority voting +(Zuo et al., +2025 +) +. +In contrast, SDPO leverages the student’s ability to learn from rich feedback provided in-context. +6.2 +Learning from Rich Feedback and through Retrospection +Beyond scalar outcome rewards, recent works have leveraged rich execution or verbal feedback to guide generation +(Gehring et al., +2025 +; Yuksekgonul et al., +2025 +) +. +A primary line of research focuses on translating verbal feedback into reward functions for RL. +This is often achieved by mapping feedback to discrete token-level rewards using an external frozen model +(Wang et al., +2026 +) +, or by employing strong external LLMs to explicitly construct state-wise reward functions +(Goyal et al., +2019 +; Xie et al., +2024 +; Urcelay et al., +2026 +) +. +Alternatively, feedback can be utilized without explicit reward modeling. +Several approaches focus on in-context improvement without integrating the process into the RL optimization loop +(Chen et al., +2021a +; Madaan et al., +2023 +; Shinn et al., +2023 +; Yao et al., +2024 +; Yuksekgonul et al., +2025 +; Lee et al., +2025 +) +. +Others manually curate preference datasets by pairing responses before and after feedback to train with direct preference optimization +(Stephan et al., +2024 +; Lee et al., +2024 +) +, though this requires additional generation and lacks the direct credit assignment of SDPO. +Various recent works bootstrap thinking traces from known answers, using these answers as rich feedback +(Zhou et al., +2026 +; Hatamizadeh et al., +2026 +; Zhang et al., +2025 +) +. +A central object in several recent works is a feedback-conditioned policy +π +θ +​ +( +y +∣ +x +, +f +) +\pi_{\theta}(y\mid x,f) +, which learns answers +y +y +that lead to feedback +f +f +(Liu et al., +2023 +; Zhang et al., +2023 +; Luo et al., +2025 +) +, typically through supervised objectives. +The idea behind these approaches is to deploy a policy conditioned on desirable (i.e., positive) feedback for deployment. +This approach is conceptually related to goal-conditioned RL +(Schaul et al., +2015 +; Liu et al., +2025a +) +, where one can learn from negative examples through goal relabeling +(Andrychowicz et al., +2017 +) +. +Feedback-conditioned policies view feedback as a goal, whereas RLRF views feedback as a state that can be used to determine whether the goal +x +x +is achieved. +Unlike SDPO, these methods do not use feedback for credit assignment in negative trajectories, but rather as a data transformation for goal relabeling. +6.3 +Distillation +Distillation is frequently employed as an alternative to supervised fine-tuning (SFT) when a strong teacher model is available. +This approach transfers capabilities by training a student to mimic the output distribution or intermediate representations of the teacher +(Hinton et al., +2015 +; Romero et al., +2015 +; Kim & Rush, +2016 +; Sanh et al., +2019 +; Xie et al., +2020 +) +. +Distillation is typically performed on fixed off-policy datasets. +To address the distribution shift between training and inference, recent works explore on-policy distillation, where the student learns from feedback of an external teacher on its own generations +(Agarwal et al., +2024 +; Gu et al., +2024 +; Yang et al., +2025 +; Lu & Thinking Machines Lab, +2025 +) +. +This mitigates the train-test mismatch, which relates closely to earlier work on online imitation learning +(Ross et al., +2011 +) +. +6.4 +Self-Distillation +The concept of self-distillation was first proposed by +Snell et al. ( +2022 +) +in a setting akin to supervised learning, introducing the idea of sampling from a model provided with extra context and training the same model to mimic these predictions without that context. +This mechanism has proven effective for compressing behavior +(Bai et al., +2022 +; Choi et al., +2022 +) +and factual information +(Eyuboglu et al., +2026 +; Kujanpää et al., +2025 +) +into model weights. +Beyond compressing a fixed context into model weights, recent works have used self-distillation to learn from environment feedback +(Scheurer et al., +2023 +; Dou et al., +2024 +; Mitra & Ulukus, +2025 +) +. +These approaches use an +off-policy +self-distillation objective, which substantially underperforms SDPO’s on-policy learning. +Off-policy self-distillation trains the student on generations from the teacher, whereas SDPO trains the student to avoid mistakes in its own generations. +In concurrent work, +Chen et al. ( +2025c +) +apply on-policy self-distillation to grid world settings where feedback is a scalar reward, and a reflection stage in the self-teacher diagnoses possible mistakes, showing improved credit assignment compared to learning value networks for advantage estimation. +7 +Conclusion, Limitations, and Future Work +We introduced +Reinforcement Learning with Rich Feedback +(RLRF), a paradigm where environments provide tokenized feedback beyond scalar rewards, and argued that this removes a key information bottleneck of RLVR. +We then proposed +Self-Distillation Policy Optimization +(SDPO), which uses the current policy as a feedback-conditioned +self-teacher +and distills its corrected log-probabilities into the student. +This leverages the model’s ability to learn from context for dense credit assignment. +We further demonstrated that SDPO can be implemented as a minimal, drop-in modification to standard RLVR pipelines. +Empirically, SDPO demonstrates superior sample efficiency and wall-clock convergence compared to GRPO on reasoning tasks, even when training in standard RLVR environments without rich feedback. +SDPO’s gains grow with model scale, suggesting that the capacity for self-correction scales with the model’s in-context learning capabilities. +Moreover, we show that performing SDPO at test-time on individual hard binary-reward tasks accelerates the discovery of solutions compared to strong baselines. +SDPO enables learning from rich feedback in a way that parallels human cognition: utilizing precise outcomes rather than just binary rewards. +By allowing the model to determine retrospectively how it should have acted, we demonstrate that language models can convert diverse tokenized feedback into effective self-supervision. +Limitations. +Our findings show that SDPO’s performance depends on a model’s in-context learning ability, suggesting that SDPO is primarily applicable for RL-training stronger base models, while it can underperform GRPO on weaker models. +Moreover, performance depends on the quality of the environment feedback. If the environment provides uninformative or misleading feedback, a model may not be able to learn from it through SDPO. +Finally, SDPO adds a small computational overhead compared to GRPO for computing the log-probs of the retrospective model. +While often negligible, this may be a larger overhead for smaller models with shorter generation lengths, where generation time is comparatively small. +Future Work. +Our work highlights several exciting directions for future research: +• +Long-horizon and agentic settings. +RLRF is particularly appealing when trajectories are long or expose information about intermediate states. +Evaluating SDPO in agentic environments is a natural next step. +• +Training dynamics at scale. +Beyond our evaluation on LiveCodeBench, it would be particularly interesting to scale SDPO to large multi-task RL training runs and further study its scaling properties with frontier base models. +• +Beyond verifiable rewards. +While we focused on verifiable code generation, many tasks provide textual feedback without a ground-truth verifier. +Investigating whether SDPO’s retrospection mechanism can improve alignment in open-ended text generation or continuous-reward tasks remains an open empirical question. +• +Behavioral differences in reasoning. +We observed that SDPO induces qualitatively different reasoning patterns than GRPO, notably avoiding the latter’s tendency toward verbosity and superficial reasoning. +Future work should systematically study how individual aspects, such as the reprompt template, influence behavior. +Author Contributions +Jonas Hübotter +conceived of the project in summer 2025 and has been working on it full-time since then, leading the team. +Jonas proposed the conceptual framework of self-distillation for credit assignment with input from Lejs, implemented the algorithm with help from others, led the quantitative experiments on LCBv6, and led the writing of the paper. +Frederike Lübeck +led the design of the code environment, led the design and evaluation of the TTT setting in +Section +˜ +5 +with input from Jonas, contributed to the project direction in discussions, and contributed significantly to the writing of the paper. +Lejs Behric +noted the dense credit assignment of knowledge distillation with strong teacher models in discussions with Jonas, inspiring the idea of self-distillation. Further, Lejs led the evaluation of different teacher templates, co-led the development of a tool for qualitative analysis of runs with Marco and Daniel, helped implement parts of the algorithm, and contributed to the project direction in discussions. +Anton Baumann +joined in December 2025 and led the evaluation of SDPO without rich feedback in +Section +˜ +3 +with input from Jonas, and contributed to the writing of the paper. +Marco Bagatella and Daniel Marta +co-led the development of a tool for qualitative analysis of runs with Lejs, contributed to the training infrastructure, and contributed to the project direction in discussions. +Ido Hakimi +significantly contributed to the initial codebase and experimental setup, contributed early algorithmic ideas, and contributed to the project direction in discussions. +Idan Shenfeld, Thomas Kleine Buening, Carlos Guestrin, and Andreas Krause +supported this project, with Idan and Carlos joining in December 2025. They made significant contributions to the project direction in discussions and gave valuable advice on our presentation. Thomas and Idan, in particular, significantly contributed to the development of core algorithmic ideas and design of experiments. Thomas further evaluated checkpoints on holdout benchmarks. Carlos suggested the qualitative analysis of reasoning traces in +Figure +˜ +7 +and the presentation of TTT results in +Section +˜ +5 +. Andreas pointed out valuable connections to existing work in RL which shaped the direction of the project. +Acknowledgments +We would like to thank Akira Yoshiyama, Yassir Akram, Parnian Kassraie, Jonathan Thomm, Roman Vorushin, Afra Amini, Imanol Schlag, Yu Sun, and Moritz Hardt for helpful discussions. +We thank Eduard Durech for helpful conversations regarding the scaling of RL fine-tuning and for his technical guidance on distributed infrastructure and long-context optimization. +Furthermore, we would like to thank Leander Diaz-Bone for supporting dataset generation. +This project was supported through the Swiss AI compute grant a156 and, in part, compute grant infra01. +JH was supported by the Swiss National Science Foundation under NCCR Automation, grant agreement 51NF40 180545. +FL and MB were supported by the ETH-MPI Center for Learning Systems. +TKB and IH were supported by an ETH AI Center Postdoctoral Fellowship. +DM was supported by the Knut and Alice Wallenberg Foundation. +References +Agarwal et al. (2024) +Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos Garea, Matthieu Geist, and Olivier Bachem. +On-policy distillation of language models: Learning from self-generated mistakes. +In +ICLR +, 2024. +Akyürek et al. (2025) +Ekin Akyürek, Mehul Damani, Adam Zweiger, Linlu Qiu, Han Guo, Jyothish Pari, Yoon Kim, and Jacob Andreas. +The surprising effectiveness of test-time training for few-shot learning. +In +ICML +, 2025. +Andrychowicz et al. (2017) +Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, and Wojciech Zaremba. +Hindsight experience replay. +In +NeurIPS +, 2017. +Anthony et al. (2017) +Thomas Anthony, Zheng Tian, and David Barber. +Thinking fast and slow with deep learning and tree search. +In +NeurIPS +, 2017. +Bai et al. (2022) +Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, et al. +Constitutional ai: Harmlessness from ai feedback. +arXiv preprint arXiv:2212.08073 +, 2022. +Behrouz et al. (2025) +Ali Behrouz, Peilin Zhong, and Vahab Mirrokni. +Titans: Learning to memorize at test time. +In +NeurIPS +, 2025. +Berner et al. (2019) +Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Przemysław Debiak, Christy Dennison, David Farhi, Quirin Fischer, Shariq Hashme, Chris Hesse, et al. +Dota 2 with large scale deep reinforcement learning. +arXiv preprint arXiv:1912.06680 +, 2019. +Boyd & Vandenberghe (2004) +Stephen Boyd and Lieven Vandenberghe. +Convex optimization +. +Cambridge university press, 2004. +Brown et al. (2020) +Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. +Language models are few-shot learners. +arXiv preprint ArXiv:2005.14165 +, 2020. +Bubeck (2015) +Sébastien Bubeck. +Convex optimization: Algorithms and complexity. +Foundations and Trends® in Machine Learning +, 2015. +Cao et al. (2025) +Meng Cao, Shuyuan Zhang, Xiao-Wen Chang, and Doina Precup. +Scar: Shapley credit assignment for more efficient rlhf. +arXiv preprint arXiv:2505.20417 +, 2025. +Chan et al. (2024) +Alex J Chan, Hao Sun, Samuel Holt, and Mihaela Van Der Schaar. +Dense reward for free in reinforcement learning from human feedback. +In +ICML +, 2024. +Chen et al. (2025a) +Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, et al. +Minimax-m1: Scaling test-time compute efficiently with lightning attention. +arXiv preprint arXiv:2506.13585 +, 2025a. +Chen et al. (2022) +Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen. +Codet: Code generation with generated tests. +In +ICLR +, 2022. +Chen et al. (2025b) +Howard Chen, Noam Razin, Karthik Narasimhan, and Danqi Chen. +Retaining by doing: The role of on-policy data in mitigating forgetting. +arXiv preprint arXiv:2510.18874 +, 2025b. +Chen et al. (2021a) +Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch. +Decision transformer: Reinforcement learning via sequence modeling. +In +NeurIPS +, 2021a. +Chen et al. (2021b) +Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. +Evaluating large language models trained on code. +arXiv preprint arXiv:2107.03374 +, 2021b. +Chen et al. (2025c) +Wentse Chen, Jiayu Chen, Fahim Tajwar, Hao Zhu, Xintong Duan, Ruslan Salakhutdinov, and Jeff Schneider. +Retrospective in-context learning for temporal credit assignment with large language models. +In +NeurIPS +, 2025c. +Chiang et al. (2024) +Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael Jordan, Joseph E Gonzalez, et al. +Chatbot arena: An open platform for evaluating llms by human preference. +In +ICML +, 2024. +Choi et al. (2022) +Eunbi Choi, Yongrae Jo, Joel Jang, and Minjoon Seo. +Prompt injection: Parameterization of fixed inputs. +arXiv preprint arXiv:2206.11349 +, 2022. +Cui et al. (2025) +Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al. +Process reinforcement through implicit rewards. +arXiv preprint arXiv:2502.01456 +, 2025. +Diaz-Bone et al. (2025) +Leander Diaz-Bone, Marco Bagatella, Jonas Hübotter, and Andreas Krause. +Discover: Automated curricula for sparse-reward reinforcement learning. +In +NeurIPS +, 2025. +Dolan & Moré (2002) +Elizabeth D Dolan and Jorge J Moré. +Benchmarking optimization software with performance profiles. +Mathematical programming +, 91(2), 2002. +Dou et al. (2024) +Zi-Yi Dou, Cheng-Fu Yang, Xueqing Wu, Kai-Wei Chang, and Nanyun Peng. +Re-rest: Reflection-reinforced self-training for language agents. +In +EMNLP +, 2024. +El-Kishky et al. (2025) +Ahmed El-Kishky, Alexander Wei, Andre Saraiva, Borys Minaiev, Daniel Selsam, David Dohan, Francis Song, Hunter Lightman, Ignasi Clavera, Jakub Pachocki, et al. +Competitive programming with large reasoning models. +arXiv preprint arXiv:2502.06807 +, 2025. +Eyuboglu et al. (2026) +Sabri Eyuboglu, Ryan Ehrlich, Simran Arora, Neel Guha, Dylan Zinsley, Emily Liu, Will Tennien, Atri Rudra, James Zou, Azalia Mirhoseini, et al. +Cartridges: Lightweight and general-purpose long context representations via self-study. +In +ICLR +, 2026. +Feng et al. (2024) +Kehua Feng, Keyan Ding, Weijie Wang, Xiang Zhuang, Zeyuan Wang, Ming Qin, Yu Zhao, Jianhua Yao, Qiang Zhang, and Huajun Chen. +Sciknoweval: Evaluating multi-level scientific knowledge of large language models. +arXiv preprint arXiv:2406.09098 +, 2024. +Gehring et al. (2025) +Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve. +Rlef: Grounding code llms in execution feedback with reinforcement learning. +In +ICML +, 2025. +Goyal et al. (2019) +Prasoon Goyal, Scott Niekum, and Raymond J Mooney. +Using natural language for reward shaping in reinforcement learning. +In +IJCAI +, 2019. +Gu et al. (2024) +Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. +Minillm: Knowledge distillation of large language models. +2024. +Guha et al. (2026) +Etash Guha, Ryan Marten, Sedrick Keh, Negin Raoof, Georgios Smyrnis, Hritik Bansal, Marianna Nezhurina, Jean Mercat, Trung Vu, Zayne Sprague, et al. +Openthoughts: Data recipes for reasoning models. +In +ICLR +, 2026. +Guo et al. (2025) +Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. +Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. +arXiv preprint arXiv:2501.12948 +, 2025. +Haarnoja et al. (2018) +Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine. +Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. +In +ICML +, 2018. +Hardt & Sun (2024) +Moritz Hardt and Yu Sun. +Test-time training on nearest neighbors for large language models. +In +ICLR +, 2024. +Hatamizadeh et al. (2026) +Ali Hatamizadeh, Syeda Nahida Akter, Shrimai Prabhumoye, Jan Kautz, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, and Yejin Choi. +Rlp: Reinforcement as a pretraining objective. +In +ICLR +, 2026. +Hinton et al. (2015) +Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. +Distilling the knowledge in a neural network. +arXiv preprint arXiv:1503.02531 +, 2015. +Huang et al. (2026) +Chengsong Huang, Wenhao Yu, Xiaoyang Wang, Hongming Zhang, Zongxia Li, Ruosen Li, Jiaxin Huang, Haitao Mi, and Dong Yu. +R-zero: Self-evolving reasoning llm from zero data. +In +ICLR +, 2026. +Hübotter et al. (2026) +Jonas Hübotter, Patrik Wolf, Alexander Shevchenko, Dennis Jüni, Andreas Krause, and Gil Kur. +Specialization after generalization: Towards understanding test-time training in foundation models. +In +ICLR +, 2026. +Hübotter et al. (2025a) +Jonas Hübotter, Sascha Bongni, Ido Hakimi, and Andreas Krause. +Efficiently learning at test-time: Active fine-tuning of llms. +In +ICLR +, 2025a. +Hübotter et al. (2025b) +Jonas Hübotter, Leander Diaz-Bone, Ido Hakimi, Andreas Krause, and Moritz Hardt. +Learning on the job: Test-time curricula for targeted reinforcement learning. +arXiv preprint arXiv:2510.04786 +, 2025b. +Jaech et al. (2024) +Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al. +Openai o1 system card. +arXiv preprint arXiv:2412.16720 +, 2024. +Jain et al. (2025) +Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica. +Livecodebench: Holistic and contamination free evaluation of large language models for code. +In +ICLR +, 2025. +Kaelbling et al. (1998) +Leslie Pack Kaelbling, Michael L Littman, and Anthony R Cassandra. +Planning and acting in partially observable stochastic domains. +Artificial intelligence +, 101(1-2), 1998. +Kazemnejad et al. (2025) +Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux. +Vineppo: Refining credit assignment in rl training of llms. +In +ICML +, 2025. +Khatri et al. (2026) +Devvrit Khatri, Lovish Madaan, Rishabh Tiwari, Rachit Bansal, Sai Surya Duvvuri, Manzil Zaheer, Inderjit S Dhillon, David Brandfonbrener, and Rishabh Agarwal. +The art of scaling reinforcement learning compute for llms. +In +ICLR +, 2026. +Kim & Rush (2016) +Yoon Kim and Alexander M Rush. +Sequence-level knowledge distillation. +In +EMNLP +, 2016. +Kimi et al. (2025) +Kimi, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. +Kimi k1.5: Scaling reinforcement learning with llms. +arXiv preprint arXiv:2501.12599 +, 2025. +Kujanpää et al. (2025) +Kalle Kujanpää, Pekka Marttinen, Harri Valpola, and Alexander Ilin. +Efficient knowledge injection in LLMs via self-distillation. +TMLR +, 2025. +Kwon et al. (2023) +Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. +Efficient memory management for large language model serving with pagedattention. +In +PSIGOPS +, 2023. +Lambert et al. (2025) +Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al. +Tulu 3: Pushing frontiers in open language model post-training. +In +COLM +, 2025. +Le et al. (2022) +Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi. +Coderl: Mastering code generation through pretrained models and deep reinforcement learning. +In +NeurIPS +, 2022. +Lee et al. (2024) +Kyungjae Lee, Dasol Hwang, Sunghyun Park, Youngsoo Jang, and Moontae Lee. +Reinforcement learning from reflective feedback (rlrf): Aligning and improving llms via fine-grained self-reflection. +arXiv preprint arXiv:2403.14238 +, 2024. +Lee et al. (2025) +Yoonho Lee, Joseph Boen, and Chelsea Finn. +Feedback descent: Open-ended text optimization via pairwise comparison. +arXiv preprint arXiv:2511.07919 +, 2025. +Levine (2018) +Sergey Levine. +Reinforcement learning and control as probabilistic inference: Tutorial and review. +arXiv preprint arXiv:1805.00909 +, 2018. +Li et al. (2025) +Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu, Joseph E Gonzalez, and Ion Stoica. +From crowdsourced data to high-quality benchmarks: Arena-hard and benchbuilder pipeline. +In +ICML +, 2025. +Lightman et al. (2023) +Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. +Let’s verify step by step. +In +ICLR +, 2023. +Liu et al. (2025a) +Grace Liu, Michael Tang, and Benjamin Eysenbach. +A single goal is all you need: Skills and exploration emerge from contrastive rl without rewards, demonstrations, or subgoals. +In +ICLR +, 2025a. +Liu et al. (2023) +Hao Liu, Carmelo Sferrazza, and Pieter Abbeel. +Chain of hindsight aligns language models with feedback. +arXiv preprint arXiv:2302.02676 +, 2023. +Liu et al. (2025b) +Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. +Understanding r1-zero-like training: A critical perspective. +In +COLM +, 2025b. +Lu & Thinking Machines Lab (2025) +Kevin Lu and Thinking Machines Lab. +On-policy distillation. +Thinking Machines Lab: Connectionism +, 2025. +URL +https://thinkingmachines.ai/blog/on-policy-distillation +. +Luo et al. (2025) +Renjie Luo, Zichen Liu, Xiangyan Liu, Chao Du, Min Lin, Wenhu Chen, Wei Lu, and Tianyu Pang. +Language models can learn from verbal feedback without scalar rewards. +arXiv preprint arXiv:2509.22638 +, 2025. +Madaan et al. (2023) +Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al. +Self-refine: Iterative refinement with self-feedback. +In +NeurIPS +, 2023. +Mitra & Ulukus (2025) +Purbesh Mitra and Sennur Ulukus. +Semantic soft bootstrapping: Long context reasoning in llms without reinforcement learning. +arXiv preprint arXiv:2512.05105 +, 2025. +Mnih et al. (2015) +Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Andrei A. Rusu, Joel Veness, Marc G. Bellemare, Alex Graves, Martin Riedmiller, Andreas K. Fidjeland, Georg Ostrovski, et al. +Human-level control through deep reinforcement learning. +Nature +, 518(7540), 2015. +Muennighoff et al. (2025) +Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori B Hashimoto. +s1: Simple test-time scaling. +In +EMNLP +, 2025. +Ng et al. (2000) +Andrew Y Ng, Stuart Russell, et al. +Algorithms for inverse reinforcement learning. +In +ICML +, 2000. +Novikov et al. (2025) +Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii, Francisco JR Ruiz, Abbas Mehrabian, et al. +Alphaevolve: A coding agent for scientific and algorithmic discovery. +arXiv preprint arXiv:2506.13131 +, 2025. +Olmo et al. (2025) +Team Olmo, Allyson Ettinger, Amanda Bertsch, Bailey Kuehl, David Graham, David Heineman, Dirk Groeneveld, Faeze Brahman, Finbarr Timbers, Hamish Ivison, et al. +Olmo 3. +arXiv preprint arXiv:2512.13961 +, 2025. +Peng et al. (2019) +Xue Bin Peng, Aviral Kumar, Grace Zhang, and Sergey Levine. +Advantage-weighted regression: Simple and scalable off-policy reinforcement learning. +arXiv preprint arXiv:1910.00177 +, 2019. +Qwen et al. (2024) +Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, et al. +Qwen2.5 technical report. +arXiv preprint arXiv:2412.15115 +, 2024. +Rafailov et al. (2023) +Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. +Direct preference optimization: Your language model is secretly a reward model. +In +NeurIPS +, 2023. +Romero et al. (2015) +Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. +Fitnets: Hints for thin deep nets. +In +ICLR +, 2015. +Ross et al. (2011) +Stéphane Ross, Geoffrey Gordon, and Drew Bagnell. +A reduction of imitation learning and structured prediction to no-regret online learning. +In +AISTATS +, 2011. +Samadi et al. (2025) +Mehrzad Samadi, Aleksander Ficek, Sean Narenthiran, Siddhartha Jain, Wasi Uddin Ahmad, Somshubra Majumdar, Vahid Noroozi, and Boris Ginsburg. +Scaling test-time compute to achieve ioi gold medal with open-weight models. +arXiv preprint arXiv:2510.14232 +, 2025. +Sanh et al. (2019) +Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. +Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. +arXiv preprint arXiv:1910.01108 +, 2019. +Schaul et al. (2015) +Tom Schaul, Daniel Horgan, Karol Gregor, and David Silver. +Universal value function approximators. +In +ICML +, 2015. +Scheurer et al. (2023) +Jérémy Scheurer, Jon Ander Campos, Tomasz Korbak, Jun Shern Chan, Angelica Chen, Kyunghyun Cho, and Ethan Perez. +Training language models with language feedback at scale. +arXiv preprint arXiv:2303.16755 +, 2023. +Schulman et al. (2015) +John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz. +Trust region policy optimization. +In +ICML +, 2015. +Schulman et al. (2016) +John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. +High-dimensional continuous control using generalized advantage estimation. +In +ICLR +, 2016. +Schulman et al. (2017) +John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. +Proximal policy optimization algorithms. +arXiv preprint arXiv:1707.06347 +, 2017. +Setlur et al. (2025) +Amrith Setlur, Chirag Nagpal, Adam Fisch, Xinyang Geng, Jacob Eisenstein, Rishabh Agarwal, Alekh Agarwal, Jonathan Berant, and Aviral Kumar. +Rewarding progress: Scaling automated process verifiers for llm reasoning. +In +ICLR +, 2025. +Shao et al. (2024) +Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al. +Deepseekmath: Pushing the limits of mathematical reasoning in open language models. +arXiv preprint arXiv:2402.03300 +, 2024. +Shenfeld et al. (2026) +Idan Shenfeld, Jyothish Pari, and Pulkit Agrawal. +Rl’s razor: Why online reinforcement learning forgets less. +In +ICLR +, 2026. +Sheng et al. (2025) +Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. +Hybridflow: A flexible and efficient rlhf framework. +In +EuroSys +, 2025. +Shinn et al. (2023) +Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. +Reflexion: Language agents with verbal reinforcement learning. +In +NeurIPS +, 2023. +Silver et al. (2016) +David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al. +Mastering the game of go with deep neural networks and tree search. +Nature +, 529(7587), 2016. +Silver et al. (2017) +David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, et al. +Mastering chess and shogi by self-play with a general reinforcement learning algorithm. +arXiv preprint arXiv:1712.01815 +, 2017. +Snell et al. (2022) +Charlie Snell, Dan Klein, and Ruiqi Zhong. +Learning by distilling context. +arXiv preprint arXiv:2209.15189 +, 2022. +Stephan et al. (2024) +Moritz Stephan, Alexander Khazatsky, Eric Mitchell, Annie S Chen, Sheryl Hsu, Archit Sharma, and Chelsea Finn. +Rlvf: Learning from verbal feedback without overgeneralization. +In +ICML +, 2024. +Sun et al. (2020) +Yu Sun, Xiaolong Wang, Zhuang Liu, John Miller, Alexei Efros, and Moritz Hardt. +Test-time training with self-supervision for generalization under distribution shifts. +In +ICML +, 2020. +Sun et al. (2025) +Yu Sun, Xinhao Li, Karan Dalal, Jiarui Xu, Arjun Vikram, Genghan Zhang, Yann Dubois, Xinlei Chen, Xiaolong Wang, Sanmi Koyejo, et al. +Learning to (learn at test time): Rnns with expressive hidden states. +In +ICML +, 2025. +Sutton & Barto (1998) +Richard S Sutton and Andrew G Barto. +Reinforcement learning: An introduction +. +MIT press, 1998. +Tandon et al. (2025) +Arnuv Tandon, Karan Dalal, Xinhao Li, Daniel Koceja, Marcel Rød, Sam Buchanan, Xiaolong Wang, Jure Leskovec, Sanmi Koyejo, Tatsunori Hashimoto, et al. +End-to-end test-time training for long context. +arXiv preprint arXiv:2512.23675 +, 2025. +Tang et al. (2023) +Qiaoyu Tang, Ziliang Deng, Hongyu Lin, Xianpei Han, Qiao Liang, Boxi Cao, and Le Sun. +Toolalpaca: Generalized tool learning for language models with 3000 simulated cases. +arXiv preprint arXiv:2306.05301 +, 2023. +Urcelay et al. (2026) +Belen Martin Urcelay, Andreas Krause, and Giorgia Ramponi. +From words to rewards: Leveraging natural language for reinforcement learning. +In +TMLR +, 2026. +Vaswani et al. (2017) +Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. +Attention is all you need. +In +NeurIPS +, 2017. +Wainwright & Jordan (2008) +Martin J Wainwright and Michael I Jordan. +Graphical models, exponential families, and variational inference. +Foundations and Trends® in Machine Learning +, 2008. +Wang et al. (2026) +Hanyang Wang, Lu Wang, Chaoyun Zhang, Tianjun Mao, Si Qin, Qingwei Lin, Saravan Rajmohan, and Dongmei Zhang. +Text2grad: Reinforcement learning from natural language feedback. +In +ICLR +, 2026. +Wang et al. (2024a) +Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui. +Math-shepherd: Verify and reinforce llms step-by-step without human annotations. +In +ACL +, 2024a. +Wang et al. (2025) +Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, et al. +Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning. +In +NeurIPS +, 2025. +Wang et al. (2024b) +Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, et al. +Mmlu-pro: A more robust and challenging multi-task language understanding benchmark. +In +NeurIPS +, 2024b. +Wei et al. (2022) +Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. +Chain-of-thought prompting elicits reasoning in large language models. +In +NeurIPS +, 2022. +Williams (1992) +Ronald J Williams. +Simple statistical gradient-following algorithms for connectionist reinforcement learning. +Machine learning +, 8(3), 1992. +Xie et al. (2020) +Qizhe Xie, Minh-Thang Luong, Eduard Hovy, and Quoc V Le. +Self-training with noisy student improves imagenet classification. +In +CVPR +, 2020. +Xie et al. (2024) +Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, and Tao Yu. +Text2reward: Reward shaping with language models for reinforcement learning. +In +ICLR +, 2024. +Yang et al. (2025) +An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. +Qwen3 technical report. +arXiv preprint arXiv:2505.09388 +, 2025. +Yao et al. (2025) +Feng Yao, Liyuan Liu, Dinghuai Zhang, Chengyu Dong, Jingbo Shang, and Jianfeng Gao. +Your efficient rl framework secretly brings you off-policy rl training, 2025. +URL +https://fengyao.notion.site/off-policy-rl +. +Yao et al. (2024) +Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, et al. +Retroformer: Retrospective large language agents with policy gradient optimization. +In +ICLR +, 2024. +Yu et al. (2025) +Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, et al. +Dapo: An open-source llm reinforcement learning system at scale. +In +NeurIPS +, 2025. +Yue et al. (2025) +Yang Yue, Zhiqi Chen, Rui Lu, Andrew Zhao, Zhaokai Wang, Shiji Song, and Gao Huang. +Does reinforcement learning really incentivize reasoning capacity in llms beyond the base model? +In +NeurIPS +, 2025. +Yuksekgonul et al. (2025) +Mert Yuksekgonul, Federico Bianchi, Joseph Boen, Sheng Liu, Pan Lu, Zhi Huang, Carlos Guestrin, and James Zou. +Optimizing generative ai by backpropagating language model feedback. +Nature +, 639:609–616, 2025. +Yuksekgonul et al. (2026) +Mert Yuksekgonul, Daniel Koceja, Xinhao Li, Federico Bianchi, Jed McCaleb, Xiaolong Wang, Jan Kautz, Yejin Choi, James Zou, Carlos Guestrin, et al. +Learning to discover at test time. +arXiv preprint arXiv:2601.16175 +, 2026. +Zelikman et al. (2022) +Eric Zelikman, Yuhuai Wu, Jesse Mu, and Noah D Goodman. +Star: Bootstrapping reasoning with reasoning. +In +NeurIPS +, 2022. +Zhang et al. (2025) +Kai Zhang, Xiangchao Chen, Bo Liu, Tianci Xue, Zeyi Liao, Zhihan Liu, Xiyao Wang, Yuting Ning, Zhaorun Chen, Xiaohan Fu, et al. +Agent learning via early experience. +arXiv preprint arXiv:2510.08558 +, 2025. +Zhang et al. (2023) +Tianjun Zhang, Fangchen Liu, Justin Wong, Pieter Abbeel, and Joseph E Gonzalez. +The wisdom of hindsight makes language models better instruction followers. +In +ICML +, 2023. +Zhao et al. (2025) +Andrew Zhao, Yiran Wu, Yang Yue, Tong Wu, Quentin Xu, Matthieu Lin, Shenzhi Wang, Qingyun Wu, Zilong Zheng, and Gao Huang. +Absolute zero: Reinforced self-play reasoning with zero data. +In +NeurIPS +, 2025. +Zheng et al. (2025a) +Chujie Zheng, Shixuan Liu, Mingze Li, Xiong-Hui Chen, Bowen Yu, Chang Gao, Kai Dang, Yuqiong Liu, Rui Men, An Yang, et al. +Group sequence policy optimization. +arXiv preprint arXiv:2507.18071 +, 2025a. +Zheng et al. (2025b) +Tianyu Zheng, Tianshun Xing, Qingshui Gu, Taoran Liang, Xingwei Qu, Xin Zhou, Yizhi Li, Zhoufutu Wen, Chenghua Lin, Wenhao Huang, et al. +First return, entropy-eliciting explore. +arXiv preprint arXiv:2507.07017 +, 2025b. +Zhou et al. (2023) +Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and Le Hou. +Instruction-following evaluation for large language models. +arXiv preprint arXiv:2311.07911 +, 2023. +Zhou et al. (2026) +Xiangxin Zhou, Zichen Liu, Anya Sims, Haonan Wang, Tianyu Pang, Chongxuan Li, Liang Wang, Min Lin, and Chao Du. +Reinforcing general reasoning without verifiers. +In +ICLR +, 2026. +Ziebart et al. (2008) +Brian D Ziebart, Andrew L Maas, J Andrew Bagnell, Anind K Dey, et al. +Maximum entropy inverse reinforcement learning. +In +AAAI +, 2008. +Zuo et al. (2025) +Yuxin Zuo, Kaiyan Zhang, Shang Qu, Li Sheng, Xuekai Zhu, Biqing Qi, Youbang Sun, Ganqu Cui, Ning Ding, and Bowen Zhou. +Ttrl: Test-time reinforcement learning. +In +NeurIPS +, 2025. +Contents +section.1table.caption.4section.2subsection.2.1subsection.2.2subsection.2.3section.3subsection.3.1subsection.3.1subsection.3.2subsection.3.3section.4section.4subsection.4.1subsection.4.2subsection.4.3subsection.4.4subsection.4.4subsection.4.5subsection.4.6subsection.4.6subsection.4.6subsection.4.6section.5subsection.5.1subsection.5.2figure.caption.20figure.caption.20section.6subsection.6.1subsection.6.2subsection.6.3subsection.6.4section.7section.7section.7appendix.Asubsection.A.1subsection.A.2subsection.A.3appendix.Bsubsection.B.1subsection.B.2subsection.B.3appendix.Cappendix.Cappendix.Cappendix.Cappendix.Csubsection.C.1subsection.C.1equation.22appendix.Dsubsection.D.1subsection.D.2subsubsection.D.2.1subsubsection.D.2.2subsubsection.D.2.3subsection.D.3appendix.Esubsection.E.1subsection.E.2subsubsection.E.2.1subsection.E.3appendix.Fsubsection.F.1subsection.F.2subsection.F.3subsection.F.4 +Appendix A +Implementation of SDPO +The following pseudocode in +Figure +˜ +14 +outlines the implementation of SDPO: +⬇ +def +compute_sdpo_loss +( +batch +, +teacher_context +, +loss_mask +): +""" +Computes +probabilities +of +response +y +under +the +self +- +teacher +and +the +per +- +logit +SDPO +loss +. +""" +# +Compute +model +probabilities +for +response +y +logprobs_student += +compute_log_prob +( +batch +) +# +( +T +, +V +) +probs_student += +logprobs_student +. +exp +() +# +( +T +, +V +) +\ +par +# +Compute +self +- +teacher +probabilities +for +response +y +teacher_batch += +reprompt +( +batch +, +teacher_context +) +logprobs_teacher += +compute_log_prob +( +teacher_batch +). +detach +() +# +( +T +, +V +) +\ +par +# +Compute +SDPO +loss +: +per +- +token +divergence +per_token_loss += +divergence +( +logprobs_student +, +logprobs_teacher +) +# +( +T +,) +return +agg_loss +( +per_token_loss +, +loss_mask +, +loss_agg_mode +=" +token +- +mean +") +Figure 14: +The pseudo-code of SDPO within a standard RL training pipeline. Omitted here is the filtering to top- +K +K +logprobs for student and teacher (including a tail term) as described in +Section +˜ +A.2 +. Further, we omit here any importance sampling weights to correct for off-policy data. +reprompt +modifies the batch to incorporate teacher context (i.e., rich feedback). +divergence +implements any per-token divergence such as reverse-KL, forward-KL, or Jensen-Shannon. +In the following, we provide further details on: +• +Teacher regularization ( +Section +˜ +A.1 +) +• +Approximating logit-distillation with the top- +K +K +logits for saving GPU memory ( +Section +˜ +A.2 +) +• +Generalizing PPO-style policy gradient algorithms to logit-level advantages ( +Section +˜ +A.3 +) +To disambiguate the notation of the self-teacher, we use +q +θ +( +⋅ +∣ +x +, +f +) +:= +π +θ +( +⋅ +∣ +reprompt +( +x +, +f +) +) +q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f)) +in the following. +Here, +reprompt +denotes the reprompt template of the self-teacher. +A.1 +Regularized teacher +In contrast to standard distillation, the teacher in SDPO changes throughout training. This bootstrapping enables the teacher to improve, but it may also lead to training instability. +To stabilize training, we seek to prevent the teacher +q +q +from quickly diverging from the initial teacher +q +θ +ref +\smash{q_{\theta_{{\mathrm{ref}}}}} +. +We can achieve this by placing an explicit trust-region constraint on +q +q +(Schulman et al., +2015 +; Peng et al., +2019 +) +, that is: +∑ +t +KL +( +q +( +y +t +∣ +x +, +f +, +y +< +t +) +∥ +q +θ +ref +( +y +t +∣ +x +, +f +, +y +< +t +) +) +≤ +ϵ +, +ϵ +> +0 +. +\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{0. +(5) +This trust-region can be implemented in two ways: +1. +Explicit trust-region: +We can define the teacher as the policy closest to +q +θ +q_{\theta} +while satisfying the trust-region constraint. +This teacher can be expressed as +q +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +∝ +exp +⁡ +( +( +1 +− +α +) +​ +log +⁡ +q +θ +ref +​ +( +y +t +∣ +x +, +f +, +y +< +t +) ++ +α +​ +log +⁡ +q +θ +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +) +, +q(y_{t}\mid x,f,y_{ +0 +. +\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{0. +(12) +In the following, we derive a teacher +q +q +which satisfies the trust-region constraint while staying close to the target +q +θ +q_{\theta} +. +The following optimization problem characterizes such a +q +q +(Peng et al., +2019 +) +: +arg +​ +max +q +∈ +Δ +∑ +t +∑ +y +t +q +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +​ +log +⁡ +q +θ +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +q +θ +ref +​ +( +y +t +∣ +x +, +f +, +y +< +t +) +s.t. +∑ +t +KL +( +q +( +y +t +∣ +x +, +f +, +y +< +t +) +∥ +q +θ +ref +( +y +t +∣ +x +, +f +, +y +< +t +) +) +≤ +ϵ +, +\displaystyle\begin{split}\operatorname*{arg\,max}_{q\in\Delta}\ &\sum_{t}\sum_{y_{t}}q(y_{t}\mid x,f,y_{ +0 +\operatorname*{arg\,max}_{\theta}\ \mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[\sum_{t}r(y_{t}\mid x,y_{0 +(22) +where +π +θ +​ +( +y +∣ +x +) += +∏ +t += +1 +T +π +θ +​ +( +y +t +∣ +x +, +y +< +t +) +\smash{\pi_{\theta}(y\mid x)=\prod_{t=1}^{T}\pi_{\theta}(y_{t}\mid x,y_{ +... + +< +answer +> +... + +For +the +answer +, +only +output +the +letter +corresponding +to +the +correct +option +( +A +, +B +, +C +, +or +D +), +and +nothing +else +. +Do +not +restate +the +answer +text +. +For +example +, +if +the +answer +is +" +A +", +just +output +: +< +answer +> +A + +Listing 1: +System prompt: Multiple Choice Questions +⬇ +{ +question +} +Please +reason +step +by +step +. +Listing 2: +User prompt: Multiple Choice Questions +⬇ +You +are +a +helpful +function +- +calling +AI +assistant +. +You +are +provided +with +function +signatures +within +< +functions +> +XML +tags +. +You +may +call +one +or +more +functions +to +assist +with +the +user +query +. +Output +any +function +calls +within +< +function_calls +> +XML +tags +. +Do +not +make +assumptions +about +what +values +to +plug +into +functions +. +Listing 3: +System prompt: Tool use +⬇ +Your +task +is +to +answer +the +user +’ +s +question +using +available +tools +. +You +have +access +to +the +following +tools +: +Name +: +Axolotl +Description +: +Collection +of +axolotl +pictures +and +facts +Documentation +: +getRandomAxolotlImage +: +Retrieve +a +random +axolotl +image +with +information +on +the +image +source +. +Parameters +: +{} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Object +{ +url +, +source +, +description +} +searchAxolotlImages +: +Search +for +axolotl +images +based +on +specific +criteria +such +as +color +, +gender +, +and +size +. +Parameters +: +{" +color +": +" +string +. +One +of +: +[ +wild +, +leucistic +, +albino +]. +The +color +of +the +axolotl +( +e +. +g +., +’ +wild +’, +’ +leucistic +’, +’ +albino +’, +etc +.).", +" +gender +": +" +string +. +One +of +: +[ +male +, +female +]. +The +gender +of +the +axolotl +(’ +male +’, +’ +female +’).", +" +size +": +" +string +. +One +of +: +[ +small +, +medium +, +large +]. +The +size +of +the +axolotl +(’ +small +’, +’ +medium +’, +’ +large +’).", +" +page +": +" +integer +. +The +page +number +for +pagination +purposes +."} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Object +{ +results +: +Array +[ +Object +{ +url +, +source +, +description +}], +pagination +: +Object +{ +current_page +, +total_pages +, +total_results +}} +getAxolotlFacts +: +Retrieve +interesting +facts +about +axolotls +such +as +their +habits +, +habitats +, +and +physical +characteristics +. +Parameters +: +{" +category +": +" +string +. +One +of +: +[ +habits +, +habitat +, +physical +characteristics +]. +The +category +of +facts +to +retrieve +( +e +. +g +., +’ +habits +’, +’ +habitat +’, +’ +physical +characteristics +’).", +" +limit +": +" +integer +. +The +maximum +number +of +facts +to +return +."} +Output +: +Successful +response +. +- +Format +: +application +/ +json +- +Structure +: +Array +[ +Object +{ +fact +, +source +}] +Use +the +following +format +: +Thought +: +you +should +always +think +about +what +to +do +Action +: +the +action +to +take +, +should +be +one +of +the +tool +names +. +Action +Input +: +the +input +to +the +action +, +must +be +in +JSON +format +. +All +of +the +action +input +must +be +realistic +and +from +the +user +. +Begin +! +Question +: +Hey +, +can +you +show +me +a +random +picture +of +an +axolotl +? +Listing 4: +Example user prompt: Tool use +Appendix F +Qualitative Examples +F.1 +Visualization of Advantages +Figure +˜ +21 +compares the advantages of SDPO and GRPO in a representative example. +Figure 21: +Visualization of advantages in SDPO and GRPO with Olmo3-7B-Instruct in a batch from the Chemistry task of +Section +˜ +3 +. Each row corresponds to the beginning of a response. The color indicates the advantage value at that token position, with positive advantages shown in blue and negative advantages shown in red. +F.2 +Examples +Below, we show an example from training SDPO on LCBv6 using Qwen3-8B. +⬇ +[Prompt] +You +are +a +coding +expert. +You +will +be +given +a +coding +problem, +and +you +need +to +write +a +correct +Python +program +that +matches +the +specification +and +passes +all +tests. +The +time +limit +is +1 +second. +You +may +start +by +outlining +your +thought +process. +In +the +end, +please +provide +the +complete +code +in +a +code +block +enclosed +with +‘‘‘ +‘‘‘. +You +are +given +a +binary +string +s +of +length +n, +where: +’1’ +represents +an +active +section. +’0’ +represents +an +inactive +section. +You +can +perform +at +most +one +trade +to +maximize +the +number +of +active +sections +in +s. +In +a +trade, +you: +Convert +a +contiguous +block +of +’1’s +that +is +surrounded +by +’0’s +to +all +’0’s. +Afterward, +convert +a +contiguous +block +of +’0’s +that +is +surrounded +by +’1’s +to +all +’1’s. +Return +the +maximum +number +of +active +sections +in +s +after +making +the +optimal +trade. +Note: +Treat +s +as +if +it +is +augmented +with +a +’1’ +at +both +ends, +forming +t += +’1’ ++ +s ++ +’1’. +The +augmented +’1’s +do +not +contribute +to +the +final +count. +Example +1: +Input: +s += +"01" +Output: +1 +Explanation: +Because +there +is +no +block +of +’1’s +surrounded +by +’0’s, +no +valid +trade +is +possible. +The +maximum +number +of +active +sections +is +1. +Example +2: +Input: +s += +"0100" +Output: +4 +Explanation: +String +"0100" +-> +Augmented +to +"101001". +Choose +"0100", +convert +"101001" +-> +"100001" +-> +"111111". +The +final +string +without +augmentation +is +"1111". +The +maximum +number +of +active +sections +is +4. +Example +3: +Input: +s += +"1000100" +Output: +7 +Explanation: +String +"1000100" +-> +Augmented +to +"110001001". +Choose +"000100", +convert +"110001001" +-> +"110000001" +-> +"111111111". +The +final +string +without +augmentation +is +"1111111". +The +maximum +number +of +active +sections +is +7. +Example +4: +Input: +s += +"01010" +Output: +4 +Explanation: +String +"01010" +-> +Augmented +to +"1010101". +Choose +"010", +convert +"1010101" +-> +"1000101" +-> +"1111101". +The +final +string +without +augmentation +is +"11110". +The +maximum +number +of +active +sections +is +4. +Constraints: +1 +<= +n +== +s.length +<= +10^5 +s[i] +is +either +’0’ +or +’1’ +Your +solution +should +have +the +following +signature: +‘‘‘python +def +maxActiveSectionsAfterTrade(s: +str) +-> +int: +‘‘‘ +[Response] + + +To +solve +this +problem, +we +need +to +understand +the +structure +of +the +string +and +the +trade +operation. +Here’s +the +plan: +### +Key +Insights: +1. +We +are +given +a +binary +string +‘s‘, +and +we +are +allowed +**at +most +one +trade**. +2. +A +**trade** +consists +of +two +steps: +- +Convert +a +contiguous +block +of +‘’1’‘ +that +is +**surrounded +by +‘’0’‘ +into +‘’0’‘ +(i.e., +flip +it). +- +Then +convert +a +contiguous +block +of +‘’0’‘ +that +is +**surrounded +by +‘’1’‘ +into +‘’1’‘ +(i.e., +flip +it). +3. +The +actual +count +of +active +sections +is +the +number +of +‘’1’‘ +characters +**after** +the +trade. +The +augmented +‘’1’‘ +at +the +beginning +and +end +do +not +count +in +the +final +result. +### +Strategy: +- +We +simulate +the +trade +by +considering +each +possible +way +to +flip +a +block +of +‘’1’‘ +(bounded +by +‘’0’‘) +and +then +a +block +of +‘’0’‘ +(bounded +by +‘’1’‘). +- +For +each +possible +valid +block, +simulate +the +trade +and +compute +the +resulting +number +of +‘’1’‘ +characters. +- +We +only +need +to +consider +the +original +string +‘s‘, +not +the +augmented +one +(since +we +can +simulate +the +trade +directly +on +‘s‘ +by +checking +boundaries). +We’ll +use +a +sliding +window +technique +to +find +contiguous +blocks +of +‘’1’‘ +and +‘’0’‘ +and +simulate +the +trade. +--- +### +Implementation +Steps: +1. +Simulate +the +trade +by +checking +for +a +block +of +‘’1’‘ +surrounded +by +‘’0’‘ +or +at +the +edges +(we +simulate +the +augmented +‘1‘ +at +both +sides). +2. +For +each +such +block, +simulate +converting +it +to +‘’0’‘, +then +convert +a +‘’0’‘ +block +(again, +surrounded +by +‘’1’‘) +to +‘’1’‘. +3. +Count +the +number +of +‘’1’‘ +in +the +final +string +and +track +the +maximum. +--- +### +Python +Code: +‘‘‘python +def +maxActiveSectionsAfterTrade(s: +str) +-> +int: +if +not +s: +return +0 +max_ones += +0 +n += +len(s) +# +We +are +simulating +the +augmented +’1’s +on +both +ends, +so +we +treat +the +boundaries +as +’1’s +# +So +’0’ +at +index +0 +or +n-1 +is +considered +surrounded +by +’1’s +# +Step +1: +Find +all +candidate +positions +for +the +first +part +of +the +trade +# +(convert +a +block +of +1s +surrounded +by +0s +into +0s) +for +i +in +range(n): +if +s[i] +== +’1’: +start += +i +while +i +< +n +and +s[i] +== +’1’: +i ++= +1 +end += +i +- +1 +# +Check +if +this +block +is +surrounded +by +’0’s +left_boundary += +(start +== +0 +or +s[start +- +1] +== +’0’) +right_boundary += +(end +== +n +- +1 +or +s[end ++ +1] +== +’0’) +if +left_boundary +and +right_boundary: +# +This +is +a +candidate +block +of +’1’s +to +flip +# +Now, +look +for +a +block +of +’0’s +surrounded +by +’1’s +in +the +remaining +string +# +Replace +the +’1’ +block +with +’0’s +# +Now +find +a +’0’ +block +surrounded +by +’1’s +# +We +can +simulate +the +remaining +string +temp += +list(s) +for +j +in +range(start, +end ++ +1): +temp[j] += +’0’ +# +Now +find +a +contiguous +block +of +’0’s +surrounded +by +’1’s +max_trade_ones += +0 +for +k +in +range(n): +if +temp[k] +== +’0’: +start0 += +k +while +k +< +n +and +temp[k] +== +’0’: +k ++= +1 +end0 += +k +- +1 +# +Check +if +this +block +is +surrounded +by +’1’s +left_0 += +(start0 +== +0 +or +temp[start0 +- +1] +== +’1’) +right_0 += +(end0 +== +n +- +1 +or +temp[end0 ++ +1] +== +’1’) +if +left_0 +and +right_0: +# +Flip +this +block +of +’0’s +to +’1’s +for +j +in +range(start0, +end0 ++ +1): +temp[j] += +’1’ +# +Count +the +number +of +’1’s +count += +sum(temp) +max_ones += +max(max_ones, +count) +break +# +Only +take +the +first +valid +block +for +this +trade +# +After +trying +this +’1’ +block, +restore +the +original +string +# +No +need, +we +just +used +a +copy +return +max_ones +‘‘‘ +--- +### +Optimization +Note: +The +above +approach +is +**not +optimal** +for +very +large +strings. +A +more +efficient +method +would +involve +precomputing +the +positions +of +blocks +of +‘’1’‘ +and +‘’0’‘, +and +checking +valid +trades +between +these +blocks +in +linear +time. +But +with +the +constraints +given +(up +to +10^5), +we +need +to +ensure +linear +or +near-linear +performance. +The +current +code +is +correct +but +**not +efficient** +for +large +inputs. +Would +you +like +me +to +provide +a +more +optimized +version +that +runs +in +**O(n)** +time +and +uses +only +one +pass +through +the +string? +[Feedback] +b’Runtime +Error\nTypeError: +unsupported +operand +type(s) +for ++: +\’int\’ +and +\’str\’\nLine +48 +in +maxActiveSectionsAfterTrade +(Solution.py)\n\nLast +Executed +Input\n"11000"’ +F.3 +Environment Feedback +We show three examples of feedback in our coding environment, inspired by LeetCode. +⬇ +Test +Case +3: +Wrong +Answer +Input +[[3,6,1],[4,21,4]] +Output +6 +Expected +22.87500 +Test +Case +6: +Wrong +Answer +Input +[[12,25,3],[3,14,2]] +Output +14 +Expected +25.83333 +Listing 5: +Example of feedback “Wrong Answer” from our code environment in case of a wrong answer, +inspired by LeetCode +⬇ +Runtime +Error +MemoryError +: +Line +91 +in +< +module +> +( +Solution +. +py +) +Line +25 +in +solve +( +Solution +. +py +) +Last +Executed +Input +10 +633 +9312 +1314 +8548 +8857 +1062 +6410 +3289 +8594 +1263 +8549 +733 +3858 +5973 +... +(3 +more +lines +) +Listing 6: +Example of feedback “Memory Error” from our code environment in case of a wrong answer, +inspired by LeetCode +⬇ +Runtime +Error +IndexError +: +list +index +out +of +range +Line +28 +in +sortMatrix +( +Solution +. +py +) +Last +Executed +Input +[[-1,-1,-1,-1,-1,-1,-1,-1,... +Listing 7: +Example of feedback “Index Error” from our code environment in case of a wrong answer, +inspired by LeetCode +F.4 +Illustrative Example +Figure +22 +shows an illustrative example of the dense credit assignment in SDPO. +Figure 22: +Dense credit assignment through self-teaching in SDPO. +The answer is generated by then model (Qwen3-8B) before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the self-teacher after seeing the feedback. We show the per-token +log +⁡ +( +ℙ +​ +( +self-teacher +) +/ +ℙ +​ +( +student +) +) +\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}}) +, with red indicating negative values ( +self-teacher disagrees +), blue indicating positive values ( +teacher reinforces +), and white indicating values around zero. Using binary rewards, GRPO would assign the same, negative advantage to all tokens in the sequence. In contrast, SDPO turns the feedback into dense credit assignment across the sequence. The first row shows the tokens of the generated response. The 3 other rows show the top- +k +k +logits of the self-teacher that are used during self-distillation, suggesting alternative tokens. Notably, in this example, the self-teacher identifies the error through retrospection without an explicit solution. The credit assignment on the generated sequence, and the alternative top- +k +k +logits correctly show that replacing +set +with +dict +maintains the order of elements. Further, in the seventh shown position, the model also identifies an alternative solution path which starts with the +seen +token, instead of directly returning the output. The activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution for specifically these few tokens. \ No newline at end of file diff --git a/research/notes/search-arxiv-e-print-repository.md b/research/notes/search-arxiv-e-print-repository.md new file mode 100644 index 0000000000000000000000000000000000000000..39db4f43946ed342c9cf478c265e93fddd34a95e --- /dev/null +++ b/research/notes/search-arxiv-e-print-repository.md @@ -0,0 +1,145 @@ +--- +title: Search | arXiv e-print repository +id: search-arxiv-e-print-repository +tags: +- deepread +created: '2026-06-10T00:36:40.541465Z' +source: https://arxiv.org/search/?searchtype=all&query=DiLoCo+scaling+laws&start=0 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:36:40.541307Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Search | arXiv e-print repository +Showing 1–1 of 1 results for all: +DiLoCo scaling laws +Search v0.5.6 released 2020-02-24 +Search term or terms +Field +All fields +Title +Author(s) +Abstract +Comments +Journal reference +ACM classification +MSC classification +Report number +arXiv identifier +DOI +ORCID +License (URI) +arXiv author ID +Help pages +Full text +Search +Show abstracts +Hide abstracts +Advanced Search +All fields +Title +Author(s) +Abstract +Comments +Journal reference +ACM classification +MSC classification +Report number +arXiv identifier +DOI +ORCID +License (URI) +arXiv author ID +Help pages +Full text +Show abstracts +Hide abstracts +25 +50 +100 +200 +results per page +. +Sort results by +Announcement date (newest first) +Announcement date (oldest first) +Submission date (newest first) +Submission date (oldest first) +Relevance +Go +arXiv:2503.09799 +[ +pdf +, +other +] +cs.LG +cs.CL +cs.DC +Communication-Efficient Language Model Training +Scales +Reliably and Robustly: +Scaling +Laws +for +DiLoCo +Authors: +Zachary Charles +, +Gabriel Teston +, +Lucio Dery +, +Keith Rush +, +Nova Fallen +, +Zachary Garrett +, +Arthur Szlam +, +Arthur Douillard +Abstract +: +As we +scale +to more massive machine learning models, the frequent synchronization demands inherent in data-parallel approaches create significant slowdowns, posing a critical challenge to further… +▽ More +As we +scale +to more massive machine learning models, the frequent synchronization demands inherent in data-parallel approaches create significant slowdowns, posing a critical challenge to further +scaling +. Recent work develops an approach ( +DiLoCo +) that relaxes synchronization demands without compromising model quality. However, these works do not carefully analyze how +DiLoCo's +behavior changes with model size. In this work, we study the +scaling +law +behavior of +DiLoCo +when training LLMs under a fixed compute budget. We focus on how algorithmic factors, including number of model replicas, hyperparameters, and token budget affect training in ways that can be accurately predicted via +scaling +laws +. We find that +DiLoCo +scales +both predictably and robustly with model size. When well-tuned, +DiLoCo +scales +better than data-parallel training with model size, and can outperform data-parallel training even at small model sizes. Our results showcase a more general set of benefits of +DiLoCo +than previously documented, including increased optimal batch sizes, improved downstream generalization with +scale +, and improved evaluation loss for a fixed token budget. +△ Less +Submitted +12 March, 2025; +originally announced +March 2025. +Search v0.5.6 released 2020-02-24 \ No newline at end of file diff --git a/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md new file mode 100644 index 0000000000000000000000000000000000000000..271ca988c6b17e28c1365a005b52b7eef3b0dbd4 --- /dev/null +++ b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md @@ -0,0 +1,3185 @@ +--- +title: 'Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models' +id: self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2 +tags: +- deepread +created: '2026-06-10T00:23:45.929035Z' +source: https://arxiv.org/html/2601.18734v3 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:45.928794Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models +Title: +Content selection saved. Describe the issue below: +Description: +License: CC BY 4.0 +arXiv:2601.18734v3 [cs.LG] 20 Mar 2026 +Self-Distilled Reasoner: +On-Policy Self-Distillation for Large Language Models +Siyan Zhao +† +Zhihui Xie +Mengchen Liu +Jing Huang +Guan Pang +Feiyu Chen +∗,‡ +Aditya Grover +∗ +Abstract +Knowledge distillation improves large language model (LLM) reasoning by compressing the knowledge of a teacher LLM to train smaller LLMs. On-policy distillation advances this approach by having the student sample its own trajectories while a teacher LLM provides dense token-level supervision, addressing the distribution mismatch between training and inference in off-policy distillation methods. However, on-policy distillation typically requires a separate, often larger, teacher LLM and does not explicitly leverage ground-truth solutions available in reasoning datasets. Inspired by the intuition that a sufficiently capable LLM can rationalize external privileged reasoning traces and teach its weaker self, we introduce +On-Policy Self-Distillation +(OPSD), a learning algorithm where a single LLM acts as both teacher and student with different contexts. The teacher policy conditions on privileged information (e.g., verified reasoning traces) while the student policy sees only the question; training minimizes the per-token divergence between these distributions over the student’s own rollouts. We demonstrate the efficacy of our method on multiple mathematical reasoning benchmarks, achieving superior token efficiency compared to reinforcement learning methods and better performance over off-policy distillation methods. Code repo: +https://github.com/siyan-zhao/OPSD +. +Machine Learning, ICML +1 +Introduction +Figure 1 +: +Overview of On-Policy Self-Distillation (OPSD): +Given a reasoning dataset +𝒮 += +{ +( +x +i +, +y +i +⋆ +) +} +i += +1 +N +\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N} +, we instantiate two policies from the same LLM: a +student policy +p +S +( +⋅ +∣ +x +) +p_{S}(\cdot\mid x) +and a +teacher policy +p +T +( +⋅ +∣ +x +, +y +⋆ +) +p_{T}(\cdot\mid x,y^{\star}) +. The student generates an on-policy response +y +^ +∼ +p +S +( +⋅ +∣ +x +) +\hat{y}\sim p_{S}(\cdot\mid x) +. Both policies then evaluate this trajectory to produce next-token distributions +p +S +( +⋅ +∣ +x +, +y +^ +< +n +) +p_{S}(\cdot\mid x,\hat{y}_{ Create new bug generation techinques. +> Expand to non-Python repositories. +> Train better SWE-agents! +Read our +documentation +or +code +for more. +Authors +John Yang +, +Kilian Lieret +, +Carlos E. Jimenez +, +Alexander Wettig +, +Kabir Khandpur +, +Yanzhe Zhang +, +Binyuan Hui +, +Ofir Press +, +Ludwig Schmidt +, +Diyi Yang +Affiliations +Stanford University +, +Stanford SALT Lab +, +Princeton Language & Intelligence +, +Alibaba Qwen +Citation +@misc{yang2025swesmith, + title={SWE-smith: Scaling Data for Software Engineering Agents}, + author={John Yang and Kilian Lieret and Carlos E. Jimenez and Alexander Wettig and Kabir Khandpur and Yanzhe Zhang and Binyuan Hui and Ofir Press and Ludwig Schmidt and Diyi Yang}, + year={2025}, + eprint={2504.21798}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2504.21798}, +} +© 2025 +Our projects +SWE-bench +SWE-agent +Mini-SWE-Agent +SWE-ReX +sb-cli \ No newline at end of file diff --git a/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md b/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md new file mode 100644 index 0000000000000000000000000000000000000000..6107569530f3d9ffca6fac05b3ee25a0a751cc29 --- /dev/null +++ b/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md @@ -0,0 +1,2776 @@ +--- +title: Training Software Engineering Agents and Verifiers with SWE-Gym +id: training-software-engineering-agents-and-verifiers-with-swe-gym +tags: +- deepread +created: '2026-06-10T00:23:36.811308Z' +source: https://arxiv.org/html/2412.21139 +source_domain: arxiv.org +fetched_at: '2026-06-10T00:23:36.810162Z' +fetch_provider: builtin +status: draft +type: note +tier: institutional +content_type: paper +deprecated: false +--- + +Training Software Engineering Agents and Verifiers with SWE-Gym +Training Software Engineering Agents and Verifiers with SWE-Gym +Jiayi Pan +Xingyao Wang +Graham Neubig +Navdeep Jaitly +Heng Ji +Alane Suhr +Yizhe Zhang +Abstract +We present SWE-Gym, the first environment for training software engineering (SWE) agents. +SWE-Gym contains 2,438 real-world task instances, each comprising a Python codebase with an executable runtime environment, unit tests, and a task specified in natural language. +We use SWE-Gym to train language model based SWE agents, and achieve up to 19% absolute gains in resolution rate on the popular SWE-Bench Verified and Lite test sets. +We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. +When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, +models, and agent trajectories. +Machine Learning, ICML +\addauthor +gnred +\NewDocumentCommand +\heng +mO +Heng +[#1] +1 +Introduction +Figure 1 +: +SWE-Gym enables scalable improvements for software engineering agents. +Top +: Scaling the amount of training data shows consistent performance improvements as we obtain more training trajectories, with no signs of saturation at 491 trajectories. +We use temperature +t += +0 +𝑡 +0 +t=0 +italic_t = 0 +for evaluation. +Bottom +: For inference time scaling, we generate a number of candidate trajectories per task and select the best using a verifier trained on SWE-Gym. This approach demonstrates roughly log-linear gains with the number of sampled solutions. +t += +0 +𝑡 +0 +t=0 +italic_t = 0 +(excluded from regression) is used as the first hypothesis to be consistent with the top figure; later rollouts use +t += +0.5 +𝑡 +0.5 +t=0.5 +italic_t = 0.5 +. +Language models (LMs) have remarkable promise in automating software engineering (SWE) tasks, as most clearly measured by recent progress on benchmarks like SWE-Bench +(Jimenez et al., +2024 +) +and Commit0 +(Zhao et al., +2024 +) +. +While LM-based SWE agents have shown significant performance gains through improving agent-computer interfaces +(Yang et al., +2024 +) +and prompting strategies +(Wang et al., +2024c +) +, advances in SWE agents have been limited by a reliance on proprietary models, with limited research to improve the underlying LM itself. +Unlike other domains where supervised fine-tuning and reinforcement learning have significantly improved LM capabilities, such as chat +(Ouyang et al., +2022 +) +, math reasoning +(Shao et al., +2024 +; Yuan et al., +2024 +) +, and web navigation +(Pan et al., +2024 +) +, software engineering currently lacks suitable training environments, and creating environments is uniquely challenging. +Real-world software engineering requires interaction with an executable runtime that has been prepared with the appropriate software dependencies and reproducible test suites, among other requirements. +These challenges are reflected in the existing resources (Tab. +1 +). +For example, the SWE-Bench +(Jimenez et al., +2024 +) +training split contains only solutions (git patches that solve the task), missing the step-by-step actions taken by the developer to create each solution, and executable environments and reward signals. R2E +(Jain et al., +2024 +) +uses synthetic tasks that are very far from real-world problems, while datasets such as APPS +(Hendrycks et al., +2021a +) +focus only on isolated tasks rather than realistic repository-level coding problems. +Table 1 +: +SWE-Gym is the first publicly available training environment combining real-world SWE tasks from GitHub issues with pre-installed dependencies and executable test verification. +Repository-level +: whether each task is situated in a sophisticated repository; +Executable Environment +: whether each task instance comes with an executable environment with all relevant dependencies pre-installed; +Real task +: whether task instruction is collected from human developers. +Dataset (split) +Repository-Level +Executable Environment +Real task +# Instances (total) +# Instances (train) +CodeFeedback +( +Zheng et al. +, +2024b +) +✗ +✗ +✓ +66,383 +66,383 +APPS +( +Hendrycks et al. +, +2021a +) +✗ +✓ +✓ +10,000 +5,000 +HumanEval +( +Chen et al. +, +2021 +) +✗ +✓ +✓ +164 +0 +MBPP +( +Tao et al. +, +2024 +) +✗ +✓ +✓ +974 +374 +R2E +( +Jain et al. +, +2024 +) +✓ +✓ +✗ +246 +0 +SWE-Bench (train) +( +Jimenez et al. +, +2024 +) +✓ +✗ +✓ +19,008 +19,008 +SWE-Gym Raw +✓ +✗ +✓ +64,689 +64,689 +SWE-Bench (test) +( +Jimenez et al. +, +2024 +) +✓ +✓ +✓ +2,294 +0 +SWE-Gym +✓ +✓ +✓ +2,438 +2,438 +To bridge this gap, we present SWE-Gym, the +first training environment +combining real-world software engineering tasks from GitHub issues with pre-installed dependencies and executable test verification. +SWE-Gym contains 2,438 Python tasks sourced from 11 popular open-source repositories (Tab. +2 +), providing useful environments for training LMs as agents and verifiers. +SWE-Gym supports training state-of-the-art open-weight SWE agents +. +Based on the OpenHands +(Wang et al., +2024c +) +agent scaffold for general-purpose software development (§ +2 +), we fine-tune a 32B Qwen-2.5 coder model +(Hui et al., +2024b +) +using only 491 agent-environment interaction trajectories sampled using SWE-Gym, and achieve substantial absolute improvements of +12.3% (to 15.3%) and +13.6% (to 20.6%) in resolution rate on SWE-Bench Lite and SWE-Bench Verified respectively (§ +4.2 +). +SWE-Gym is effective across agent scaffolds +. +In another agent scaffold based on a specialized workflow (MoatlessTools; +Örwall +2024 +; § +2 +), we experiment with self-improvement, where the LM interacts with SWE-Gym, receives reward from it, and learns to improve itself through rejection sampling fine-tuning. +This self-improvement boosts performance up to 19.7% on SWE-Bench Lite. +SWE-Gym supports training verifier models to enable inference-time scaling +. +We use test suites included in SWE-Gym to determine whether sampled agent trajectories are successful or not. +Given these samples, we train a verifier model +(i.e., an outcome-supervised reward model; Cobbe et al., +2021 +) +that estimates a trajectory’s probability of success. +This enables inference-time scaling, where we sample multiple agent trajectories and select the one with the highest estimated reward according to the verifier. This further improves the resolution rate to 32.0% (+11.4% absolute improvement) on SWE-Bench Verified (§ +5.1.1 +; Fig. +1 +bottom) and 26.0% on SWE-Bench Lite (§ +5.1.2 +), establishing a new state-of-the-art among systems with publicly accessible weights (Tab. +9 +). +Our baseline training and inference-time scaling methods on SWE-Gym yield continuously improved results with increasing compute +(Fig. +1 +). +In the training phase, performance scales with the number of sampled trajectories up to our current limit of 491 trajectories, suggesting that performance is currently limited by the compute budget for sampling rather than the number of tasks in SWE-Gym. +Similarly, using the agent and verifier trained by SWE-Gym, the bottom panel shows that using more compute during inference time steadily improves the performance. +2 +Related Work +Agents that solve GitHub issues. +We focus on software engineering agents designed to automatically resolve GitHub issues within the SWE-Bench framework +(Jimenez et al., +2024 +) +. These agents take a GitHub issue and its associated code repository as input and generate a valid code modification (i.e., a git diff patch) to address the issue. The correctness of these modifications is verified using a human-written test suite. +Existing agent designs are categorized by the extent of human priors integrated into their workflows: +Specialized workflows +(Xia et al., +2024 +; Örwall, +2024 +; Zhang et al., +2024b +; Chen et al., +2024 +) +involve human-defined stages (e.g., localization, code editing, patch re-ranking), where a LM is iteratively prompted for each stage to produce the final result. This approach reduces the task horizon and minimizes the need for long-term planning. However, specialized workflows require significant human engineering, may not generalize to novel issue types, and can fail if intermediate steps encounter problems. +In contrast, +general-purpose prompting +( +(Yang et al., +2024 +; Wang et al., +2024c +) +) rely on LM’s ability to plan over long horizons and generate actions based on a history of interactions without heavily pre-defined workflows. While more flexible, general approaches demand higher capabilities from the underlying LM and can be computationally expensive due to multiple interaction rounds. +The most successful existing SWE agents are built on proprietary language models like GPT-4 or Claude and utilize specialized workflows to overcome these models’ limitations. This contrasts with other sequential decision-making domains +(Silver et al., +2017 +; Akkaya et al., +2019 +) +, where learning-based approaches, such as reinforcement learning, drive success by enabling systems to learn from interactions and rewards to develop task competence. A key barrier in the SWE agent domain is the lack of appropriate training environments. Our experiments show that SWE-Gym can be used to build strong learning-based agents, accelerating research in this area. +Environments for training software agents. +There is no existing dataset suitable for training software engineering agents. +SWE-Bench +(Jimenez et al., +2024 +) +is widely used for evaluating software engineering performance, but its training split lacks executable environments and success signals present in the evaluation split, making it useful only for imitation learning approaches. +HumanEval +(Chen et al., +2021 +) +is designed for standalone code generation tasks, akin to coding competitions. Therefore, it falls short of addressing the complex challenges inherent in real-world, repository-level software engineering tasks, which involve thousands of files, millions of lines of code, and tasks such as bug fixing, feature development, and system optimization. +Similarly, R2E +(Jain et al., +2024 +) +is a small evaluation dataset with 246 instances and, due to its synthetic nature, lacks the realism and complexity in real-world software engineering scenario. +Our proposed SWE-Gym instead uses real-world GitHub issues as task, and associated executable unit tests for evaluation. This results in realistic and complex task formulations, aligning closely with real-world challenges. +Post-training: From chatbots and reasoners to agents. +Post-training, which fine-tunes pre-trained language models using supervised or reinforcement learning, significantly improves model performance across various domains. Techniques like RLHF +(Ouyang et al., +2022 +) +have become standard for adapting language models into chatbots, improving both performance and alignment +(Qwen Team, +2024 +) +. In math reasoning, datasets such as MATH +(Hendrycks et al., +2021b +) +and GSM-8K +(Cobbe et al., +2021 +) +facilitate the training and evaluation of policy and verifier models +(Cobbe et al., +2021 +; Wang et al., +2024a +) +. +Earlier works +(Wang et al., +2024b +; Chen et al., +2023 +; Zeng et al., +2023 +; Wu et al., +2024 +) +demonstrate that distilling agent trajectories from stronger models improve weaker models. Recent studies +(Xi et al., +2024 +; Zhai et al., +2024 +; Bai et al., +2024 +) +explore self-improving methods, showing that reinforcement learning or rejection sampling fine-tuning guided by reward enables LMs to enhance themselves without more capable teachers. +However, post-training typically depends on expert demonstration data or training environments with reliable reward signals, which are largely absent in the software engineering domain. This has led to a reliance on prompting-based methods with proprietary language models. Our work addresses this gap with SWE-Gym, a training environment based on real-world software engineering tasks that uses expert-written tests as reward signals. Our experiments demonstrate that SWE-Gym can build strong SWE agents without prompt engineering. +Category +Metric +SWE-Gym +SWE-Gym Lite +Size +# Instances +2,438 (2,294) +230 (300) +# Repos +11 (12) +11 (12) +Issue Text +Length by Words +239.8 (195.1) +186.2 (175.9) +Codebase +# Non-test Files +971.2 (2944.2) +818.8 (2988.5) +# Non-test Lines +340675.0 (363728.4) +340626.2 (377562.4) +Gold Patch +# Lines edited +69.8 (32.8) +10.6 (10.1) +# Files edited +2.5 (1.7) +1.0 (1.0) +# Func. edited +4.1 (3.0) +1.4 (1.34) +Tests +# Fail to Pass +10.0 (9.0) +2.04 (3.5) +# Total +760.8 (132.5) +99.9 (85.2) +Table 2 +: +Statistics comparing SWE-Gym with the SWE-Bench test split (in parenthesis). +Except for size metrics, we report the average value across instances. +Figure 2 +: +Repository distribution of SWE-Gym instances. +3 +SWE-Gym Environment +SWE-Gym comprises 2,438 real-world software engineering tasks sourced from pull requests in 11 popular Python repositories, with pre-configured executable environments and expert-validated test cases, constructed in close alignment with SWE-Bench +(Jimenez et al., +2024 +) +. +These repositories are separate from those used in SWE-Bench to avoid contamination. +These tasks require SWE agents to develop test-passing solutions for real-world GitHub issues using provided codebases and executable environments. +Such agents must map from natural language descriptions of the issue, as well as the initial state of the repository, to a pull request represented as a git patch. +We also identify a subset of 230 tasks, SWE-Gym Lite, which contains generally easier and more self-contained tasks that are suitable for rapid prototyping, in alignment with SWE-Bench Lite +(Jimenez et al., +2024 +) +. +To support future research in SWE agent development and automatic dataset synthesis, we also release SWE-Gym Raw, a large set of Python GitHub issues without executable environments (64,689 instances spanning 358 Python repositories). +3.1 +Dataset Construction +Identify Repositories. +We first use SEART GitHub search +1 +1 +1 +https://seart-ghs.si.usi.ch/ +to filter a list of initial repositories. Unlike SWE-Bench, which focuses on the top 5k most downloaded PyPI libraries +(Jimenez et al., +2024 +) +, we select Python repositories that were created before July 1, 2022 and have more than 500 stars, with at least 300 lines of code, more than 500 pull requests (PRs) and 100 contributors. This results in 358 repositories. +Extracting Training Instances from Repositories. +We use SWE-Bench’s instance extraction script to convert these repositories into task instances, each corresponding to a GitHub issue including the natural language description of the issue, a snapshot of the repository in which the issue was created, and a set of unit tests. +Over the 358 repositories, we extract 64,689 task instances. We refer to this dataset as SWE-Gym Raw, which is over three times larger than the 19k instances gathered in previous work +(Jimenez et al., +2024 +) +and includes nearly ten times as many repositories. +While SWE-Gym Raw instances contain code, issue descriptions, and the solution, they do not contain executable environments or a guarantee that its unit tests are effective in evaluating the correctness of a solution. +Thus, we focus on 11 repositories with numerous instances and semi-manually create executable environments for them. +Version Training Instances. +Associating instances with their respective version numbers (e.g. +1.2.3 +) and setting up environments version-by-version makes the environment collection process more practical by avoiding redundant setup work. +We generalize SWE-Bench’s versioning script to support versioning via script execution, and semi-automatically collect versions for each instance based on information available in the repository (e.g., +pyproject.toml +, git tag, etc). +Setup Executable Environments and Verify Instances. +Creating executable environments with pre-installed dependencies is crucial for developing software engineering agents, as it mirrors deployment settings and allows for incremental unit test feedback. Configuring dependencies for specific codebase versions is challenging due to the lack of a universal Python package installation method and backward compatibility issues, especially for older GitHub issues. Ignoring these environments could introduce distribution bias, diminishing SWE-Gym’s utility. To address this, we manually configure dependencies for each task instance using relevant configuration files (e.g., +requirements.txt +), CI scripts, or documentation from the repository snapshot at the time of issue creation. +We then use SWE-Bench’s execution-based validation script to ensure that the gold patch (the human-submitted code diff) passes more unit tests than the original code. This process required approximately 200 human annotation hours +2 +2 +2 +Annotations are done by a subset of the authors. +and 10,000 CPU core hours. After validation and filtering out failed instances, we obtained 2,438 unit-test-validated instances from 11 repositories. For full reproducibility, we publicly release pre-built Docker images for each instance, totaling 6 TB. +3.2 +SWE-Gym Lite +Solving software engineering tasks is computationally intensive, costing usually $1 or more per task with frontier models +(Wang et al., +2024c +) +. +To improve research efficiency via faster agent evaluation, +Jimenez et al. ( +2024 +) +introduce SWE-Bench Lite, a canonical subset of 300 instances from SWE-Bench. +Following the SWE-Bench Lite filtering pipeline, +3 +3 +3 +For details on its construction process, see +https://www.swebench.com/lite.html +. +we delineate the +SWE-Gym Lite +split, comprising 230 instances. Similar to SWE-Bench Lite, this subset excludes tasks that require editing more than one file, tasks with poorly described problem statements, those with excessively complex ground-truth code diffs, and tests focused on error message validation. +3.3 +Dataset Statistics +Fig. +2 +illustrates that the task distribution across repositories exhibits a long-tail pattern. Notably, tasks associated with +pandas +comprise nearly one-third of the total, whereas tasks related to +bokeh +represent a mere one percent. +Our analysis suggests that tasks in SWE-Gym are on average harder than those included in SWE-Bench. +Tab. +2 +shows that SWE-Gym has statistics similar to SWE-Bench, with several key differences. +Codebases in SWE-Gym, on average, have relatively fewer files than SWE-Bench, but a similar number of total lines of code. +However, gold patches in SWE-Gym have significantly more lines and files edited when compared to SWE-Bench’s gold patches. +Additionally, we find models have consistently lower performance on SWE-Gym compared to SWE-Bench. +4 +4 +4 +§ +B.4 +contains details of these experiments. +Beyond models and scaffolds overfitting to SWE-Bench, the decreased performance on SWE-Gym may also be due to our inclusion of sophisticated repositories like +pandas +and +MONAI +. +4 +Training LMs as Agents with SWE-Gym +We experiment with training language model agents using SWE-Gym. +We use two agent scaffolds (OpenHands, +Wang et al. +2024c +, § +4.2 +; Moatless Tools, +Örwall +2024 +, § +4.3 +). +4.1 +Setting +Agent Scaffolds. +Recent LM-based SWE agents comprise a base language model, and a set of tools and prompts this base model has access to. +This set of tools and prompting strategies is referred to as an agent scaffold, and recent work has developed numerous scaffolds for different purposes (refer to § +2 +for examples). +We experiment with two types of agent scaffolds: one for general-purpose prompting (OpenHands CodeAct; +Wang et al. +2024c +) and one for specialized workflows (MoatlessTools; +Örwall +2024 +), which +allows us to +measure the efficacy of SWE-Gym across diverse deployment settings. +Policy Improvement Algorithm. +We use SWE-Gym to improve the underlying LM for a given SWE agent. +As a baseline, we employ a simple policy improvement algorithm: rejection sampling fine-tuning (a.k.a. filtered behavior cloning), where we fine-tune the base LM on +success +trajectories sampled from SWE-Gym. +Evaluation Metrics. +We use the standard SWE agent benchmarks SWE-Bench Lite and Verified +(Jimenez et al., +2024 +) +for evaluation. +We report (1) +resolution rate (%) +, the proportion of resolved task instances, and (2) +Empty Patch (%) +, the proportion of trajectories where none of the code in the repository is edited. +We use OpenHands remote runtime +(Neubig & Wang, +2024 +) +to parallelize evaluation (e.g., execute unit tests). +Technical Details. +For base LMs, we use +Qwen-2.5-Coder-Instruct +(Hui et al., +2024a +) +7B, 14B, and 32B. § +B.2 +contains training run details. +4.2 +Training General-Purpose Prompting Agents +Table 3 +: +Model performance (fine-tuned on 491 SWE-Gym-sampled trajectories) on SWE-Bench +(Jimenez et al., +2024 +) +using OpenHands +(Wang et al., +2024c +) +as agent scaffold. We use +Qwen-2.5-Coder-Instruct +as the base model. +Model +Empty Patch (%, +↓ +↓ +\downarrow +↓ +) +Stuck in Loop (%, +↓ +↓ +\downarrow +↓ +) +Avg. Turn(s) +Resolve Rate (%, +↑ +↑ +\uparrow +↑ +) +Size +zero-shot +fine-tuned +Δ +Δ +\Delta +roman_Δ +zero-shot +fine-tuned +Δ +Δ +\Delta +roman_Δ +zero-shot +fine-tuned +Δ +Δ +\Delta +roman_Δ +zero-shot +fine-tuned +Δ +Δ +\Delta +roman_Δ +SWE-Bench Lite (300 instances) +7B +40.3 +29.7 +-10.7 +47.0 +31.0 +-16.0 +20.3 +22.2 ++1.9 +1.0 ( +± +plus-or-minus +\pm +± +1.0) +10.0 ( +± +plus-or-minus +\pm +± +2.4) ++9.0 +14B +49.7 +18.1 +-31.6 +31.7 +27.1 +-4.6 +23.2 +21.4 +-1.8 +2.7 ( +± +plus-or-minus +\pm +± +1.9) +12.7 ( +± +plus-or-minus +\pm +± +2.3) ++10.0 +32B +27.0 +18.1 +-8.9 +16.7 +18.1 ++1.5 +15.5 +29.3 ++13.9 +3.0 +( +± +plus-or-minus +\pm +± +1.4) +15.3 +( +± +plus-or-minus +\pm +± +2.5) ++12.3 +SWE-Bench Verified (500 instances) +7B +45.8 +33.8 +-12.0 +39.6 +21.0 +-18.6 +21.9 +35.3 ++13.4 +1.8 ( +± +plus-or-minus +\pm +± +1.1) +10.6 ( +± +plus-or-minus +\pm +± +2.1) ++8.8 +14B +44.9 +14.5 +-30.4 +32.1 +21.3 +-10.7 +25.5 +30.1 ++4.6 +4.0 ( +± +plus-or-minus +\pm +± +1.6) +16.4 ( +± +plus-or-minus +\pm +± +2.0) ++12.4 +32B +9.5 +13.8 ++4.3 +29.4 +23.8 +-5.6 +24.6 +31.6 ++7.0 +7.0 +( +± +plus-or-minus +\pm +± +1.3) +20.6 +( +± +plus-or-minus +\pm +± +2.1) ++13.6 +In this section, we use OpenHands (version CodeActAgent 2.1, +Wang et al. +2024b +, +c +) as our agent scaffold, which is based on general-purpose ReAct-style prompting +(Yao et al., +2023 +) +. +In contrast to specialized-workflows-agents (§ +2 +), it relies on the LM to generate actions and do planning. It equips the base LM with a bash terminal and a file editor. We disable the browser feature of OpenHands in this work. +Trajectory Collection. +By rejection sampling, we obtain 491 successful trajectories from SWE-Gym,. +These trajectories are sampled from +gpt-4o-2024-08-06 +and +claude-3-5-sonnet-20241022 +with different temperature settings. +Each successful trajectory, on average, has roughly 19 turns and approximately 19,000tokens. +5 +5 +5 +Tab. +8 +contains more statistics of the sampled trajectories. +Although SWE-Gym offers many more tasks and allows repeated sampling, our 491 trajectories are limited primarily by computational budget. +Training on SWE-Gym trajectories turns LM into effective agents to fix issues. +As shown in Tab. +3 +, the pre-trained base model achieves resolution rates of 3.0% and 7.0% on SWE-Bench Lite and Verified, respectively. After fine-tuning on 491 trajectories +6 +6 +6 +We use a sampling temperature of 0 unless otherwise specified. +, it improves by up to 12.3% (3.0% → 15.3%) and 13.6% (7.0% → 20.6%). +Training reduces stuck-in-loop behavior. +For agent tasks, open-weight LMs +often get stuck in loops, where the model perpetually generates the same action for multiple turns, especially when prompted with general-purpose prompts (§ +2 +). Thus, we report +Stuck in Loop (%) +, the percentage of trajectories where the agent repeats the same action three times consecutively. +As shown in Tab. +3 +, zero-shot pre-trained models often get stuck in loops; even the largest 32B model is trapped in 29.4% of SWE-Bench Verified tasks. Fine-tuning on trajectories from SWE-Gym consistently reduces the stuck-in-loop rate by 4.6–18.6% across both SWE-Bench Lite and Verified tasks, except for the 32B model on SWE-Bench Lite, which increases by 1.5% due to its already low loop rate. This coincides with a decrease in the empty patch rate, likely enabling the agent to perform more code edits. +Performance scales with model size. +Rather unsurprisingly, larger base models consistently improve the resolution rate, empty patch rate, and stuck-in-loop rate (Tab. +3 +). +Self-improvement remains ineffective. +In addition to fine-tuning on trajectories sampled from strong teacher models, we also experiment with fine-tuning on trajectories sampled directly from the policy being updated. +We use the fine-tuned 32B model to sample 6 trajectories per SWE-Gym instance (using temperature +t += +0.5 +𝑡 +0.5 +t=0.5 +italic_t = 0.5 +), obtaining 868 successful trajectories (i.e., on-policy trajectories). We further fine-tune the base 32B model on a mixture of 868 on-policy trajectories and the previously collected 491 off-policy trajectories. +When evaluating this fine-tuned model on SWE-Bench Lite, we observe the resolution rate drop from 15.3 to 8.7%, suggesting that self-improvement is not yet working. We hypothesize that we could achieve improved results using more advanced policy optimization methods, such as proximal policy optimization (PPO) +(Schulman et al., +2017 +) +, or with a stronger base model. These directions remain promising avenues for future investigation. +4.3 +Self-Improvement with Specialized Workflow +Unlike OpenHands, which offers freedom in long-horizon planning, MoatlessTools constrains the language model’s action space to pre-defined specialized workflows, reducing task horizons. +Specialized workflows outperform general-purpose prompting for open-weight LMs. In Tab. +3 +and Tab. +4 +, the 7B and 32B LM achieve zero-shot resolution rates of 7% and 19% with MoatlessTools, compared to 1.0% and 3.0% with OpenHands on SWE-Bench Lite. +Given MoatlessTools’ improved zero-shot performance and shorter task horizon, we hypothesize that self-improvement without a strong teacher is achievable using this scaffold and training on SWE-Gym. +With a limited compute budget, we conduct this experiment with only 7B and 32B models, using LoRA +(Hu et al., +2022 +) +for the 32B model for improved efficiency. +We use the 7B model for ablation experiments. +We use iterative rejection sampling fine-tuning for policy improvement. Each iteration involves (a) performing 30 high-temperature (1.0) rollouts per task on SWE-Gym-Lite and adding successful trajectories to the fine-tuning dataset, and (b) fine-tuning the policy on these filtered trajectories. After two iterations, further improvements are negligible. +Data Bias Impacts Performance. +Repeated sampling, as in +Brown et al. ( +2024 +) +, shows that task success probability follows a long-tail distribution (Fig. +6 +), where more samples increase solved instances. While broader task coverage benefits training, it introduces a bias toward easier tasks, making it suboptimal to train on all successful trajectories, as first observed in math reasoning +Tong et al. ( +2024 +) +. +Mitigating Bias with Per-Instance Capping. +We introduce per-instance capping—a method that limits the maximum number of selected samples per task. As illustrated in Fig. +6 +, this balances dataset bias and size. A low cap reduces dataset size and performance (§ +5.2 +), while a high cap skews the distribution toward easier tasks. Empirically, a threshold of 2 achieves a good balance, slightly outperforming the full dataset and improving training speed (Tab. +6 +). We rank trajectories by the number of model response rounds required, preferring fewer. +Results. +Results. After two policy improvement iterations (Tab. +4 +), the 7B model’s resolution rate increased from 7.0% to 9.0% after the first iteration and to 10.0% after the second. In contrast, the 32B model improved from 19.0% to 19.7% after the first iteration with no further gains. +We attribute the limited gains in the 32B model to the scaffold’s restricted action space and the rejection sampling fine-tuning method. +Table 4: +resolution rate (RR) and Empty patch rate (EP) on SWE-Bench Lite with the MoatlessTools Scaffold after online rejection sampling fine-tuning (temperature +t += +0 +𝑡 +0 +t=0 +italic_t = 0 +). +Setting +7B Model +32B Model +EP( +% +, +↓ +\%,\downarrow +% , ↓ +) +RR( +% +, +↑ +\%,\uparrow +% , ↑ +) +EP( +% +, +↓ +\%,\downarrow +% , ↓ +) +RR( +% +, +↑ +\%,\uparrow +% , ↑ +) +Zero-Shot +56.3% +7.0% +24.3% +19.0% +Iteration 1 +29.0% +9.0% +18.3% +19.7% +Iteration 2 +23.3 +% +10.0% +9.7% +19.7% +5 +Scaling Agent Performance with SWE-Gym +We explore two scaling directions enabled by SWE-Gym to enhance agent performance: inference-time scaling (§ +5.1 +) and training-time data scaling (§ +5.2 +). +5.1 +Inference-Time Scaling with Verifiers +Trajectories sampled from SWE-Gym can be used not only for training a policy, but also for training a verifier (i.e., reward) model. +We train an outcome-supervised reward model (ORM) +(Cobbe et al., +2021 +) +that, given the relevant context of the task execution (including the problem statement, agent trajectory, and current git diff), generates a score that estimates the probability that the agent has solved the problem. +We experiment with using this model to rerank candidate trajectories sampled from a SWE agent policy, and show that such learned verifiers enable effective inference-time scaling for further performance improvement. +5.1.1 +Verifier for General-Purpose Prompting +For OpenHands agents +(Wang et al., +2024b +, +c +) +with general-purpose prompting (§ +2 +), we train a verifier (ORM) that takes as input the trajectory +τ += +[ +o +1 +, +a +1 +, +o +2 +, +a +2 +, +… +, +o +n +, +a +n +] +𝜏 +subscript +𝑜 +1 +subscript +𝑎 +1 +subscript +𝑜 +2 +subscript +𝑎 +2 +… +subscript +𝑜 +𝑛 +subscript +𝑎 +𝑛 +\tau=[o_{1},a_{1},o_{2},a_{2},\dots,o_{n},a_{n}] +italic_τ = [ italic_o start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_o start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , … , italic_o start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ] +, represented as an interleaved sequence of observations and actions, and generates a scalar reward +r +∈ +[ +0 +, +1 +] +𝑟 +0 +1 +r\in[0,1] +italic_r ∈ [ 0 , 1 ] +. +Observations +o +k +subscript +𝑜 +𝑘 +o_{k} +italic_o start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT +include the task problem statement, command execution output, error messages, etc; action +a +k +subscript +𝑎 +𝑘 +a_{k} +italic_a start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT +can be bash command or file operations (e.g., edit, view) from the agent. +Training and Inference. +We fine-tune 32B +Qwen2.5-Coder-Instruct +to label trajectories as successful or unsuccessful using output tokens + +and + +respectively. +7 +7 +7 +§ +B.6 +includes the verifier prompt template. +For training data, we re-use two sets of trajectories we sampled on SWE-Gym for agent training in § +4.2 +: (1) +off-policy trajectories +which contain 443 successful trajectories; (2) +on-policy trajectories +which contain 875 successful trajectories sampled from the fine-tuned +Qwen2.5-Coder-Instruct-32B +. +8 +8 +8 +We keep only trajectories within 32k-token length for training, which may reduce their number compared to Section +4.2 +. +We combine both on-policy and off-policy trajectories, randomly sample the same amount of unsuccessful trajectories from each subset (1,318 each), and combine them as our dataset for verifier training (total 2,636 trajectories). +We fine-tune the model to predict + +for successful trajectories and + +for unsuccessful ones. +At inference time, conditioned on the prompt and the agent trajectory +τ +𝜏 +\tau +italic_τ +, we use SGLang +(Zheng et al., +2024a +) +to obtain the log probability of the next token being + +( +l +y +subscript +𝑙 +𝑦 +l_{y} +italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT +) or + +( +l +n +subscript +𝑙 +𝑛 +l_{n} +italic_l start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT +). +We then calculate the reward as the probability of success by normalizing the log probability: +r += +exp +⁡ +( +l +y +) +/ +( +exp +⁡ +( +l +y +) ++ +exp +⁡ +( +l +n +) +) +𝑟 +subscript +𝑙 +𝑦 +subscript +𝑙 +𝑦 +subscript +𝑙 +𝑛 +r=\exp(l_{y})/(\exp(l_{y})+\exp(l_{n})) +italic_r = roman_exp ( italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT ) / ( roman_exp ( italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT ) + roman_exp ( italic_l start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ) ) +. +Metrics. +We report two metrics: +(1) Pass@ +k +𝑘 +k +italic_k +, the proportion of tasks with at least one successful solution among +k +𝑘 +k +italic_k +samples, and +(2) Best@ +k +𝑘 +k +italic_k +, the success rate of the highest-reward trajectories selected by the verifier from +k +𝑘 +k +italic_k +samples per task. Pass@ +k +𝑘 +k +italic_k +measures solution discovery (upper bound for Best@ +k +𝑘 +k +italic_k +); Best@ +k +𝑘 +k +italic_k +evaluates verifier accuracy. Mean and variance calculation are detailed in § +B.1 +, following +Lightman et al. ( +2023 +) +. +Figure 3 +: +Increasing inference-time compute improves performance on SWE-Bench Verified with a learnt verifier. +Both the agent and the verifier are a +Qwen2.5-Coder-Instruct-32B +model fine-tuned on the corresponding dataset (§ +5.1.1 +). OpenHands is used as the agent scaffold. +Results. +Fig. +3 +shows how Pass@ +k +𝑘 +k +italic_k +and Best@K scale with the number of sampled agent trajectories using the fine-tuned 32B model as the agent model. Pass@ +k +𝑘 +k +italic_k +demonstrates strong improvement, rising from 20.6 to 37.8% resolution rate as +k +𝑘 +k +italic_k +increases from 1 to 8, and up to 42.8@ +k +𝑘 +k +italic_k +=16. +The Best@ +k +𝑘 +k +italic_k +metric, which relies on our verifier’s ability to select the best trajectory, demonstrates more modest but steady progress, improving from a resolution rate of 20.6@1 to 29.8@8, and up to 32.0@16. +The gap between Pass@ +k +𝑘 +k +italic_k +and Best@ +k +𝑘 +k +italic_k +, due to the imperfect performance of our trained verifier, indicates there is room for improvements in reward modeling for coding agents. +Surprisingly, we found that fine-tuning the verifier model using LoRA +(Hu et al., +2022 +) +(29.8@8) with Unsloth +(Unsloth Team, +2024 +) +performs better than full-parameter fine-tuning (27.2@8), potentially due regularization. Furthermore, as shown in Fig. +1 +(bottom), the Best@ +k +𝑘 +k +italic_k +curve exhibits strong linearity on a logarithmic scale, indicating a promising scaling behavior. +Training data matters for verifier. +We experiment with variations on the choice of training data for our verifier model. +Using full-parameter fine-tuning on +Qwen-2.5-Coder-Instruct-32B +, we use different mixtures of on- and off-policy trajectories, as well as different distributions of successful and unsuccessful trajectories. +As shown in Fig. +8 +, our ablation study demonstrates that the choice of training data can significantly impact verifier performance. +Training with a mixture of off-policy and on-policy data yields the best results (our default setting), reaching a resolution rate of 27@8. +In contrast, using only on-policy data from the fine-tuned model shows moderate but limited improvement, while training exclusively on off-policy data from Claude and GPT leads to early performance plateaus around 22% resolution rate. +Our findings indicate that verifier training benefits most from a diverse dataset combining both off-policy and on-policy examples. +5.1.2 +Verifier for Specialized Workflow +Figure 4 +: +Scaling inference-time compute for MoatlessTools Agents (32B) with learned verifiers on SWE-Bench Lite. Temperature +t += +0.5 +𝑡 +0.5 +t=0.5 +italic_t = 0.5 +. +For MoatlessTools agents with specialized workflows, given that it doesn’t have a turn-taking action-observation trajectory like OpenHands CodeActAgent, + +we prepare verifier inputs through a parsing process adopted from +Zhang et al. ( +2024a +) +, which combines task descriptions, relevant agent context, and generated patches. +9 +9 +9 +We provide the prompt template in § +B.5 +. +We train the verifier to map from this input to a single token indicating task success. +Following the training procedure described in § +5.1.1 +, we train 7B and 32B verifiers using on-policy trajectories from the last (2nd round of sampling, applying LoRA +(Hu et al., +2022 +) +. To address the easy-data bias in the training dataset, we cap the number of successful trajectories per instance at two and balance the data by subsampling failure cases to match the same number of successful ones. +Results. +We evaluate the verifiers by sampling from an agent policy with +k += +𝑘 +absent +k= +italic_k = +8 at temperature 0.5. As shown in Fig. +4 +and Fig. +7 +, these verifiers enable effective scaling across verifier and policy sizes: the 7B verifier improves from 10 to 13.3% resolution rate on SWE-Bench Lite when paired with a 7B policy, while the 32B verifier improves from 19.7 to 26.3% when paired with a 32B policy. The 7B verifier plateaus after +k += +𝑘 +absent +k= +italic_k = +4 samples when ranking trajectories from both 7B and 32B agents. In contrast, the 32B verifier continues improving even at +k += +𝑘 +absent +k= +italic_k = +8, suggesting that verifier size significantly affects scaling behavior. +5.2 +Training-Time Scaling with Data +We then examine how scaling the amount of training data affects agent performance using 491 sampled trajectories from § +4.2 +. We simulate three scaling methods through subsampling: (1) +Scaling trajectories +, where trajectories are randomly dropped (Fig. +5 +); (2) +Scaling unique task instances +, where only one successful trajectory per task instance is selected (Fig. +9 +); and (3) +Scaling repositories +, which sequentially includes all instances from each repository to assess repository-level diversity. +Setup. +Using OpenHands +(Wang et al., +2024c +) +and the fine-tuning approach described in § +4.2 +, we evaluate these scaling approaches on SWE-Bench Verified: scaling the number of trajectories, by subsampling from the full trajectory dataset from § +4.2 +(at most 491 trajectories); unique instance scaling on these trajectories deduplicated by instance ID (at most 294 trajectories), and repository-based scaling where we sort repositories alphabetically and include all trajectories from each repository in order (e.g., first 25% contains complete trajectories from the first N repositories). +We compare models trained on 25%, 50%, and 100% of the full dataset for each approach, sampling training subsets using the methods described above for each scaling approach. +10 +10 +10 +Tab. +7 +contains detailed statistics of these datasets. +Scaling trends suggest instance and repository diversity is not yet a bottleneck. +Fig. +5 +demonstrates substantial scaling behavior, with consistent improvements in resolution rate as the number of training trajectories randomly increases, particularly for the 32B model. +These results suggest that SWE-Gym’s current size and repository diversity are likely not performance bottlenecks - further improvements could likely be achieved by allocating more computing resources to sampling more training trajectories. +Figure 5 +: +Scaling effects of increasing the number of randomly sampled trajectories for training. +Fig. +9 +reveals comparable overall performance between different scaling approaches up to where deduplication takes effect. While Random Scaling (No Dedup.) achieves higher final performance, this is likely due to having more trajectories (491 vs 294) rather than better scaling efficiency. +Among deduplicated approaches, Repository Scaling shows stronger initial performance at 25% data, suggesting that complete repository coverage may provide more coherent learning signals early in training. +These results suggest that the repository and instance diversity of SWE-Gym is not yet a bottleneck - further improvements could likely be achieved by simply sampling more agent trajectory data for traning, regardless of duplication or repository distribution. +6 +Conclusions, Limitations, and Future Work +In this paper, we introduce SWE-Gym, the first training environment that addresses critical gaps in enabling scalable learning for software engineering agents. By combining real-world Python tasks with repository-level context, pre-configured execution environments, and test verifications, SWE-Gym will be a foundation for advancing LM agent training research. +Through extensive experiments, we demonstrate that SWE-Gym enables both agent and verifier models to achieve significant improvements in resolving complex software tasks. Our findings highlight the scalability of these approaches, revealing potential for continuous performance gains with increased compute. +We see many research directions that we are excited to explore in the future: +1. +Automatic Environment Synthesis +SWE-Gym, while effective, is limited by its environment diversity, including the number of repositories, types of tasks, and programming languages. We view environment synthesis—via automated environment creation, test-case generation, or task generation—as a critical next step. +2. +Self-Improvement with Reinforcement Learning +Despite notable progress, our self-improvement results are modest. Training language model agents with large-scale online reinforcement learning is a promising direction for further improvements. +3. +Human-Agent Interaction +Current SWE settings focus solely on task completion, neglecting human-in-the-loop collaboration, which is essential for real-world software engineering. Methods like user simulation or learning from offline human-agent interaction data might offer ways for developing collaborative agents that align with human. +Impact Statement +This work presents SWE-Gym, an environment for training software engineering agents, with strong empirical results on its effectiveness. We discuss a few important societal implications to consider. +First, improving automated software engineering capabilities could increase developer’s productivity and accessibility across industries. +Although current models are primarily research artifacts and not yet production-ready, they can support critical open-source infrastructure and potentially make software development more accessible. +Secondly, as these agents become more capable, they may impact software engineering jobs and require careful consideration around code ownership, licensing, and attribution. +Additionally, while we focus on legitimate software engineering tasks, similar techniques could potentially be misused to automate the creation of malicious code. +We encourage future work to further explore frameworks for responsible deployment of software engineering agents, including considerations around security, safety, and economic impacts. +Acknowledgments +We thank John Yang and Ofir Press for helpful discussions, and John Yang for assistance in reproducing data analysis results from SWE-Bench. We thank Modal Labs +11 +11 +11 +https://modal.com/ +for the GPU compute support through its Academic Credits Program. +XW and HJ are partially supported by U.S. DARPA ITM Program No. FA8650-23-C-7316. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein. +References +Akkaya et al. (2019) +Akkaya, I., Andrychowicz, M., Chociej, M., Litwin, M., McGrew, B., Petron, A., Paino, A., Plappert, M., Powell, G., Ribas, R., et al. +Solving rubik’s cube with a robot hand. +arXiv preprint arXiv:1910.07113 +, 2019. +Badertdinov et al. (2024) +Badertdinov, I., Trofimova, M., Anapolskiy, Y., Abramov, S., Zainullina, K., Golubev, A., Polezhaev, S., Litvintseva, D., Karasik, S., Fisin, F., Skvortsov, S., Nekrashevich, M., Shevtsov, A., and Yangel, B. +Scaling data collection for training software engineering agents. +Nebius blog +, 2024. +Bai et al. (2024) +Bai, H., Zhou, Y., Cemri, M., Pan, J., Suhr, A., Levine, S., and Kumar, A. +Digirl: Training in-the-wild device-control agents with autonomous reinforcement learning. +ArXiv +, abs/2406.11896, 2024. +URL +https://api.semanticscholar.org/CorpusID:270562229 +. +Brown et al. (2024) +Brown, B., Juravsky, J., Ehrlich, R., Clark, R., Le, Q. V., R’e, C., and Mirhoseini, A. +Large language monkeys: Scaling inference compute with repeated sampling. +ArXiv +, abs/2407.21787, 2024. +URL +https://api.semanticscholar.org/CorpusID:271571035 +. +Chen et al. (2023) +Chen, B., Shu, C., Shareghi, E., Collier, N., Narasimhan, K., and Yao, S. +Fireact: Toward language agent fine-tuning. +ArXiv +, abs/2310.05915, 2023. +URL +https://api.semanticscholar.org/CorpusID:263829338 +. +Chen et al. (2024) +Chen, D., Lin, S., Zeng, M., Zan, D., Wang, J.-G., Cheshkov, A., Sun, J., Yu, H., Dong, G., Aliev, A., Wang, J., Cheng, X., Liang, G., Ma, Y., Bian, P., Xie, T., and Wang, Q. +Coder: Issue resolving with multi-agent and task graphs. +CoRR in ArXiv +, abs/2406.01304, 2024. +Chen et al. (2021) +Chen, M., Tworek, J., Jun, H., Yuan, Q., Pondé, H., Kaplan, J., Edwards, H., Burda, Y., Joseph, N., Brockman, G., Ray, A., Puri, R., Krueger, G., Petrov, M., Khlaaf, H., Sastry, G., Mishkin, P., Chan, B., Gray, S., Ryder, N., Pavlov, M., Power, A., Kaiser, L., Bavarian, M., Winter, C., Tillet, P., Such, F. P., Cummings, D. W., Plappert, M., Chantzis, F., Barnes, E., Herbert-Voss, A., Guss, W. H., Nichol, A., Babuschkin, I., Balaji, S., Jain, S., Carr, A., Leike, J., Achiam, J., Misra, V., Morikawa, E., Radford, A., Knight, M. M., Brundage, M., Murati, M., Mayer, K., Welinder, P., McGrew, B., Amodei, D., McCandlish, S., Sutskever, I., and Zaremba, W. +Evaluating large language models trained on code. +ArXiv +, abs/2107.03374, 2021. +URL +https://api.semanticscholar.org/CorpusID:235755472 +. +Cobbe et al. (2021) +Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R., Hesse, C., and Schulman, J. +Training verifiers to solve math word problems. +ArXiv +, abs/2110.14168, 2021. +URL +https://api.semanticscholar.org/CorpusID:239998651 +. +Golubev et al. (2024) +Golubev, A., Polezhaev, S., Zainullina, K., Trofimova, M., Badertdinov, I., Anapolskiy, Y., Litvintseva, D., Karasik, S., Fisin, F., Skvortsov, S., Nekrashevich, M., Shevtsov, A., Abramov, S., and Yangel, B. +Leveraging training and search for better software engineering agents. +Nebius blog +, 2024. +https://nebius.com/blog/posts/training-and-search-for-software-engineering-agents. +Hendrycks et al. (2021a) +Hendrycks, D., Basart, S., Kadavath, S., Mazeika, M., Arora, A., Guo, E., Burns, C., Puranik, S., He, H., Song, D., and Steinhardt, J. +Measuring coding challenge competence with APPS. +In Vanschoren, J. and Yeung, S. (eds.), +Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual +, 2021a. +Hendrycks et al. (2021b) +Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D. X., and Steinhardt, J. +Measuring mathematical problem solving with the math dataset. +ArXiv +, abs/2103.03874, 2021b. +URL +https://api.semanticscholar.org/CorpusID:232134851 +. +Hu et al. (2022) +Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., and Chen, W. +Lora: Low-rank adaptation of large language models. +In +The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022 +. OpenReview.net, 2022. +URL +https://openreview.net/forum?id=nZeVKeeFYf9 +. +Hui et al. (2024a) +Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., Dang, K., et al. +Qwen2. 5-coder technical report. +arXiv preprint arXiv:2409.12186 +, 2024a. +Hui et al. (2024b) +Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., Dang, K., et al. +Qwen2. 5-coder technical report. +arXiv preprint arXiv:2409.12186 +, 2024b. +Jain et al. (2024) +Jain, N., Shetty, M., Zhang, T., Han, K., Sen, K., and Stoica, I. +R2E: turning any github repository into a programming agent environment. +In +Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024 +. OpenReview.net, 2024. +URL +https://openreview.net/forum?id=kXHgEYFyf3 +. +Jimenez et al. (2024) +Jimenez, C. E., Yang, J., Wettig, A., Yao, S., Pei, K., Press, O., and Narasimhan, K. R. +Swe-bench: Can language models resolve real-world github issues? +In +The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024 +. OpenReview.net, 2024. +URL +https://openreview.net/forum?id=VTF8yNQM66 +. +Lightman et al. (2023) +Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., and Cobbe, K. +Let’s verify step by step. +ArXiv +, abs/2305.20050, 2023. +URL +https://api.semanticscholar.org/CorpusID:258987659 +. +Ma et al. (2024) +Ma, Y., Cao, R., Cao, Y., Zhang, Y., Chen, J., Liu, Y., Liu, Y., Li, B., Huang, F., and Li, Y. +Lingma swe-gpt: An open development-process-centric language model for automated software improvement. +arXiv preprint arXiv:2411.00622 +, 2024. +Modal (2024) +Modal. +Modal: High-performance AI infrastructure. +https://modal.com/ +, 2024. +Accessed: 2024-12-18. +Neubig & Wang (2024) +Neubig, G. and Wang, X. +Evaluation of LLMs as Coding Agents on SWE-Bench (at 30x Speed!). +All Hands AI blog +, 2024. +Ouyang et al. (2022) +Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C., Mishkin, P., Zhang, C., Agarwal, S., Slama, K., Ray, A., et al. +Training language models to follow instructions with human feedback. +Advances in neural information processing systems +, 35:27730–27744, 2022. +Pan et al. (2024) +Pan, J., Zhang, Y., Tomlin, N., Zhou, Y., Levine, S., and Suhr, A. +Autonomous evaluation and refinement of digital agents. +ArXiv +, abs/2404.06474, 2024. +URL +https://api.semanticscholar.org/CorpusID:269009430 +. +PyTorch Team (2024) +PyTorch Team. +torchtune: PyTorch native post-training library. +https://github.com/pytorch/torchtune +, 2024. +Qwen Team (2024) +Qwen Team. +Qwen2.5: A party of foundation models, September 2024. +URL +https://qwenlm.github.io/blog/qwen2.5/ +. +Schulman et al. (2017) +Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O. +Proximal policy optimization algorithms. +ArXiv +, abs/1707.06347, 2017. +URL +https://api.semanticscholar.org/CorpusID:28695052 +. +Shao et al. (2024) +Shao, Z., Wang, P., Zhu, Q., Xu, R., Song, J., Bi, X., Zhang, H., Zhang, M., Li, Y., Wu, Y., et al. +Deepseekmath: Pushing the limits of mathematical reasoning in open language models. +arXiv preprint arXiv:2402.03300 +, 2024. +Silver et al. (2017) +Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T. P., Simonyan, K., and Hassabis, D. +Mastering chess and shogi by self-play with a general reinforcement learning algorithm. +ArXiv +, abs/1712.01815, 2017. +URL +https://api.semanticscholar.org/CorpusID:33081038 +. +Tao et al. (2024) +Tao, N., Ventresque, A., Nallur, V., and Saber, T. +Enhancing program synthesis with large language models using many-objective grammar-guided genetic programming. +Algorithms +, 17(7):287, 2024. +doi: +10.3390/A17070287 +. +URL +https://doi.org/10.3390/a17070287 +. +Tong et al. (2024) +Tong, Y., Zhang, X., Wang, R., Wu, R. M., and He, J. +Dart-math: Difficulty-aware rejection tuning for mathematical problem-solving. +ArXiv +, abs/2407.13690, 2024. +URL +https://api.semanticscholar.org/CorpusID:271270574 +. +Unsloth Team (2024) +Unsloth Team. +Easily finetune and train LLMs. Get faster with unsloth. +https://unsloth.ai/ +, 2024. +Wang et al. (2024a) +Wang, P., Li, L., Shao, Z., Xu, R., Dai, D., Li, Y., Chen, D., Wu, Y., and Sui, Z. +Math-shepherd: Verify and reinforce LLMs step-by-step without human annotations. +In Ku, L.-W., Martins, A., and Srikumar, V. (eds.), +Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) +, pp.  9426–9439, Bangkok, Thailand, August 2024a. Association for Computational Linguistics. +doi: +10.18653/v1/2024.acl-long.510 +. +URL +https://aclanthology.org/2024.acl-long.510 +. +Wang et al. (2024b) +Wang, X., Chen, Y., Yuan, L., Zhang, Y., Li, Y., Peng, H., and Ji, H. +Executable code actions elicit better LLM agents. +In +Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024 +. OpenReview.net, 2024b. +URL +https://openreview.net/forum?id=jJ9BoXAfFa +. +Wang et al. (2024c) +Wang, X., Li, B., Song, Y., Xu, F. F., Tang, X., Zhuge, M., Pan, J., Song, Y., Li, B., Singh, J., Tran, H. H., Li, F., Ma, R., Zheng, M., Qian, B., Shao, Y., Muennighoff, N., Zhang, Y., Hui, B., Lin, J., Brennan, R., Peng, H., Ji, H., and Neubig, G. +OpenHands: An Open Platform for AI Software Developers as Generalist Agents. +CoRR in ArXiv +, abs/2407.16741, 2024c. +Wu et al. (2024) +Wu, Z., Bai, H., Zhang, A., Gu, J., Vinod Vydiswaran, V., Jaitly, N., and Zhang, Y. +Divide-or-conquer? which part should you distill your llm? +ArXiv +, 2024. +Xi et al. (2024) +Xi, Z., Ding, Y., Chen, W., Hong, B., Guo, H., Wang, J., Yang, D., Liao, C., Guo, X., He, W., Gao, S., Chen, L., Zheng, R., Zou, Y., Gui, T., Zhang, Q., Qiu, X., Huang, X., Wu, Z., and Jiang, Y.-G. +Agentgym: Evolving large language model-based agents across diverse environments. +ArXiv +, abs/2406.04151, 2024. +URL +https://api.semanticscholar.org/CorpusID:270285866 +. +Xia et al. (2024) +Xia, C. S., Deng, Y., Dunn, S., and Zhang, L. +Agentless: Demystifying llm-based software engineering agents. +CoRR +, abs/2407.01489, 2024. +doi: +10.48550/ARXIV.2407.01489 +. +URL +https://doi.org/10.48550/arXiv.2407.01489 +. +Yang et al. (2024) +Yang, J., Jimenez, C. E., Wettig, A., Lieret, K., Yao, S., Narasimhan, K., and Press, O. +Swe-agent: Agent-computer interfaces enable automated software engineering. +CoRR +, abs/2405.15793, 2024. +doi: +10.48550/ARXIV.2405.15793 +. +URL +https://doi.org/10.48550/arXiv.2405.15793 +. +Yao et al. (2023) +Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K. R., and Cao, Y. +React: Synergizing reasoning and acting in language models. +In +The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023 +. OpenReview.net, 2023. +URL +https://openreview.net/forum?id=WE_vluYUL-X +. +Yuan et al. (2024) +Yuan, L., Cui, G., Wang, H., Ding, N., Wang, X., Deng, J., Shan, B., Chen, H., Xie, R., Lin, Y., Liu, Z., Zhou, B., Peng, H., Liu, Z., and Sun, M. +Advancing LLM reasoning generalists with preference trees. +CoRR +, abs/2404.02078, 2024. +doi: +10.48550/ARXIV.2404.02078 +. +URL +https://doi.org/10.48550/arXiv.2404.02078 +. +Zeng et al. (2023) +Zeng, A., Liu, M., Lu, R., Wang, B., Liu, X., Dong, Y., and Tang, J. +Agenttuning: Enabling generalized agent abilities for llms. +In +Annual Meeting of the Association for Computational Linguistics +, 2023. +URL +https://api.semanticscholar.org/CorpusID:264306101 +. +Zhai et al. (2024) +Zhai, Y., Bai, H., Lin, Z., Pan, J., Tong, S., Zhou, Y., Suhr, A., Xie, S., LeCun, Y., Ma, Y., and Levine, S. +Fine-tuning large vision-language models as decision-making agents via reinforcement learning. +ArXiv +, abs/2405.10292, 2024. +URL +https://api.semanticscholar.org/CorpusID:269790773 +. +Zhang et al. (2024a) +Zhang, K., Yao, W., Liu, Z., Feng, Y., Liu, Z., Murthy, R., Lan, T., Li, L., Lou, R., Xu, J., Pang, B., Zhou, Y., Heinecke, S., Savarese, S., Wang, H., and Xiong, C. +Diversity empowers intelligence: Integrating expertise of software engineering agents. +ArXiv +, abs/2408.07060, 2024a. +URL +https://api.semanticscholar.org/CorpusID:271860093 +. +Zhang et al. (2024b) +Zhang, Y., Ruan, H., Fan, Z., and Roychoudhury, A. +Autocoderover: Autonomous program improvement. +In +ISSTA +, 2024b. +Zhao et al. (2024) +Zhao, W., Jiang, N., Lee, C., Chiu, J. T., Cardie, C., Gallé, M., and Rush, A. M. +Commit0: Library generation from scratch, 2024. +URL +https://arxiv.org/abs/2412.01769 +. +Zheng et al. (2024a) +Zheng, L., Yin, L., Xie, Z., Sun, C., Huang, J., Yu, C. H., Cao, S., Kozyrakis, C., Stoica, I., Gonzalez, J. E., Barrett, C., and Sheng, Y. +Sglang: Efficient execution of structured language model programs, 2024a. +URL +https://arxiv.org/abs/2312.07104 +. +Zheng et al. (2024b) +Zheng, T., Zhang, G., Shen, T., Liu, X., Lin, B. Y., Fu, J., Chen, W., and Yue, X. +Opencodeinterpreter: Integrating code generation with execution and refinement. +ArXiv +, abs/2402.14658, 2024b. +URL +https://api.semanticscholar.org/CorpusID:267782452 +. +Örwall (2024) +Örwall, A. +Moatless Tool. +https://github.com/aorwall/moatless-tools +, 2024. +Accessed: 2024-10-22. +Appendix A +Comparison with Concurrent Works +Ma et al. ( +2024 +) +trains an LM agent, Lingma SWE-GPT, using a method similar to our rejection sampling fine-tuning baseline, with a dataset comparable to our SWE-Gym Raw splits. Without executable unit test feedback, they rely on manually defined heuristics to filter out low-quality trajectories, such as comparing similarity between submitted patches and edit locations with gold patches. +The model weights are publicly accessible but not the training pipeline or the dataset. +Most relevant to our work are two consecutive blog posts by +Golubev et al. ( +2024 +) +and +Badertdinov et al. ( +2024 +) +, who also construct an executable training environment with real-world tasks from GitHub. +Instead of manual configuration, they employ a general environment setup script and simply discard instances that fail the setup process. +This approach leads to key differences in dataset size and distribution: while it biases the environment away from tasks with complex dependencies, they successfully collect 6,415 instances, about 1.5 times larger than our dataset. +In +Golubev et al. ( +2024 +) +, they also study training agents and verifiers with the environment. +Additionally, they explore a lookahead setting where a trained verifier ranks and selects the best next action. +With a substantially large collection of agent trajectories (80,036 compared to thousands in our experiments) and model size (72B compared to 32B), +Their best system achieves 40% accuracy on SWE-Bench Verified. +While their dataset and agent trajectories are publicly accessible, the model is not. +In comparison, with a comparable dataset size, our SWE-Gym has executable feedback, avoids potential dataset bias through manual configuration of environments, while providing comprehensive analysis of agent and verifier training, their scaling behaviors, and positive results on agent self-improvement. Our system achieves competitive results with significantly lower compute and a smaller model size (32B vs 72B). Lastly, we open source all artifacts of the project, including dataset, model weights, agent trajectory data and the training pipeline. +Model +SWE-Bench +Openness +Name, Model Size +Lite +Verified +Model +Environment +Ma et al. ( +2024 +) +, 72B +22.0 +30.2 +✓ +✗ +Golubev et al. ( +2024 +) +Agent and Verifier, 72B +- +40.6 +✗ +✓ +Our SWE-Gym Agent and Verifier, 32B +26.0 +32.0 +✓ +✓ +Table 5 +: +Comparison of model performance on SWE-Bench benchmark and if the model weights and environments are publically accessible (openness). +Cap +# Traj +Empty Patch ( +% +, +↓ +\%,\downarrow +% , ↓ +) +resolution rate ( +% +, +↑ +\%,\uparrow +% , ↑ +) +0 (Zero-shot) +0 +56.3 +7.0 +1 +36 +37.3 +9.0 +2 +62 +29.0 +9.7 +3 +82 +43.7 +7.7 +No Cap (All) +172 +30.7 +9.3 +Table 6: +resolution rate and empty patch rate on SWE-Bench Lite with a 7B model trained using different instance capping strategies (Cap). +Figure 6 +: +Success distribution over 30 rounds on SWE-Gym Lite with 7B model in zero-shot. The distribution is naturally biased toward easy tasks. Per instance capping reduces this bias but lowers the total trajectory count for training. We set temperature +t += +1 +𝑡 +1 +t=1 +italic_t = 1 +during sampling. +Figure 7 +: +Scaling inference-time compute for MoatlessTools Agents (7B and 32B) with their corresponding learned verifiers. Temperature +t += +0.5 +𝑡 +0.5 +t=0.5 +italic_t = 0.5 +. +Figure 8 +: +Ablation study for verifier training (§ +5.1.1 +). Performances are evaluated on SWE-Bench Verified. +Both the agent and the verifier are +Qwen2.5-Coder-Instruct-32B +model fine-tuned on the corresponding dataset. +OpenHands +(Wang et al., +2024c +) +is used as the agent scaffold. +Figure 9 +: +Comparison of three data sampling approaches using 32B LM: +scaling trajectories (dedup.), scaling unique task instances, and scaling repositories (§ +5.2 +). +Appendix B +Experiment Details +Original +Dedup. +Sorted by Random (Dedup.) +Sorted by Repo (Dedup.) +First 25% +First 50% +First 25% +First 50% +getmoto/moto +155 +72 +12 +33 +0 +46 +Project-MONAI/MONAI +95 +53 +17 +25 +53 +53 +pandas-dev/pandas +70 +61 +14 +30 +0 +0 +python/mypy +46 +27 +7 +12 +0 +0 +dask/dask +45 +29 +8 +17 +6 +29 +iterative/dvc +36 +24 +8 +12 +0 +0 +conan-io/conan +20 +12 +1 +7 +12 +12 +pydantic/pydantic +11 +7 +2 +4 +0 +0 +facebookresearch/hydra +7 +5 +2 +5 +0 +5 +bokeh/bokeh +3 +2 +1 +1 +2 +2 +modin-project/modin +3 +2 +1 +1 +0 +0 +Total +491 +294 +73 +147 +73 +147 +Table 7 +: +Distribution of success trajectories used in training-time scaling experiments (§ +5.2 +). +Dedup. +denotes that the trajectories are deduplicated by randomly select ONE success trajectory per instance ID; +Sorted by random (repo) X% (Dedup.) +denotes a subset of trajectories taken from the first X% from dedup. instances that are sorted randomly (by repository name). +Percentiles +Resolved +Count +Mean +Std +Min +Max +5% +10% +25% +50% +75% +90% +95% +Num. of Messages +✗ +5 +, +557.0 +5 +557.0 +5,557.0 +5 , 557.0 +39.2 +39.2 +39.2 +39.2 +31.9 +31.9 +31.9 +31.9 +7.0 +7.0 +7.0 +7.0 +101.0 +101.0 +101.0 +101.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +9.0 +29.0 +29.0 +29.0 +29.0 +61.0 +61.0 +61.0 +61.0 +100.0 +100.0 +100.0 +100.0 +101.0 +101.0 +101.0 +101.0 +✓ +491.0 +491.0 +491.0 +491.0 +39.9 +39.9 +39.9 +39.9 +19.9 +19.9 +19.9 +19.9 +13.0 +13.0 +13.0 +13.0 +101.0 +101.0 +101.0 +101.0 +19.0 +19.0 +19.0 +19.0 +21.0 +21.0 +21.0 +21.0 +25.0 +25.0 +25.0 +25.0 +33.0 +33.0 +33.0 +33.0 +47.5 +47.5 +47.5 +47.5 +65.0 +65.0 +65.0 +65.0 +87.0 +87.0 +87.0 +87.0 +Num. of Tokens +✗ +5 +, +557.0 +5 +557.0 +5,557.0 +5 , 557.0 +17 +, +218.3 +17 +218.3 +17,218.3 +17 , 218.3 +17 +, +761.6 +17 +761.6 +17,761.6 +17 , 761.6 +1 +, +615.0 +1 +615.0 +1,615.0 +1 , 615.0 +167 +, +834.0 +167 +834.0 +167,834.0 +167 , 834.0 +1 +, +833.0 +1 +833.0 +1,833.0 +1 , 833.0 +1 +, +907.0 +1 +907.0 +1,907.0 +1 , 907.0 +2 +, +268.0 +2 +268.0 +2,268.0 +2 , 268.0 +12 +, +305.0 +12 +305.0 +12,305.0 +12 , 305.0 +26 +, +434.0 +26 +434.0 +26,434.0 +26 , 434.0 +41 +, +182.2 +41 +182.2 +41,182.2 +41 , 182.2 +51 +, +780.6 +51 +780.6 +51,780.6 +51 , 780.6 +✓ +491.0 +491.0 +491.0 +491.0 +18 +, +578.5 +18 +578.5 +18,578.5 +18 , 578.5 +11 +, +361.4 +11 +361.4 +11,361.4 +11 , 361.4 +2 +, +560.0 +2 +560.0 +2,560.0 +2 , 560.0 +81 +, +245.0 +81 +245.0 +81,245.0 +81 , 245.0 +5 +, +813.0 +5 +813.0 +5,813.0 +5 , 813.0 +8 +, +357.0 +8 +357.0 +8,357.0 +8 , 357.0 +11 +, +559.5 +11 +559.5 +11,559.5 +11 , 559.5 +15 +, +999.0 +15 +999.0 +15,999.0 +15 , 999.0 +22 +, +040.5 +22 +040.5 +22,040.5 +22 , 040.5 +31 +, +632.0 +31 +632.0 +31,632.0 +31 , 632.0 +39 +, +512.5 +39 +512.5 +39,512.5 +39 , 512.5 +Table 8 +: +Statistics of SWE-Gym-sampled trajectories. We use the tokenizer from +Qwen-2.5-Coder-Instruct-7B +to estimate the number of tokens. +Agent +Model +Model Size +Training Data +Resolved +( +% +) +(\%) +( % ) +SWE-Bench Verified (500 instances) +RAG +SWE-Llama +(Jimenez et al., +2024 +) +7B +10K instances +1.4 +RAG +SWE-Llama +(Jimenez et al., +2024 +) +13B +10K instances +1.2 +Lingma Agent +(Ma et al., +2024 +) +Lingma SWE-GPT (v0925) +7B +90K PRs from 4K repos +18.2 +Lingma Agent +(Ma et al., +2024 +) +Lingma SWE-GPT (v0925) +72B +90K PRs from 4K repos +28.8 +OpenHands +(Wang et al., +2024c +) +(Ours) +fine-tuned Qwen2.5-Coder-Instruct +32B +491 agent trajectories from 11 repos +20.6 +OpenHands w/ Verifier +(Wang et al., +2024c +) +(Ours) +fine-tuned Qwen2.5-Coder-Instruct +32B (Agent & Verifier) +491 +491 +491 +491 +agent trajectories from 11 repos for agent + +1318 +× +2 +1318 +2 +1318\times 2 +1318 × 2 +success/failure agent trajectories for verifier +32.0 +Table 9 +: +Performance comparison with SWE-Bench +(Jimenez et al., +2024 +) +baselines +with publicly accessible weights +. +Data source: +https://www.swebench.com/ +, Accessed on Dec 21, 2024. +B.1 +Mean and Variance for Pass@N and Best@N. +We mostly follow +(Lightman et al., +2023 +) +for obtaining the mean and variance for the Pass@N and Best@N curve. Given a total of M rounds of rollouts, for +N +< +M +𝑁 +𝑀 +N +> +< +< +/think +> +> +and answer is enclosed within +< +< +answer +> +> +< +< +/answer +> +> +tags, respectively, i.e., +< +< +think +> +> +reasoning process here +< +< +/think +> +> +< +< +answer +> +> +answer here +< +< +/answer +> +> +.\nUser: +{question} +\nAssistant: +< +< +think +> +> +Template 2 +( +Qwen-Math template +) +. +< +< +|im_start| +> +> +system\nPlease reason step by step, and put your final answer within \\boxed{}. +< +< +|im_end| +> +> +\n +< +< +|im_start | +> +> +user\n +{question} +< +< +|im_end| +> +> +\n +< +< +|im_start| +> +> +assistant\n +Template 3 +( +No template +) +. +{question} +Experimental settings +. We include Qwen2.5-Math-1.5B, Qwen2.5-Math-7B, Qwen2.5-7B, Llama-3.1-8B, DeepSeek-Math-7B and DeepSeek-V3-Base-685B for experiments. For each model, we first apply +No template +to get the model responses, then let GPT-4o-mini to judge whether the model responses are in an answering format (regardless of quality) or in a sentence-completion pattern. We record the percentage of responses that tend to answer the question as the metric. We then apply both +R1 template +and +Qwen-Math template +to obtain model responses, and determine the most suitable template for each model based on the metric. Finally, we evaluate the pass@8 accuracy of each model with the corresponding template to assess whether the base policies can explore rewarding trajectories for RL improvement. +Figure 3: +Model attributes across three aspects. +Question-Answering Ability +: the extent to which a pretrained language model provides a direct answer to a question rather than continuing or expanding upon it; +Exploration Ability +: pass@8 measures how well base models explore; +Self-Reflection +: counts are obtained through cross-validation between keyword-based detection and LLM-based detection, as detailed in Appendix +D +. +Results +. The left plot of +Fig. +3 +shows how well base models (with or without templates) answer the provided questions. We observe that Llama and DeepSeek models all improve the answering ability by employing the proper template (R1 template). However, Qwen2.5 models work best (with +100 +% +100\% +answering rate) when no template is used. This intriguing property motivates further investigation which will be discussed in +Sec. +2.2 +. Meanwhile, the lowest answering rate with no template suggests that DeepSeek-V3-Base is a nearly pure base model. This observation motivates us to explore whether a pure base model like DeepSeek-V3-Base demonstrates the Aha moment ( +Sec. +2.3 +). +The middle plot of +Fig. +3 +shows the pass@8 accuracy of different base models (with template) at different sampling temperatures. This metric can serve as an indicator of base policy’s exploration ability. For example, if a base policy cannot even sample a single trajectory that leads to the correct final answer, it is impossible for RL to improve the policy because there is no reward signal. Our results demonstrate that all tested models are exploratory (thus ready for RL), with Qwen2.5 models performing the best (even surpassing DeekSeek-V3-Base). This might partially explain that most R1-Zero projects +(Zeng et al., +2025 +; Hu et al., +2025 +) +are based on Qwen2.5 models. +2.2 +Qwen-2.5 Models Unlock the Best Performance When Discarding Template +We next dig into the intriguing observation (c.f. +Fig. +3 +(Left)) that all Qwen2.5 base models readily serve as chat models even without any template. We take a step further to evaluate the reasoning ability of Qwen2.5-Math models on five standard benchmarks: AIME 2024 +(Li et al., +2024a +) +, AMC +(Li et al., +2024a +) +, MATH500 +(Hendrycks et al., +2021 +) +, Minerva Math +(Lewkowycz et al., +2022 +) +, and OlympiadBench +(He et al., +2024 +) +. Following common practice, we use greedy decoding and limit the sampling budget to 3000 tokens. +Base model + Template +AIME24 +AMC +MATH500 +Minerva +OlympiadBench +Avg. +Qwen2.5-Math-1.5B +(4-shot prompting) +0.0 +20.0 +50.4 +12.1 +15.9 +19.7 +R1 template +0.0 +9.6 +21.2 +6.6 +2.2 +7.9 +Qwen template +20.0 +32.5 +33.0 +12.5 +22.8 +24.2 +No template +16.7 +43.4 +61.8 +15.1 +28.4 +33.1 +Qwen2.5-Math-7B +(4-shot prompting) +3.3 +22.5 +61.6 +10.7 +20.9 +23.8 +R1 template +0.0 +0.0 +0.0 +0.0 +0.1 +0.0 +Qwen template +16.7 +38.6 +50.6 +9.9 +16.6 +26.5 +No template +0.2 +45.8 +69.0 +21.3 +34.7 +38.2 +Table 1: +Qwen2.5-Math models might be pretrained on concatenated question-answer text, resulting in peak performance when +no template +is applied. +As shown in +Table +1 +, not using any template can drastically boost the average performance, resulting in an improvement of about +60 +% +60\% +compared to the traditional 4-shot prompting. Since Qwen2.5-Math +(Yang et al., +2024b +) +uses chat model’s data (question-answer pairs) during the pretraining stage, we hypothesize that they might pretrain on the concatenated text to maximize +log +⁡ +p +θ +​ +( +𝐪 +; +𝐨 +) +\log p_{\theta}({\mathbf{q}};{\mathbf{o}}) +directly. If our hypothesis turns out true, we shall be more careful about using Qwen2.5 models to reproduce DeepSeek-R1-Zero, since the base models are already SFT-like without templates. +2.3 +Aha Moment Already Appears in Base Models Including DeepSeek-V3-Base +One of the most inspiring results of DeepSeek-R1-Zero is the emergence of self-reflection behaviors, a.k.a., Aha moment, through pure RL training. A few prior studies +(Liu et al., +2025b +; Yeo et al., +2025 +) +have suggested that there may not be Aha moment in open-source R1 replications because the base models they use already exhibit self-reflection keywords. However, they have not tested DeepSeek-V3-Base, on which the real R1-Zero model was RL-tuned. We complete this missing piece by hosting DeepSeek-V3-Base-685B ourselves and investigating its responses to the +500 +500 +MATH questions with the R1 template. From the right plot of +Fig. +3 +, we can observe that DeepSeek-V3-Base also generates a decent amount of self-reflections, further validating the claims of +Liu et al. ( +2025b +) +. We also show examples in +App. +E +( +Fig. +13 +) where DeepSeek-V3-Base generates keywords such as “Aha” and “wait”. +An additional important question is whether self-reflection behaviors are associated with improved model performance after RL training. To investigate this, we host DeepSeek-R1-Zero and analyze its responses to the same questions from the MATH dataset. Although self-reflection behaviors occur more frequently in R1-Zero, we observe that these behaviors are not positively correlated with higher accuracy. Detailed analysis can be found in +App. +F +. +3 +Analysis on Reinforcement Learning +Language model generation can be formulated as a token-level Markov Decision Process (MDP) +ℳ += +( +𝒮 +, +𝒜 +, +r +, +p +𝒬 +) +{\mathcal{M}}=({\mathcal{S}},{\mathcal{A}},r,p_{{\mathcal{Q}}}) +. +At each generation step +t +t +, the state +s +t +∈ +𝒮 +s_{t}\in{\mathcal{S}} +is the concatenation of the input question and the output response generated so far: +s +t += +𝐪 +; +𝐨 +< +t += +[ +q +1 +, +… +, +q +M +, +o +1 +, +… +, +o +t +− +1 +] +s_{t}={\mathbf{q}};{\mathbf{o}}_{ +0 +\beta>0 +) for reinforcement learning from human feedback +(Christiano et al., +2017 +) +, where +r +r +is a +reward model +learned from data collected by +π +ref +\pi_{\text{ref}} +. In this case, regularization helps prevent +π +θ +\pi_{\theta} +from deviating too far from the distribution where the reward model is accurate +(Jaques et al., +2019 +; Stiennon et al., +2020 +) +. +However, RL-tuning reasoning models typically employs +rule-based verifiers +as +r +r +(Lambert et al., +2024 +) +, eliminating the concerns of distributional shift. This allows us to remove the KL term, which not only saves the memory and computation required by +π +ref +\pi_{\text{ref}} +during training, but also potentially leads to better performance for R1-Zero-like training +(Hu et al., +2025 +) +. We will assume +β += +0 +\beta=0 +throughout this paper. +Policy optimization algorithms +. To optimize +π +θ +\pi_{\theta} +with the above objective ( +Eq. +1 +with +β += +0 +\beta=0 +), Proximal Policy Optimization (PPO) +(Schulman et al., +2017b +) +maximizes the following surrogate objective: +𝒥 +P +​ +P +​ +O +​ +( +π +θ +) += +𝔼 +𝐪 +∼ +p +𝒬 +, +𝐨 +∼ +π +θ +old +( +⋅ +| +𝐪 +) +∑ +t += +1 +| +𝐨 +| +{ +min +⁡ +[ +π +θ +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +π +θ +old +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +​ +A +^ +t +, +clip +​ +( +π +θ +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +π +θ +old +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +, +1 +− +ϵ +, +1 ++ +ϵ +) +​ +A +^ +t +] +} +, +\begin{split}{\mathcal{J}}_{PPO}(\pi_{\theta})&=\mathbb{E}_{{\mathbf{q}}\sim p_{{\mathcal{Q}}},{\mathbf{o}}\sim\pi_{\theta_{\text{old}}}(\cdot|{\mathbf{q}})}\\ +&\sum_{t=1}^{|{\mathbf{o}}|}\left\{\min\left[\frac{\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{ +0 +\hat{A}_{i,t}>0 +, indicating a correct response), this bias results in greater gradient updates for shorter responses, leading the policy to favor brevity in correct answers. Conversely, for negative advantages ( +A +^ +i +, +t +< +0 +\hat{A}_{i,t}<0 +, indicating an incorrect response), longer responses are penalized less due to their larger +| +𝐨 +i +| +|{\mathbf{o}}_{i}| +, causing the policy to prefer lengthier responses among incorrect ones. +• +Question-level difficulty bias +: This is caused by dividing the centered outcome reward by +std +⁡ +( +{ +R +​ +( +𝐪 +, +𝐨 +1 +) +, +… +, +R +​ +( +𝐪 +, +𝐨 +G +) +} +) +\operatorname{std}(\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}) +. Questions with lower standard deviations (e.g., those that are too easy or too hard, with the outcome rewards being almost all 1 or 0) are given higher weights during policy updates. While advantage normalization is a common trick in RL +(Andrychowicz et al., +2021 +) +, it is typically computed across an entire batch. In contrast, question-level normalization results in varying weights in the objective for different questions, leading to a difficulty bias in optimization. +Figure 4: +Illustration of the biases in GRPO. Note that the effective advantage of GRPO +a +i +, +t +a_{i,t} +is equivalent to a reweighted version of the unbiased advantage +A +~ +i +, +t += +R +​ +( +𝐪 +, +𝐨 +i +) +− +mean +⁡ +( +𝐑 +) +\tilde{A}_{i,t}=R({\mathbf{q}},{\mathbf{o}}_{i})-\operatorname{mean}(\mathbf{R}) +. The terms +std +⁡ +( +𝐑 +) +\operatorname{std}(\mathbf{R}) +and +| +𝐨 +i +| +|{\mathbf{o}}_{i}| +could bias the optimization by assigning different weights to different questions and responses, as denoted by the sizes of the blue circles and the lengths of the orange arrows. Upward arrows indicate positive advantages, and vice versa. +Length Bias Also Exists in Open-Source PPO Implementations +. We also examined several popular open-source implementations of vanilla PPO algorithms for LLM post-training. To our surprise, all of these implementations normalize the loss by response length (see +LABEL:lst:ppo_impl +and +Table +2 +), which +misaligns +with the PPO objective as defined in +Eq. +2 +. +This formulation-implementation misalignment was present even before the publication of GRPO. We speculate that the misalignment might originate from the +pretraining stage +(Shoeybi et al., +2019 +) +, where all tokens are packed into a fixed-length context and normalizing the loss by the context length (i.e., computing +loss.mean(-1) +) improves the numerical stability. However, in the +RL-tuning stage +, typical implementations +(von Werra et al., +2020 +) +normalize the loss by the response length, which is +not +a constant, introducing an unintended length bias. +Listing 1: +Comparison between a typical open-source PPO loss implementation that is biased (red) and our implementation (green). +MAX_TOKENS +is a global constant during the entire training (unless budget curriculum is enabled), which specifies the maximum number of generation tokens. Other constants also work with differences in gradient norm. +⬇ +1 +def +masked_mean +( +tensor +, +mask +, +dim +): +2 +- +return +(tensor +* +mask).sum(axis=dim) +/ +mask.sum(axis=dim) +3 ++ +return +(tensor +* +mask).sum(axis=-1) +/ +MAX_TOKENS +4 +5 +ppo_loss += +... +# +compute +per-token +ppo +loss +6 +response_mask += +... +# +per-token +response +mask +7 +# +per-response +length +normalization +(e.g., +OpenRLHF) +8 +loss_variant1 += +masked_mean(ppo_loss, +response_mask, +dim=-1).mean() +9 +# +OR +per-batch +length +normalization +(e.g., +trl, +verl) +10 +loss_variant2 += +masked_mean(ppo_loss, +response_mask, +dim=None).mean() +Repository +Code Link +Unbiased? +trl +(von Werra et al., +2020 +) +PPO Loss +OpenRLHF +(Hu et al., +2024 +) +PPO Loss +verl +(Sheng et al., +2024 +) +PPO Loss +SimpleRL-Zero +(Zeng et al., +2025 +) +PPO Loss +Open-Reasoner-Zero +(Hu et al., +2025 +) +PPO Loss +Table 2: +Many open-sourced PPO implementations contain length bias. +3.2 +Dr. GRPO: Group Relative Policy Optimization Done Right +To avoid the aforementioned optimization bias in GRPO, we propose to simply remove the +1 +| +𝐨 +i +| +{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\frac{1}{|{\mathbf{o}}_{i}|}} +and +std +⁡ +( +{ +R +​ +( +𝐪 +, +𝐨 +1 +) +, +… +, +R +​ +( +𝐪 +, +𝐨 +G +) +} +) +{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\operatorname{std}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})} +normalization terms. Meanwhile, to faithfully implement the unbiased optimization objective, we could replace the +mask.sum(axis=dim) +with a constant value (e.g., generation budget) in the +masked_mean +function in +LABEL:lst:ppo_impl +, as highlighted by the line in green. +Notably, these simple modifications recover the PPO objective in +Eq. +2 +, with the advantage estimated by Monte Carlo return with an unbiased baseline +(Sutton & Barto, +2018 +) +. We give detailed derivations in +App. +A +. We refer to our new optimization algorithm as +Dr. GRPO +. We next experimentally validate its effectiveness. +Experimental settings +. +We implement our algorithm using Oat +(Liu et al., +2025a +) +, a modular, research-friendly and efficient LLM RL framework. We adopt the Qwen2.5-1.5B base model and the R1 template ( +Template +1 +) for online RL-tuning. We implement the verification-based reward function using Math-Verify +2 +2 +2 +https://github.com/huggingface/Math-Verify +. +, with the following minimalistic rule: +R +​ +( +𝐪 +, +𝐨 +) += +{ +1 +if +𝐨 +contains the correct final answer to +𝐪 +0 +otherwise +R({\mathbf{q}},{\mathbf{o}})=\begin{cases}1&\text{if ${\mathbf{o}}$ contains the correct final answer to ${\mathbf{q}}$}\\ +0&\text{otherwise}\end{cases} +We run RL on questions sampled from the MATH +(Hendrycks et al., +2021 +) +training dataset, and compare the vanilla GRPO with the proposed Dr. GRPO. We evaluate the online model on five benchmarks: AIME2024, AMC, MATH500, Minerva Math and OlympiadBench. +More experimental details including hyperparameters can be found in +App. +G +. +Figure 5: +Comparison of Dr. GRPO and GRPO in terms of training dynamics (Top) and evaluation results (Bottom). +Results +. We report various metrics in +Fig. +5 +to demonstrate that Dr. GRPO can effectively mitigate the optimization bias and lead to +better token efficiency +. In particular, we first note that both GRPO and Dr. GRPO exhibit similar trend to DeepSeek-R1-Zero +(Guo et al., +2025 +) +, namely their response length increases along with training reward (Plots 1 & 2). However, we observe that GRPO tends to continually generate longer +responses even when the reward improvement slows down (Plot 2). Although such a phenomenon is often referred to as the “emergence” of long-CoT through RL +(Zeng et al., +2025 +; Hu et al., +2025 +) +, we argue that it is also confounded by the response-level length bias ( +Sec. +3.1 +) during optimization +3 +3 +3 +We note that both +Zeng et al. ( +2025 +) +and +Hu et al. ( +2025 +) +employ PPO, which is unbiased by formulation. However, their loss implementations still introduce the length bias (see +LABEL:lst:ppo_impl +). +. In contrast, by computing the unbiased policy gradients, Dr. GRPO prevents the response length from growing wildly during training (Plot 2). Moreover, on evaluation benchmarks, the length of incorrect responses is substantially reduced by Dr. GRPO compared to the baseline (Plot 4), suggesting that an unbiased optimizer also +mitigates overthinking +(Chen et al., +2024 +) +. +Figure 6: +The average benchmark accuracy of different {template, question set} combinations during RL training. +3.3 +A Duet of Template and Question Set Coverage in RL dynamics +Recall that the Qwen2.5-Math base models can readily answer questions with high accuracy without any prompt template ( +Sec. +2.2 +). Based on this intriguing observation, we are interested in how different templates affect the RL training. Furthermore, given the general belief that larger question set coverage leads to better performance +(Luo et al., +2025 +; Hu et al., +2025 +) +, we also study the interaction between different templates and different levels of question coverage. +Experimental settings +. Starting from the Qwen2.5-Math-1.5B base model, we apply R1 template, Qwen-Math template and No template respectively to run RL using Dr. GRPO. All experiments are repeated for different question sets that are detailed in +Table +3 +. +Question set +# +Description +ORZ +57k +Combining AIME, Numina-Math, Tulu3 MATH; diverse and large amount +MATH +12k +High-school math competition questions +GSM +8k +Simpler grade-school math questions +ASDiv +2k +Basic algebra ( ++ +− +× +÷ +) ++-\times\div) +questions +Table 3: +Different question sets that have different levels of difficulty and coverage. +Results +. +Fig. +6 +shows the RL curves of different runs, from which we can make several interesting observations: +1) +Templates determine the performance of the initial policies, but RL can improve all policies to a comparable performance of +∼ +40 +% +\sim 40\% +(given a proper question set); +2) +When using the R1 template, question sets have a significant impact on the dynamics of RL, with too narrow coverage leading to lower plateau performance. However, when using the Qwen-Math template, the best final performance is attained by RL on GSM-8K, demonstrating that training on much simpler (and o.o.d.) questions can largely improve (nearly double) the test accuracy on harder questions. From these observations, we draw the following insights: +• +The Qwen2.5-Math-1.5B base model already possesses strong math-solving capabilities (see the starting point in the right plot of +Fig. +6 +). +Applying templates in fact destroys +the capability before RL reconstructs it. This implies that we should be more conservative in claiming the huge gains brought about by pure RL. +• +When there is a large +mismatch +between base models and templates (e.g., R1 template mismatches Qwen2.5-Math-1.5B), the policy improvement mainly comes from RL-tuning, thus requiring question set to have good coverage (left plot of +Fig. +6 +). +Otherwise +, even a small and completely o.o.d. question set could induce the reasoning ability equally well, by +reinforcing useful reasoning behaviors instead of infusing new knowledge +. +3.4 +Domain-Specific Pretraining Improves RL Ceiling +Recent successful R1-Zero-like replications of math reasoners mostly employ Qwen2.5 base models as the initial policies +(Zeng et al., +2025 +; Cui et al., +2025 +; Hu et al., +2025 +) +, which are already strong math solvers and exhibit self-reflection patterns ( +Sec. +2.2 +and +2.3 +). +In this section we hope to explore the other side: +can R1-Zero-like training succeed on originally weak (in terms of math reasoning) base models? +We answer this question affirmatively, with the observation that +math pretraining would improve the ceiling of RL +. +Figure 7: +Left +: The average benchmark performance curves of different base models. +Right +: The comparison between Dr. GRPO and GRPO with respect to reasoning accuracy (solid lines) and model response length (dashed lines). +Experimental settings +. We adopt the Llama-3.2-3B base model as our starting point, and use the unbiased Dr. GRPO algorithm for RL-tuning with the R1 template. We hypothesize that domain-specific pretraining would help RL, hence we adopt the +Llama-3.2-3B-FineMath +4 +4 +4 +https://huggingface.co/HuggingFaceTB/FineMath-Llama-3B +. +, which is continual pretrained on the FineMath dataset +(Allal et al., +2025 +) +. Moreover, as we hypothesize that Qwen2.5 models are likely to be pretrained on concatenated question-response texts ( +Sec. +2.2 +), we similarly prepare a concatenated dataset from NuminaMath-1.5 +(Li et al., +2024b +) +, and continual pretrain Llama-3.2-3B-FineMath for 2 epochs with learning rate 1e-5. We refer to the concatanated continual pretrained model as +Llama-3.2-3B-NuminaQA +. +Results +. We present the RL curves of different base models in the left plot of +Fig. +7 +. We observe that RL can even improve the vanilla Llama base model, but the gain is minimal. After continual pretraining (and concatenated continual pretraining) to embed math domain knowledge, Llama models can show much stronger RL performance, validating our hypothesis. We also revisit the GRPO’s optimization bias with the Llama base model. The right plot of +Fig. +7 +compares the model performance and response length trained with GRPO and Dr. GRPO. We can clearly see that GRPO can produce the “double-increase” phenomenon, potentially leading to a +misperception +that long-CoT can also emerge on Llama models after math pretraining. Unfortunately, the increase of length might be due to the optimization bias ( +Sec. +3.1 +), which can be effectively mitigated by the proposed Dr. GRPO ( +Sec. +3.2 +& right plot of +Fig. +7 +). +4 +Closing Remarks +We have taken a critical perspective to examine base models used for R1-Zero-like training, as well as algorithms used for RL. Through the analysis, we demystified how pretraining biases influence RL outcomes and how optimization choices, like GRPO, can unintentionally shape model behavior. With the proposed Dr. GRPO, we offer a simple fix that improves token efficiency while preserving reasoning performance. Our results show that scaling RL can be both effective and efficient—sometimes, less really is more. +References +Ahmadian et al. (2024) +Arash Ahmadian, Chris Cremer, Matthias Gallé, Marzieh Fadaee, Julia Kreutzer, Olivier Pietquin, Ahmet Üstün, and Sara Hooker. +Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms. +arXiv preprint arXiv:2402.14740 +, 2024. +Allal et al. (2025) +Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Guilherme Penedo, Lewis Tunstall, Andrés Marafioti, Hynek Kydlíček, Agustín Piqueres Lajarín, Vaibhav Srivastav, et al. +Smollm2: When smol goes big–data-centric training of a small language model. +arXiv preprint arXiv:2502.02737 +, 2025. +Andrychowicz et al. (2021) +Marcin Andrychowicz, Anton Raichuk, Piotr Stańczyk, Manu Orsini, Sertan Girgin, Raphaël Marinier, Leonard Hussenot, Matthieu Geist, Olivier Pietquin, Marcin Michalski, et al. +What matters for on-policy deep actor-critic methods? a large-scale study. +In +International conference on learning representations +, 2021. +Chen et al. (2024) +Xingyu Chen, Jiahao Xu, Tian Liang, Zhiwei He, Jianhui Pang, Dian Yu, Linfeng Song, Qiuzhi Liu, Mengfei Zhou, Zhuosheng Zhang, et al. +Do not think that much for 2+ 3=? on the overthinking of o1-like llms. +arXiv preprint arXiv:2412.21187 +, 2024. +Christiano et al. (2017) +Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. +Deep reinforcement learning from human preferences. +Advances in neural information processing systems +, 30, 2017. +Cui et al. (2025) +Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al. +Process reinforcement through implicit rewards. +arXiv preprint arXiv:2502.01456 +, 2025. +Grattafiori et al. (2024) +Aaron Grattafiori, Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Alex Vaughan, et al. +The llama 3 herd of models. +arXiv preprint arXiv:2407.21783 +, 2024. +Guo et al. (2025) +Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. +Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. +arXiv preprint arXiv:2501.12948 +, 2025. +He et al. (2024) +Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, et al. +Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems. +arXiv preprint arXiv:2402.14008 +, 2024. +Hendrycks et al. (2021) +Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. +Measuring mathematical problem solving with the math dataset. +arXiv preprint arXiv:2103.03874 +, 2021. +Hu et al. (2024) +Jian Hu, Xibin Wu, Zilin Zhu, Xianyu, Weixun Wang, Dehao Zhang, and Yu Cao. +Openrlhf: An easy-to-use, scalable and high-performance rlhf framework. +arXiv preprint arXiv:2405.11143 +, 2024. +Hu et al. (2025) +Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, and Heung-Yeung Shum Xiangyu Zhang. +Open-reasoner-zero: An open source approach to scaling reinforcement learning on the base model. +https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero +, 2025. +Jaques et al. (2019) +Natasha Jaques, Asma Ghandeharioun, Judy Hanwen Shen, Craig Ferguson, Agata Lapedriza, Noah Jones, Shixiang Gu, and Rosalind Picard. +Way off-policy batch deep reinforcement learning of implicit human preferences in dialog. +arXiv preprint arXiv:1907.00456 +, 2019. +Kool et al. (2019) +Wouter Kool, Herke van Hoof, and Max Welling. +Buy 4 reinforce samples, get a baseline for free!, 2019. +Lambert et al. (2024) +Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al. +T +\ +\backslash +” ulu 3: Pushing frontiers in open language model post-training. +arXiv preprint arXiv:2411.15124 +, 2024. +Lewkowycz et al. (2022) +Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay Ramasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, et al. +Solving quantitative reasoning problems with language models. +Advances in Neural Information Processing Systems +, 35:3843–3857, 2022. +Li et al. (2024a) +Jia Li, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Huang, Kashif Rasul, Longhui Yu, Albert Q Jiang, Ziju Shen, et al. +Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions. +Hugging Face repository +, 13:9, 2024a. +Li et al. (2024b) +Jia Li, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Costa Huang, Kashif Rasul, Longhui Yu, Albert Jiang, Ziju Shen, Zihan Qin, Bin Dong, Li Zhou, Yann Fleureau, Guillaume Lample, and Stanislas Polu. +Numinamath, 2024b. +Liu et al. (2024) +Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al. +Deepseek-v3 technical report. +arXiv preprint arXiv:2412.19437 +, 2024. +Liu et al. (2025a) +Zichen Liu, Changyu Chen, Chao Du, Wee Sun Lee, and Min Lin. +Oat: A research-friendly framework for llm online alignment. +https://github.com/sail-sg/oat +, 2025a. +Liu et al. (2025b) +Zichen Liu, Changyu Chen, Wenjun Li, Tianyu Pang, Chao Du, and Min Lin. +There may not be aha moment in r1-zero-like training — a pilot study. +https://oatllm.notion.site/oat-zero +, 2025b. +Notion Blog. +Luo et al. (2025) +Michael Luo, Sijun Tan, Justin Wong, Xiaoxiang Shi, William Y. Tang, Manan Roongta, Colin Cai, Jeffrey Luo, Tianjun Zhang, Li Erran Li, Raluca Ada Popa, and Ion Stoica. +Deepscaler: Surpassing o1-preview with a 1.5b model by scaling rl. +https://github.com/agentica-project/deepscaler +, 2025. +Pan et al. (2025) +Jiayi Pan, Junjie Zhang, Xingyao Wang, Lifan Yuan, Hao Peng, and Alane Suhr. +Tinyzero. +https://github.com/Jiayi-Pan/TinyZero, 2025. +Accessed: 2025-01-24. +Schulman et al. (2015) +John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. +High-dimensional continuous control using generalized advantage estimation. +arXiv preprint arXiv:1506.02438 +, 2015. +Schulman et al. (2017a) +John Schulman, Xi Chen, and Pieter Abbeel. +Equivalence between policy gradients and soft q-learning. +arXiv preprint arXiv:1704.06440 +, 2017a. +Schulman et al. (2017b) +John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. +Proximal policy optimization algorithms. +arXiv preprint arXiv:1707.06347 +, 2017b. +Shao et al. (2024) +Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Y Wu, et al. +Deepseekmath: Pushing the limits of mathematical reasoning in open language models. +arXiv preprint arXiv:2402.03300 +, 2024. +Sheng et al. (2024) +Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. +Hybridflow: A flexible and efficient rlhf framework. +arXiv preprint arXiv:2409.19256 +, 2024. +Shoeybi et al. (2019) +Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. +Megatron-lm: Training multi-billion parameter language models using model parallelism. +arXiv preprint arXiv:1909.08053 +, 2019. +Stiennon et al. (2020) +Nisan Stiennon, Long Ouyang, Jeffrey Wu, Daniel Ziegler, Ryan Lowe, Chelsea Voss, Alec Radford, Dario Amodei, and Paul F Christiano. +Learning to summarize with human feedback. +Advances in neural information processing systems +, 33:3008–3021, 2020. +Sutton & Barto (2018) +Richard S. Sutton and Andrew G. Barto. +Reinforcement Learning: An Introduction +. +The MIT Press, second edition, 2018. +von Werra et al. (2020) +Leandro von Werra, Younes Belkada, Lewis Tunstall, Edward Beeching, Tristan Thrush, Nathan Lambert, Shengyi Huang, Kashif Rasul, and Quentin Gallouédec. +Trl: Transformer reinforcement learning. +https://github.com/huggingface/trl +, 2020. +Yang et al. (2024a) +An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al. +Qwen2.5 technical report. +arXiv preprint arXiv:2412.15115 +, 2024a. +Yang et al. (2024b) +An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao, Bowen Yu, Chengpeng Li, Dayiheng Liu, Jianhong Tu, Jingren Zhou, Junyang Lin, et al. +Qwen2.5-math technical report: Toward mathematical expert model via self-improvement. +arXiv preprint arXiv:2409.12122 +, 2024b. +Yeo et al. (2025) +Edward Yeo, Yuxuan Tong, Morry Niu, Graham Neubig, and Xiang Yue. +Demystifying long chain-of-thought reasoning in llms. +arXiv preprint arXiv:2502.03373 +, 2025. +Zeng et al. (2025) +Weihao Zeng, Yuzhen Huang, Wei Liu, Keqing He, Qian Liu, Zejun Ma, and Junxian He. +7b model and 8k examples: Emerging reasoning with reinforcement learning is both effective and efficient. +https://hkust-nlp.notion.site/simplerl-reason +, 2025. +Notion Blog. +Appendix A +Policy Gradient Derivations +In the context of RL for LLM post-training, we typically maximize the value of +𝒥 +​ +( +π +θ +) += +𝔼 +𝐪 +∼ +p +𝒬 +​ +[ +𝔼 +𝐨 +∼ +π +θ +( +⋅ +| +𝐪 +) +​ +[ +R +​ +( +𝐪 +, +𝐨 +) +] +] +, +\mathcal{J}(\pi_{\theta})=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[R({\mathbf{q}},{\mathbf{o}})]\right], +(4) +where +R +​ +( +𝐪 +, +𝐨 +) += +∑ +t += +1 +| +𝐨 +| +r +​ +( +𝐪 +, +𝐨 +≤ +t +) +R({\mathbf{q}},{\mathbf{o}})=\sum_{t=1}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t}) +is the return +(Sutton & Barto, +2018 +) +of the trajectory +𝐪 +; +𝐨 +{\mathbf{q}};{\mathbf{o}} +, and +r +​ +( +𝐪 +, +𝐨 +≤ +t +) +r({\mathbf{q}},{\mathbf{o}}_{\leq t}) +represents the token-level reward for +t +t +-th token in response +𝐨 +{\mathbf{o}} +. +The Monte Carlo policy gradient +(Sutton & Barto, +2018 +) +of +Eq. +4 +is +∇ +θ +𝒥 +​ +( +π +θ +) += +𝔼 +𝐪 +∼ +p +𝒬 +​ +[ +𝔼 +𝐨 +∼ +π +θ +( +⋅ +| +𝐪 +) +​ +[ +∇ +θ +log +⁡ +π +θ +​ +( +𝐨 +| +𝐪 +) +​ +R +​ +( +𝐪 +, +𝐨 +) +] +] += +𝔼 +𝐪 +∼ +p +𝒬 +​ +[ +𝔼 +𝐨 +∼ +π +θ +( +⋅ +| +𝐪 +) +​ +[ +∇ +θ +​ +∑ +t += +1 +| +𝐨 +| +log +⁡ +π +θ +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +​ +R +​ +( +𝐪 +, +𝐨 +) +] +] += +𝔼 +𝐪 +∼ +p +𝒬 +​ +[ +𝔼 +𝐨 +∼ +π +θ +( +⋅ +| +𝐪 +) +​ +[ +∑ +t += +1 +| +𝐨 +| +∇ +θ +log +⁡ +π +θ +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +​ +∑ +t +′ += +t +| +𝐨 +| +r +​ +( +𝐪 +, +𝐨 +≤ +t +′ +) +] +] += +𝔼 +𝐪 +∼ +p +𝒬 +​ +[ +𝔼 +𝐨 +∼ +π +θ +( +⋅ +| +𝐪 +) +​ +[ +∑ +t += +1 +| +𝐨 +| +∇ +θ +log +⁡ +π +θ +​ +( +o +t +| +𝐪 +, +𝐨 +< +t +) +​ +( +∑ +t +′ += +t +| +𝐨 +| +r +​ +( +𝐪 +, +𝐨 +≤ +t +′ +) +− +B +​ +( +𝐪 +, +𝐨 +< +t +) +) +] +] +, +\begin{split}\nabla_{\theta}\mathcal{J}(\pi_{\theta})&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\nabla_{\theta}\log\pi_{\theta}({\mathbf{o}}|{\mathbf{q}})R({\mathbf{q}},{\mathbf{o}})]\right]\\ +&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\nabla_{\theta}\sum_{t=1}^{|{\mathbf{o}}|}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{ +without guessing. See +recipe/README.md +for the full index and links. +For code explanation and advance usage (extension): +PPO Trainer and Workers +PPO Ray Trainer +Model Engine +Engine Workers (FSDP / Megatron-LM / Automodel / VeOmni / TorchTitan) +Advanced Usage and Extension +Add Models with the FSDP Backend +Add Models with the Megatron-LM Backend +Multi-turn Rollout Support +Search Tool Integration +Sandbox Fusion Integration +Extend to Other RL(HF) algorithms +Ray API design tutorial +Blogs from the community +When Reasoning Models Break Tokenization: The Hidden Complexity of Multiturn Training +verl deployment on AWS SageMaker +verl x SGLang Multi-turn Code Walkthrough +Optimizing SGLang Memory Usage in verl +SGLang, verl, OpenBMB and Tsinghua University: Pioneering End-to-End Multi-Turn RLHF +Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration +veMLP x verl :玩转强化学习训练 +使用 verl 进行 GRPO 分布式强化学习训练最佳实践 +HybridFlow verl 原文浅析 +最高提升 20 倍吞吐量!豆包大模型团队发布全新 RLHF 框架,现已开源! +Performance Tuning Guide +The performance is essential for on-policy RL algorithm. We have written a detailed +performance tuning guide +to help you optimize performance. +Upgrade to vLLM >= v0.8.2 +verl now supports vLLM>=0.8.2 when using FSDP as the training backend. Please refer to +this document +for the installation guide and more information. Please avoid vllm 0.7.x, which contains bugs that may lead to OOMs and unexpected errors. +Use Latest SGLang +SGLang is fully supported with verl, and SGLang RL Group is working extensively on building unique features, including multi-turn agentic RL, VLM RLHF, server-based RL, and partial rollout. Please refer to +this document +for the installation guide and more information. +Upgrade to FSDP2 +verl is fully embracing FSDP2! FSDP2 is recommended by torch distributed team, providing better throughput and memory usage, and is composible with other features (e.g. torch.compile). To enable FSDP2, simply use verl main and set the following options: +actor_rollout_ref.ref.strategy=fsdp2 +actor_rollout_ref.actor.strategy=fsdp2 +critic.strategy=fsdp2 +Furthermore, FSDP2 cpu offloading is compatible with gradient accumulation. You can turn it on to save memory with +actor_rollout_ref.actor.fsdp_config.offload_policy=True +. For more details, see +#1026 +AMD Support (ROCm Kernel) +verl runs on AMD ROCm GPUs (MI300X / MI325X / MI355X) with FSDP, FSDP2, and Megatron trainer backends, and vLLM as the validated inference engine (SGLang support is in progress). See the +AMD ROCm quick-start guide +for container bring-up, environment verification, and training examples. +Citation and acknowledgement +If you find the project helpful, please cite: +HybridFlow: A Flexible and Efficient RLHF Framework +A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization +@article +{ +sheng2024hybridflow +, +title += +{ +HybridFlow: A Flexible and Efficient RLHF Framework +} +, +author += +{ +Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu +} +, +year += +{ +2024 +} +, +journal += +{ +arXiv preprint arXiv: 2409.19256 +} +} +verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and contributed by Bytedance, Anyscale, LMSys.org, +Alibaba Qwen team +, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, University of Hong Kong, ke.com, +All Hands AI +, +ModelBest +, JD AI Lab, Microsoft Research, +StepFun +, Amazon, LinkedIn, Meituan, +Camel-AI +, +OpenManus +, Xiaomi, NVIDIA research, +Baichuan +, +RedNote +, +SwissAI +, +Moonshot AI (Kimi) +, Baidu, Snowflake, Skywork.ai, JetBrains, +IceSword Lab +, and many more. +Awesome Projects Built with +verl +Welcome to register your awesome project build with +verl +for other developers' reference! +TinyZero +: a reproduction of +DeepSeek R1 Zero +recipe for reasoning tasks +SkyThought +: RL training for Sky-T1-7B by NovaSky AI team. +simpleRL-reason +: SimpleRL-Zoo: Investigating and Taming Zero Reinforcement Learning for Open Base Models in the Wild +Easy-R1 +: +Multi-modal +RL training framework +RandOpt +: Neural Thickets: Diverse Task Experts Are Dense Around Pretrained Weights +OpenManus-RL +: LLM Agents RL tuning framework for multiple agent environments. +rllm +: async RL training with +verl-pipeline +RAGEN +: a general-purpose reasoning +agent +training framework +Search-R1 +: RL with reasoning and +searching (tool-call) +interleaved LLMs +ReSearch +: Learning to +Re +ason with +Search +for LLMs via Reinforcement Learning +Skywork-OR1 +: Skywork open reaonser series +ToRL +: Scaling tool-integrated RL +Absolute Zero Reasoner +: +A no human curated data self-play framework for reasoning +verl-agent +: A scalable training framework for +long-horizon LLM/VLM agents +, along with a new algorithm +GiGPO +RL-Factory +: An easy and efficient RL post-training framework for Agentic Learning +ReTool +: ReTool: reinforcement learning for strategic tool use in LLMs. Code release is in progress... +verl-tool +: An unified and easy-to-extend tool-agent training framework based on verl +PRIME +: Process reinforcement through implicit rewards +MemAgent +: MemAgent: Reshaping Long-Context LLM with Multi-Conv RL based Memory Agent +POLARIS +: A Post-training recipe for scaling RL on Advanced Reasoning models +GUI-R1 +: +GUI-R1 +: A Generalist R1-style Vision-Language Action Model For +GUI Agents +DeepRetrieval +: RL Training of +Search Agent +with +Search/Retrieval Outcome +Code-R1 +: Reproducing R1 for +Code +with Reliable Rewards +DeepResearcher +: Scaling deep research via reinforcement learning in real-world environments +VAGEN +: Training VLM agents with multi-turn reinforcement learning +RM-R1 +: RL training of reasoning reward models +Dr. MAS +: Stable +end-to-end RL +post-training for +multi-agent LLM systems +LUFFY +: Learning to Reason under Off-Policy Guidance +DeepMath +: DeepMath-103K data and series models for math reasoning +PACS +: Implicit Actor Critic Coupling via a Supervised Learning Framework for RLVR +Entropy Mechanism of RL +: The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning +LLaSA-TTS-GRPO +: TTS fine-tuning with GRPO optimization based on LLASA models +PF-PPO +: Policy Filtration for PPO based on the reliability of reward signals for more efficient and robust RLHF. +RACRO +: Build multi-modal reasoning models via decoupling it into query-conditioned captioning and text-only reasoning +Agent Lightning +: A flexible and extensible framework that enables seamless agent optimization for any existing agent framework. +VTool-R1 +: VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. +Kimina-Prover-RL +: Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1. +RL-PLUS +: Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization. +rStar2-Agent +: Using reinforcement learning with multi-step tool-calling for math tasks, rStar2-Agent-14B reaches frontier-level math reasoning in just 510 RL training steps +Vision-SR1 +: Self-Rewarding Vision-Language Model via Reasoning Decomposition +SimpleVLA-RL +: SimpleVLA-RL: A Simple yet Effective Vision-Language Action Model for Reinforcement Learning +Table-R1 +: Table-R1: Inference-Time Scaling for Table Reasoning +Revisual-R1 +: Revisual-R1: Advancing Multimodal Reasoning From Optimized Cold Start to Staged Reinforcement Learning +ARES +: ARES: Multimodal Adaptive Reasoning via Difficulty-Aware Token-Level Entropy Shaping +Meta-Bandit-LLM +: Meta-Bandit-LLM: Long-horizon multiturn interactive training for meta-bandit agents +PokeeResearch +: PokeeResearch: State-of-the-art 7B DeepResearch Agent that leverages web search and content reading capabilities to answer complex questions using the most up-to-date information available online. +Search Self-play +: Pushing the Frontier of Agent Capability without Supervision +OneThinker +: All-in-one Reasoning Model for Image and Video +OpenTinker +: Democratizing Agentic Reinforcement Learning as a Service +FlowRL +: Matching reward distributions via +flow balance +for diverse exploration and generalizable reasoning +Logic-RL +: a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset. +Seed-Coder +: RL training of Seed-Coder boosts performance on competitive programming +all-hands/openhands-lm-32b-v0.1 +: A strong, open coding agent model, trained with +multi-turn fine-tuning +s3 +Efficient Yet Effective +Search Agent Training via RL +Rec-R1 +: Bridging Generative Large Language Models and Recommendation Systems via Reinforcement Learning +Explore RL Data Scaling +: Exploring Data Scaling Trends and Effects in Reinforcement Learning from Human Feedback +FIRE +: Flaming-hot initiation with regular execution sampling for large language models +DQO +: Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization +ProRL +: Prolonged Reinforcement Learning Expands Reasoning Boundaries in Large Language Models +cognition-engineering +: Test time scaling drives cognition engineering. +Trust Region Preference Approximation +: A simple and stable +reinforcement learning algorithm +for LLM reasoning. +AdaRFT +: Efficient Reinforcement Finetuning via +Adaptive Curriculum Learning +critic-rl +: LLM critics for code generation +self-rewarding-reasoning-LLM +: self-rewarding and correction with +generative reward models +DeepEnlighten +: Reproduce R1 with +social reasoning +tasks and analyze key findings +MetaSpatial +: Reinforcing +3D Spatial Reasoning +in +VLMs +for the +Metaverse +PURE +: +Credit assignment +is the key to successful reinforcement fine-tuning using +process reward model +cognitive-behaviors +: Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs +deepscaler +: iterative context scaling with GRPO +DAPO +: the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B +NoisyRollout +: Reinforcing Visual Reasoning with Data Augmentation +SPEAR +: +Self-imitation +with +Progressive Exploration +for Agentic Reinforcement Learning (ICLR 2026) +RuleReasoner +: +RuleReasoner: +Reinforced Rule-based Reasoning via +Domain-aware Dynamic Sampling +(ICLR 2026) +MetaphorStar +: +Image Metaphor +Understanding and Reasoning with End-to-End +Visual Reinforcement Learning +DART-GUI +: a decoupled agentic RL framework for Computer Use Agents, achieving ~2× training speedup and ~5× environment utilization! +Rethinking OPD +: Rethinking On-Policy Distillation of Large Language Models: Phenomenology, Mechanism, and Recipe +Contribution Guide +See +contributions guide +About +ByteDance Seed Team +Founded in 2023, ByteDance Seed Team is dedicated to crafting the industry's most advanced AI foundation models. The team aspires to become a world-class research team and make significant contributions to the advancement of science and society. You can get to know Bytedance Seed better through the following channels👇 +We are HIRING! Send us an +email +if you are interested in internship/FTE opportunities in RL for agents. +You can’t perform that action at this time. \ No newline at end of file diff --git a/research/notes/welcome-to-verls-documentation-verl-documentation-2.md b/research/notes/welcome-to-verls-documentation-verl-documentation-2.md new file mode 100644 index 0000000000000000000000000000000000000000..a232ca52a93c8e60291604bc75521d2530282aa3 --- /dev/null +++ b/research/notes/welcome-to-verls-documentation-verl-documentation-2.md @@ -0,0 +1,227 @@ +--- +title: Welcome to verl’s documentation! — verl documentation +id: welcome-to-verls-documentation-verl-documentation-2 +tags: +- deepread +created: '2026-06-10T00:41:00.572353Z' +source: https://verl.readthedocs.io/en/latest/ +source_domain: verl.readthedocs.io +fetched_at: '2026-06-10T00:41:00.572215Z' +fetch_provider: builtin +status: draft +type: note +tier: ground_truth +content_type: docs +deprecated: false +--- + +Welcome to verl’s documentation! — verl documentation +Welcome to verl’s documentation! +View page source +Welcome to verl’s documentation! + +verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the +HybridFlow +paper. +verl is flexible and easy to use with: +Easy extension of diverse RL algorithms +: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code. +Seamless integration of existing LLM infra with modular APIs +: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks. +Flexible device mapping and parallelism +: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes. +Ready integration with popular HuggingFace models +verl is fast with: +State-of-the-art throughput +: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput. +Efficient actor model resharding with 3D-HybridEngine +: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases. +Quickstart +Installation +Requirements +Choices of Backend Engines +Install from docker image +Install from custom environment +Install with AMD GPUs - ROCM kernel support +Quickstart: PPO training on GSM8K dataset +Introduction +Dataset Introduction +Step 1: Prepare the dataset +Step 2: Download a model for post-training +Step 3: Perform PPO training with the instruct model +Multinode Training +Option 1: Launch Manually +Option 2: Launch via SkyPilot on Kubernetes or clouds +Option 3: Launch via Slurm +Option 4: Launch via dstack +How to debug? +Multi-node training on AMD clusters +Ray Debug Tutorial +How to debug? +More Resources +Agentic RL Training +Overview +Server-based Asynchronous Rollout +Multi-turn Conversations and Tool Calls +Agent Framework +Programming guide +HybridFlow Programming Guide +Motivation and Design +Codebase walkthrough (PPO) +Repository organization +The Design of +verl.single_controller +Preface +Origin +A Running Example: +generate_sequences +Beyond RL Post-Training: Generalizing +verl.single_controller +Data Preparation +Prepare Data for Post-Training +Implement Reward Function for Dataset +Configurations +Config Explanation +ppo_trainer.yaml for RL FSDP Backend +evaluation.yaml +sft_trainer.yaml for SFT FSDP Backend +PPO Example +PPO Example Architecture +GSM8K Example +Megatron-FSDP Example +Multi-Modal Example Architecture +SkyPilot Examples +Algorithms +Proximal Policy Optimization (PPO) +Group Relative Policy Optimization (GRPO) +Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) +Recipe: Self-Play Fine-Tuning (SPIN) +Recipe: Self-Play Preference Optimization (SPPO) +Recipe: Entropy Mechanism +On-Policy RL with Optimal Reward Baseline (OPO) +Algorithm Baselines +GPG: Group Policy Gradient +Rollout Correction +Mathematical Formulations of Rollout Correction Methods in +verl +Optimal Token Baseline (OTB) +Divergence Proximal Policy Optimization (DPPO) +On-Policy Distillation (OPD) +PPO Trainer and Workers +PPO Ray Trainer +Model Engine +Engine Workers +Automodel Backend +SGLang Backend +TensorRT-LLM Backend +Performance Tuning Guide +Training DeepSeek 671b +Verl LLM Best Practices (DAPO + Qwen3-235B) +Performance Tuning Guide +Rollout KV Cache Offload via Mooncake-Store +Upgrading to vLLM >= 0.8 +Hardware Resource Needed for RL +verl Profiler System +NVIDIA Nsight Systems profiling in verl +PyTorch Profiling in verl +Adding new models +Add models with the FSDP backend +Add models with the Megatron-LM backend +Async Training +Recipe: One Step Off Policy Async Trainer +Recipe: Fully Async Policy Trainer +Recipe: Async On-Policy Knowledge Distillation Trainer +Low Precision +FP8 RL in verl +NVFP4 QAT (Quantization-Aware Training) in verl +Advanced Features +Using Checkpoints to Support Fault Tolerance Training +RoPE Scaling override +Attention Implementation Override +RL(HF) algorithms with LoRA Support +Multi-turn Rollout Support +Ray API Design Tutorial +Extend to other RL(HF) algorithms +Sandbox Fusion Example +Trace Function Usage Instructions +SkipManager: Skip everything in the RL pipeline. +Agent Loop +Reward Loop +TransferQueue Data System +Use Prometheus and Grafana to Monitor Rollout +Guide to Using MTP in SFT/RL Training and Inference +Hardware Support +Multi-Chip Support +AMD (ROCm) Tutorial +Ascend (NPU) Tutorial +API References +Data interface +Single Controller interface +Trainer Interface +Utilities +Blog +verl 0.7 release blog +FAQ +Frequently Asked Questions +Ray related +Distributed training +Install related +Illegal memory access +Checkpoints +Triton +compile_module_from_src +error +What is the meaning of train batch size, mini batch size, and micro batch size? +How to generate ray timeline to analyse performance of a training job? +How to set proxy only for wandb? +Missmatch between inference and training sequence (high actor/grad_norm) +Contributing +Editing Agent Instructions +Development Notes +Sandbox Fusion Tool Integration +Contribution + +verl is free software; you can redistribute it and/or modify it under the terms +of the Apache License 2.0. We welcome contributions. +Join us on +GitHub +, +Slack +and +Wechat +for discussions. +Contributions from the community are welcome! Please check out our +project roadmap +and +good first issues +to see where you can contribute. +Code Linting and Formatting + +We use pre-commit to help improve code quality. To initialize pre-commit, run: +pip +install +pre-commit +pre-commit +install +To resolve CI errors locally, you can also manually run pre-commit by: +pre-commit +run +Adding CI tests + +If possible, please add CI test(s) for your new feature: +Find the most relevant workflow yml file, which usually corresponds to a +hydra +default config (e.g. +ppo_trainer +, +ppo_megatron_trainer +, +sft_trainer +, etc). +Add related path patterns to the +paths +section if not already included. +Minimize the workload of the test script(s) (see existing scripts for examples). +We are HIRING! Send us an +email +if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment. \ No newline at end of file diff --git a/research/notes/welcome-to-verls-documentation-verl-documentation.md b/research/notes/welcome-to-verls-documentation-verl-documentation.md new file mode 100644 index 0000000000000000000000000000000000000000..9c26db008016c3435cb11c6ed44bc1223c33ea83 --- /dev/null +++ b/research/notes/welcome-to-verls-documentation-verl-documentation.md @@ -0,0 +1,227 @@ +--- +title: Welcome to verl’s documentation! — verl documentation +id: welcome-to-verls-documentation-verl-documentation +tags: +- deepread +created: '2026-06-10T00:40:49.227131Z' +source: https://verl.readthedocs.io/en/latest/index.html +source_domain: verl.readthedocs.io +fetched_at: '2026-06-10T00:40:49.226989Z' +fetch_provider: builtin +status: draft +type: note +tier: ground_truth +content_type: docs +deprecated: false +--- + +Welcome to verl’s documentation! — verl documentation +Welcome to verl’s documentation! +View page source +Welcome to verl’s documentation! + +verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the +HybridFlow +paper. +verl is flexible and easy to use with: +Easy extension of diverse RL algorithms +: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code. +Seamless integration of existing LLM infra with modular APIs +: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks. +Flexible device mapping and parallelism +: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes. +Ready integration with popular HuggingFace models +verl is fast with: +State-of-the-art throughput +: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput. +Efficient actor model resharding with 3D-HybridEngine +: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases. +Quickstart +Installation +Requirements +Choices of Backend Engines +Install from docker image +Install from custom environment +Install with AMD GPUs - ROCM kernel support +Quickstart: PPO training on GSM8K dataset +Introduction +Dataset Introduction +Step 1: Prepare the dataset +Step 2: Download a model for post-training +Step 3: Perform PPO training with the instruct model +Multinode Training +Option 1: Launch Manually +Option 2: Launch via SkyPilot on Kubernetes or clouds +Option 3: Launch via Slurm +Option 4: Launch via dstack +How to debug? +Multi-node training on AMD clusters +Ray Debug Tutorial +How to debug? +More Resources +Agentic RL Training +Overview +Server-based Asynchronous Rollout +Multi-turn Conversations and Tool Calls +Agent Framework +Programming guide +HybridFlow Programming Guide +Motivation and Design +Codebase walkthrough (PPO) +Repository organization +The Design of +verl.single_controller +Preface +Origin +A Running Example: +generate_sequences +Beyond RL Post-Training: Generalizing +verl.single_controller +Data Preparation +Prepare Data for Post-Training +Implement Reward Function for Dataset +Configurations +Config Explanation +ppo_trainer.yaml for RL FSDP Backend +evaluation.yaml +sft_trainer.yaml for SFT FSDP Backend +PPO Example +PPO Example Architecture +GSM8K Example +Megatron-FSDP Example +Multi-Modal Example Architecture +SkyPilot Examples +Algorithms +Proximal Policy Optimization (PPO) +Group Relative Policy Optimization (GRPO) +Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) +Recipe: Self-Play Fine-Tuning (SPIN) +Recipe: Self-Play Preference Optimization (SPPO) +Recipe: Entropy Mechanism +On-Policy RL with Optimal Reward Baseline (OPO) +Algorithm Baselines +GPG: Group Policy Gradient +Rollout Correction +Mathematical Formulations of Rollout Correction Methods in +verl +Optimal Token Baseline (OTB) +Divergence Proximal Policy Optimization (DPPO) +On-Policy Distillation (OPD) +PPO Trainer and Workers +PPO Ray Trainer +Model Engine +Engine Workers +Automodel Backend +SGLang Backend +TensorRT-LLM Backend +Performance Tuning Guide +Training DeepSeek 671b +Verl LLM Best Practices (DAPO + Qwen3-235B) +Performance Tuning Guide +Rollout KV Cache Offload via Mooncake-Store +Upgrading to vLLM >= 0.8 +Hardware Resource Needed for RL +verl Profiler System +NVIDIA Nsight Systems profiling in verl +PyTorch Profiling in verl +Adding new models +Add models with the FSDP backend +Add models with the Megatron-LM backend +Async Training +Recipe: One Step Off Policy Async Trainer +Recipe: Fully Async Policy Trainer +Recipe: Async On-Policy Knowledge Distillation Trainer +Low Precision +FP8 RL in verl +NVFP4 QAT (Quantization-Aware Training) in verl +Advanced Features +Using Checkpoints to Support Fault Tolerance Training +RoPE Scaling override +Attention Implementation Override +RL(HF) algorithms with LoRA Support +Multi-turn Rollout Support +Ray API Design Tutorial +Extend to other RL(HF) algorithms +Sandbox Fusion Example +Trace Function Usage Instructions +SkipManager: Skip everything in the RL pipeline. +Agent Loop +Reward Loop +TransferQueue Data System +Use Prometheus and Grafana to Monitor Rollout +Guide to Using MTP in SFT/RL Training and Inference +Hardware Support +Multi-Chip Support +AMD (ROCm) Tutorial +Ascend (NPU) Tutorial +API References +Data interface +Single Controller interface +Trainer Interface +Utilities +Blog +verl 0.7 release blog +FAQ +Frequently Asked Questions +Ray related +Distributed training +Install related +Illegal memory access +Checkpoints +Triton +compile_module_from_src +error +What is the meaning of train batch size, mini batch size, and micro batch size? +How to generate ray timeline to analyse performance of a training job? +How to set proxy only for wandb? +Missmatch between inference and training sequence (high actor/grad_norm) +Contributing +Editing Agent Instructions +Development Notes +Sandbox Fusion Tool Integration +Contribution + +verl is free software; you can redistribute it and/or modify it under the terms +of the Apache License 2.0. We welcome contributions. +Join us on +GitHub +, +Slack +and +Wechat +for discussions. +Contributions from the community are welcome! Please check out our +project roadmap +and +good first issues +to see where you can contribute. +Code Linting and Formatting + +We use pre-commit to help improve code quality. To initialize pre-commit, run: +pip +install +pre-commit +pre-commit +install +To resolve CI errors locally, you can also manually run pre-commit by: +pre-commit +run +Adding CI tests + +If possible, please add CI test(s) for your new feature: +Find the most relevant workflow yml file, which usually corresponds to a +hydra +default config (e.g. +ppo_trainer +, +ppo_megatron_trainer +, +sft_trainer +, etc). +Add related path patterns to the +paths +section if not already included. +Minimize the workload of the test script(s) (see existing scripts for examples). +We are HIRING! Send us an +email +if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment. \ No newline at end of file