diff --git a/composer_replication/datagen/repo_gate.py b/composer_replication/datagen/repo_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b02474d6a9faf34d94cca58e3a08f42d576a4e4
--- /dev/null
+++ b/composer_replication/datagen/repo_gate.py
@@ -0,0 +1,361 @@
+"""repo_gate.py — Stage-0 ingest gate: license tiers + benchmark decontamination.
+
+Architecture step 1 of the dataset pipeline (research/deepread/
+13-synthesis-architecture.md Part B). Closes two verified findings:
+
+  * V3 / D-5  — ZERO benchmark decontamination existed anywhere in code or
+    designs, while the pipeline trains on SWE-bench-family substrates and is
+    scored on SWE-bench Verified. ``is_eval_contaminated`` is the hard wall:
+    a repo on the eval list is NEVER admitted, regardless of license.
+  * V9 / D-13 — the only license filter was a lowercase substring match on a
+    task field (``substrates.py`` ``is_redistributable``), with no SPDX
+    detection at the repo-ingest path and no trainable-vs-redistributable
+    split. ``detect_license`` + ``license_tier`` replace the boolean with a
+    three-tier verdict.
+
+Why tiers, not a boolean (D-13): weak-copyleft repos (MPL/LGPL) are fine to
+*train on* but we must not *redistribute* derivative diffs from them — a
+boolean "redistributable?" gate either over-excludes them or leaks them into
+published corpora. The tier travels with the verdict so downstream corpus
+steps (step 6) can route TRAINABLE_ONLY rows away from any published split.
+
+Why title-anchored matching for the GNU family: GPL-3.0 §13 mentions the
+"GNU Affero General Public License" by name and AGPL-3.0 §13 mentions the
+"GNU General Public License" — naive full-body substring matching
+misclassifies one as the other. We therefore classify the GNU/MPL/Apache
+family from the document HEADER (first ~400 normalized chars, where the
+license title lives) and only use full-body phrases for the short permissive
+licenses whose titles are not distinctive (MIT/ISC/BSD/Unlicense).
+
+Stdlib-only on purpose: the gate must run before anything heavy is installed.
+"""
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# License detection (V9 / D-13)
+# ---------------------------------------------------------------------------
+
+#: License files checked in order; first match wins (case-insensitive on name).
+_LICENSE_FILENAMES: tuple[str, ...] = ("LICENSE", "LICENSE.txt", "LICENSE.md", "COPYING")
+
+#: Trove classifier / PEP 639 expression fragments → SPDX id. Secondary signal
+#: only — the classifier cannot distinguish BSD-2 from BSD-3, so it maps to
+#: BSD-3-Clause (the common case) and the LICENSE file is preferred when present.
+_CLASSIFIER_MAP: tuple[tuple[str, str], ...] = (
+    ("gnu affero general public license", "AGPL-3.0"),
+    ("gnu lesser general public license v3", "LGPL-3.0"),
+    ("gnu lesser general public license v2.1", "LGPL-2.1"),
+    ("gnu lesser general public license", "LGPL-3.0"),
+    ("gnu general public license v3", "GPL-3.0"),
+    ("gnu general public license v2", "GPL-2.0"),
+    ("mozilla public license 2.0", "MPL-2.0"),
+    ("apache software license", "Apache-2.0"),
+    ("mit license", "MIT"),
+    ("bsd license", "BSD-3-Clause"),
+    ("isc license", "ISC"),
+    ("the unlicense", "Unlicense"),
+)
+
+#: Bare SPDX ids accepted from PEP 639 ``license = "<expr>"`` in pyproject.
+_SPDX_IDS: frozenset[str] = frozenset(
+    {
+        "MIT", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", "ISC",
+        "GPL-2.0", "GPL-3.0", "AGPL-3.0", "LGPL-2.1", "LGPL-3.0",
+        "MPL-2.0", "Unlicense",
+    }
+)
+_SPDX_LOOKUP: dict[str, str] = {s.lower(): s for s in _SPDX_IDS}
+# Common -only/-or-later suffixed forms normalize to the base id we tier on.
+for _base in ("GPL-2.0", "GPL-3.0", "AGPL-3.0", "LGPL-2.1", "LGPL-3.0"):
+    _SPDX_LOOKUP[f"{_base.lower()}-only"] = _base
+    _SPDX_LOOKUP[f"{_base.lower()}-or-later"] = _base
+
+
+@dataclass(frozen=True)
+class LicenseInfo:
+    """Outcome of license detection: SPDX-ish id + which signal decided it."""
+
+    spdx_id: str  # one of _SPDX_IDS or "unknown"
+    signal: str  # "license_file" | "classifier" | "none"
+    source: str = ""  # filename that supplied the winning signal
+
+
+def _normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip().lower()
+
+
+#: Title strings for the families that cross-cite each other. NOTE: "gnu
+#: affero general public license" does NOT contain "gnu general public
+#: license" as a substring ("affero" splits it), so the titles are disjoint.
+_HEADER_TITLES: tuple[tuple[str, str], ...] = (
+    ("gnu affero general public license", "agpl"),
+    ("gnu lesser general public license", "lgpl"),
+    ("gnu general public license", "gpl"),
+    ("mozilla public license", "mpl"),
+    ("apache license", "apache"),
+)
+
+
+def _classify_header(header: str) -> str | None:
+    """Title-anchored families (GNU/MPL/Apache). The EARLIEST-occurring title
+    wins, because a license document's own title always precedes any
+    cross-citation — GPL-3 §13 names the AGPL and AGPL-3 §13 names the GPL,
+    so mere presence-matching misclassifies one as the other (the V9 trap)."""
+    hits = [(idx, family) for title, family in _HEADER_TITLES if (idx := header.find(title)) >= 0]
+    if not hits:
+        return None
+    family = min(hits)[1]
+    if family == "agpl":
+        return "AGPL-3.0"
+    if family == "lgpl":
+        return "LGPL-2.1" if "version 2.1" in header else "LGPL-3.0"
+    if family == "gpl":
+        return "GPL-2.0" if "version 2" in header and "version 3" not in header else "GPL-3.0"
+    if family == "mpl":
+        return "MPL-2.0" if "2.0" in header else None
+    return "Apache-2.0" if "version 2.0" in header else None
+
+
+def _classify_body(body: str) -> str | None:
+    """Distinctive-phrase matching for the short permissive licenses. Order
+    matters: ISC's grant ("permission to use, copy, modify") is checked via
+    its unique "and/or distribute … with or without fee" wording so it can't
+    be shadowed by MIT's "permission is hereby granted" phrase."""
+    if "free and unencumbered software released into the public domain" in body:
+        return "Unlicense"
+    # Apache boilerplate notice files ("Licensed under the Apache License,
+    # Version 2.0") carry the title mid-body, not in a header — the tricky
+    # Apache-vs-MIT case: both say "permission"/"license", only Apache names
+    # itself with a version.
+    if "apache license" in body and "version 2.0" in body:
+        return "Apache-2.0"
+    if "permission is hereby granted, free of charge, to any person obtaining a copy" in body:
+        return "MIT"
+    if "with or without fee" in body and "permission to use, copy, modify" in body:
+        return "ISC"
+    if "redistribution and use in source and binary forms" in body:
+        # The third clause ("Neither the name of …") is what separates 3- from 2-.
+        return "BSD-3-Clause" if "neither the name of" in body else "BSD-2-Clause"
+    return None
+
+
+def _classify_license_text(text: str) -> str:
+    norm = _normalize_text(text)
+    return _classify_header(norm[:400]) or _classify_body(norm) or "unknown"
+
+
+def _classifier_signal(repo_root: Path) -> tuple[str, str] | None:
+    """Secondary signal: trove classifiers / PEP 639 license expression in
+    pyproject.toml or setup.py. Regex-scan, not a TOML parse — the gate must
+    not depend on packaging libs and classifiers are line-shaped in practice."""
+    for name in ("pyproject.toml", "setup.py"):
+        path = repo_root / name
+        if not path.is_file():
+            continue
+        try:
+            text = path.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            continue
+        # PEP 639: license = "Apache-2.0" (pyproject only, but harmless on setup.py).
+        m = re.search(r'license\s*=\s*["\']([A-Za-z0-9.+-]+)["\']', text)
+        if m and m.group(1).lower() in _SPDX_LOOKUP:
+            return _SPDX_LOOKUP[m.group(1).lower()], name
+        low = _normalize_text(text)
+        for fragment, spdx in _CLASSIFIER_MAP:
+            if f"license :: osi approved :: {fragment}" in low or (
+                "license ::" in low and fragment in low
+            ):
+                return spdx, name
+    return None
+
+
+def detect_license(repo_root: Path) -> LicenseInfo:
+    """Detect the repo license. LICENSE-file text is the primary signal;
+    packaging classifiers are secondary (used only when the file is absent or
+    unclassifiable). The winning signal is recorded so corpus manifests can
+    show provenance for the tier decision (V9 closure must be auditable)."""
+    for name in _LICENSE_FILENAMES:
+        path = repo_root / name
+        if not path.is_file():
+            # Case-insensitive fallback (e.g. "License.md", "COPYING.txt" not
+            # matched here on purpose — only exact-name case variants).
+            matches = [p for p in repo_root.glob("*") if p.is_file() and p.name.lower() == name.lower()]
+            path = matches[0] if matches else path
+        if path.is_file():
+            try:
+                text = path.read_text(encoding="utf-8", errors="replace")
+            except OSError:
+                continue
+            spdx = _classify_license_text(text)
+            if spdx != "unknown":
+                return LicenseInfo(spdx_id=spdx, signal="license_file", source=path.name)
+            # File exists but unclassifiable → let the classifier signal try
+            # before giving up; remember we saw a file for the "none" case.
+            fallback = _classifier_signal(repo_root)
+            if fallback is not None:
+                return LicenseInfo(spdx_id=fallback[0], signal="classifier", source=fallback[1])
+            return LicenseInfo(spdx_id="unknown", signal="license_file", source=path.name)
+    fallback = _classifier_signal(repo_root)
+    if fallback is not None:
+        return LicenseInfo(spdx_id=fallback[0], signal="classifier", source=fallback[1])
+    return LicenseInfo(spdx_id="unknown", signal="none")
+
+
+# ---------------------------------------------------------------------------
+# License tiers (D-13: tiers, not a boolean)
+# ---------------------------------------------------------------------------
+
+
+class Tier(Enum):
+    """Three-way license verdict. TRAINABLE_ONLY exists because weak copyleft
+    (MPL/LGPL) permits training but redistribution of derivative diffs would
+    trigger copyleft obligations — collapsing this to a boolean either loses
+    training data or leaks copyleft material into published corpora (D-13)."""
+
+    REDISTRIBUTABLE = "redistributable"
+    TRAINABLE_ONLY = "trainable_only"
+    EXCLUDED = "excluded"
+
+
+_TIER_BY_SPDX: dict[str, Tier] = {
+    "MIT": Tier.REDISTRIBUTABLE,
+    "Apache-2.0": Tier.REDISTRIBUTABLE,
+    "BSD-2-Clause": Tier.REDISTRIBUTABLE,
+    "BSD-3-Clause": Tier.REDISTRIBUTABLE,
+    "ISC": Tier.REDISTRIBUTABLE,
+    "Unlicense": Tier.REDISTRIBUTABLE,
+    "MPL-2.0": Tier.TRAINABLE_ONLY,
+    "LGPL-2.1": Tier.TRAINABLE_ONLY,
+    "LGPL-3.0": Tier.TRAINABLE_ONLY,
+    # GPL/AGPL and unknown are EXCLUDED: strong copyleft would bind the model
+    # outputs' redistribution story, and "unknown" defaults closed (V9).
+    "GPL-2.0": Tier.EXCLUDED,
+    "GPL-3.0": Tier.EXCLUDED,
+    "AGPL-3.0": Tier.EXCLUDED,
+}
+
+
+def license_tier(info: LicenseInfo) -> Tier:
+    """Map detected license → tier. Anything unrecognized is EXCLUDED — the
+    gate fails closed, never open (V9: the old substring filter failed open)."""
+    return _TIER_BY_SPDX.get(info.spdx_id, Tier.EXCLUDED)
+
+
+# ---------------------------------------------------------------------------
+# Benchmark decontamination (V3 / D-5)
+# ---------------------------------------------------------------------------
+
+#: The canonical 12 SWE-bench test repos (SWE-bench / -Lite / -Verified /
+#: -Multimodal all draw eval instances from these). Training on ANY of them
+#: contaminates every SWE-bench-family score we report (V3). Lowercase
+#: "org/repo" form. Extend via a JSON file (list of "org/repo" strings)
+#: passed to is_eval_contaminated(extra_list=...) — e.g. SWE-Gym eval splits.
+DECONTAMINATION_LIST: frozenset[str] = frozenset(
+    {
+        "astropy/astropy",
+        "django/django",
+        "matplotlib/matplotlib",
+        "mwaskom/seaborn",
+        "pallets/flask",
+        "psf/requests",
+        "pydata/xarray",
+        "pylint-dev/pylint",
+        "pytest-dev/pytest",
+        "scikit-learn/scikit-learn",
+        "sphinx-doc/sphinx",
+        "sympy/sympy",
+    }
+)
+
+
+def load_decontamination_list(path: Path) -> frozenset[str]:
+    """Load an extension list from a JSON file: ``["org/repo", ...]``. This is
+    THE documented mechanism for adding eval repos (new SWE-bench releases,
+    SWE-Gym eval splits) without editing code."""
+    entries = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(entries, list):
+        raise ValueError(f"{path}: decontamination JSON must be a list of 'org/repo' strings")
+    return frozenset(normalize_repo(str(e)) for e in entries)
+
+
+def normalize_repo(repo: str) -> str:
+    """Reduce any repo spelling — full https/ssh GitHub URL, trailing ``.git``,
+    mixed case — to lowercase ``org/repo``. Decontamination must hit no matter
+    how the driver spells the repo (V3: a miss here is silent contamination)."""
+    r = repo.strip().lower()
+    r = re.sub(r"^(https?://|git@)", "", r)
+    r = re.sub(r"^[^/]*github\.com[:/]", "", r)
+    r = r.rstrip("/")
+    r = r.removesuffix(".git")
+    parts = [p for p in r.split("/") if p]
+    return "/".join(parts[:2]) if len(parts) >= 2 else r
+
+
+def is_eval_contaminated(repo: str, extra_list: frozenset[str] | None = None) -> bool:
+    """True if ``repo`` is in the SWE-bench-family eval set (or the caller's
+    extension list). Case-insensitive; accepts URLs and bare org/repo."""
+    key = normalize_repo(repo)
+    return key in DECONTAMINATION_LIST or (extra_list is not None and key in extra_list)
+
+
+# ---------------------------------------------------------------------------
+# The gate verdict — single entry point for the pipeline driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GateVerdict:
+    """Everything the driver needs to admit/reject a repo, with reasons kept
+    for the run manifest (step 6's lineage record)."""
+
+    repo: str
+    license_info: LicenseInfo
+    tier: Tier
+    contaminated: bool
+    admitted: bool
+    reasons: list[str] = field(default_factory=list)
+
+
+def gate_repo(repo: str, repo_root: Path | None, extra_decontamination: frozenset[str] | None = None) -> GateVerdict:
+    """Architecture step 1: the one call the pipeline driver makes per repo.
+
+    Hard rules (in priority order):
+      1. Contaminated (V3) → NEVER admitted, even if the license is permissive.
+      2. Tier EXCLUDED (GPL/AGPL/unknown) → not admitted (V9: fail closed).
+      3. Tier TRAINABLE_ONLY → admitted, with the do-not-redistribute
+         constraint recorded as a reason so step 6 can route the rows.
+    """
+    contaminated = is_eval_contaminated(repo, extra_decontamination)
+    info = detect_license(repo_root) if repo_root is not None else LicenseInfo("unknown", "none")
+    tier = license_tier(info)
+
+    reasons: list[str] = []
+    if contaminated:
+        reasons.append(
+            f"benchmark decontamination: {normalize_repo(repo)} is a SWE-bench-family eval repo (V3/D-5)"
+        )
+    if repo_root is None:
+        reasons.append("no repo_root provided: license undetectable, failing closed (V9)")
+    if tier is Tier.EXCLUDED and not contaminated:
+        reasons.append(f"license tier EXCLUDED: spdx={info.spdx_id} (signal={info.signal})")
+    if tier is Tier.TRAINABLE_ONLY:
+        reasons.append(
+            f"license tier TRAINABLE_ONLY: spdx={info.spdx_id} — usable for training, "
+            "derivative diffs must NOT be redistributed (D-13)"
+        )
+
+    admitted = (not contaminated) and tier is not Tier.EXCLUDED
+    return GateVerdict(
+        repo=repo,
+        license_info=info,
+        tier=tier,
+        contaminated=contaminated,
+        admitted=admitted,
+        reasons=reasons,
+    )
diff --git a/composer_replication/datagen/rollout_harness.py b/composer_replication/datagen/rollout_harness.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e394e320098eb985002b1fa3cde45c3dd81b95b
--- /dev/null
+++ b/composer_replication/datagen/rollout_harness.py
@@ -0,0 +1,214 @@
+"""rollout_harness.py — the agent loop over FeatureDeletionEnv (finding V2).
+
+THE critical missing component the design critic identified: nothing in the
+repo ran an agent episode against `FeatureDeletionEnv` to completion, so the
+SFT corpus had NO producer and the tree-of-work had no env-grounded seeds.
+`collect_trajectory` is that producer: prompt → policy.act → env.step → … →
+submit → `_grade()`, emitting a `CanonicalTrajectory` whose steps are real
+executed environment transitions (the seeds the tree needs, fixing the
+seed-trace/oracle disjointness of finding D-1 as a free byproduct).
+
+The policy is pluggable (`RolloutPolicy` protocol): a scripted fake for tests,
+a frontier API model for expert-trajectory collection (SWE-Gym/SWE-smith both
+validated this recipe — 491 and 5,016 expert trajectories respectively), or a
+local model later.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Protocol, runtime_checkable
+
+from composer_replication.datagen.env import FeatureDeletionEnv, StepResult
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.trajectory import (
+    CanonicalTrajectory,
+    ToolCall,
+    TrajectoryStep,
+)
+
+
+@runtime_checkable
+class RolloutPolicy(Protocol):
+    """Anything that maps (observation, history) → the next action.
+
+    Returning a `ToolCall` continues the episode (translated to an env action
+    dict); returning a plain `str` is the final message — the harness submits.
+    """
+
+    def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str: ...
+
+
+@dataclass
+class ScriptedPolicy:
+    """Test fake: replays a fixed action list, then submits."""
+
+    actions: list[ToolCall | str]
+    _i: int = 0
+
+    def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str:
+        if self._i >= len(self.actions):
+            return "done"  # str → submit
+        a = self.actions[self._i]
+        self._i += 1
+        return a
+
+
+class OpenRouterPolicy:
+    """Frontier-API policy for expert-trajectory collection (thin stub).
+
+    Mirrors `teacher_replay._call_teacher`'s payload shape (one chat call,
+    temperature 0.2). Lazy-deps on httpx so the module imports without it.
+    Deliberately minimal: real expert collection should evaluate adopting
+    mini-swe-agent/SWE-agent as the scaffold (deepread 11 finding 2) — this
+    class exists so the harness has a live-API path without a new framework.
+    """
+
+    def __init__(self, model_slug: str, api_key: str | None = None,
+                 max_tokens: int = 512) -> None:
+        try:
+            import httpx  # noqa: F401, PLC0415 — lazy heavy dep
+        except ImportError as e:
+            raise ImportError(
+                "OpenRouterPolicy requires httpx (`pip install httpx` or the "
+                "[serverless] extra). For tests use ScriptedPolicy. Got: " + repr(e)
+            ) from e
+        from composer_replication.teacher_replay import _load_api_key
+        self.model_slug = model_slug
+        self.api_key = api_key or _load_api_key()
+        self.max_tokens = max_tokens
+
+    def act(self, observation: str, history: list[TrajectoryStep]) -> ToolCall | str:
+        import httpx  # noqa: PLC0415
+
+        from composer_replication.teacher_replay import OPENROUTER_URL
+        messages = [{"role": "user", "content": observation}]
+        r = httpx.post(
+            OPENROUTER_URL,
+            json={"model": self.model_slug, "messages": messages,
+                  "max_tokens": self.max_tokens, "temperature": 0.2},
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            timeout=120.0,
+        )
+        r.raise_for_status()
+        return str(r.json()["choices"][0]["message"]["content"])
+
+
+def _to_env_action(call: ToolCall) -> dict:
+    """ToolCall → FeatureDeletionEnv action dict.
+
+    CONVENTION (documented here, the single translation point): the env's
+    `step()` consumes ``{"type": <tool name>, **args}``; ``type=="submit"``
+    triggers grading (env.py:67). A ToolCall named "submit" therefore ends the
+    episode through the same path as a plain-text final message.
+    """
+    return {"type": call.name, **call.args}
+
+
+def collect_trajectory(
+    env: FeatureDeletionEnv,
+    task: FeatureDeletionTask,
+    policy: RolloutPolicy,
+    *,
+    max_turns: int = 40,
+    budget_usd: float | None = None,
+    provenance: dict | None = None,
+) -> CanonicalTrajectory:
+    """Run one episode and return the graded CanonicalTrajectory.
+
+    The episode ends when the policy emits a plain string (final message →
+    submit), a ToolCall named "submit", or `max_turns` is hit (the env grades
+    on its own turn limit too — we mirror it here so the harness's history
+    stays aligned with the env's accounting).
+    """
+    obs = env.reset(task)
+    steps: list[TrajectoryStep] = []
+    final: StepResult | None = None
+
+    for _ in range(max_turns):
+        action = policy.act(obs, steps)
+        if isinstance(action, str) or action.name == "submit":
+            final = env.step({"type": "submit"})
+            steps.append(TrajectoryStep(
+                observation=obs, action=action, result=final.observation,
+                tool_error=False,
+            ))
+            break
+        res = env.step(_to_env_action(action))
+        tool_error = "error" in (res.observation or "").lower()[:200]
+        steps.append(TrajectoryStep(
+            observation=obs, action=action, result=res.observation,
+            tool_error=tool_error,
+        ))
+        if res.done:  # env hit its own turn limit and graded
+            final = res
+            break
+        obs = res.observation
+
+    if final is None:
+        # max_turns exhausted without submit — grade what exists.
+        final = env.step({"type": "submit"})
+
+    info = final.info or {}
+    return CanonicalTrajectory(
+        task_id=task.task_id,
+        steps=steps,
+        grade=float(final.reward) if final.reward is not None else None,
+        guard_ok=bool(info.get("guard_ok", True)),
+        hacked=bool(info.get("hacked", False)),
+        provenance={"source": "rollout_harness",
+                    "policy": type(policy).__name__,
+                    **(provenance or {})},
+    )
+
+
+# ---------------------------------------------------------------------
+# Admission — type the signal and route it (final report §4)
+# ---------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class AdmissionVerdict:
+    """Where a trajectory may go. Routing per the typed-train-on-all verdict:
+    clean full passes → SFT; clean near-misses → DPO-candidate (contrastive
+    rejected vs a winner, never raw negative gradient); everything else →
+    rejected (quarantine-side, full provenance kept for audit)."""
+
+    sft_admitted: bool
+    dpo_candidate: bool
+    rejected: bool
+    reasons: tuple[str, ...]
+
+
+def admit(traj: CanonicalTrajectory) -> AdmissionVerdict:
+    reasons: list[str] = []
+    clean = traj.guard_ok and not traj.hacked
+    if not traj.guard_ok:
+        reasons.append("pass_to_pass guard broken")
+    if traj.hacked:
+        reasons.append("hack monitor flagged")
+    grade = traj.grade if traj.grade is not None else 0.0
+    if traj.grade is None:
+        reasons.append("ungraded (no execution oracle)")
+
+    sft = clean and traj.grade is not None and grade == 1.0
+    dpo = clean and traj.grade is not None and 0.0 < grade < 1.0
+    if sft:
+        reasons.append("clean full pass")
+    elif dpo:
+        reasons.append(f"clean near-miss (grade={grade:.2f})")
+    elif clean and grade == 0.0 and traj.grade is not None:
+        reasons.append("clean zero — no partial signal")
+    return AdmissionVerdict(
+        sft_admitted=sft, dpo_candidate=dpo,
+        rejected=not (sft or dpo), reasons=tuple(reasons),
+    )
+
+
+__all__ = [
+    "RolloutPolicy",
+    "ScriptedPolicy",
+    "OpenRouterPolicy",
+    "collect_trajectory",
+    "AdmissionVerdict",
+    "admit",
+]
diff --git a/composer_replication/datagen/swesmith_adapter.py b/composer_replication/datagen/swesmith_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6010a866b6e41545f0c5f9131c55460316c69f35
--- /dev/null
+++ b/composer_replication/datagen/swesmith_adapter.py
@@ -0,0 +1,269 @@
+"""swesmith_adapter.py — adapt SWE-smith instances into Feature-Deletion tasks.
+
+THE BUY-VS-BUILD VERDICT (deepread finding V4 / D-6): `pip install swesmith`
+(MIT) already ships what ADR-010's "Option B greenfield generator" would have
+hand-built — env construction from arbitrary GitHub repos (ONE Docker image per
+repo, ~500x more storage-efficient than per-task images), five bug-synthesis
+strategies, issue-text generation, and validation-by-test-execution, at a
+verified $1,360 + ~20 human-hours for 50k tasks. Its **PR Mirror strategy is
+exactly this repo's gold-patch-reversion mechanic** and SWE-smith's own ablation
+(Table 5, arXiv:2504.21798) shows PR-Mirror trajectories train the BEST models
+of its five strategies — independent validation of ADR-010's core approach.
+So SWE-smith is the synthesis ENGINE for "point at a repo"; this module is the
+schema bridge into the existing `FeatureDeletionTask` world.
+
+THE SEMANTIC INVERSION (load-bearing — easy to get backwards):
+  * SWE-bench-shaped instances: `patch` is the GOLD FIX. broken = HEAD with the
+    fix reverted (`git apply -R patch`). `SweBenchAdapter` stores `patch` as
+    `golden_diff` directly.
+  * SWE-smith instances: `patch` INTRODUCES THE BUG. broken = HEAD with the
+    patch APPLIED. The fix — what the agent must produce, the validator's gate-4
+    restoration diff — is the REVERSE of the bug patch.
+This adapter therefore stores `golden_diff = reverse_unified_diff(bug_patch)`.
+When mechanical reversal fails (exotic diff features), it falls back to the
+original patch tagged with a provenance marker so downstream gate-4 validation
+knows to use `git apply -R` instead of `git apply`.
+
+The adapter itself needs nothing beyond core deps. Live synthesis (building new
+repo profiles / generating new bugs) needs the `swesmith` toolkit + Docker on
+Linux — see the `[swesmith]` extra in pyproject.
+"""
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.substrates import _as_tuple
+
+#: Marker prefixed to golden_diff when reverse_unified_diff could not invert the
+#: bug patch mechanically. Consumers (gate-4 validation) must then apply the
+#: remainder with `git apply -R` (it is the FORWARD bug patch, not the fix).
+UNREVERSED_MARKER = "### UNREVERSED-BUG-PATCH (apply with -R) ###\n"
+
+#: instance_id substring patterns → synthesis strategy (SWE-smith §2.1 / §B).
+#: Patterns follow the toolkit's naming: e.g.
+#:   pandas-dev__pandas.abc123.lm_modify__xyz
+#:   ...func_pm_ctrl_invert_if__..., ...combine_file__..., ...pr_1234
+_STRATEGY_PATTERNS: tuple[tuple[str, str], ...] = (
+    ("lm_modify", "lm_modify"),
+    ("lm_rewrite", "lm_rewrite"),
+    ("func_pm", "procedural"),     # procedural AST modifications (13 transform types)
+    ("func_basic", "procedural"),
+    ("combine_file", "combine"),
+    ("combine_module", "combine"),
+    ("combine", "combine"),
+    ("pr_", "pr_mirror"),
+)
+
+
+def parse_strategy(instance_id: str) -> str:
+    """Map a SWE-smith instance_id to its bug-synthesis strategy.
+
+    Returns one of {lm_modify, lm_rewrite, procedural, combine, pr_mirror,
+    unknown}. The strategy matters because SWE-smith's Table 5 ablation found
+    trajectory quality differs sharply by strategy (PR Mirror best, LM Modify
+    steep drop-off) — we carry it as provenance so corpus builds can weight or
+    filter by strategy.
+    """
+    iid = (instance_id or "").lower()
+    for pattern, strategy in _STRATEGY_PATTERNS:
+        if pattern in iid:
+            return strategy
+    return "unknown"
+
+
+#: Heuristic cold-start difficulty priors per strategy, motivated by SWE-smith
+#: Table 1 medians (PR Mirror: 3 median F2P but 14 lines edited; Combine: 15
+#: F2P / 11 lines = multi-site; procedural: 7 F2P / 5 lines, mechanical).
+#: These only seed DifficultyCurriculum's p-hat before real rollouts exist.
+_DIFFICULTY_PRIOR: dict[str, float] = {
+    "pr_mirror": 0.4,
+    "combine": 0.4,
+    "lm_rewrite": 0.45,
+    "lm_modify": 0.55,
+    "procedural": 0.6,
+    "unknown": 0.5,
+}
+
+
+_HUNK_RE = re.compile(
+    r"^@@ -(?P<old_start>\d+)(?:,(?P<old_count>\d+))? "
+    r"\+(?P<new_start>\d+)(?:,(?P<new_count>\d+))? @@(?P<tail>.*)$"
+)
+
+
+def reverse_unified_diff(patch: str) -> str | None:
+    """Mechanically invert a unified diff (swap additions and deletions).
+
+    Handles the standard unified-diff features SWE-smith patches use:
+    ``diff --git`` headers, ``---``/``+++`` file lines, ``@@`` hunk headers
+    (old/new ranges swapped), ``+``/``-`` body lines (swapped), context lines,
+    and ``\\ No newline at end of file`` markers (kept in place).
+
+    HONEST LIMITATIONS (returns None — caller falls back to UNREVERSED_MARKER):
+      * file mode changes (``old mode``/``new mode``), renames/copies
+        (``rename from``...), binary patches (``GIT binary patch``), and
+        ``index`` lines with mode suffixes are NOT inverted — reversing them
+        correctly requires git plumbing, not text surgery.
+      * Within a hunk, a reversed diff's line ORDER for paired -/+ runs is the
+        naive swap; `git apply` accepts it, but it is not byte-identical to
+        what `git diff` would emit for the reverse change.
+    """
+    if not patch or "@@" not in patch:
+        return None
+    unsupported = ("old mode ", "new mode ", "rename from ", "rename to ",
+                   "copy from ", "copy to ", "GIT binary patch")
+    if any(marker in patch for marker in unsupported):
+        return None
+
+    out: list[str] = []
+    for line in patch.splitlines():
+        if line.startswith("diff --git "):
+            # `diff --git a/<old> b/<new>` → swap the two paths.
+            m = re.match(r"^diff --git a/(?P<a>.+) b/(?P<b>.+)$", line)
+            if m:
+                out.append(f"diff --git a/{m.group('b')} b/{m.group('a')}")
+            else:
+                out.append(line)
+        elif line.startswith("--- "):
+            out.append("+++ " + line[4:].replace("a/", "b/", 1)
+                       if line[4:].startswith("a/") else "+++ " + line[4:])
+        elif line.startswith("+++ "):
+            out.append("--- " + line[4:].replace("b/", "a/", 1)
+                       if line[4:].startswith("b/") else "--- " + line[4:])
+        elif line.startswith("@@"):
+            m = _HUNK_RE.match(line)
+            if not m:
+                return None
+            old_start, old_count = m.group("old_start"), m.group("old_count")
+            new_start, new_count = m.group("new_start"), m.group("new_count")
+            oc = f",{old_count}" if old_count is not None else ""
+            nc = f",{new_count}" if new_count is not None else ""
+            out.append(f"@@ -{new_start}{nc} +{old_start}{oc} @@{m.group('tail')}")
+        elif line.startswith("+"):
+            out.append("-" + line[1:])
+        elif line.startswith("-"):
+            out.append("+" + line[1:])
+        else:
+            # context lines, `index ...`, `\ No newline...` pass through.
+            out.append(line)
+    return "\n".join(out) + ("\n" if patch.endswith("\n") else "")
+
+
+@dataclass(frozen=True)
+class SwesmithMeta:
+    """Sidecar provenance for a SWE-smith-derived task.
+
+    Kept OUT of the frozen `FeatureDeletionTask` schema deliberately — the
+    schema is shared with SweBenchAdapter and the trainer; strategy provenance
+    is a corpus-construction concern, carried alongside (e.g. into the run
+    manifest), never into the policy-visible task row.
+    """
+
+    strategy: str            # lm_modify | lm_rewrite | procedural | combine | pr_mirror | unknown
+    diff_reversed: bool      # True if golden_diff is the mechanical reverse of the bug patch
+    source: str = "swesmith"
+
+
+@dataclass
+class SwesmithAdapter:
+    """Convert a SWE-smith instance dict into a FeatureDeletionTask.
+
+    Mirrors `SweBenchAdapter`'s shape; differs in the patch semantics (see the
+    module docstring INVERSION note) and the per-REPO image convention.
+    """
+
+    default_test_command: str = "python -m pytest -q"
+
+    def image_for(self, instance: dict) -> str:
+        # SWE-smith publishes ONE image per repo (not per task). Rows carry
+        # `image_name`; some exports use `docker_image`. Fall back to the
+        # toolkit's naming convention derived from the repo slug.
+        for key in ("image_name", "docker_image"):
+            if instance.get(key):
+                return str(instance[key])
+        repo = str(instance.get("repo", "unknown")).replace("/", "__").lower()
+        return f"swesmith.x86_64.{repo}:latest"
+
+    def to_task(self, instance: dict) -> FeatureDeletionTask:
+        task, _meta = self.to_task_with_meta(instance)
+        return task
+
+    def to_task_with_meta(self, instance: dict) -> tuple[FeatureDeletionTask, SwesmithMeta]:
+        iid = str(instance.get("instance_id") or instance.get("task_id") or "unknown")
+        strategy = parse_strategy(iid)
+
+        bug_patch = str(instance.get("patch", ""))
+        fix = reverse_unified_diff(bug_patch)
+        if fix is not None:
+            golden_diff = fix
+            diff_reversed = True
+        else:
+            golden_diff = UNREVERSED_MARKER + bug_patch
+            diff_reversed = False
+
+        ftp = _as_tuple(instance.get("FAIL_TO_PASS"))
+        ptp = _as_tuple(instance.get("PASS_TO_PASS"))
+
+        task = FeatureDeletionTask(
+            task_id=iid,
+            repo=str(instance.get("repo", "unknown")),
+            base_commit=str(instance.get("base_commit", "")),
+            broken_image=self.image_for(instance),
+            test_command=str(instance.get("test_command") or self.default_test_command),
+            fail_to_pass=ftp,
+            pass_to_pass=ptp,
+            golden_diff=golden_diff,
+            granularity="feature",
+            # SWE-smith rows don't carry per-instance licenses; repo-level
+            # licensing is the repo_gate's job (deepread finding V9/D-13).
+            upstream_license=str(instance.get("license_name", "unknown")),
+            difficulty_prior=_DIFFICULTY_PRIOR.get(strategy, 0.5),
+        )
+        return task, SwesmithMeta(strategy=strategy, diff_reversed=diff_reversed)
+
+
+def load_swesmith_instances(
+    path_or_hf_id: str,
+    *,
+    limit: int | None = None,
+) -> list[dict]:
+    """Load SWE-smith instances from a local JSONL file or the HF dataset.
+
+    Local ``.jsonl`` paths need no extra deps (used by tests/fixtures). HF ids
+    (e.g. ``SWE-bench/SWE-smith``) lazy-import `datasets` from the `[datagen]`
+    extra.
+    """
+    if path_or_hf_id.endswith(".jsonl"):
+        rows: list[dict] = []
+        with open(path_or_hf_id, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                rows.append(json.loads(line))
+                if limit is not None and len(rows) >= limit:
+                    break
+        return rows
+    try:
+        from datasets import load_dataset  # noqa: PLC0415 — lazy heavy dep
+    except ImportError as e:
+        raise RuntimeError(
+            "Loading SWE-smith from the HF Hub requires `datasets`; install "
+            "with `pip install -e .[datagen]`. Got: " + repr(e)
+        ) from e
+    split = load_dataset(path_or_hf_id, split="train")
+    rows = [dict(r) for i, r in enumerate(split) if limit is None or i < limit]
+    return rows[: limit if limit is not None else len(rows)]
+
+
+__all__ = [
+    "SwesmithAdapter",
+    "SwesmithMeta",
+    "UNREVERSED_MARKER",
+    "load_swesmith_instances",
+    "parse_strategy",
+    "reverse_unified_diff",
+]
diff --git a/composer_replication/datagen/tests/test_repo_gate.py b/composer_replication/datagen/tests/test_repo_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c913bf44c934e4184e5069d81aac9282f306201
--- /dev/null
+++ b/composer_replication/datagen/tests/test_repo_gate.py
@@ -0,0 +1,419 @@
+"""Tests for the Stage-0 ingest gate (repo_gate.py) — architecture step 1.
+
+Coverage targets the two findings the module closes:
+  * V9/D-13 — SPDX detection from real license fixture texts (incl. the
+    tricky GNU-family cross-citation and Apache-vs-MIT phrasing) and the
+    three-tier mapping that replaces the old boolean substring filter.
+  * V3/D-5  — decontamination hits in exact, URL, and mixed-case forms, and
+    the hard never-admit rule in the composed verdict.
+
+CPU-only, stdlib + tmp_path fixtures — no network, no Docker.
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from composer_replication.datagen.repo_gate import (
+    DECONTAMINATION_LIST,
+    GateVerdict,
+    LicenseInfo,
+    Tier,
+    detect_license,
+    gate_repo,
+    is_eval_contaminated,
+    license_tier,
+    load_decontamination_list,
+    normalize_repo,
+)
+
+# ---------------------------------------------------------------------
+# License fixture texts — distinctive excerpts of the real license texts.
+# ---------------------------------------------------------------------
+
+MIT_TEXT = """\
+MIT License
+
+Copyright (c) 2026 Example Org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction...
+"""
+
+# The tricky Apache case: the words "permission" and "license" appear in both
+# MIT and Apache; only Apache names itself with a version.
+APACHE_TEXT = """\
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+"""
+
+BSD3_TEXT = """\
+BSD 3-Clause License
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice...
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+"""
+
+BSD2_TEXT = """\
+BSD 2-Clause License
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice.
+2. Redistributions in binary form must reproduce the above copyright notice.
+"""
+
+ISC_TEXT = """\
+ISC License
+
+Copyright (c) 2026, Example Org
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+"""
+
+# GPL-3.0 §13 cross-cites the AGPL by full name — the classic trap for
+# full-body substring matchers. Header anchoring must win.
+GPL3_TEXT = """\
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+  permission to link or combine any covered work with a work licensed
+  under version 3 of the GNU Affero General Public License...
+"""
+
+GPL2_TEXT = """\
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+"""
+
+# AGPL-3.0 §13 reciprocally cites the plain GPL by name.
+AGPL3_TEXT = """\
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License...
+"""
+
+LGPL21_TEXT = """\
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+"""
+
+MPL2_TEXT = """\
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+1.1. "Contributor"
+    means each individual or legal entity that creates, contributes to
+    the creation of, or owns Covered Software.
+"""
+
+UNLICENSE_TEXT = """\
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any means.
+"""
+
+
+def _repo_with_license(tmp_path: Path, text: str, filename: str = "LICENSE") -> Path:
+    (tmp_path / filename).write_text(text, encoding="utf-8")
+    return tmp_path
+
+
+# ---------------------------------------------------------------------
+# detect_license — SPDX classification from file text
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        (MIT_TEXT, "MIT"),
+        (APACHE_TEXT, "Apache-2.0"),
+        (BSD3_TEXT, "BSD-3-Clause"),
+        (BSD2_TEXT, "BSD-2-Clause"),
+        (ISC_TEXT, "ISC"),
+        (GPL3_TEXT, "GPL-3.0"),
+        (GPL2_TEXT, "GPL-2.0"),
+        (AGPL3_TEXT, "AGPL-3.0"),
+        (LGPL21_TEXT, "LGPL-2.1"),
+        (MPL2_TEXT, "MPL-2.0"),
+        (UNLICENSE_TEXT, "Unlicense"),
+    ],
+    ids=["mit", "apache2", "bsd3", "bsd2", "isc", "gpl3", "gpl2", "agpl3", "lgpl21", "mpl2", "unlicense"],
+)
+def test_detect_license_spdx_ids(tmp_path: Path, text: str, expected: str):
+    info = detect_license(_repo_with_license(tmp_path, text))
+    assert info.spdx_id == expected
+    assert info.signal == "license_file"
+    assert info.source == "LICENSE"
+
+
+def test_gpl3_not_misread_as_agpl(tmp_path: Path):
+    """GPL-3.0 §13 names the AGPL in its body; header anchoring must keep
+    this classified as GPL-3.0 (the V9 substring filter would have tripped)."""
+    info = detect_license(_repo_with_license(tmp_path, GPL3_TEXT))
+    assert info.spdx_id == "GPL-3.0"
+
+
+def test_apache_notice_without_header_still_apache(tmp_path: Path):
+    """The short 'Licensed under the Apache License, Version 2.0' boilerplate
+    has no canonical header — the body fallback must catch it, and must not
+    fall through to MIT despite shared 'permission' vocabulary."""
+    notice = (
+        "Copyright 2026 Example Org\n\n"
+        "Licensed under the Apache License, Version 2.0 (the \"License\");\n"
+        "you may not use this file except in compliance with the License.\n"
+    )
+    info = detect_license(_repo_with_license(tmp_path, notice))
+    assert info.spdx_id == "Apache-2.0"
+
+
+def test_detect_license_alternate_filenames(tmp_path: Path):
+    info = detect_license(_repo_with_license(tmp_path, GPL2_TEXT, filename="COPYING"))
+    assert info.spdx_id == "GPL-2.0"
+    assert info.source == "COPYING"
+    info2 = detect_license(_repo_with_license(tmp_path, MIT_TEXT, filename="LICENSE.md"))
+    # LICENSE.md is also present in tmp_path now alongside COPYING; first
+    # filename in priority order (LICENSE/LICENSE.txt/LICENSE.md) wins over COPYING.
+    assert info2.spdx_id == "MIT"
+    assert info2.source == "LICENSE.md"
+
+
+def test_detect_license_unknown_text(tmp_path: Path):
+    info = detect_license(_repo_with_license(tmp_path, "All rights reserved. Ask legal."))
+    assert info.spdx_id == "unknown"
+
+
+def test_detect_license_no_files(tmp_path: Path):
+    info = detect_license(tmp_path)
+    assert info == LicenseInfo(spdx_id="unknown", signal="none")
+
+
+def test_classifier_secondary_signal(tmp_path: Path):
+    """No LICENSE file, but pyproject carries a trove classifier — the
+    classifier signal must win and be recorded as such."""
+    (tmp_path / "pyproject.toml").write_text(
+        '[project]\nname = "x"\nclassifiers = [\n'
+        '    "License :: OSI Approved :: MIT License",\n]\n',
+        encoding="utf-8",
+    )
+    info = detect_license(tmp_path)
+    assert info.spdx_id == "MIT"
+    assert info.signal == "classifier"
+    assert info.source == "pyproject.toml"
+
+
+def test_classifier_pep639_expression(tmp_path: Path):
+    (tmp_path / "pyproject.toml").write_text(
+        '[project]\nname = "x"\nlicense = "Apache-2.0"\n', encoding="utf-8"
+    )
+    info = detect_license(tmp_path)
+    assert info.spdx_id == "Apache-2.0"
+    assert info.signal == "classifier"
+
+
+def test_license_file_beats_classifier(tmp_path: Path):
+    """When both signals exist and the file is classifiable, the file wins —
+    the classifier is secondary by design (it can't tell BSD-2 from BSD-3)."""
+    _repo_with_license(tmp_path, GPL3_TEXT)
+    (tmp_path / "pyproject.toml").write_text(
+        'classifiers = ["License :: OSI Approved :: MIT License"]\n', encoding="utf-8"
+    )
+    info = detect_license(tmp_path)
+    assert info.spdx_id == "GPL-3.0"
+    assert info.signal == "license_file"
+
+
+def test_unclassifiable_file_falls_back_to_classifier(tmp_path: Path):
+    _repo_with_license(tmp_path, "Custom corporate license, see legal dept.")
+    (tmp_path / "pyproject.toml").write_text(
+        'classifiers = ["License :: OSI Approved :: ISC License"]\n', encoding="utf-8"
+    )
+    info = detect_license(tmp_path)
+    assert info.spdx_id == "ISC"
+    assert info.signal == "classifier"
+
+
+# ---------------------------------------------------------------------
+# license_tier — tiers, not a boolean (D-13)
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("spdx", "tier"),
+    [
+        ("MIT", Tier.REDISTRIBUTABLE),
+        ("Apache-2.0", Tier.REDISTRIBUTABLE),
+        ("BSD-2-Clause", Tier.REDISTRIBUTABLE),
+        ("BSD-3-Clause", Tier.REDISTRIBUTABLE),
+        ("ISC", Tier.REDISTRIBUTABLE),
+        ("Unlicense", Tier.REDISTRIBUTABLE),
+        ("MPL-2.0", Tier.TRAINABLE_ONLY),
+        ("LGPL-2.1", Tier.TRAINABLE_ONLY),
+        ("LGPL-3.0", Tier.TRAINABLE_ONLY),
+        ("GPL-2.0", Tier.EXCLUDED),
+        ("GPL-3.0", Tier.EXCLUDED),
+        ("AGPL-3.0", Tier.EXCLUDED),
+        ("unknown", Tier.EXCLUDED),
+        ("WTFPL", Tier.EXCLUDED),  # unrecognized id → fail closed
+    ],
+)
+def test_license_tier_mapping(spdx: str, tier: Tier):
+    assert license_tier(LicenseInfo(spdx_id=spdx, signal="license_file")) is tier
+
+
+# ---------------------------------------------------------------------
+# Decontamination (V3 / D-5)
+# ---------------------------------------------------------------------
+
+
+def test_decontamination_list_has_the_canonical_12():
+    assert len(DECONTAMINATION_LIST) == 12
+    assert "django/django" in DECONTAMINATION_LIST
+    assert "sympy/sympy" in DECONTAMINATION_LIST
+
+
+@pytest.mark.parametrize(
+    "repo",
+    [
+        "django/django",  # exact
+        "Django/Django",  # case
+        "https://github.com/django/django",  # https URL
+        "https://github.com/django/django.git",  # URL + .git
+        "git@github.com:django/django.git",  # ssh URL
+        "https://github.com/django/django/",  # trailing slash
+    ],
+)
+def test_is_eval_contaminated_hits(repo: str):
+    assert is_eval_contaminated(repo) is True
+
+
+@pytest.mark.parametrize(
+    "repo",
+    [
+        "pandas-dev/pandas",
+        "https://github.com/torvalds/linux",
+        "someuser/django",  # fork-org differs: NOT the eval repo
+    ],
+)
+def test_is_eval_contaminated_misses(repo: str):
+    assert is_eval_contaminated(repo) is False
+
+
+def test_normalize_repo_forms():
+    assert normalize_repo("git@github.com:PSF/Requests.git") == "psf/requests"
+    assert normalize_repo("https://github.com/pydata/xarray/tree/main") == "pydata/xarray"
+
+
+def test_extension_list_from_json(tmp_path: Path):
+    """The documented extension mechanism: extra eval repos load from JSON
+    and hit through the same normalized matching."""
+    extra_path = tmp_path / "extra.json"
+    extra_path.write_text(json.dumps(["SWE-Gym/Extra-Repo"]), encoding="utf-8")
+    extra = load_decontamination_list(extra_path)
+    assert is_eval_contaminated("https://github.com/swe-gym/extra-repo", extra_list=extra)
+    assert not is_eval_contaminated("swe-gym/other-repo", extra_list=extra)
+
+
+def test_extension_list_rejects_non_list(tmp_path: Path):
+    bad = tmp_path / "bad.json"
+    bad.write_text('{"repo": "a/b"}', encoding="utf-8")
+    with pytest.raises(ValueError):
+        load_decontamination_list(bad)
+
+
+# ---------------------------------------------------------------------
+# gate_repo — verdict composition
+# ---------------------------------------------------------------------
+
+
+def test_gate_admits_permissive_clean_repo(tmp_path: Path):
+    v = gate_repo("example/clean", _repo_with_license(tmp_path, MIT_TEXT))
+    assert isinstance(v, GateVerdict)
+    assert v.admitted is True
+    assert v.tier is Tier.REDISTRIBUTABLE
+    assert v.contaminated is False
+    assert v.reasons == []
+
+
+def test_gate_contaminated_never_admitted_even_if_permissive(tmp_path: Path):
+    """V3 hard rule: an eval repo with an MIT license is STILL rejected —
+    decontamination outranks license."""
+    v = gate_repo("https://github.com/pallets/flask", _repo_with_license(tmp_path, MIT_TEXT))
+    assert v.contaminated is True
+    assert v.admitted is False
+    assert any("decontamination" in r for r in v.reasons)
+    # license detection still ran and is recorded for the manifest
+    assert v.license_info.spdx_id == "MIT"
+
+
+def test_gate_excluded_tier_never_admitted(tmp_path: Path):
+    v = gate_repo("example/agpl-repo", _repo_with_license(tmp_path, AGPL3_TEXT))
+    assert v.tier is Tier.EXCLUDED
+    assert v.admitted is False
+    assert any("EXCLUDED" in r for r in v.reasons)
+
+
+def test_gate_trainable_only_admitted_with_reason(tmp_path: Path):
+    """D-13: weak copyleft is admitted for training, but the verdict must
+    carry the do-not-redistribute constraint for step 6 to route on."""
+    v = gate_repo("example/mpl-repo", _repo_with_license(tmp_path, MPL2_TEXT))
+    assert v.tier is Tier.TRAINABLE_ONLY
+    assert v.admitted is True
+    assert any("TRAINABLE_ONLY" in r for r in v.reasons)
+    assert any("redistributed" in r for r in v.reasons)
+
+
+def test_gate_no_repo_root_fails_closed():
+    """No repo_root → license undetectable → unknown → EXCLUDED → rejected
+    (V9: the gate must default closed, never open)."""
+    v = gate_repo("example/unfetched", None)
+    assert v.license_info.spdx_id == "unknown"
+    assert v.tier is Tier.EXCLUDED
+    assert v.admitted is False
+    assert any("failing closed" in r for r in v.reasons)
+
+
+def test_gate_extra_decontamination_list(tmp_path: Path):
+    extra = frozenset({"my-eval/secret-benchmark"})
+    v = gate_repo(
+        "https://github.com/My-Eval/Secret-Benchmark.git",
+        _repo_with_license(tmp_path, MIT_TEXT),
+        extra_decontamination=extra,
+    )
+    assert v.contaminated is True
+    assert v.admitted is False
diff --git a/composer_replication/datagen/tests/test_rollout_harness.py b/composer_replication/datagen/tests/test_rollout_harness.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4f3d329c11b2e0b0c4dc2ffa4e10f35036c8bc
--- /dev/null
+++ b/composer_replication/datagen/tests/test_rollout_harness.py
@@ -0,0 +1,103 @@
+"""Tests for the rollout harness (deepread finding V2 — the SFT-corpus producer)."""
+from __future__ import annotations
+
+from composer_replication.datagen.env import FeatureDeletionEnv
+from composer_replication.datagen.rollout_harness import (
+    ScriptedPolicy,
+    admit,
+    collect_trajectory,
+)
+from composer_replication.datagen.sandbox import FakeSandbox
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.trajectory import CanonicalTrajectory, ToolCall
+
+
+def _task() -> FeatureDeletionTask:
+    return FeatureDeletionTask(
+        task_id="t1", repo="org/repo", base_commit="abc",
+        broken_image="img:1", test_command="pytest -q",
+        fail_to_pass=("t/a.py::t1", "t/a.py::t2"),
+        pass_to_pass=("t/a.py::keep",),
+    )
+
+
+def _env(outcomes: dict[str, bool]) -> FeatureDeletionEnv:
+    return FeatureDeletionEnv(FakeSandbox(test_outcomes=outcomes))
+
+
+def test_collect_trajectory_full_pass():
+    """Policy 'fixes' the repo via the FakeSandbox set_outcome pseudo-action,
+    then submits — grade 1.0, steps record real env transitions."""
+    env = _env({"t/a.py::keep": True})
+    policy = ScriptedPolicy(actions=[
+        ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True, "t/a.py::t2": True}}),
+        "final answer: implemented the feature",
+    ])
+    traj = collect_trajectory(env, _task(), policy)
+    assert isinstance(traj, CanonicalTrajectory)
+    assert traj.grade == 1.0
+    assert traj.guard_ok is True and traj.hacked is False
+    assert len(traj.steps) == 2
+    assert isinstance(traj.steps[0].action, ToolCall)
+    assert traj.steps[0].result == "ok"           # env.step observation recorded
+    assert traj.provenance["source"] == "rollout_harness"
+
+
+def test_collect_trajectory_guard_broken_zeroes_reward():
+    env = _env({"t/a.py::keep": False})  # functional guard broken
+    policy = ScriptedPolicy(actions=[
+        ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True, "t/a.py::t2": True,
+                                              "t/a.py::keep": False}}),
+        "done",
+    ])
+    traj = collect_trajectory(env, _task(), policy)
+    assert traj.grade == 0.0
+    assert traj.guard_ok is False
+
+
+def test_collect_trajectory_near_miss():
+    env = _env({"t/a.py::keep": True})
+    policy = ScriptedPolicy(actions=[
+        ToolCall("set_outcome", {"outcomes": {"t/a.py::t1": True}}),  # 1 of 2
+        "done",
+    ])
+    traj = collect_trajectory(env, _task(), policy)
+    assert traj.grade == 0.5
+    assert traj.guard_ok is True
+
+
+def test_collect_trajectory_max_turns_grades_anyway():
+    env = _env({"t/a.py::keep": True})
+    looping = ScriptedPolicy(actions=[ToolCall("bash", {"command": "ls"})] * 50)
+    traj = collect_trajectory(env, _task(), looping, max_turns=3)
+    assert traj.grade is not None  # graded despite never submitting
+
+
+# ---------------------------------------------------------------------
+# Admission routing (typed train-on-all, final report §4)
+# ---------------------------------------------------------------------
+
+
+def _t(grade, guard_ok=True, hacked=False) -> CanonicalTrajectory:
+    return CanonicalTrajectory(task_id="x", grade=grade, guard_ok=guard_ok, hacked=hacked)
+
+
+def test_admit_routes_clean_pass_to_sft():
+    v = admit(_t(1.0))
+    assert v.sft_admitted and not v.dpo_candidate and not v.rejected
+
+
+def test_admit_routes_near_miss_to_dpo():
+    v = admit(_t(0.5))
+    assert v.dpo_candidate and not v.sft_admitted and not v.rejected
+
+
+def test_admit_rejects_hacked_even_at_full_grade():
+    v = admit(_t(1.0, hacked=True))
+    assert v.rejected and "hack monitor flagged" in v.reasons
+
+
+def test_admit_rejects_guard_broken_and_ungraded():
+    assert admit(_t(1.0, guard_ok=False)).rejected
+    assert admit(_t(None)).rejected
+    assert admit(_t(0.0)).rejected
diff --git a/composer_replication/datagen/tests/test_swesmith_adapter.py b/composer_replication/datagen/tests/test_swesmith_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2068bfe97eb457bd046ea6c9f159ce7ff7a19078
--- /dev/null
+++ b/composer_replication/datagen/tests/test_swesmith_adapter.py
@@ -0,0 +1,165 @@
+"""Tests for the SWE-smith adapter (deepread finding V4 — buy-vs-build).
+
+The load-bearing coverage: the PATCH-SEMANTICS INVERSION (SWE-smith's patch
+introduces the bug; golden_diff must be its reverse) and the mechanical
+reverse_unified_diff round-trip.
+"""
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.substrates import SweBenchAdapter
+from composer_replication.datagen.swesmith_adapter import (
+    UNREVERSED_MARKER,
+    SwesmithAdapter,
+    load_swesmith_instances,
+    parse_strategy,
+    reverse_unified_diff,
+)
+
+BUG_PATCH = """\
+diff --git a/pkg/mod.py b/pkg/mod.py
+index 1111111..2222222 100644
+--- a/pkg/mod.py
++++ b/pkg/mod.py
+@@ -1,4 +1,3 @@
+ def add(a, b):
+-    return a + b
++    return a - b
+ # trailing context
+"""
+
+
+def _instance(**over) -> dict:
+    base = {
+        "instance_id": "getmoto__moto.abc1234.lm_modify__1a2b",
+        "repo": "getmoto/moto",
+        "base_commit": "abc1234",
+        "patch": BUG_PATCH,
+        "FAIL_TO_PASS": json.dumps(["tests/test_mod.py::test_add"]),
+        "PASS_TO_PASS": json.dumps(["tests/test_mod.py::test_other"]),
+        "image_name": "swesmith.x86_64.getmoto__moto:latest",
+    }
+    base.update(over)
+    return base
+
+
+# ---------------------------------------------------------------------
+# Strategy parsing
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("iid,expected", [
+    ("r__x.abc.lm_modify__1", "lm_modify"),
+    ("r__x.abc.lm_rewrite__1", "lm_rewrite"),
+    ("r__x.abc.func_pm_ctrl_invert_if__1", "procedural"),
+    ("r__x.abc.func_basic__1", "procedural"),
+    ("r__x.abc.combine_file__1", "combine"),
+    ("r__x.abc.combine_module__2", "combine"),
+    ("r__x.abc.pr_1234", "pr_mirror"),
+    ("r__x.abc.mystery__1", "unknown"),
+])
+def test_parse_strategy(iid, expected):
+    assert parse_strategy(iid) == expected
+
+
+# ---------------------------------------------------------------------
+# reverse_unified_diff
+# ---------------------------------------------------------------------
+
+
+def test_reverse_swaps_adds_and_removes():
+    rev = reverse_unified_diff(BUG_PATCH)
+    assert rev is not None
+    # The bug ADDED "return a - b"; the reverse must REMOVE it.
+    assert "-    return a - b" in rev
+    assert "+    return a + b" in rev
+    # Hunk header ranges swapped: -1,4 +1,3 → -1,3 +1,4
+    assert "@@ -1,3 +1,4 @@" in rev
+    # Context lines untouched.
+    assert " def add(a, b):" in rev
+    assert " # trailing context" in rev
+
+
+def test_reverse_round_trip_is_identity_on_body():
+    rev = reverse_unified_diff(BUG_PATCH)
+    rev2 = reverse_unified_diff(rev)
+    # Round trip restores hunks and +/- bodies (headers may normalize).
+    orig_body = [ln for ln in BUG_PATCH.splitlines() if ln[:1] in "+-@" and not ln.startswith(("+++", "---"))]
+    rt_body = [ln for ln in rev2.splitlines() if ln[:1] in "+-@" and not ln.startswith(("+++", "---"))]
+    assert orig_body == rt_body
+
+
+def test_reverse_refuses_renames_and_binary():
+    assert reverse_unified_diff("diff --git a/x b/y\nrename from x\nrename to y\n") is None
+    assert reverse_unified_diff("diff --git a/x b/x\nGIT binary patch\nliteral 5\n") is None
+    assert reverse_unified_diff("") is None
+    assert reverse_unified_diff("no hunks here") is None
+
+
+# ---------------------------------------------------------------------
+# Adapter
+# ---------------------------------------------------------------------
+
+
+def test_to_task_golden_diff_is_the_fix_not_the_bug():
+    """THE semantic inversion: golden_diff must restore the feature."""
+    task, meta = SwesmithAdapter().to_task_with_meta(_instance())
+    assert isinstance(task, FeatureDeletionTask)
+    assert meta.diff_reversed is True
+    assert meta.strategy == "lm_modify"
+    # The FIX restores `a + b` (adds it back) and removes the bug.
+    assert "+    return a + b" in task.golden_diff
+    assert "-    return a - b" in task.golden_diff
+    assert UNREVERSED_MARKER not in task.golden_diff
+
+
+def test_to_task_unreversible_patch_gets_marker():
+    inst = _instance(patch="diff --git a/x b/y\nrename from x\nrename to y\n@@ -1 +1 @@\n-a\n+b\n")
+    task, meta = SwesmithAdapter().to_task_with_meta(inst)
+    assert meta.diff_reversed is False
+    assert task.golden_diff.startswith(UNREVERSED_MARKER)
+
+
+def test_image_resolution_prefers_instance_field_then_convention():
+    a = SwesmithAdapter()
+    assert a.image_for(_instance()) == "swesmith.x86_64.getmoto__moto:latest"
+    assert a.image_for(_instance(image_name=None, docker_image="custom:tag")) == "custom:tag"
+    inst = _instance(image_name=None)
+    inst.pop("docker_image", None)
+    assert a.image_for(inst) == "swesmith.x86_64.getmoto__moto:latest"
+
+
+def test_f2p_p2p_tuple_handling_matches_swebench_semantics():
+    task = SwesmithAdapter().to_task(_instance(
+        FAIL_TO_PASS=["t/a.py::t1", "t/a.py::t2"],  # real list, not JSON string
+        PASS_TO_PASS=json.dumps([]),
+    ))
+    assert task.fail_to_pass == ("t/a.py::t1", "t/a.py::t2")
+    assert task.pass_to_pass == ()
+
+
+def test_difficulty_priors_by_strategy():
+    pr = SwesmithAdapter().to_task(_instance(instance_id="r__x.abc.pr_99"))
+    proc = SwesmithAdapter().to_task(_instance(instance_id="r__x.abc.func_pm_remove_loop__1"))
+    assert pr.difficulty_prior < proc.difficulty_prior  # PR Mirror harder prior
+
+
+def test_redistributable_filter_interplay():
+    """repo_gate owns repo-level licensing, but the per-instance filter from
+    SweBenchAdapter still composes when a license field IS present."""
+    task = SwesmithAdapter().to_task(_instance(license_name="GPL-3.0"))
+    assert SweBenchAdapter.is_redistributable(task) is False
+    task2 = SwesmithAdapter().to_task(_instance(license_name="MIT"))
+    assert SweBenchAdapter.is_redistributable(task2) is True
+
+
+def test_load_local_jsonl(tmp_path):
+    p = tmp_path / "fixtures.jsonl"
+    p.write_text("\n".join(json.dumps(_instance(instance_id=f"r__x.abc.pr_{i}")) for i in range(5)))
+    rows = load_swesmith_instances(str(p), limit=3)
+    assert len(rows) == 3
+    assert rows[0]["instance_id"] == "r__x.abc.pr_0"
diff --git a/composer_replication/datagen/tests/test_trajectory.py b/composer_replication/datagen/tests/test_trajectory.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc58a1a8fb97b5a4cc47f42774857a3f156f55d
--- /dev/null
+++ b/composer_replication/datagen/tests/test_trajectory.py
@@ -0,0 +1,127 @@
+"""Tests for the canonical trajectory IR (deepread findings V2/D-11/D-8).
+
+The load-bearing test is the SENTINEL leak guard: `to_policy_row` must never
+emit golden_diff/deleted_symbols, even though the task dataclass carries them.
+"""
+from __future__ import annotations
+
+import json
+
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.trajectory import (
+    CanonicalTrajectory,
+    ToolCall,
+    TrajectoryStep,
+    from_trace_states,
+    to_policy_row,
+    to_sft_messages,
+)
+from composer_replication.teacher_replay import TraceState
+
+
+def _task(**over) -> FeatureDeletionTask:
+    base = dict(
+        task_id="t1", repo="org/repo", base_commit="abc",
+        broken_image="img:1", test_command="pytest -q",
+        fail_to_pass=("t/a.py::t1",), pass_to_pass=("t/a.py::t2",),
+        golden_diff="SENTINEL_NEVER_LEAK",
+        deleted_symbols=("secret_fn",),
+    )
+    base.update(over)
+    return FeatureDeletionTask(**base)
+
+
+def _traj() -> CanonicalTrajectory:
+    return CanonicalTrajectory(
+        task_id="t1",
+        steps=[
+            TrajectoryStep(observation="repo is broken", action=ToolCall("bash", {"command": "pytest"}),
+                           result="2 failed", tool_error=False),
+            TrajectoryStep(observation="2 failed", action="here is my final patch",
+                           result="graded", tool_error=False),
+        ],
+        grade=1.0, guard_ok=True, hacked=False,
+        provenance={"source": "test"},
+    )
+
+
+# ---------------------------------------------------------------------
+# ToolCall canonical form — the v1 divergence algebra
+# ---------------------------------------------------------------------
+
+
+def test_canonical_form_is_order_insensitive_on_args():
+    a = ToolCall("edit", {"path": "x.py", "content": "y"})
+    b = ToolCall("edit", {"content": "y", "path": "x.py"})
+    assert a.canonical_form() == b.canonical_form()
+
+
+def test_canonical_form_distinguishes_name_and_args():
+    assert ToolCall("bash", {"command": "ls"}).canonical_form() != \
+        ToolCall("bash", {"command": "ls -la"}).canonical_form()
+    assert ToolCall("read", {"f": "x"}).canonical_form() != \
+        ToolCall("write", {"f": "x"}).canonical_form()
+
+
+# ---------------------------------------------------------------------
+# THE leak guard (finding D-8)
+# ---------------------------------------------------------------------
+
+
+def test_policy_row_never_contains_golden_diff_or_deleted_symbols():
+    row = to_policy_row(_traj(), _task())
+    blob = json.dumps(row)
+    assert "SENTINEL_NEVER_LEAK" not in blob
+    assert "secret_fn" not in blob
+    assert "golden_diff" not in blob
+    assert "deleted_symbols" not in blob
+    # And the row still carries what the policy MAY see.
+    assert row["repo"] == "org/repo"
+    assert row["fail_to_pass"] == ["t/a.py::t1"]
+    assert row["grade"] == 1.0
+
+
+# ---------------------------------------------------------------------
+# IR ↔ SFT messages
+# ---------------------------------------------------------------------
+
+
+def test_to_sft_messages_alternates_roles():
+    msgs = to_sft_messages(_traj())
+    assert msgs[0] == {"role": "user", "content": "repo is broken"}
+    assert msgs[1]["role"] == "assistant"
+    assert "[TOOL_USE] name=bash" in msgs[1]["content"]
+    assert msgs[2] == {"role": "user", "content": "2 failed"}
+
+
+# ---------------------------------------------------------------------
+# Claude Code → IR adapter
+# ---------------------------------------------------------------------
+
+
+def test_from_trace_states_parses_single_tool_use_and_error_flag():
+    states: list[TraceState] = [
+        {
+            "state_id": "sess1::0000",
+            "messages": [
+                {"role": "system", "content": "sys"},
+                {"role": "user", "content": "[TOOL_RESULT (ERROR)] (id=x)\nboom",
+                 "tool_error": True},
+            ],
+            "student_action": '[TOOL_USE] name=Bash input={"command":"ls"}',
+        },
+        {
+            "state_id": "sess1::0001",
+            "messages": [{"role": "user", "content": "plain prompt"}],
+            "student_action": "I think the fix is...\n\n[TOOL_USE] name=Edit input={\"p\":1}\n\n[TOOL_USE] name=Bash input={\"c\":2}",
+        },
+    ]
+    traj = from_trace_states(states)
+    assert traj.task_id == "sess1"
+    assert traj.grade is None  # ungraded — Claude Code traces have no oracle
+    s0, s1 = traj.steps
+    assert isinstance(s0.action, ToolCall) and s0.action.name == "Bash"
+    assert s0.tool_error is True
+    # Multi-tool turn stays as the raw string (honest, not guessed).
+    assert isinstance(s1.action, str)
+    assert s1.tool_error is False
diff --git a/composer_replication/datagen/trajectory.py b/composer_replication/datagen/trajectory.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f891233807c57334e3a9129986be50a89846582
--- /dev/null
+++ b/composer_replication/datagen/trajectory.py
@@ -0,0 +1,203 @@
+"""trajectory.py — the canonical trajectory IR (deepread findings V2/D-11/D-8).
+
+THE GAP THIS CLOSES: the repo had 3 (heading to 5) incompatible trajectory
+shapes — Claude Code TraceState text serialization, Bedrock `.jsonl.out` rows,
+the planned tree/rollout/OpenHands shapes — with no shared schema, and the
+divergence gate rested on a whitespace-collapse string normalizer
+(`teacher_replay._normalize_action`, self-admitted skeleton). This module is the
+single intermediate representation every producer adapts INTO and every corpus
+writer reads FROM.
+
+SECURITY INVARIANT (finding D-8): `FeatureDeletionTask.golden_diff` uses
+`repr=False`, but `dataclasses.asdict()` and naive JSON serialization still
+include it. `to_policy_row()` is the ONE serializer allowed to produce
+policy-visible rows, and its output is unit-tested to never contain
+`golden_diff` or `deleted_symbols`.
+"""
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Any
+
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.teacher_replay import TraceState
+
+#: Bump when the IR shape changes; carried on every trajectory + corpus row.
+SCHEMA_VERSION = "1"
+
+
+@dataclass(frozen=True)
+class ToolCall:
+    """One structured tool invocation — the unit of the action algebra.
+
+    `canonical_form()` is the v1 divergence-gate algebra (finding D-3): two
+    actions are "the same" iff their canonical forms match (tool name + sorted,
+    JSON-normalized args). This replaces the whitespace-collapse stub that made
+    the divergence gate fire on noise. v2 will need arg-level normalization
+    (path equivalence, whitespace-insensitive code args); keep that evolution
+    HERE so every consumer inherits it.
+    """
+
+    name: str
+    args: dict = field(default_factory=dict)
+
+    def canonical_form(self) -> str:
+        try:
+            args_json = json.dumps(self.args, sort_keys=True, separators=(",", ":"))
+        except (TypeError, ValueError):
+            args_json = str(self.args)
+        return f"{self.name}:{args_json}"
+
+
+@dataclass
+class TrajectoryStep:
+    """One agent turn: what it saw, what it did, what came back."""
+
+    observation: str
+    action: ToolCall | str          # str = plain text / final message
+    result: str | None = None      # tool output observed AFTER the action
+    tool_error: bool = False
+
+
+@dataclass
+class CanonicalTrajectory:
+    """The IR: an episode (or trace) as a typed step list + outcome + provenance."""
+
+    task_id: str
+    steps: list[TrajectoryStep] = field(default_factory=list)
+    grade: float | None = None      # _grade() pass-fraction; None = ungraded trace
+    guard_ok: bool = True
+    hacked: bool = False
+    provenance: dict = field(default_factory=dict)  # source, policy id, cost_usd, run_id
+    schema_version: str = SCHEMA_VERSION
+
+
+# ---------------------------------------------------------------------
+# Producers → IR
+# ---------------------------------------------------------------------
+
+# ClaudeCodeIngester serializes tool calls as "[TOOL_USE] name=<n> input=<json>".
+_TOOL_USE_RE = re.compile(r"^\[TOOL_USE\] name=(?P<name>\S+) input=(?P<input>\{.*\})$")
+
+
+def _parse_action(student_action: str) -> ToolCall | str:
+    """Parse a TraceState student_action back into a ToolCall where possible.
+
+    Claude Code assistant turns serialize as newline-joined blocks; if exactly
+    one [TOOL_USE] block is present we recover the structured call (the common
+    case ADR-002 chose one-node-per-turn for). Multi-tool turns and pure-text /
+    thinking turns stay as the raw string — honest about what we can't
+    structure rather than guessing.
+    """
+    blocks = [b for b in student_action.split("\n\n") if b.strip()]
+    tool_blocks = [b for b in blocks if b.startswith("[TOOL_USE]")]
+    if len(tool_blocks) == 1:
+        m = _TOOL_USE_RE.match(tool_blocks[0].strip())
+        if m:
+            try:
+                return ToolCall(name=m.group("name"), args=json.loads(m.group("input")))
+            except (json.JSONDecodeError, ValueError):
+                pass
+    return student_action
+
+
+def from_trace_states(
+    states: list[TraceState],
+    *,
+    task_id: str = "",
+    provenance: dict | None = None,
+) -> CanonicalTrajectory:
+    """Adapt a Claude Code trace (TraceState list) into the IR.
+
+    HONEST CAPABILITY NOTE (finding D-1): these traces carry no executable
+    environment — no broken_image, no FAIL_TO_PASS — so the resulting
+    trajectory is UNGRADED (`grade=None`) and is admissible only for flat
+    Channel-3 / SFT-style uses, never as a tree seed. Env-grounded rollouts
+    (rollout_harness.collect_trajectory) are the graded producers.
+    """
+    steps: list[TrajectoryStep] = []
+    for s in states:
+        # The observation for step t is the last user message before the turn.
+        obs = ""
+        tool_error = False
+        for msg in reversed(s["messages"]):
+            if msg.get("role") == "user":
+                obs = str(msg.get("content", ""))
+                tool_error = bool(msg.get("tool_error", False))
+                break
+        steps.append(TrajectoryStep(
+            observation=obs,
+            action=_parse_action(s["student_action"]),
+            result=None,
+            tool_error=tool_error,
+        ))
+    prov = {"source": "claude_code", **(provenance or {})}
+    return CanonicalTrajectory(task_id=task_id or (states[0]["state_id"].split("::")[0] if states else ""),
+                               steps=steps, grade=None, provenance=prov)
+
+
+# ---------------------------------------------------------------------
+# IR → consumers
+# ---------------------------------------------------------------------
+
+
+def _action_text(action: ToolCall | str) -> str:
+    if isinstance(action, ToolCall):
+        return f"[TOOL_USE] name={action.name} input=" + json.dumps(
+            action.args, separators=(",", ":")
+        )
+    return action
+
+
+def to_sft_messages(traj: CanonicalTrajectory) -> list[dict]:
+    """IR → OpenAI-style messages for SFT (obs→user, action→assistant)."""
+    messages: list[dict] = []
+    for step in traj.steps:
+        if step.observation:
+            messages.append({"role": "user", "content": step.observation})
+        messages.append({"role": "assistant", "content": _action_text(step.action)})
+        if step.result:
+            messages.append({"role": "user", "content": step.result})
+    return messages
+
+
+#: Task fields the POLICY may see. Everything else (golden_diff,
+#: deleted_symbols) is construction-side and must never reach a corpus row.
+_POLICY_VISIBLE_TASK_FIELDS = (
+    "task_id", "repo", "base_commit", "test_command",
+    "fail_to_pass", "pass_to_pass", "granularity", "difficulty_prior",
+)
+
+
+def to_policy_row(traj: CanonicalTrajectory, task: FeatureDeletionTask) -> dict:
+    """THE policy-visible corpus serializer (finding D-8 — the leak guard).
+
+    Builds the row field-by-field from an allowlist; never `asdict(task)`,
+    which would include `golden_diff` despite its `repr=False`. Unit-tested
+    with a sentinel to prove the absence.
+    """
+    row: dict[str, Any] = {
+        "schema_version": traj.schema_version,
+        "messages": to_sft_messages(traj),
+        "grade": traj.grade,
+        "guard_ok": traj.guard_ok,
+        "hacked": traj.hacked,
+        "provenance": dict(traj.provenance),
+    }
+    for f in _POLICY_VISIBLE_TASK_FIELDS:
+        v = getattr(task, f)
+        row[f] = list(v) if isinstance(v, tuple) else v
+    return row
+
+
+__all__ = [
+    "SCHEMA_VERSION",
+    "ToolCall",
+    "TrajectoryStep",
+    "CanonicalTrajectory",
+    "from_trace_states",
+    "to_sft_messages",
+    "to_policy_row",
+]
diff --git a/composer_replication/diloco/__init__.py b/composer_replication/diloco/__init__.py
index 3b912ad9d27dda9129821fa2c68c82c832472698..5fe9f276efd3d971fe0ca774d5129fe0fa959bf6 100644
--- a/composer_replication/diloco/__init__.py
+++ b/composer_replication/diloco/__init__.py
@@ -4,9 +4,12 @@ Wraps `torchft.local_sgd.DiLoCo` with the framework's conventions:
 - Sign convention is documented LOUDLY here once and tested via Spike 008.
 - The wrapper exposes the same constructor shape as torchft's DiLoCo so a
   future swap-in of the upstream class is a one-line change.
-- Vanilla DiLoCo (Douillard et al. 2023) = `fragment_sync_delay=0`, single
-  fragment. Streaming DiLoCo (Liu et al. 2025) = non-zero delay, multiple
-  fragments. Spike 008 uses vanilla; Streaming is configured by the same API.
+- Vanilla DiLoCo (Douillard et al. 2023, arXiv:2311.08105) =
+  `fragment_sync_delay=0`, single fragment. Streaming DiLoCo (Douillard et
+  al., arXiv:2501.18512 "Streaming DiLoCo with overlapping communication";
+  the separate Eager-Updates work is Kale et al., arXiv:2502.12996 — citation
+  corrected per deepread finding V7) = non-zero delay, multiple fragments.
+  Spike 008 uses vanilla; Streaming is configured by the same API.
 
 Reference: `docs/adrs/ADR-003-diloco-impl.md`.
 
diff --git a/composer_replication/opsd.py b/composer_replication/opsd.py
index cd0e45275cdfc3030837796aef1179bd5f7c5f32..c795c388e17c84feb2f6c854adf0c99a2b35b3f7 100644
--- a/composer_replication/opsd.py
+++ b/composer_replication/opsd.py
@@ -10,17 +10,23 @@ Mathematical reference:
 - OPSD paper: Zhao et al., "Self-Distilled Reasoner: On-Policy Self-Distillation
   for LLMs", arXiv:2601.18734.
 - SDPO paper: Hübotter et al., "Reinforcement Learning via Self-Distillation",
-  arXiv:2601.20802 (formalizes the same loss as Composer 2.5's "Targeted RL with
-  Textual Feedback").
+  arXiv:2601.20802. PROVENANCE (corrected per deepread finding V1): Cursor's
+  blog cites SDPO/OPSD only as *background* ("For more background on this
+  approach see…"), NOT as its mechanism. Published SDPO distills over the FULL
+  rollout with feedback in the prefix and an EMA-regularized teacher; this
+  repo's channel is a turn-localized hint-splice with a live (stop-grad,
+  non-EMA) teacher — a third, blog-inspired design, neither verbatim SDPO nor
+  confirmed-Composer. The kernel below matches OPSD's generalized JSD math.
 
 The loss computes JSD/KL divergence between a teacher distribution (model
 conditioned on privileged information / a hint) and a student distribution
 (model on the original context). Both come from the SAME model — the teacher
 is just "the model with hint inserted into context."
 
-Composer 2.5 uses this with the privileged information being a "hint" inserted
-at the error-turn site. We use the same loss; the data collator constructs
-ctx_teacher = ctx_student + hint_at_error_turn for us.
+Composer 2.5's blog describes inserting a "hint" at the error-turn site and
+distilling the student toward the hint-conditioned distribution "for that turn
+only". The data collator constructs ctx_teacher = ctx_student +
+hint_at_error_turn for us.
 """
 
 from __future__ import annotations
diff --git a/composer_replication/pipeline/__init__.py b/composer_replication/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4388cf11938b7098c5d3291301745ae37ffb1485
--- /dev/null
+++ b/composer_replication/pipeline/__init__.py
@@ -0,0 +1,38 @@
+"""composer_replication.pipeline — the Stage-0 dataset-pipeline contract + driver.
+
+THE single reconciled dataset contract (supersedes the two divergent layouts in
+research/design-F1 and design-F2 — deepread finding V8/D-7), the pragmatic
+near-duplicate detector, and the local stage-driver that turns
+(tasks, env, policy) into a carded, deduped, holdout-split corpus.
+"""
+from composer_replication.pipeline.build_corpus import build_corpus
+from composer_replication.pipeline.dedup import (
+    dedup,
+    find_near_duplicates,
+    jaccard_estimate,
+    minhash_signature,
+)
+from composer_replication.pipeline.s3_contract import (
+    RunLayout,
+    RunManifest,
+    write_dataset_card,
+    write_dpo_rows,
+    write_sft_rows,
+    write_tasks,
+    write_tasks_full,
+)
+
+__all__ = [
+    "RunLayout",
+    "RunManifest",
+    "build_corpus",
+    "dedup",
+    "find_near_duplicates",
+    "jaccard_estimate",
+    "minhash_signature",
+    "write_dataset_card",
+    "write_dpo_rows",
+    "write_sft_rows",
+    "write_tasks",
+    "write_tasks_full",
+]
diff --git a/composer_replication/pipeline/build_corpus.py b/composer_replication/pipeline/build_corpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c214ed5fdeb3f526b8724b29e8b8d7f0524ea23
--- /dev/null
+++ b/composer_replication/pipeline/build_corpus.py
@@ -0,0 +1,137 @@
+"""build_corpus.py — the local Stage-0 stage-driver (architecture step 6-7).
+
+One function wires the whole local pipeline: holdout-split the task pool
+(holdout tasks are NEVER rolled out — they are the eval anchor), roll out a
+policy over each train task, admit + type + route trajectories
+(sft / dpo-candidate / quarantine), dedup the SFT rows (within-run AND against
+a prior generation's signatures), and write everything through the single
+s3_contract layout with a manifest + dataset card.
+
+Deliberately LOCAL-first (finding D-9): the five-service AWS orchestration is
+Stage 4; this driver must produce one real corpus end-to-end on a laptop with a
+FakeSandbox before anything is distributed. Write-once per layout
+(finding D-21): refuses to run if the manifest already exists.
+"""
+from __future__ import annotations
+
+from typing import Callable, Sequence
+
+from composer_replication.datagen.env import FeatureDeletionEnv
+from composer_replication.datagen.rollout_harness import (
+    RolloutPolicy,
+    admit,
+    collect_trajectory,
+)
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.trajectory import to_policy_row
+from composer_replication.pipeline import s3_contract
+from composer_replication.pipeline.dedup import dedup
+from composer_replication.pipeline.s3_contract import RunLayout, RunManifest
+from composer_replication.safety.holdout import HeldoutSplit
+
+
+def build_corpus(
+    source_tasks: Sequence[FeatureDeletionTask],
+    env_factory: Callable[[], FeatureDeletionEnv],
+    policy_factory: Callable[[], RolloutPolicy],
+    layout: RunLayout,
+    manifest: RunManifest,
+    *,
+    holdout_frac: float = 0.2,
+    holdout_seed: int = 0,
+    max_tasks: int | None = None,
+    cost_per_rollout_usd: float = 0.0,
+    prior_signatures: Sequence[Sequence[int]] | None = None,
+    dedup_threshold: float = 0.85,
+) -> RunManifest:
+    """Run the Stage-0 pipeline over `source_tasks`; returns the final manifest.
+
+    Args:
+        source_tasks: gate_repo-admitted FeatureDeletionTasks (the caller runs
+            `datagen.repo_gate.gate_repo` BEFORE this — the driver assumes the
+            license/decontamination gates already passed).
+        env_factory: fresh `FeatureDeletionEnv` per rollout (a sandbox is
+            stateful; sharing one across episodes leaks trajectory state).
+        policy_factory: fresh policy per rollout (ScriptedPolicy is stateful).
+        manifest: a `RunManifest` with run_id/created_at/budget set by the
+            caller (created_at is caller-passed for reproducibility).
+        cost_per_rollout_usd: accounting hook — API policies should report
+            real cost; the driver enforces `manifest.budget_usd` with it.
+        prior_signatures: previous generation's MinHash signatures
+            (cross-generation dedup, finding D-12).
+
+    Raises:
+        FileExistsError: if the layout already has a manifest (write-once).
+    """
+    if s3_contract.manifest_exists(layout):
+        raise FileExistsError(
+            f"Run layout already has a manifest at {layout.manifest_path} — "
+            "runs are write-once per (root, run_id); mint a new run_id "
+            "(finding D-21 idempotency)."
+        )
+
+    # 1. Holdout split FIRST — held-out tasks are never rolled out, so no
+    #    training signal can derive from them (the HeldoutSplit discipline).
+    split = HeldoutSplit.split(source_tasks, holdout_frac=holdout_frac,
+                               seed=holdout_seed, check_content=True)
+    by_id = {t.task_id: t for t in source_tasks}
+    holdout_tasks = [by_id[i] for i in sorted(split.holdout_ids)]
+    train_tasks = [by_id[i] for i in sorted(split.train_ids)]
+    if max_tasks is not None:
+        train_tasks = train_tasks[:max_tasks]
+
+    # 2. Rollouts + admission routing.
+    sft_rows: list[dict] = []
+    dpo_rows: list[dict] = []
+    quarantine_rows: list[dict] = []
+    traj_rows: list[dict] = []
+    partial = False
+    for task in train_tasks:
+        if manifest.over_budget:
+            partial = True
+            break
+        traj = collect_trajectory(env_factory(), task, policy_factory(),
+                                  provenance={"run_id": manifest.run_id})
+        manifest.spend(cost_per_rollout_usd)
+        verdict = admit(traj)
+        row = to_policy_row(traj, task)
+        traj_rows.append({**row, "admission": list(verdict.reasons)})
+        if verdict.sft_admitted:
+            sft_rows.append(row)
+        elif verdict.dpo_candidate:
+            dpo_rows.append(row)
+        else:
+            quarantine_rows.append({**row, "reasons": list(verdict.reasons)})
+
+    # 3. Dedup the SFT corpus (within-run + cross-generation).
+    def _key(r: dict) -> str:
+        return " ".join(m.get("content", "") for m in r.get("messages", []))
+
+    sft_rows, dedup_stats = dedup(sft_rows, _key, dedup_threshold,
+                                  prior_signatures=prior_signatures)
+
+    # 4. Write everything through the contract.
+    s3_contract.write_tasks(layout, train_tasks)
+    s3_contract.write_tasks_full(layout, train_tasks)
+    s3_contract.write_holdout(layout, holdout_tasks)
+    s3_contract.write_trajectories(layout, traj_rows)
+    s3_contract.write_sft_rows(layout, sft_rows)
+    s3_contract.write_dpo_rows(layout, dpo_rows)
+    s3_contract.write_quarantine(layout, quarantine_rows)
+
+    manifest.counts = {
+        "tasks_train": len(train_tasks),
+        "tasks_holdout": len(holdout_tasks),
+        "rollouts": len(traj_rows),
+        "sft_rows": len(sft_rows),
+        "dpo_rows": len(dpo_rows),
+        "quarantined": len(quarantine_rows),
+        **{f"dedup_{k}": v for k, v in dedup_stats.items()},
+    }
+    manifest.status = "partial" if partial else "building"
+    manifest.write(layout)
+    s3_contract.write_dataset_card(layout, manifest, dedup_stats=dedup_stats)
+    return manifest
+
+
+__all__ = ["build_corpus"]
diff --git a/composer_replication/pipeline/dedup.py b/composer_replication/pipeline/dedup.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f1ea5ab539d6edc9f1e3ee70f0e774bc33a0c7
--- /dev/null
+++ b/composer_replication/pipeline/dedup.py
@@ -0,0 +1,138 @@
+"""dedup.py — MinHash near-duplicate detection (finding D-12).
+
+Cross-generation dedup is a flywheel-collapse mitigation: a self-training loop
+that re-ingests its own outputs accumulates near-identical rows, and per-batch
+`document_deduplicator` (the only dedup the old designs had) never sees across
+runs. This module computes MinHash signatures over word 5-shingles so a run can
+(a) dedup within itself and (b) accept the PRIOR run's signature file and dedup
+against it (lineage threaded by `RunManifest.parent_run_id`).
+
+Pragmatic v1: builtin-hash permutation MinHash with N=64 seeds, no banding/LSH
+(O(n^2) pair scan — fine for Stage-0 corpus sizes, thousands of rows).
+`datasketch` (MinHashLSH) is the documented upgrade path when row counts make
+the pair scan bite.
+
+NOTE on hash stability: Python's builtin `hash()` over str is salted per
+process (PYTHONHASHSEED), which would make signatures non-portable across
+runs — exactly what cross-generation dedup needs. We therefore use md5-based
+hashing (stable everywhere) despite the small speed cost.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from typing import IO, Callable, Iterable, Sequence
+
+N_PERMUTATIONS = 64
+_SHINGLE_W = 5
+_WORD_RE = re.compile(r"\w+")
+_MAX64 = (1 << 64) - 1
+
+
+def _shingles(text: str, w: int = _SHINGLE_W) -> set[str]:
+    words = _WORD_RE.findall(text.lower())
+    if len(words) <= w:
+        return {" ".join(words)} if words else set()
+    return {" ".join(words[i:i + w]) for i in range(len(words) - w + 1)}
+
+
+def _stable_hash(s: str, seed: int) -> int:
+    h = hashlib.md5(f"{seed}:{s}".encode()).digest()
+    return int.from_bytes(h[:8], "big")
+
+
+def minhash_signature(text: str, n_perm: int = N_PERMUTATIONS) -> tuple[int, ...]:
+    """MinHash signature: per-seed minimum over the shingle set."""
+    sh = _shingles(text)
+    if not sh:
+        return tuple([_MAX64] * n_perm)
+    return tuple(min(_stable_hash(s, seed) for s in sh) for seed in range(n_perm))
+
+
+def jaccard_estimate(sig_a: Sequence[int], sig_b: Sequence[int]) -> float:
+    """Estimated Jaccard similarity = fraction of agreeing signature slots."""
+    if len(sig_a) != len(sig_b) or not sig_a:
+        raise ValueError("signatures must be equal-length and non-empty")
+    return sum(1 for a, b in zip(sig_a, sig_b) if a == b) / len(sig_a)
+
+
+def find_near_duplicates(
+    rows: Sequence[dict],
+    key_fn: Callable[[dict], str],
+    threshold: float = 0.85,
+    *,
+    prior_signatures: Sequence[Sequence[int]] | None = None,
+) -> list[tuple[int, int]]:
+    """All (i, j) index pairs whose estimated Jaccard >= threshold.
+
+    `prior_signatures` (from a previous run) participate as virtual rows with
+    negative indices -(k+1), so a pair (i, -1) means "row i duplicates prior
+    signature 0" — the cross-generation case.
+    """
+    sigs = [minhash_signature(key_fn(r)) for r in rows]
+    pairs: list[tuple[int, int]] = []
+    for i in range(len(sigs)):
+        for j in range(i + 1, len(sigs)):
+            if jaccard_estimate(sigs[i], sigs[j]) >= threshold:
+                pairs.append((i, j))
+        for k, prior in enumerate(prior_signatures or []):
+            if jaccard_estimate(sigs[i], prior) >= threshold:
+                pairs.append((i, -(k + 1)))
+    return pairs
+
+
+def dedup(
+    rows: Sequence[dict],
+    key_fn: Callable[[dict], str],
+    threshold: float = 0.85,
+    *,
+    prior_signatures: Sequence[Sequence[int]] | None = None,
+) -> tuple[list[dict], dict]:
+    """Keep-first dedup. Returns (kept_rows, stats).
+
+    A row duplicating a PRIOR-run signature is dropped outright (the prior run
+    already owns it); within-run duplicates keep the earliest occurrence.
+    """
+    pairs = find_near_duplicates(rows, key_fn, threshold,
+                                 prior_signatures=prior_signatures)
+    drop: set[int] = set()
+    for i, j in pairs:
+        if j < 0:
+            drop.add(i)          # duplicates a prior-run row
+        else:
+            drop.add(j)          # keep-first within this run
+    kept = [r for i, r in enumerate(rows) if i not in drop]
+    return kept, {
+        "rows_in": len(rows),
+        "rows_kept": len(kept),
+        "dropped_within_run": len({j for _, j in pairs if j >= 0} & drop),
+        "dropped_cross_generation": len({i for i, j in pairs if j < 0} & drop),
+        "threshold": threshold,
+    }
+
+
+def signatures_to_jsonl(rows: Sequence[dict], key_fn: Callable[[dict], str],
+                        fh: IO[str]) -> int:
+    """Persist this run's signatures so the NEXT generation can dedup against
+    them (pass the loaded list as `prior_signatures`)."""
+    n = 0
+    for r in rows:
+        fh.write(json.dumps(list(minhash_signature(key_fn(r)))) + "\n")
+        n += 1
+    return n
+
+
+def load_signatures(fh: IO[str]) -> list[tuple[int, ...]]:
+    return [tuple(json.loads(line)) for line in fh if line.strip()]
+
+
+__all__ = [
+    "N_PERMUTATIONS",
+    "dedup",
+    "find_near_duplicates",
+    "jaccard_estimate",
+    "load_signatures",
+    "minhash_signature",
+    "signatures_to_jsonl",
+]
diff --git a/composer_replication/pipeline/s3_contract.py b/composer_replication/pipeline/s3_contract.py
new file mode 100644
index 0000000000000000000000000000000000000000..87a490dbde5886ab134dc30ff81682cb0e1e9fbd
--- /dev/null
+++ b/composer_replication/pipeline/s3_contract.py
@@ -0,0 +1,287 @@
+"""s3_contract.py — THE single dataset layout + manifest (finding V8/D-7/D-8).
+
+Supersedes BOTH prior contracts: design-F1's `runs/<id>/{sft_corpus,dpo_pairs,
+rl_task_pool,divergence_pairs,wm_tuples,holdout,diloco_rendezvous}` and
+design-F2's `{traces,tasks,replay,task_grades,corpus}/v1/run_id=<id>` — the two
+were never reconciled and coexisted in the grounding doc. One layout, one
+manifest, two explicit serializers with a unit-tested leak guard.
+
+Deliberate exclusions from the run layout:
+  * `diloco_rendezvous/` — training-comms state, not dataset; lives in its own
+    prefix/bucket (finding D-19).
+  * `wm_tuples/` — emitted only when the P4 world-model ablation is scheduled
+    (finding D-14); not part of Stage 0.
+
+Layout (root = any local path or fsspec URI):
+    <root>/runs/<run_id>/
+        tasks/manifest.jsonl       policy-safe task rows (golden_diff -> sha256)
+        tasks_full/manifest.jsonl  construction-side full rows (RESTRICTED prefix)
+        traj/*.jsonl               CanonicalTrajectory records (audit trail)
+        corpus_sft/rows.jsonl      admitted SFT rows (to_policy_row output)
+        corpus_dpo/rows.jsonl      DPO-candidate rows
+        holdout/tasks.jsonl        held-out task ids+rows (never rolled out)
+        quarantine/*.jsonl         rejected trajectories w/ reasons (audit)
+        manifest.json              RunManifest
+        DATASET_CARD.md            human-readable card
+"""
+from __future__ import annotations
+
+import dataclasses
+import hashlib
+import json
+from dataclasses import dataclass, field
+from typing import IO, Iterable
+
+from composer_replication.datagen.schema import FeatureDeletionTask
+
+SCHEMA_VERSION = "1"
+
+
+def _is_local(root: str) -> bool:
+    return "://" not in root or root.startswith("file://")
+
+
+def _open(path: str, mode: str = "w") -> IO[str]:
+    """Open a path for text IO; plain `open` locally, fsspec for s3:// etc.
+
+    fsspec is lazy so the module (and all local-corpus runs) need no extra dep.
+    """
+    if _is_local(path):
+        import os
+        local = path.removeprefix("file://")
+        os.makedirs(os.path.dirname(local), exist_ok=True)
+        return open(local, mode, encoding="utf-8")
+    try:
+        import fsspec  # noqa: PLC0415 — lazy heavy dep
+    except ImportError as e:
+        raise RuntimeError(
+            "Non-local corpus roots require fsspec; install with "
+            "`pip install -e .[serverless]`. Got: " + repr(e)
+        ) from e
+    return fsspec.open(path, mode, encoding="utf-8").open()
+
+
+def _exists(path: str) -> bool:
+    if _is_local(path):
+        import os
+        return os.path.exists(path.removeprefix("file://"))
+    import fsspec  # noqa: PLC0415
+    fs, _, paths = fsspec.get_fs_token_paths(path)
+    return bool(fs.exists(paths[0]))
+
+
+@dataclass(frozen=True)
+class RunLayout:
+    """Pure-path logic for one run's prefixes — testable without any IO."""
+
+    root: str
+    run_id: str
+
+    def _p(self, *parts: str) -> str:
+        base = self.root.rstrip("/")
+        return f"{base}/runs/{self.run_id}/" + "/".join(parts)
+
+    @property
+    def tasks_path(self) -> str:
+        return self._p("tasks", "manifest.jsonl")
+
+    @property
+    def tasks_full_path(self) -> str:
+        # RESTRICTED prefix: carries golden_diff/deleted_symbols. On S3 this
+        # prefix gets a deny-by-default policy; locally it is still separated
+        # so a naive `corpus_*` glob can never sweep it up.
+        return self._p("tasks_full", "manifest.jsonl")
+
+    @property
+    def traj_path(self) -> str:
+        return self._p("traj", "trajectories.jsonl")
+
+    @property
+    def sft_path(self) -> str:
+        return self._p("corpus_sft", "rows.jsonl")
+
+    @property
+    def dpo_path(self) -> str:
+        return self._p("corpus_dpo", "rows.jsonl")
+
+    @property
+    def holdout_path(self) -> str:
+        return self._p("holdout", "tasks.jsonl")
+
+    @property
+    def quarantine_path(self) -> str:
+        return self._p("quarantine", "rejected.jsonl")
+
+    @property
+    def manifest_path(self) -> str:
+        return self._p("manifest.json")
+
+    @property
+    def card_path(self) -> str:
+        return self._p("DATASET_CARD.md")
+
+
+@dataclass
+class RunManifest:
+    """Run-level metadata: counts, cost, lineage, budget, acceptance status.
+
+    `created_at` is CALLER-passed (never datetime.now() in here) so manifests
+    are reproducible in tests. `parent_run_id` threads flywheel lineage so
+    cross-generation dedup (finding D-12) can find prior signatures.
+    """
+
+    run_id: str
+    created_at: str
+    source: str = ""
+    counts: dict = field(default_factory=dict)
+    cost_usd: float = 0.0
+    parent_run_id: str | None = None
+    schema_version: str = SCHEMA_VERSION
+    status: str = "building"          # building | accepted | rejected | partial
+    budget_usd: float | None = None
+
+    def spend(self, usd: float) -> None:
+        self.cost_usd += usd
+
+    @property
+    def over_budget(self) -> bool:
+        return self.budget_usd is not None and self.cost_usd >= self.budget_usd
+
+    def write(self, layout: RunLayout) -> None:
+        with _open(layout.manifest_path) as f:
+            json.dump(dataclasses.asdict(self), f, indent=2)
+
+    @classmethod
+    def read(cls, layout: RunLayout) -> RunManifest:
+        with _open(layout.manifest_path, "r") as f:
+            return cls(**json.load(f))
+
+
+# ---------------------------------------------------------------------
+# Writers — the leak guard lives here (finding D-8)
+# ---------------------------------------------------------------------
+
+
+def _task_row_policy_safe(task: FeatureDeletionTask) -> dict:
+    """Task row with the construction-side secrets REPLACED, not just hidden.
+
+    `asdict()` includes `golden_diff` despite `repr=False` — that is exactly
+    the leak D-8 flagged. We keep provenance via a sha256 (verifiable, not
+    recoverable) and drop `deleted_symbols` entirely (they name the answer).
+    """
+    row = dataclasses.asdict(task)
+    gold = row.pop("golden_diff", "")
+    row.pop("deleted_symbols", None)
+    row["golden_diff_sha256"] = hashlib.sha256(gold.encode()).hexdigest() if gold else ""
+    return row
+
+
+def write_tasks(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int:
+    """Write the POLICY-SAFE task manifest (the default everything reads)."""
+    n = 0
+    with _open(layout.tasks_path) as f:
+        for t in tasks:
+            f.write(json.dumps(_task_row_policy_safe(t)) + "\n")
+            n += 1
+    return n
+
+
+def write_tasks_full(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int:
+    """Write FULL task rows (incl. golden_diff) to the RESTRICTED prefix.
+
+    Only the validator/monitor side reads this; never corpus consumers.
+    """
+    n = 0
+    with _open(layout.tasks_full_path) as f:
+        for t in tasks:
+            f.write(json.dumps(dataclasses.asdict(t)) + "\n")
+            n += 1
+    return n
+
+
+def _write_jsonl(path: str, rows: Iterable[dict]) -> int:
+    n = 0
+    with _open(path) as f:
+        for r in rows:
+            f.write(json.dumps(r) + "\n")
+            n += 1
+    return n
+
+
+def write_sft_rows(layout: RunLayout, rows: Iterable[dict]) -> int:
+    return _write_jsonl(layout.sft_path, rows)
+
+
+def write_dpo_rows(layout: RunLayout, rows: Iterable[dict]) -> int:
+    return _write_jsonl(layout.dpo_path, rows)
+
+
+def write_quarantine(layout: RunLayout, rows: Iterable[dict]) -> int:
+    return _write_jsonl(layout.quarantine_path, rows)
+
+
+def write_holdout(layout: RunLayout, tasks: Iterable[FeatureDeletionTask]) -> int:
+    return _write_jsonl(layout.holdout_path, (_task_row_policy_safe(t) for t in tasks))
+
+
+def write_trajectories(layout: RunLayout, rows: Iterable[dict]) -> int:
+    return _write_jsonl(layout.traj_path, rows)
+
+
+def write_dataset_card(layout: RunLayout, manifest: RunManifest,
+                       *, license_tiers: dict[str, int] | None = None,
+                       dedup_stats: dict | None = None,
+                       decontamination_note: str = "") -> None:
+    """A small human-readable dataset card (finding D-18)."""
+    lines = [
+        f"# Dataset card — run `{manifest.run_id}`",
+        "",
+        f"- **created:** {manifest.created_at}",
+        f"- **source:** {manifest.source}",
+        f"- **status:** {manifest.status}",
+        f"- **schema_version:** {manifest.schema_version}",
+        f"- **cost (USD):** {manifest.cost_usd:.2f}"
+        + (f" / budget {manifest.budget_usd:.2f}" if manifest.budget_usd else ""),
+        f"- **lineage:** parent_run_id={manifest.parent_run_id or 'none'}",
+        "",
+        "## Counts",
+        "",
+    ]
+    for k, v in sorted(manifest.counts.items()):
+        lines.append(f"- {k}: {v}")
+    if license_tiers:
+        lines += ["", "## License tiers seen", ""]
+        lines += [f"- {k}: {v}" for k, v in sorted(license_tiers.items())]
+    lines += ["", "## Decontamination", "",
+              decontamination_note or
+              "All source repos checked against the SWE-bench-family eval list "
+              "(datagen.repo_gate.DECONTAMINATION_LIST) at ingest."]
+    if dedup_stats:
+        lines += ["", "## Dedup", ""]
+        lines += [f"- {k}: {v}" for k, v in sorted(dedup_stats.items())]
+    lines += ["", "Policy-safe rows only: `golden_diff` is sha256-hashed and "
+              "`deleted_symbols` dropped in `tasks/`, `corpus_*/`, `holdout/` "
+              "(full rows live in the restricted `tasks_full/`).", ""]
+    with _open(layout.card_path) as f:
+        f.write("\n".join(lines))
+
+
+def manifest_exists(layout: RunLayout) -> bool:
+    """Write-once guard for the driver (finding D-21 idempotency)."""
+    return _exists(layout.manifest_path)
+
+
+__all__ = [
+    "SCHEMA_VERSION",
+    "RunLayout",
+    "RunManifest",
+    "manifest_exists",
+    "write_dataset_card",
+    "write_dpo_rows",
+    "write_holdout",
+    "write_quarantine",
+    "write_sft_rows",
+    "write_tasks",
+    "write_tasks_full",
+    "write_trajectories",
+]
diff --git a/composer_replication/pipeline/tests/__init__.py b/composer_replication/pipeline/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/composer_replication/pipeline/tests/test_pipeline.py b/composer_replication/pipeline/tests/test_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a189a38351b19d356d970b0b792b96c5768181
--- /dev/null
+++ b/composer_replication/pipeline/tests/test_pipeline.py
@@ -0,0 +1,223 @@
+"""Tests for the Stage-0 pipeline: contract, dedup, driver.
+
+Load-bearing coverage: the sentinel leak guard on write_tasks (finding D-8),
+holdout exclusion + budget stop + idempotency in build_corpus (D-21), and the
+cross-generation dedup path (D-12).
+"""
+from __future__ import annotations
+
+import io
+import json
+from pathlib import Path
+
+import pytest
+
+from composer_replication.datagen.env import FeatureDeletionEnv
+from composer_replication.datagen.rollout_harness import ScriptedPolicy
+from composer_replication.datagen.sandbox import FakeSandbox
+from composer_replication.datagen.schema import FeatureDeletionTask
+from composer_replication.datagen.trajectory import ToolCall
+from composer_replication.pipeline.build_corpus import build_corpus
+from composer_replication.pipeline.dedup import (
+    dedup,
+    find_near_duplicates,
+    jaccard_estimate,
+    load_signatures,
+    minhash_signature,
+    signatures_to_jsonl,
+)
+from composer_replication.pipeline.s3_contract import (
+    RunLayout,
+    RunManifest,
+    write_dataset_card,
+    write_tasks,
+    write_tasks_full,
+)
+
+
+def _task(i: int, **over) -> FeatureDeletionTask:
+    base = dict(
+        task_id=f"task-{i:03d}", repo="org/repo", base_commit="abc",
+        broken_image="img:1", test_command="pytest -q",
+        fail_to_pass=(f"t/a.py::t{i}",), pass_to_pass=("t/a.py::keep",),
+        golden_diff="SENTINEL_NEVER_LEAK", deleted_symbols=("secret_fn",),
+    )
+    base.update(over)
+    return FeatureDeletionTask(**base)
+
+
+# ---------------------------------------------------------------------
+# RunLayout / RunManifest
+# ---------------------------------------------------------------------
+
+
+def test_layout_paths_are_pure_and_namespaced():
+    lay = RunLayout(root="/data/corpora", run_id="run42")
+    assert lay.sft_path == "/data/corpora/runs/run42/corpus_sft/rows.jsonl"
+    assert lay.manifest_path == "/data/corpora/runs/run42/manifest.json"
+    s3 = RunLayout(root="s3://bucket/prefix/", run_id="r")
+    assert s3.tasks_path == "s3://bucket/prefix/runs/r/tasks/manifest.jsonl"
+
+
+def test_manifest_round_trip_and_budget(tmp_path):
+    lay = RunLayout(root=str(tmp_path), run_id="r1")
+    m = RunManifest(run_id="r1", created_at="2026-06-09T00:00:00Z",
+                    source="test", budget_usd=1.0)
+    m.spend(0.4)
+    assert not m.over_budget
+    m.spend(0.6)
+    assert m.over_budget
+    m.write(lay)
+    m2 = RunManifest.read(lay)
+    assert m2.cost_usd == pytest.approx(1.0)
+    assert m2.budget_usd == 1.0
+
+
+# ---------------------------------------------------------------------
+# THE leak guard (finding D-8)
+# ---------------------------------------------------------------------
+
+
+def test_write_tasks_never_leaks_golden_diff(tmp_path):
+    lay = RunLayout(root=str(tmp_path), run_id="r1")
+    write_tasks(lay, [_task(1)])
+    blob = Path(lay.tasks_path).read_text()
+    assert "SENTINEL_NEVER_LEAK" not in blob
+    assert "secret_fn" not in blob
+    row = json.loads(blob.splitlines()[0])
+    assert row["golden_diff_sha256"]  # provenance preserved as a hash
+    # The restricted full writer DOES carry it (construction side only).
+    write_tasks_full(lay, [_task(1)])
+    assert "SENTINEL_NEVER_LEAK" in Path(lay.tasks_full_path).read_text()
+
+
+# ---------------------------------------------------------------------
+# MinHash dedup
+# ---------------------------------------------------------------------
+
+_TEXT_A = "the quick brown fox jumps over the lazy dog and then runs far away home tonight"
+_TEXT_A2 = "the quick brown fox jumps over the lazy dog and then runs far away home today"
+_TEXT_B = "import numpy as np def main(): return np.zeros(10) print(main()) totally different content here"
+
+
+def test_jaccard_estimate_near_duplicates_high_disjoint_low():
+    sa, sa2, sb = (minhash_signature(t) for t in (_TEXT_A, _TEXT_A2, _TEXT_B))
+    assert jaccard_estimate(sa, sa2) > 0.5
+    assert jaccard_estimate(sa, sb) < 0.2
+    assert jaccard_estimate(sa, sa) == 1.0
+
+
+def test_dedup_keeps_first_and_drops_near_dup():
+    rows = [{"text": _TEXT_A}, {"text": _TEXT_A2}, {"text": _TEXT_B}]
+    kept, stats = dedup(rows, lambda r: r["text"], threshold=0.5)
+    assert [r["text"] for r in kept] == [_TEXT_A, _TEXT_B]
+    assert stats["dropped_within_run"] == 1
+
+
+def test_cross_generation_dedup_via_signature_file():
+    prior_rows = [{"text": _TEXT_A}]
+    buf = io.StringIO()
+    signatures_to_jsonl(prior_rows, lambda r: r["text"], buf)
+    buf.seek(0)
+    prior_sigs = load_signatures(buf)
+
+    rows = [{"text": _TEXT_A2}, {"text": _TEXT_B}]
+    kept, stats = dedup(rows, lambda r: r["text"], threshold=0.5,
+                        prior_signatures=prior_sigs)
+    assert [r["text"] for r in kept] == [_TEXT_B]
+    assert stats["dropped_cross_generation"] == 1
+
+
+def test_find_near_duplicates_pairs():
+    rows = [{"t": _TEXT_A}, {"t": _TEXT_A2}]
+    assert find_near_duplicates(rows, lambda r: r["t"], 0.5) == [(0, 1)]
+
+
+# ---------------------------------------------------------------------
+# build_corpus end-to-end (FakeSandbox + ScriptedPolicy)
+# ---------------------------------------------------------------------
+
+
+def _passing_policy():
+    # Flips both this task's F2P tests green generically: FakeSandbox's
+    # set_outcome takes explicit test names, so the fixture tasks share names
+    # via the same fail_to_pass tuple pattern; we set a superset.
+    outcomes = {f"t/a.py::t{i}": True for i in range(20)}
+    outcomes["t/a.py::keep"] = True
+    return ScriptedPolicy(actions=[ToolCall("set_outcome", {"outcomes": outcomes}), "done"])
+
+
+def _failing_policy():
+    return ScriptedPolicy(actions=["gave up immediately"])
+
+
+def _env():
+    return FeatureDeletionEnv(FakeSandbox(test_outcomes={"t/a.py::keep": True}))
+
+
+def test_build_corpus_end_to_end(tmp_path):
+    tasks = [_task(i) for i in range(6)]
+    lay = RunLayout(root=str(tmp_path), run_id="e2e")
+    manifest = RunManifest(run_id="e2e", created_at="2026-06-09T00:00:00Z", source="fixture")
+
+    out = build_corpus(tasks, _env, _passing_policy, lay, manifest,
+                       holdout_frac=0.34, holdout_seed=7)
+
+    # Holdout exclusion: holdout tasks were never rolled out.
+    assert out.counts["tasks_holdout"] >= 1
+    assert out.counts["rollouts"] == out.counts["tasks_train"]
+    # Full passes routed to SFT (post-dedup near-identical rows collapse —
+    # the fixture tasks produce near-identical messages, which is itself a
+    # realistic dedup scenario).
+    assert out.counts["sft_rows"] >= 1
+    assert out.counts["quarantined"] == 0
+    # Files exist and the SFT corpus never leaks the sentinel.
+    sft_blob = Path(lay.sft_path).read_text()
+    assert "SENTINEL_NEVER_LEAK" not in sft_blob
+    assert Path(lay.card_path).exists()
+    assert Path(lay.holdout_path).exists()
+
+
+def test_build_corpus_quarantines_failures(tmp_path):
+    tasks = [_task(i) for i in range(3)]
+    lay = RunLayout(root=str(tmp_path), run_id="fail")
+    manifest = RunManifest(run_id="fail", created_at="2026-06-09T00:00:00Z", source="fixture")
+    out = build_corpus(tasks, _env, _failing_policy, lay, manifest,
+                       holdout_frac=0.34, holdout_seed=7)
+    assert out.counts["sft_rows"] == 0
+    assert out.counts["quarantined"] == out.counts["rollouts"] > 0
+
+
+def test_build_corpus_budget_stop_marks_partial(tmp_path):
+    tasks = [_task(i) for i in range(6)]
+    lay = RunLayout(root=str(tmp_path), run_id="budget")
+    manifest = RunManifest(run_id="budget", created_at="2026-06-09T00:00:00Z",
+                           source="fixture", budget_usd=0.25)
+    out = build_corpus(tasks, _env, _passing_policy, lay, manifest,
+                       holdout_frac=0.2, holdout_seed=7,
+                       cost_per_rollout_usd=0.1)
+    assert out.status == "partial"
+    assert out.counts["rollouts"] < out.counts["tasks_train"]
+
+
+def test_build_corpus_is_write_once(tmp_path):
+    tasks = [_task(i) for i in range(3)]
+    lay = RunLayout(root=str(tmp_path), run_id="once")
+    m1 = RunManifest(run_id="once", created_at="2026-06-09T00:00:00Z", source="fixture")
+    build_corpus(tasks, _env, _passing_policy, lay, m1, holdout_frac=0.34)
+    m2 = RunManifest(run_id="once", created_at="2026-06-09T00:00:01Z", source="fixture")
+    with pytest.raises(FileExistsError, match="write-once"):
+        build_corpus(tasks, _env, _passing_policy, lay, m2, holdout_frac=0.34)
+
+
+def test_dataset_card_contents(tmp_path):
+    lay = RunLayout(root=str(tmp_path), run_id="card")
+    m = RunManifest(run_id="card", created_at="2026-06-09T00:00:00Z",
+                    source="fixture", counts={"sft_rows": 3})
+    write_dataset_card(lay, m, license_tiers={"REDISTRIBUTABLE": 3},
+                       dedup_stats={"rows_kept": 3})
+    card = Path(lay.card_path).read_text()
+    assert "run `card`" in card
+    assert "sft_rows: 3" in card
+    assert "REDISTRIBUTABLE: 3" in card
+    assert "Decontamination" in card
diff --git a/composer_replication/teacher_replay.py b/composer_replication/teacher_replay.py
index 6cbb3cb4b04cfa1bff71b635dc0205a2ba19dd5f..d6d210924a2157ec544baf713c270b98ded0a95d 100644
--- a/composer_replication/teacher_replay.py
+++ b/composer_replication/teacher_replay.py
@@ -4,8 +4,12 @@ This is channel 3 of the integrated trainer: at each step of a frozen agentic
 trace, query N pre-trained external teachers (frontier models from different
 labs) and convert teacher disagreement into preference pairs for DPO loss.
 
-Generalized from spike-001's `replay.py`. Verified economic floor (✅ spike 001):
-$0.98 mean per-trace cost ungated, $0.30/trace projected with VOI gating.
+Generalized from spike-001's `replay.py`. Cost calibration (✅ spike 001,
+relabeled per deepread finding V11): $0.98 mean per-TRACE cost ungated was
+measured on a ~50-state SYNTHETIC trace at N=3 teachers; real Claude Code
+sessions run 125–2,830 tool-use messages (ADR-002), so a full real session is
+~2 orders of magnitude more (~$70–80 flat, before VOI gating). $0.30/trace
+projected with VOI gating, same synthetic basis.
 
 Usage:
     from teacher_replay import replay_trace, extract_dpo_pairs
diff --git a/composer_replication/trainer/kl_in_reward.py b/composer_replication/trainer/kl_in_reward.py
index a6db8fc58dea5bbd98285cce68e21f687e41c89b..108bc709079f4a1d2f8e2d76bf10b7fa83e87ad8 100644
--- a/composer_replication/trainer/kl_in_reward.py
+++ b/composer_replication/trainer/kl_in_reward.py
@@ -9,7 +9,9 @@ literature says this is not cosmetic:
 
   * arXiv:2512.21852 ("A Comedy of Estimators") — k1-in-reward improves OOD
     generalization; k3-in-reward can collapse.
-  * verl adopted k1-in-reward as its *only* reverse-KL option.
+  * verl ships k1-in-reward as its default/recommended reverse-KL option
+    (it also supports a k3-family "low_var_kl" — wording corrected per
+    deepread finding V13).
   * TRL issue #4967 tracks the same divergence.
 
 OOD generalization is exactly the "take any model to the next level" axis, so
diff --git a/docs/COMPOSER_RECIPE_MAPPING.md b/docs/COMPOSER_RECIPE_MAPPING.md
index 9d000e984d266c180d224a2dcb3568ea19042079..6435c9db9625e435f253ef0ddd2c06bb828da338 100644
--- a/docs/COMPOSER_RECIPE_MAPPING.md
+++ b/docs/COMPOSER_RECIPE_MAPPING.md
@@ -22,7 +22,7 @@ The Cursor blog discusses **only three** training innovations explicitly. Everyt
 
 **Cited prior art** (Cursor's footnote 1):
 - **OPSD: Self-Distilled Reasoner — On-Policy Self-Distillation for LLMs** (Zhao et al., 2026, [arXiv:2601.18734](https://arxiv.org/abs/2601.18734), [GitHub: siyan-zhao/OPSD](https://github.com/siyan-zhao/OPSD)). The original on-policy-self-distillation framework: single LLM, teacher conditioned on privileged information (e.g. ground-truth answer), student sees only the question, loss = per-token KL on student's own rollouts.
-- **SDPO: Reinforcement Learning via Self-Distillation** (Hübotter et al., 2026, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop). Generalizes OPSD to RL with rich feedback: *"SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy."* This is **mathematically the same** as Composer's targeted-textual-feedback method. **There is published code.** Comparison table from the SDPO paper:
+- **SDPO: Reinforcement Learning via Self-Distillation** (Hübotter et al., 2026, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop). Generalizes OPSD to RL with rich feedback: *"SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy."* Cursor's blog cites this paper only as **background** ("For more background on this approach see…") — NOT as its mechanism; SDPO's published loss is full-rollout with feedback-in-prefix and an EMA-regularized teacher, while Composer's blog describes a turn-localized hint splice. Closely related, **not verified identical** (deepread finding V1). **There is published code.** Comparison table from the SDPO paper:
 
   | Method | Sampling | Signal | Feedback |
   |---|---|---|---|
@@ -72,7 +72,7 @@ This is **infrastructure, not algorithm**. It only matters at MoE-1T scale; for
 | Composer 2.5 stage | Blog mechanism | Our replication target | v0.0 | v0.1 | v0.2 |
 |---|---|---|---|---|---|
 | **(a)** Continued pretraining on code | Standard pretraining, code-weighted | Skip — start from already-code-tuned `Qwen3-Coder-7B` or `Qwen3-Coder-30B-A3B` | ✗ | ✗ | ✗ |
-| **(b)** Synthetic data at scale | Feature Deletion + 24 other (unnamed) generators | Build 1 generator (Feature Deletion) as OpenEnv-compatible env. Use SWE-bench-lite and SWE-Gym as drop-in alternatives. | ✗ (use SWE-bench-lite only) | ✓ (build Feature Deletion) | scale generator suite |
+| **(b)** Synthetic data at scale | Feature Deletion + an unspecified number of other generators (the blog says only "a range of approaches" — the old "24" was a back-formation from the 25x task multiplier; deepread finding V5) | Build 1 generator (Feature Deletion) as OpenEnv-compatible env. Use SWE-bench-lite and SWE-Gym as drop-in alternatives. | ✗ (use SWE-bench-lite only) | ✓ (build Feature Deletion) | scale generator suite |
 | **(c)** Realistic-environment RL (RLVR) | Async sandboxes, same tool harness as production | TRL `GRPOTrainer` + verifiers + OpenEnv; SWE-bench-lite env in v0.0; build sandboxed code execution env in v0.1 | ✓ baseline | ✓ + DAPO patches | + decentralized rollouts |
 | **(d)** Targeted RL w/ textual feedback (Composer's secret sauce) | Same-model self-distill: insert hint into context → teacher; original → student; on-policy KL at the turn | **Lift the OPSD/SDPO loss directly from `siyan-zhao/OPSD`** (published code, MIT). Generate hints via templates (v0.1) or LLM (v0.2). | ✗ (deferred) | ✓ (this is the Composer-recipe channel) | + learned hint generator |
 | **(e)** Trace-replay multi-teacher distill (NOVEL — our addition) | N/A (not in Composer) | N=3 teachers (Opus 4.7, GPT-5, DeepSeek V4 Pro) replay each step; disagreement → DPO pairs | ✓ (this is the v0.0 novelty bet) | ✓ + VOI gating | + tiered teachers |
@@ -148,7 +148,7 @@ Primary sources for each Composer-2.5 component, post-audit:
 - **Cursor blog** — [Introducing Composer 2.5](https://cursor.com/blog/composer-2-5) (2026)
 - **Cursor blog** — [Composer 2 technical report](https://cursor.com/blog/composer-2-technical-report) (predecessor; named the "Anyrun" environment per subagent — verify if needed)
 - **OPSD paper** — Zhao et al., *Self-Distilled Reasoner: On-Policy Self-Distillation for LLMs*, [arXiv:2601.18734](https://arxiv.org/abs/2601.18734), code at [siyan-zhao/OPSD](https://github.com/siyan-zhao/OPSD). MIT.
-- **SDPO paper** — Hübotter et al., *Reinforcement Learning via Self-Distillation*, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop. The direct formalization of Composer's hint-distill.
+- **SDPO paper** — Hübotter et al., *Reinforcement Learning via Self-Distillation*, [arXiv:2601.20802](https://arxiv.org/abs/2601.20802), ICLR 2026 Scaling Post-training Workshop. The closest published formalization; cited by Cursor only as background (deepread finding V1).
 - **Self-Distillation continual-learning** — [arXiv:2601.19897](https://arxiv.org/abs/2601.19897). Cited by Cursor; less directly relevant.
 - **Moonshot Kimi K2.5** — base model, [HF model card](https://huggingface.co/moonshotai/Kimi-K2-Thinking).
 
diff --git a/docs/adrs/ADR-016-stage0-dataset-pipeline.md b/docs/adrs/ADR-016-stage0-dataset-pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1ba7a7061216aca4b7ce230da8b8c730af69dba
--- /dev/null
+++ b/docs/adrs/ADR-016-stage0-dataset-pipeline.md
@@ -0,0 +1,119 @@
+---
+status: accepted
+date: 2026-06-09
+deciders: [Codeseys, ARIA]
+---
+
+# ADR-016: Stage-0 dataset-generation pipeline — SWE-smith engine + rollout harness + ingest gates + single contract
+
+## Context and Problem Statement
+
+The user asked to "architect and build a pipeline that builds out a dataset like
+the Composer 2.5 blog mentions, with our vision enhancements — point to an
+open-source repo and use that to build the dataset, or use traces or other
+datasets and enhance them."
+
+Before building, a full-source critical review re-read every foundational paper
+and blog (8 source clusters, `research/deepread/01-08`), ground-mapped the repo
+(`00`), ran adversarial fidelity + design critics, and independently VERIFIED
+every finding (`12` — 0 refuted). The verified verdict: the envisioned pipeline
+had four structural breaks (seed-trace/oracle disjointness; no rollout harness —
+the SFT corpus had NO producer; an uncomputable divergence gate; no
+`Sandbox.fork()`), several missing controls (zero benchmark decontamination,
+no secrets gate, a `golden_diff` serialization leak, two unreconciled S3
+contracts, no cross-generation dedup), and a buy-vs-build inversion (the
+planned image-builder duplicates `pip install swesmith`, whose PR-Mirror
+strategy IS this repo's gold-patch-reversion mechanic and is validated best-of-
+five by SWE-smith's own ablation, Table 5 of arXiv:2504.21798).
+
+## Decision
+
+Build **Stage 0 local-first** (architecture: `research/deepread/13-synthesis-architecture.md`):
+
+1. **SWE-smith is the synthesis engine** for "point at a repo" (`[swesmith]`
+   extra; `datagen/swesmith_adapter.py` bridges its instances into
+   `FeatureDeletionTask`, handling the patch-semantics INVERSION — SWE-smith's
+   patch introduces the bug, so `golden_diff` = `reverse_unified_diff(patch)`).
+   `SweBenchAdapter` remains the bridge for SWE-bench-shaped substrates.
+2. **Ingest gates before anything else** (`datagen/repo_gate.py`): SPDX-ish
+   license detection → three tiers (REDISTRIBUTABLE / TRAINABLE_ONLY /
+   EXCLUDED, fail-closed) + **benchmark decontamination** against the
+   SWE-bench-family eval-repo list (hard fail).
+3. **The rollout harness is the corpus producer** (`datagen/rollout_harness.py`):
+   `collect_trajectory(env, task, policy)` runs a pluggable policy
+   (ScriptedPolicy for tests; OpenRouterPolicy stub; mini-swe-agent/SWE-agent
+   adoption is the documented upgrade) through `FeatureDeletionEnv` to
+   `_grade()`. Its env-grounded trajectories are ALSO the tree-of-work's seed
+   nodes — fixing the seed/oracle disjointness as a byproduct. `admit()` routes
+   typed signal: clean full pass → SFT; clean near-miss → DPO candidate;
+   guard-broken/hacked → quarantine (never raw negative gradient).
+4. **One canonical trajectory IR** (`datagen/trajectory.py`): `ToolCall` (whose
+   `canonical_form()` is the v1 divergence-gate action algebra, replacing the
+   whitespace stub), `CanonicalTrajectory`, adapters from Claude Code traces
+   (explicitly UNGRADED — demoted to flat/SFT uses), and `to_policy_row()` —
+   the ONE policy-visible serializer, unit-tested to never emit
+   `golden_diff`/`deleted_symbols` (sentinel test).
+5. **One reconciled dataset contract** (`pipeline/s3_contract.py`, supersedes
+   design-F1's and design-F2's divergent layouts): `runs/<id>/{tasks,
+   tasks_full(RESTRICTED), traj, corpus_sft, corpus_dpo, holdout, quarantine}`
+   + `RunManifest` (counts, cost, budget, `parent_run_id` lineage, status) +
+   dataset card. Policy-safe task rows carry `golden_diff_sha256`, never the
+   diff. DiLoCo rendezvous and `wm_tuples/` are deliberately OUT (separate
+   concern; ablation-gated respectively).
+6. **Cross-generation dedup** (`pipeline/dedup.py`): stable-hash MinHash over
+   word 5-shingles; a run can dedup against the prior generation's signature
+   file (flywheel-collapse mitigation). datasketch/LSH is the upgrade path.
+7. **The local stage-driver** (`pipeline/build_corpus.py`): holdout-split FIRST
+   (held-out tasks never rolled out), rollouts under a budget ceiling
+   (partial-marking), typed routing, dedup, write-once-per-run idempotency.
+
+## Fidelity corrections shipped with this ADR (deepread findings, all verified)
+
+- **V1:** "SDPO is mathematically the same as Composer's mechanism" corrected
+  in `opsd.py` + `COMPOSER_RECIPE_MAPPING.md` — Cursor cites SDPO/OPSD as
+  *background*; our channel is a third, blog-inspired design (turn-localized
+  hint splice, live stop-grad teacher, no EMA).
+- **V5:** fabricated numbers struck/tagged: 69.3%/Terminal-Bench parity (no
+  primary source), "24 other generators" (back-formed), "85% post-training
+  compute" (community speculation) — `research/01`, mapping doc, `research/06`,
+  `research/09`.
+- **V7:** Streaming DiLoCo citation fixed (`diloco/__init__.py`): 2501.18512 =
+  Douillard et al.; Eager Updates = Kale et al. 2502.12996.
+- **V11:** `teacher_replay.py` cost docstring relabeled ($0.98 = 50-state
+  synthetic trace; real sessions ~2 OOM more).
+- **V13:** `kl_in_reward.py` "verl's only reverse-KL option" → default/
+  recommended (verl also ships a k3-family option).
+
+## What is deliberately NOT in Stage 0
+
+- AWS orchestration (Glue/EMR/Batch/Bedrock-batch/Step Functions) — Stage 4,
+  only after local runs are routine (finding D-9).
+- Tree depth>1 — gated on a `Sandbox.fork()` spike + a measured divergence-gate
+  firing rate (findings D-3/D-4). Depth-1 multi-candidate rollouts need no fork.
+- World-model `wm_tuples/` emission — gated on the P4 ablation being scheduled
+  (finding D-14; CWM evidence is mid-training, not RL-time aux head — V6).
+- Secrets/PII scrub at trace ingest (finding V9) — REQUIRED before any raw
+  Claude Code session is uploaded to shared storage; tracked as the next
+  pipeline item. Local-only runs are unaffected.
+
+## Acceptance gate
+
+- [x] `repo_gate`: 53 tests (license tiers, decontamination, gate verdicts).
+- [x] `swesmith_adapter`: 18 tests (patch INVERSION semantics, reverse round-trip,
+      strategy provenance, image conventions).
+- [x] `trajectory` + `rollout_harness`: 13 tests (IR round-trips, SENTINEL leak
+      guard, env-grounded episode to grade 1.0 / guard-broken / near-miss,
+      admission routing).
+- [x] `pipeline`: 12 tests (layout, manifest+budget, leak guard at the writer,
+      MinHash within-run + cross-generation, build_corpus e2e with holdout
+      exclusion + budget stop + write-once).
+- [x] Full suite green: 511 passed / 66 skipped.
+- [ ] Live swesmith synthesis on a real pointed-at repo (needs Docker+Linux) —
+      the documented `[~]` gate, same shape as ADR-010's Docker e2e.
+
+## More Information
+
+- `research/deepread/13-synthesis-architecture.md` — the architecture this implements.
+- `research/deepread/12-verified-findings.md` — the verified finding ledger (V1–V15).
+- `research/deepread/02-swe-task-synthesis.md` — the SWE-smith/R2E-Gym/SWE-Gym deep-read.
+- ADR-010 (the substrate-inversion base this extends), ADR-002 (trace source).
diff --git a/pyproject.toml b/pyproject.toml
index 11a211bf2667587d1ebaa26694170594bc97392d..0e097b6c6eb035f05596e9f003ce52a7e5feb8b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,6 +83,14 @@ aws = [
     "boto3>=1.34",
     "sagemaker>=2.200,<3",
 ]
+# SWE-smith task-synthesis engine (deepread finding V4 buy-vs-build verdict):
+# the swesmith toolkit builds env images from arbitrary GitHub repos and
+# synthesizes bugs (PR Mirror = this repo's gold-patch-reversion mechanic).
+# LIVE synthesis needs Docker on Linux (the toolkit does not support macOS/
+# Windows officially); the SwesmithAdapter itself needs nothing beyond core.
+swesmith = [
+    "swesmith>=0.1",
+]
 # Replaysim dataset normalization (per ADR-004)
 #
 # NOTE: data-juicer is intentionally NOT pinned as an extra. The package
diff --git a/research/01-composer-2.5.md b/research/01-composer-2.5.md
index f2d535bd8d9971855b65b584f0206dec161ba405..427b16c65e1eb573b0becf5d6dfe2d0b1ec07d55 100644
--- a/research/01-composer-2.5.md
+++ b/research/01-composer-2.5.md
@@ -11,7 +11,7 @@
 > The targeted-textual-feedback method is correctly described, but this file does **not** cite the three self-distillation papers Cursor cites in footnote 1 (OPSD `arXiv:2601.18734`, SDPO `arXiv:2601.20802`, Self-Distillation Continual Learning `arXiv:2601.19897`). The mapping document does.
 
 ## Overview
-Cursor's Composer 2.5 is an advanced agentic coding model that powers the Cursor IDE. Released in mid-May 2026, it represents a massive leap in agentic capabilities, particularly for long-running, multi-file software engineering tasks. While the base weights are Moonshot AI's open-source **Kimi K2.5** model, roughly 85% of the total compute budget for Composer 2.5 was spent on Cursor's proprietary post-training and Reinforcement Learning (RL) pipeline. 
+Cursor's Composer 2.5 is an advanced agentic coding model that powers the Cursor IDE. Released in mid-May 2026, it represents a massive leap in agentic capabilities, particularly for long-running, multi-file software engineering tasks. While the base weights are Moonshot AI's open-source **Kimi K2.5** model, a large share of the compute budget went to Cursor's proprietary post-training/RL pipeline (the widely-circulated "85%" figure is community speculation, in NO primary source — deepread finding V5). 
 
 The resulting model is highly optimized for the exact constraints and tools of the Cursor environment (file edits, terminal usage, LSP interaction). Composer 2.5 is praised for having fewer "false-start" tool calls, avoiding prompt-baiting, and demonstrating a much calmer, more effective collaboration loop than its predecessors.
 
@@ -60,8 +60,8 @@ During post-training, Cursor employs **Sharded Muon** and **Dual Mesh HSDP (Hybr
 ## Performance Characteristics
 Cursor claims Composer 2.5 achieves a Pareto-optimal tradeoff between intelligence and inference cost compared to frontier models (Opus 4.5/4.6, GPT-5.4/5.5).
 
-*   **Intelligence Improvements**: On Cursor's internal *CursorBench* (which tests sweeping, multi-file edits with ambiguous prompts), Composer 2.5 scored 69.3% (or ~61-63% depending on the specific benchmark version cited), a massive jump from Composer 1.5's ~44% and Composer 2's ~52%.
-*   **Frontier Parity**: On public agentic benchmarks like *Terminal-Bench 2.0*, it hit 69.3%. On *SWE-bench Multilingual*, it achieved parity with or slightly surpassed OpenAI's GPT-5.5.
+*   **Intelligence Improvements**: On Cursor's internal *CursorBench* (which tests sweeping, multi-file edits with ambiguous prompts), Composer 2.5's score is NOT in any primary source (the circulating 69.3% figure appears in neither the 2.5 blog nor the Composer 2 techreport — deepread finding V5; the techreport's Table 1 gives Composer 2 = 61.3 CursorBench). Treat all 2.5 benchmark numbers as unverified.
+*   **Frontier Parity**: Claims of Terminal-Bench 2.0 / SWE-bench Multilingual parity circulate in secondary commentary only; neither primary source contains benchmark numbers for 2.5 (deepread finding V5).
 *   **Cost Efficiency**: 
     *   Standard Tier: $0.50 per 1M input / $2.50 per 1M output tokens.
     *   Fast Tier: $3.00 per 1M input / $15.00 per 1M output tokens.
diff --git a/research/06-feature-deletion-datagen.md b/research/06-feature-deletion-datagen.md
index 44e8bd937ec28bfe3ac83a73ba7aef7ea5fa26fa..d18b00b01bc7b1cbfe7ff05772d360fd0c9d4997 100644
--- a/research/06-feature-deletion-datagen.md
+++ b/research/06-feature-deletion-datagen.md
@@ -327,7 +327,7 @@ Feature-Deletion is **embarrassingly parallel and CPU-bound** — no GPU in the
 
 1. **Deletion-target selection heuristic** — blog silent (`research/09` §1 "NO CHANGE"). We propose coverage-selectivity (§5 Path B); Cursor's actual heuristic is unknown.
 2. **Deleter model vs. program** — blog implies an agent deletes ("asked to delete code… such that the codebase remains functional"); we default to *programmatic* deletion (cheaper, deterministic, no second model). An LLM-deleter is a v0.2 escalation.
-3. **The other ~24 generators** — Feature Deletion is "one synthetic approach… a range of approaches"; the rest are unnamed. Out of scope here; this brief delivers the one named generator.
+3. **The other generators (count UNKNOWN)** — Feature Deletion is "one synthetic approach… a range of approaches"; the rest are unnamed and uncounted (the old "~24" was a back-formation from the 25x task multiplier — deepread finding V5). Out of scope here; this brief delivers the one named generator.
 4. **"Agentic monitoring tools" internals** — unspecified; our §3c monitor is a best-effort programmatic stand-in.
 5. **Composer2.pdf (arXiv:2603.24477)** — flagged by `research/09` action-item #1 as the likely home of data-mix % and generator inventory; **not yet extracted**. Recommend a follow-up pull before scaling the generator suite.
 
diff --git a/research/09-composer-blog-delta-2026.md b/research/09-composer-blog-delta-2026.md
index f26972dde93c9bc694e0106540699b68e5583407..f35a88ef9f89cc7d3886f47e1c9ff8695252230e 100644
--- a/research/09-composer-blog-delta-2026.md
+++ b/research/09-composer-blog-delta-2026.md
@@ -20,7 +20,7 @@ The **2.5 blog body is byte-for-byte unchanged** from what the mapping doc captu
 
 **DELTAS (not in / under-stated in COMPOSER_RECIPE_MAPPING.md):**
 
-- **[DELTA — new emphasis]** The phrase *"we both **select for** and **create** harder tasks **dynamically throughout the run**"* is a **dynamic curriculum / online task-selection** signal. The mapping doc captured "Feature Deletion + 24 unnamed generators" but did **not** flag that task difficulty is filtered *online* (the model "begins to get most training problems correct," so hard tasks are up-weighted live). This is a data-*mix*/curriculum detail with direct replication impact: our generator suite needs a difficulty filter / pass-rate gate, not just a static task bank.
+- **[DELTA — new emphasis]** The phrase *"we both **select for** and **create** harder tasks **dynamically throughout the run**"* is a **dynamic curriculum / online task-selection** signal. The mapping doc captured "Feature Deletion + other unnamed generators" (its old "24" count was a back-formation — deepread finding V5) but did **not** flag that task difficulty is filtered *online* (the model "begins to get most training problems correct," so hard tasks are up-weighted live). This is a data-*mix*/curriculum detail with direct replication impact: our generator suite needs a difficulty filter / pass-rate gate, not just a static task bank.
 - **[DELTA — new authoritative source for CPT data mix]** The Composer 2 technical-report blog states the CPT data mix explicitly: *"continued pretraining on a data mix that **emphasizes code** to deepen the base model's coding knowledge"* and *"We find that **reducing pretraining loss improves downstream RL performance**, with better base knowledge reliably translating into a better agent."* The mapping doc marked "continued pretraining on heavily code-weighted data" as `[BLOG-VERIFIED]` from the 2.5 Muon section — but the **causal claim (CPT loss ↓ ⇒ RL performance ↑)** is new and is the stated *justification* for doing CPT at all. Relevant to our "skip CPT, start from Qwen3-Coder" decision: Cursor's own evidence says base-knowledge quality gates RL ceiling, which strengthens the case for starting from an already-code-tuned base.
 - **[DELTA — new artifact]** There is now a **full Composer 2 arXiv technical report: [arXiv:2603.24477](https://arxiv.org/abs/2603.24477)** and a downloadable PDF at **`https://cursor.com/resources/Composer2.pdf`** (authored by Sasha Rush et al.). The report explicitly *"covers... ablations on the training recipe, our approach to agent behavior shaping, and the design of our evaluation suite."* The mapping doc cited only the blog stub and never the arXiv ID/PDF. **This PDF is the most likely place to resolve the data-mix weighting %, the RL algorithm name, and the hint-generation mechanism — none of which are in either blog.** → Recommend a dedicated follow-up extraction of Composer2.pdf.
 - **[CONFIRM — "Anyrun"]** Mapping doc flagged "Anyrun" as possibly not Cursor-sourced. **Confirmed real:** the Composer 2 report blog says *"**Anyrun**, our internal compute platform for running hundreds of thousands of sandboxed coding environments."* It is a Composer-**2** artifact (carried into 2.5), correctly attributed. Resolves the mapping doc's open flag.
diff --git a/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md b/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md
new file mode 100644
index 0000000000000000000000000000000000000000..245e87604212ad52b628ff1ee7a6a6b050108b18
--- /dev/null
+++ b/research/notes/230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen.md
@@ -0,0 +1,224 @@
+---
+title: '[2304.06767] RAFT: Reward rAnked FineTuning for Generative Foundation Model
+  Alignment'
+id: 230406767-raft-reward-ranked-finetuning-for-generative-foundation-model-alignmen
+tags:
+- deepread
+created: '2026-06-10T00:31:18.566124Z'
+source: https://arxiv.org/abs/2304.06767
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:31:18.565918Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2304.06767] RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment
+Computer Science > Machine Learning
+arXiv:2304.06767
+(cs)
+[Submitted on 13 Apr 2023 (
+v1
+), last revised 1 Dec 2023 (this version, v4)]
+Title:
+RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment
+Authors:
+Hanze Dong
+,
+Wei Xiong
+,
+Deepanshu Goyal
+,
+Yihan Zhang
+,
+Winnie Chow
+,
+Rui Pan
+,
+Shizhe Diao
+,
+Jipeng Zhang
+,
+Kashun Shum
+,
+Tong Zhang
+View a PDF of the paper titled RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment, by Hanze Dong and 9 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Generative foundation models are susceptible to implicit biases that can arise from extensive unsupervised training data. Such biases can produce suboptimal samples, skewed outcomes, and unfairness, with potentially serious consequences. Consequently, aligning these models with human ethics and preferences is an essential step toward ensuring their responsible and effective deployment in real-world applications. Prior research has primarily employed Reinforcement Learning from Human Feedback (RLHF) to address this problem, where generative models are fine-tuned with RL algorithms guided by a human-feedback-informed reward model. However, the inefficiencies and instabilities associated with RL algorithms frequently present substantial obstacles to the successful alignment, necessitating the development of a more robust and streamlined approach. To this end, we introduce a new framework, Reward rAnked FineTuning (RAFT), designed to align generative models effectively. Utilizing a reward model and a sufficient number of samples, our approach selects the high-quality samples, discarding those that exhibit undesired behavior, and subsequently enhancing the model by fine-tuning on these filtered samples. Our studies show that RAFT can effectively improve the model performance in both reward learning and other automated metrics in both large language models and diffusion models.
+Comments:
+29 pages, 12 figures, Published in Transactions on Machine Learning Research (TMLR)
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (stat.ML)
+Cite as:
+arXiv:2304.06767
+[cs.LG]
+(or
+arXiv:2304.06767v4
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2304.06767
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Hanze Dong [
+view email
+]
+[v1]
+Thu, 13 Apr 2023 18:22:40 UTC (62,967 KB)
+[v2]
+Thu, 25 May 2023 06:27:31 UTC (42,022 KB)
+[v3]
+Wed, 30 Aug 2023 01:25:29 UTC (33,955 KB)
+[v4]
+Fri, 1 Dec 2023 14:28:06 UTC (34,049 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment, by Hanze Dong and 9 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-04
+Change to browse by:
+cs
+cs.AI
+cs.CL
+cs.CV
+stat
+stat.ML
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a6136f6d7b1ffa8f18204c4d776d9f8030ee5f1
--- /dev/null
+++ b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2.md
@@ -0,0 +1,2735 @@
+---
+title: '[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language
+  Models'
+id: 230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models-2
+tags:
+- deepread
+created: '2026-06-10T00:41:12.876142Z'
+source: https://ar5iv.labs.arxiv.org/html/2305.10601
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:41:12.875985Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language Models
+Tree of Thoughts: Deliberate Problem Solving
+with Large Language Models
+Shunyu Yao
+Princeton University
+Dian Yu
+Google DeepMind
+Jeffrey Zhao
+Google DeepMind
+Izhak Shafran
+Google DeepMind
+Thomas L. Griffiths
+Princeton University
+Yuan Cao
+Google DeepMind
+Karthik Narasimhan
+Princeton University
+Abstract
+Language models are increasingly being deployed for general problem solving across a wide range of tasks, but are still confined to token-level, left-to-right decision-making processes during inference. This means they can fall short in tasks that require exploration, strategic lookahead, or where initial decisions play a pivotal role.
+To surmount these challenges, we introduce a new framework for language model inference, “Tree of Thoughts” (ToT), which generalizes over the popular “Chain of Thought” approach to prompting language models, and enables exploration over coherent units of text (“thoughts”) that serve as intermediate steps toward problem solving.
+ToT allows LMs to perform deliberate decision making by considering multiple different reasoning paths and self-evaluating choices to decide the next course of action, as well as looking ahead or backtracking when necessary to make global choices.
+Our experiments show that ToT significantly enhances language models’ problem-solving abilities on three novel tasks requiring non-trivial planning or search: Game of 24, Creative Writing, and Mini Crosswords.
+For instance, in Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of tasks, our method achieved a success rate of 74%. Code repo with all prompts:
+https://github.com/princeton-nlp/tree-of-thought-llm
+.
+1
+Introduction
+Originally designed to generate text, scaled-up versions of language models (LMs) such as GPT
+[
+25
+,
+26
+,
+1
+,
+23
+]
+and PaLM
+[
+5
+]
+have been shown to be increasingly capable of performing an ever wider range of tasks requiring mathematical, symbolic, commonsense, and knowledge reasoning. It is perhaps surprising that underlying all this progress is still the original autoregressive mechanism for generating text, which makes token-level decisions one by one and in a left-to-right fashion.
+Is such a simple mechanism sufficient for a LM to be built toward a general problem solver?
+If not, what problems would challenge the current paradigm, and what should be alternative mechanisms?
+The literature on human cognition provides some clues to answer these questions.
+Research on “dual process” models suggests that people have two modes in which they engage with decisions – a fast, automatic, unconscious mode (“System 1”) and a slow, deliberate, conscious mode (“System 2”)
+[
+30
+,
+31
+,
+16
+,
+15
+]
+.
+These two modes have previously been connected to a variety of mathematical models used in machine learning. For example, research on reinforcement learning in humans and other animals has explored the circumstances under which they engage in associative “model free” learning or more deliberative “model based” planning
+[
+7
+]
+.
+The simple associative token-level choices of LMs are also reminiscent of “System 1”, and thus might benefit from augmentation by a more deliberate “System 2” planning process that (1) maintains and explores diverse alternatives for current choices instead of just picking one, and (2) evaluates its current status and actively looks ahead or backtracks to make more global decisions.
+To design such a planning process, we return to the origins of artificial intelligence (and cognitive science), drawing inspiration from the planning processes explored by Newell, Shaw, and Simon starting in the 1950s
+[
+21
+,
+22
+]
+. Newell and colleagues characterized
+problem solving
+[
+21
+]
+as search through a combinatorial problem space, represented as a tree. We thus propose the Tree of Thoughts (ToT) framework for general problem solving with language models. As Figure
+1
+illustrates, while existing methods (detailed below) sample continuous language sequences for problem solving, ToT actively maintains a tree of thoughts, where each
+thought
+is a coherent language sequence that serves as an intermediate step toward problem solving (Table
+1
+). Such a high-level semantic unit allows the LM to self-evaluate the progress different intermediate thoughts make towards solving the problem through a deliberate reasoning process that is also instantiated in language (Figures
+2
+,
+4
+,
+6
+). This implementation of search heuristics via LM self-evaluation and deliberation is novel, as previous search heuristics are either programmed or learned. Finally, we combine this language-based capability to generate and evaluate diverse thoughts with search algorithms, such as breadth-first search (BFS) or depth-first search (DFS), which allow systematic exploration of the tree of thoughts with lookahead and backtracking.
+Empirically, we propose three new problems that challenge existing LM inference methods even with the state-of-the-art language model, GPT-4
+[
+23
+]
+: Game of 24, Creative Writing, and Crosswords (Table
+1
+).
+These tasks require deductive, mathematical, commonsense, lexical reasoning abilities, and a way to incorporate systematic planning or search.
+We show ToT obtains superior results on all three tasks by being general and flexible enough to support different levels of thoughts, different ways to generate and evaluate thoughts, and different search algorithms that adapt to the nature of different problems. We also analyze how such choices affect model performances via systematic ablations and discuss future directions to better train and use LMs.
+Figure 1:
+Schematic illustrating various approaches to problem solving with LLMs. Each rectangle box represents a
+thought
+, which is a coherent language sequence that serves as an intermediate step toward problem solving. See concrete examples of how thoughts are generated, evaluated, and searched in Figures
+2
+,
+4
+,
+6
+.
+2
+Background
+We first formalize some existing methods that use large language models for problem-solving, which our approach is inspired by and later compared with.
+We use
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+to denote a pre-trained LM with parameters
+θ
+𝜃
+\theta
+, and
+lowercase letters
+x
+,
+y
+,
+z
+,
+s
+,
+⋯
+𝑥
+𝑦
+𝑧
+𝑠
+⋯
+x,y,z,s,\cdots
+to denote a language sequence
+, i.e.
+x
+=
+(
+x
+​
+[
+1
+]
+,
+⋯
+,
+x
+​
+[
+n
+]
+)
+𝑥
+𝑥
+delimited-[]
+1
+⋯
+𝑥
+delimited-[]
+𝑛
+x=(x[1],\cdots,x[n])
+where each
+x
+​
+[
+i
+]
+𝑥
+delimited-[]
+𝑖
+x[i]
+is a token, so that
+p
+θ
+​
+(
+x
+)
+=
+∏
+i
+=
+1
+n
+p
+θ
+​
+(
+x
+​
+[
+i
+]
+|
+x
+​
+[
+1
+​
+…
+​
+i
+]
+)
+subscript
+𝑝
+𝜃
+𝑥
+superscript
+subscript
+product
+𝑖
+1
+𝑛
+subscript
+𝑝
+𝜃
+conditional
+𝑥
+delimited-[]
+𝑖
+𝑥
+delimited-[]
+1
+…
+𝑖
+p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1...i])
+. We use uppercase letters
+S
+,
+⋯
+𝑆
+⋯
+S,\cdots
+to denote a collection of language sequences.
+Input-output (IO) prompting
+is the most common way to turn a problem input
+x
+𝑥
+x
+into output
+y
+𝑦
+y
+with LM:
+y
+∼
+p
+θ
+​
+(
+y
+|
+prompt
+I
+​
+O
+​
+(
+x
+)
+)
+similar-to
+𝑦
+subscript
+𝑝
+𝜃
+conditional
+𝑦
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+y\sim p_{\theta}(y|\texttt{prompt}_{{IO}}(x))
+, where
+prompt
+I
+​
+O
+​
+(
+x
+)
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+\texttt{prompt}_{IO}(x)
+wraps input
+x
+𝑥
+x
+with task instructions and/or few-shot input-output examples. For simplicity, let us denote
+p
+θ
+prompt
+​
+(
+output
+∣
+input
+)
+=
+p
+θ
+​
+(
+output
+∣
+prompt
+​
+(
+input
+)
+)
+superscript
+subscript
+𝑝
+𝜃
+prompt
+conditional
+output
+input
+subscript
+𝑝
+𝜃
+conditional
+output
+prompt
+input
+p_{\theta}^{{\rm prompt}}(\texttt{output}\mid\texttt{input})=p_{\theta}(\texttt{output}\mid\texttt{prompt}(\texttt{input}))
+, so that IO prompting can be formulated as
+y
+∼
+p
+θ
+I
+​
+O
+​
+(
+y
+|
+x
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝐼
+𝑂
+conditional
+𝑦
+𝑥
+y\sim p_{\theta}^{IO}(y|x)
+.
+Chain-of-thought (CoT) prompting
+[
+38
+]
+was proposed to address cases where the mapping of input
+x
+𝑥
+x
+to output
+y
+𝑦
+y
+is non-trivial (e.g. when
+x
+𝑥
+x
+is a math question and
+y
+𝑦
+y
+is the final numerical answer). The key idea is to introduce a chain of
+thoughts
+z
+1
+,
+⋯
+,
+z
+n
+subscript
+𝑧
+1
+⋯
+subscript
+𝑧
+𝑛
+z_{1},\cdots,z_{n}
+to bridge
+x
+𝑥
+x
+and
+y
+𝑦
+y
+, where each
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+is a coherent language sequence that serves as a meaningful intermediate step toward problem solving (e.g.
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+could be an intermediate equation for math QA). To solve problems with CoT, each thought
+z
+i
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
+∣
+x
+,
+z
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑧
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+1
+z_{i}\sim p_{\theta}^{CoT}(z_{i}\mid x,z_{1\cdots i-1})
+is sampled sequentially, then the output
+y
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+y
+|
+x
+,
+z
+1
+​
+⋯
+​
+n
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+𝑦
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑛
+y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n})
+. In practice,
+[
+z
+1
+​
+⋯
+​
+n
+,
+y
+]
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+1
+​
+⋯
+​
+n
+,
+y
+|
+x
+)
+similar-to
+subscript
+𝑧
+1
+⋯
+𝑛
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+subscript
+𝑧
+1
+⋯
+𝑛
+conditional
+𝑦
+𝑥
+[z_{1\cdots n},y]\sim p_{\theta}^{CoT}(z_{1\cdots n},y|x)
+is sampled as a continuous language sequence, and the
+decomposition
+of thoughts (e.g. is each
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+a phrase, a sentence, or a paragraph) is left ambiguous.
+Self-consistency with CoT (CoT-SC)
+[
+36
+]
+is an ensemble approach that samples
+k
+𝑘
+k
+i.i.d. chains of thought:
+[
+z
+1
+​
+⋯
+​
+n
+(
+i
+)
+,
+y
+(
+i
+)
+]
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+1
+​
+⋯
+​
+n
+,
+y
+|
+x
+)
+​
+(
+i
+=
+1
+​
+⋯
+​
+k
+)
+similar-to
+subscript
+superscript
+𝑧
+𝑖
+1
+⋯
+𝑛
+superscript
+𝑦
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+subscript
+𝑧
+1
+⋯
+𝑛
+conditional
+𝑦
+𝑥
+𝑖
+1
+⋯
+𝑘
+[z^{(i)}_{1\cdots n},y^{(i)}]\sim p_{\theta}^{CoT}(z_{1\cdots n},y|x)\ (i=1\cdots k)
+, then returns the most frequent output:
+arg
+⁡
+max
+y
+⁡
+#
+​
+{
+i
+∣
+y
+(
+i
+)
+=
+y
+}
+subscript
+𝑦
+#
+conditional-set
+𝑖
+superscript
+𝑦
+𝑖
+𝑦
+\arg\max_{y}\#\{i\mid y^{(i)}=y\}
+. CoT-SC improves upon CoT, because there are generally different thought processes for the same problem (e.g. different ways to prove the same theorem), and the output decision can be more faithful by exploring a richer set of thoughts. However, within each chain there is no local exploration of different thought steps, and the “most frequent” heuristic only applies when the output space is limited (e.g. multi-choice QA).
+3
+Tree of Thoughts: Deliberate Problem Solving with LM
+A genuine problem-solving process involves the repeated use of available information to initiate exploration, which discloses, in turn, more information until a way to attain the solution is finally discovered.——
+Newell et al. [
+21
+]
+Research on human problem-solving suggests that people search through a combinatorial problem-space – a tree where the nodes represent partial solutions, and the branches correspond to operators that modify them
+[
+21
+,
+22
+]
+. Which branch to take is determined by heuristics that help to navigate the problem-space and guide the problem-solver towards a solution. This perspective highlights two key shortcomings of existing approaches that use LMs to solve general problems: 1) Locally, they do not explore
+different
+continuations within a thought process – the branches of the tree. 2) Globally, they do not incorporate any type of planning, lookahead, or backtracking to help evaluate these different options – the kind of heuristic-guided search that seems characteristic of human problem-solving.
+To address these shortcomings, we introduce
+Tree of Thoughts (ToT)
+, a paradigm that allows LMs to explore multiple reasoning paths over thoughts (Figure
+1
+(c)). ToT frames any problem as a search over a tree, where each node is a
+state
+s
+=
+[
+x
+,
+z
+1
+​
+⋯
+​
+i
+]
+𝑠
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+s=[x,z_{1\cdots i}]
+representing a partial solution with the input and the sequence of thoughts so far. A specific instantiation of ToT involves answering four questions: 1. How to
+decompose
+the intermediate process into thought steps; 2. How to
+generate
+potential thoughts from each state; 3. How to heuristically
+evaluate
+states; 4. What
+search
+algorithm to use.
+1. Thought decomposition.
+While CoT samples thoughts coherently without explicit decomposition, ToT leverages problem properties to design and decompose intermediate thought steps. As Table
+1
+shows, depending on different problems, a thought could be a couple of words (Crosswords), a line of equation (Game of 24), or a whole paragraph of writing plan (Creative Writing). In general, a thought should be “small” enough so that LMs can generate promising and diverse samples (e.g. generating a whole book is usually too “big” to be coherent), yet “big” enough so that LMs can evaluate its prospect toward problem solving (e.g. generating one token is usually too “small” to evaluate).
+2. Thought generator
+G
+​
+(
+p
+θ
+,
+s
+,
+k
+)
+𝐺
+subscript
+𝑝
+𝜃
+𝑠
+𝑘
+G(p_{\theta},s,k)
+.
+Given a tree state
+s
+=
+[
+x
+,
+z
+1
+​
+⋯
+​
+i
+]
+𝑠
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+s=[x,z_{1\cdots i}]
+, we consider two strategies to generate
+k
+𝑘
+k
+candidates for the next thought step:
+(a)
+Sample
+i.i.d. thoughts from a CoT prompt (Creative Writing, Figure
+4
+):
+z
+(
+j
+)
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
++
+1
+|
+s
+)
+=
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
++
+1
+|
+x
+,
+z
+1
+​
+⋯
+​
+i
+)
+​
+(
+j
+=
+1
+​
+⋯
+​
+k
+)
+similar-to
+superscript
+𝑧
+𝑗
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+1
+𝑠
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+1
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+𝑗
+1
+⋯
+𝑘
+z^{(j)}\sim p_{\theta}^{CoT}(z_{i+1}|s)=p_{\theta}^{CoT}(z_{i+1}|x,z_{1\cdots i})\ (j=1\cdots k)
+. This works better when the thought space is rich (e.g. each thought is a paragraph), and i.i.d. samples lead to diversity;
+(b)
+Propose
+thoughts sequentially using a “propose prompt” (Game of 24, Figure
+2
+; Crosswords, Figure
+6
+):
+[
+z
+(
+1
+)
+,
+⋯
+,
+z
+(
+k
+)
+]
+∼
+p
+θ
+p
+​
+r
+​
+o
+​
+p
+​
+o
+​
+s
+​
+e
+​
+(
+z
+i
++
+1
+(
+1
+​
+⋯
+​
+k
+)
+∣
+s
+)
+similar-to
+superscript
+𝑧
+1
+⋯
+superscript
+𝑧
+𝑘
+superscript
+subscript
+𝑝
+𝜃
+𝑝
+𝑟
+𝑜
+𝑝
+𝑜
+𝑠
+𝑒
+conditional
+superscript
+subscript
+𝑧
+𝑖
+1
+1
+⋯
+𝑘
+𝑠
+[z^{(1)},\cdots,z^{(k)}]\sim p_{\theta}^{propose}(z_{i+1}^{(1\cdots k)}\mid s)
+. This works better when the thought space is more constrained (e.g. each thought is just a word or a line), so proposing different thoughts in the same context avoids duplication.
+3. State evaluator
+V
+​
+(
+p
+θ
+,
+S
+)
+𝑉
+subscript
+𝑝
+𝜃
+𝑆
+V(p_{\theta},S)
+.
+Given a frontier of different states, the state evaluator evaluates the progress they make towards solving the problem, serving as a
+heuristic
+for the search algorithm to determine which states to keep exploring and in which order. While heuristics are a standard approach to solving search problems, they are typically either programmed (e.g. DeepBlue
+[
+3
+]
+) or learned (e.g. AlphaGo
+[
+29
+]
+). We propose a third alternative, by using the LM to deliberately reason about states. When applicable, such a deliberate heuristic can be more flexible than programmed rules, and more sample-efficient than learned models.
+Similar to the thought generator, we consider two strategies to evaluate states either independently or together:
+(a)
+Value
+each state independently:
+V
+​
+(
+p
+θ
+,
+S
+)
+​
+(
+s
+)
+∼
+p
+θ
+v
+​
+a
+​
+l
+​
+u
+​
+e
+​
+(
+v
+|
+s
+)
+​
+∀
+s
+∈
+S
+similar-to
+𝑉
+subscript
+𝑝
+𝜃
+𝑆
+𝑠
+superscript
+subscript
+𝑝
+𝜃
+𝑣
+𝑎
+𝑙
+𝑢
+𝑒
+conditional
+𝑣
+𝑠
+for-all
+𝑠
+𝑆
+V(p_{\theta},S)(s)\sim p_{\theta}^{value}(v|s)\ \forall s\in S
+, where a value prompt reasons about the state
+s
+𝑠
+s
+to generate a scalar value
+v
+𝑣
+v
+(e.g. 1-10) or a classification (e.g. sure/likely/impossible) that could be heuristically turned into a value. The basis of such evaluative reasoning can vary across problems and thought steps. In this work, we explore evaluation via few
+lookahead
+simulations (e.g. quickly confirm that 5, 5, 14 can reach 24 via 5 + 5 + 14, or “hot_l” can mean “inn” via filling “e” in “_”) plus commonsense (e.g. 1 2 3 are too small to reach 24, or no word can start with “tzxc”). While the former might promote “good” states, the latter could help eliminate “bad” states. Such valuations do not need to be perfect, and only need to be approximately helpful for decision making.
+(b)
+Vote
+across states:
+V
+​
+(
+p
+θ
+,
+S
+)
+​
+(
+s
+)
+=
+𝟙
+​
+[
+s
+=
+s
+∗
+]
+𝑉
+subscript
+𝑝
+𝜃
+𝑆
+𝑠
+1
+delimited-[]
+𝑠
+superscript
+𝑠
+V(p_{\theta},S)(s)=\mathds{1}[s=s^{*}]
+, where a “good” state
+s
+∗
+∼
+p
+θ
+v
+​
+o
+​
+t
+​
+e
+​
+(
+s
+∗
+|
+S
+)
+similar-to
+superscript
+𝑠
+superscript
+subscript
+𝑝
+𝜃
+𝑣
+𝑜
+𝑡
+𝑒
+conditional
+superscript
+𝑠
+𝑆
+s^{*}\sim p_{\theta}^{vote}(s^{*}|S)
+is voted out based on deliberately comparing different states in
+S
+𝑆
+S
+in a vote prompt.
+When problem success is harder to directly value (e.g. passage coherency), it is natural to to instead compare different partial solutions and vote for the most promising one. This is similar in spirit to a “step-wise” self-consistency strategy, i.e. cast “which state to explore” as a multi-choice QA, and use LM samples to vote for it.
+For both strategies, we could prompt the LM multiple times to aggregate the value or vote results to trade time/resource/cost for more faithful/robust heuristics.
+Algorithm 1
+ToT-BFS(
+x
+,
+p
+θ
+,
+G
+,
+k
+,
+V
+,
+T
+,
+b
+𝑥
+subscript
+𝑝
+𝜃
+𝐺
+𝑘
+𝑉
+𝑇
+𝑏
+x,p_{\theta},G,k,V,T,b
+)
+Input
+x
+𝑥
+x
+, LM
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, thought generator
+G
+​
+(
+)
+𝐺
+G()
+& size limit
+k
+𝑘
+k
+, states evaluator
+V
+​
+(
+)
+𝑉
+V()
+, step limit
+T
+𝑇
+T
+, breadth limit
+b
+𝑏
+b
+.
+S
+0
+←
+{
+x
+}
+←
+subscript
+𝑆
+0
+𝑥
+S_{0}\leftarrow\{x\}
+for
+t
+=
+1
+,
+⋯
+,
+T
+𝑡
+1
+⋯
+𝑇
+t=1,\cdots,T
+do
+S
+t
+′
+←
+{
+[
+s
+,
+z
+]
+∣
+s
+∈
+S
+t
+−
+1
+,
+z
+t
+∈
+G
+​
+(
+p
+θ
+,
+s
+,
+k
+)
+}
+←
+subscript
+superscript
+𝑆
+′
+𝑡
+conditional-set
+𝑠
+𝑧
+formulae-sequence
+𝑠
+subscript
+𝑆
+𝑡
+1
+subscript
+𝑧
+𝑡
+G
+subscript
+𝑝
+𝜃
+𝑠
+𝑘
+S^{\prime}_{t}\leftarrow\{[s,z]\mid s\in S_{t-1},z_{t}\in{\color[rgb]{0,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{0,0,0}\pgfsys@color@gray@stroke{0}\pgfsys@color@gray@fill{0}\mathrm{G}}(p_{\theta},s,k)\}
+V
+t
+←
+V
+​
+(
+p
+θ
+,
+S
+t
+′
+)
+←
+subscript
+𝑉
+𝑡
+𝑉
+subscript
+𝑝
+𝜃
+subscript
+superscript
+𝑆
+′
+𝑡
+V_{t}\leftarrow V(p_{\theta},S^{\prime}_{t})
+S
+t
+←
+arg
+⁡
+max
+S
+⊂
+S
+t
+′
+,
+|
+S
+|
+=
+b
+​
+∑
+s
+∈
+S
+V
+t
+​
+(
+s
+)
+←
+subscript
+𝑆
+𝑡
+subscript
+formulae-sequence
+𝑆
+subscript
+superscript
+𝑆
+′
+𝑡
+𝑆
+𝑏
+subscript
+𝑠
+𝑆
+subscript
+𝑉
+𝑡
+𝑠
+S_{t}\leftarrow\arg\max_{S\subset S^{\prime}_{t},|S|=b}\sum_{s\in S}V_{t}(s)
+end
+for
+return
+G
+​
+(
+p
+θ
+,
+arg
+⁡
+max
+s
+∈
+S
+T
+⁡
+V
+T
+​
+(
+s
+)
+,
+1
+)
+𝐺
+subscript
+𝑝
+𝜃
+subscript
+𝑠
+subscript
+𝑆
+𝑇
+subscript
+𝑉
+𝑇
+𝑠
+1
+G(p_{\theta},\arg\max_{s\in S_{T}}V_{T}(s),1)
+Algorithm 2
+ToT-DFS(
+s
+,
+t
+,
+p
+θ
+,
+G
+,
+k
+,
+V
+,
+T
+,
+v
+t
+​
+h
+𝑠
+𝑡
+subscript
+𝑝
+𝜃
+𝐺
+𝑘
+𝑉
+𝑇
+subscript
+𝑣
+𝑡
+ℎ
+s,t,p_{\theta},G,k,V,T,v_{\small th}
+)
+Current state
+s
+𝑠
+s
+, step
+t
+𝑡
+t
+, LM
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, thought generator
+G
+​
+(
+)
+𝐺
+G()
+and size limit
+k
+𝑘
+k
+, states evaluator
+V
+​
+(
+)
+𝑉
+V()
+, step limit
+T
+𝑇
+T
+, threshold
+v
+t
+​
+h
+subscript
+𝑣
+𝑡
+ℎ
+v_{\small th}
+if
+t
+>
+T
+𝑡
+𝑇
+t>T
+then
+record output
+G
+​
+(
+p
+θ
+,
+s
+,
+1
+)
+𝐺
+subscript
+𝑝
+𝜃
+𝑠
+1
+G(p_{\theta},s,1)
+end
+if
+for
+s
+′
+∈
+G
+​
+(
+p
+θ
+,
+s
+,
+k
+)
+superscript
+𝑠
+′
+𝐺
+subscript
+𝑝
+𝜃
+𝑠
+𝑘
+s^{\prime}\in G(p_{\theta},s,k)
+do
+▷
+▷
+\triangleright
+sorted candidates
+if
+V
+​
+(
+p
+θ
+,
+{
+s
+′
+}
+)
+​
+(
+s
+)
+>
+v
+t
+​
+h
+​
+r
+​
+e
+​
+s
+𝑉
+subscript
+𝑝
+𝜃
+superscript
+𝑠
+′
+𝑠
+subscript
+𝑣
+𝑡
+ℎ
+𝑟
+𝑒
+𝑠
+V(p_{\theta},\{s^{\prime}\})(s)>v_{\small thres}
+then
+▷
+▷
+\triangleright
+pruning
+DFS
+(
+s
+′
+,
+t
++
+1
+)
+superscript
+𝑠
+′
+𝑡
+1
+(s^{\prime},t+1)
+end
+if
+end
+for
+4. Search algorithm.
+Finally, within the ToT framework, one can plug and play different search algorithms depending on the tree structure. We explore two relatively simple search algorithms and leave more advanced ones (e.g. A*
+[
+11
+]
+, MCTS
+[
+2
+]
+) for future work:
+(a)
+Breadth-first search (BFS)
+(Algorithm
+1
+) maintains a set of the
+b
+𝑏
+b
+most promising states per step. This is used for Game of 24 and Creative Writing where the tree depth is limit (
+T
+≤
+3
+𝑇
+3
+T\leq 3
+), and initial thought steps can be evaluated and pruned to a small set (
+b
+≤
+5
+𝑏
+5
+b\leq 5
+).
+(b)
+Depth-first search (DFS)
+(Algorithm
+2
+) explores the most promising state first, until the final output is reached (
+t
+>
+T
+𝑡
+𝑇
+t>T
+), or the state evaluator deems it impossible to solve the problem from the current
+s
+𝑠
+s
+(
+V
+​
+(
+p
+θ
+,
+{
+s
+}
+)
+​
+(
+s
+)
+≤
+v
+t
+​
+h
+𝑉
+subscript
+𝑝
+𝜃
+𝑠
+𝑠
+subscript
+𝑣
+𝑡
+ℎ
+V(p_{\theta},\{s\})(s)\leq v_{th}
+for a value threshold
+v
+t
+​
+h
+subscript
+𝑣
+𝑡
+ℎ
+v_{th}
+). In the latter case, the subtree from
+s
+𝑠
+s
+is
+pruned
+to trade exploration for exploitation. In both cases, DFS
+backtracks
+to the parent state of
+s
+𝑠
+s
+to continue exploration.
+Conceptually, ToT has several benefits as a method for general problem-solving with LMs: (1)
+Generality.
+IO, CoT, CoT-SC, and self-refinement can be seen as special cases of ToT (i.e. trees of limited depth and breadth; Figure
+1
+). (2)
+Modularity.
+The base LM, as well as the thought decomposition, generation, evaluation, and search procedures can all be varied independently. (3)
+Adaptability
+. Different problem properties, LM capabilities, and resource constraints can be accommodated. (4)
+Convenience.
+No extra training is needed, just a pre-trained LM is sufficient. The next section will show how these conceptual benefits translate to strong empirical performance in different problems.
+4
+Experiments
+Game of 24
+Creative Writing
+5x5 Crosswords
+Input
+4 numbers
+(4 9 10 13)
+4 random sentences
+10 clues
+(h1. presented;..)
+Output
+An equation to reach 24
+(13-9)*(10-4)=24
+A passage of 4 paragraphs ending in the 4 sentences
+5x5 letters:
+SHOWN; WIRRA; AVAIL; …
+Thoughts
+3 intermediate equations
+(13-9=4 (left 4,4,10); 10-4=6 (left 4,6); 4*6=24)
+A short writing plan
+(1. Introduce a book that connects…)
+Words to fill in for clues:
+(h1. shown; v5. naled; …)
+#ToT steps
+3
+1
+5-10 (variable)
+Table 1:
+Task overview. Input, output, thought examples are in blue.
+We propose three tasks that are hard even when sampling from the state-of-the-art language model, GPT-4
+[
+23
+]
+, using standard IO prompting or chain-of-thought (CoT) prompting. We show how deliberate search in trees of thoughts (ToT) produces better results, and more importantly, interesting and promising new ways to use language models to solve problems requiring search or planning.
+Unless otherwise stated, we perform experiments using a Chat Completion mode GPT-4
+1
+1
+1
+Experiments were done between May 5-16, 2023.
+with a sampling temperature of 0.7.
+4.1
+Game of 24
+Game of 24 is a mathematical reasoning challenge, where the goal is to use 4 numbers and basic arithmetic operations (+-*/) to obtain 24.
+For example, given input “4 9 10 13”, a solution output could be “(10 - 4) * (13 - 9) = 24”.
+Figure 2:
+ToT in a game of 24. The LM is prompted for (a) thought generation and (b) valuation.
+Method
+Success
+IO prompt
+7.3%
+CoT prompt
+4.0%
+CoT-SC
+(k=100)
+9.0%
+ToT (ours)
+(b=1)
+45%
+ToT (ours)
+(b=5)
+74%
+IO + Refine
+(k=10)
+27%
+IO
+(best of 100)
+33%
+CoT
+(best of 100)
+49%
+Table 2:
+Game of 24 Results.
+Figure 3:
+Game of 24 (a) scale analysis & (b) error analysis.
+Task Setup.
+We scrape data from
+4nums.com
+, which has 1,362 games that are sorted from easy to hard by human solving time, and use a subset of relatively hard games indexed 901-1,000 for testing. For each task, we consider the output as success if it is a valid equation that equals 24 and uses the input numbers each exactly once. We report the success rate across 100 games as the metric.
+Baselines.
+We use a standard input-output (IO) prompt with 5 in-context examples. For chain-of-thought (CoT) prompting, we augment each input-output pair with 3 intermediate equations, each operating on two remaining numbers. For example, given input “4 9 10 13”, the thoughts could be “13 - 9 = 4 (left: 4 4 10); 10 - 4 = 6 (left: 4 6); 4 * 6 = 24 (left: 24)”. For each game, we sample IO and CoT prompting for 100 times for average performance.
+We also consider a CoT self-consistency baseline, which takes the majority output from 100 CoT samples, and an iterative-refine approach on top of an IO sample for at most
+10
+10
+10
+iterations. At each iteration, the LM is conditioned on all previous history to “reflect on your mistakes and generate a refined answer” if the output is incorrect. Note that it uses groundtruth feedback signals about equation correctness.
+ToT Setup.
+To frame Game of 24 into ToT, it is natural to decompose the thoughts into 3 steps, each an intermediate equation. As shown in Figure
+2
+(a), at each tree node, we exact the remaining numbers and prompt the LM to propose some possible next steps.
+The same “propose prompt” is used for all 3 thought steps, though it only has one example with 4 input numbers.
+We perform a breadth-first search (BFS) in ToT, where at each step we keep the best
+b
+=
+5
+𝑏
+5
+b=5
+candidates.
+To perform deliberate BFS in ToT, as shown in Figure
+2
+(b), we prompt LM to evaluate each thought candidate as “sure/maybe/impossible” with regard to reaching 24. The aim is to promote correct partial solutions that can be verdicted within few lookahead trials, and eliminate impossible partial solutions based on “too big/small” commonsense, and keep the rest “maybe”. We sample values
+3
+3
+3
+times for each thought.
+Results.
+As shown in Table
+3
+, IO, CoT, and CoT-SC prompting methods perform badly on the task, achieving only 7.3%, 4.0%, and 9.0% success rates. In contrast, ToT with a breadth of
+b
+=
+1
+𝑏
+1
+b=1
+already achieves a success rate of
+45
+%
+percent
+45
+45\%
+, while
+b
+=
+5
+𝑏
+5
+b=5
+achieves
+74
+%
+percent
+74
+74\%
+.
+We also consider an oracle setup for IO/CoT, by calculating the success rate using best of
+k
+𝑘
+k
+samples
+(
+1
+≤
+k
+≤
+100
+)
+1
+𝑘
+100
+(1\leq k\leq 100)
+. To compare IO/CoT (best of k) with ToT, we consider calculating the tree nodes visited per task in ToT across
+b
+=
+1
+​
+⋯
+​
+5
+𝑏
+1
+⋯
+5
+b=1\cdots 5
+, and map the 5 success rates in Figure
+3
+(a), treating IO/CoT (best of
+k
+𝑘
+k
+) as visiting
+k
+𝑘
+k
+nodes in a bandit. Not surprisingly, CoT scales better than IO, and best of 100 CoT samples achieve a success rate of
+49
+%
+percent
+49
+49\%
+, but still much worse than exploring more nodes in ToT (
+b
+>
+1
+𝑏
+1
+b>1
+).
+Error analysis.
+Figure
+3
+(b) breaks down at which step CoT and ToT samples fail the task, i.e. the thought (in CoT) or all
+b
+𝑏
+b
+thoughts (in ToT) are invalid or impossible to reach 24. Notably, around 60% of CoT samples already failed the task after generating the first step, or equivalently, the first three words (e.g. “
+4
++
+9
+4
+9
+4+9
+”). This highlights the issues with direct left-to-right decoding.
+4.2
+Creative writing
+Next, we invent a creative writing task where the input is 4 random sentences and the output should be a coherent passage with 4 paragraphs that end in the 4 input sentences respectively.
+Such a task is open-ended and exploratory, and challenges creative thinking as well as high-level planning.
+Task setup.
+We sample random sentences from
+randomwordgenerator.com
+to form 100 inputs, and there is no groundtruth passage for each input constraint. As we find that GPT-4 can follow the input constraints most of the time, we focus on evaluating passage coherency in two ways: using a GPT-4 zero-shot prompt to provide a 1-10 scalar score, or using human judgments to compare pairs of outputs from different methods. For the former, we sample 5 scores and average them for each task output, and we find these 5 scores usually consistent, with a standard deviation of around
+0.56
+0.56
+0.56
+on average across outputs. For the latter, we employ a subset of the authors in a blind study to compare the coherency of CoT vs. ToT generated passage pairs, where the order of passages is random flipped over 100 inputs.
+Baselines.
+Given the creative nature of the task, both IO and CoT prompts are zero-shot. While the former prompts the LM to directly generate a coherent passage given input constraints, the latter prompts the LM to first make a brief plan then write the passage, i.e. the plan serves as the intermediate thought step. We generate 10 IO and CoT samples per task.
+We also consider an iterative-refine (
+k
+≤
+5
+𝑘
+5
+k\leq 5
+) method on top of a random IO sample for each task, where the LM is conditioned on input constraints and the last generated passage to decide if the passage is already “perfectly coherent”, and if not generate a refined one.
+ToT setup.
+We build a ToT with depth 2 (and only 1 intermediate thought step) — the LM first generates
+k
+=
+5
+𝑘
+5
+k=5
+plans and votes for the best one (Figure
+4
+), then similarly generate
+k
+=
+5
+𝑘
+5
+k=5
+passages based on the best plan then vote for the best one. Here the breadth limit
+b
+=
+1
+𝑏
+1
+b=1
+, as only one choice is kept per step. A simple zero-shot vote prompt (“analyze choices below, then conclude which is most promising for the instruction”) is used to sample 5 votes at both steps.
+Results.
+Figure
+5
+(a) shows average GPT-4 scores across 100 tasks, where ToT (7.56) is deemed to generate more coherent passages than IO (6.19) and CoT (6.93) on average. While such an automatic metric might be noisy, Figure
+5
+(b) confirms the finding by showing that humans prefer ToT over CoT in 41 out of 100 passage pairs, while only prefer CoT over ToT in 21 (other 38 pairs are found “similarly coherent”). Lastly, iterative-refine is more effective on this natural language task, where it improves IO coherency score from 6.19 to 7.67, and ToT coherency score from 7.56 to 7.91.
+We believe it could be thought of as a third approach to thought generation in the ToT framework, where new thoughts can arise from refining old thoughts instead of i.i.d. or sequentially generated.
+Figure 4:
+A step of deliberate search in a randomly picked Creative Writing task. Given the input, the LM samples 5 different plans, then votes 5 times to decide which plan is best. The majority choice is used to consequently write the output passage with the same sample-vote procedure.
+Figure 5:
+Creative Writing results.
+Method
+Success Rate (%)
+Letter
+Word
+Game
+IO
+38.7
+14
+0
+CoT
+40.6
+15.6
+1
+ToT (ours)
+78
+60
+20
++best state
+82.4
+67.5
+35
+-prune
+65.4
+41.5
+5
+-backtrack
+54.6
+20
+5
+Table 3:
+Mini Crosswords results.
+4.3
+Mini crosswords
+Figure 6:
+In Mini Crosswords, (a) how thoughts are proposed and aggregated in a priority queue for depth-first search (DFS), and (b) how a state is evaluated based on the possibility of filling in each remaining word clue, and pruned if any remaining clue is deemed not possible to fill by the LM. Then DFS backtracks to the parent state and explore the next promising thought for clue.
+In Game of 24 and Creative Writing, ToT is relatively shallow — at most 3 thought steps are needed to reach the final output. Here we explore
+5
+×
+5
+5
+5
+5\times 5
+mini crosswords as a harder search problem involving natural language. Again, the goal is not just to solve the task, as more general crosswords can be readily solved with specialized NLP pipelines
+[
+34
+]
+that leverages large-scale retrieval instead of LM. Rather, we aim to explore the limit of LM as a general problem solver that explores its own thoughts and guides its own exploration with deliberate reasoning as heuristics.
+Task setup.
+We scrape data from
+GooBix
+, which contains 156 games of
+5
+×
+5
+5
+5
+5\times 5
+mini crosswords. As we observe adjacent games contain similar clues, we use 20 games with indices
+1
+,
+6
+,
+⋯
+,
+91
+,
+96
+1
+6
+⋯
+91
+96
+1,6,\cdots,91,96
+for testing, and games
+136
+,
+141
+,
+146
+,
+151
+,
+156
+136
+141
+146
+151
+156
+136,141,146,151,156
+for prompting.
+For each task, the input describes the 5 horizontal clues and 5 vertical clues, and the output should be a board of
+5
+×
+5
+=
+25
+5
+5
+25
+5\times 5=25
+letters to solve the crosswords. For evaluation, we consider three levels of success: the portion of correct letters (25 per game), words (10 per game), and games.
+Baselines.
+We provide 5 example input-output pairs in the IO prompt, and in the CoT prompt additionally include intermediate words in the order h1..5 then v1..5. We run each prompt for 10 samples and average the results.
+ToT setup.
+We leverage a depth-first search (Algorithm
+2
+) that keeps exploring the most promising subsequent word clue until the state is no longer promising, then backtrack to the parent state to explore alternative thoughts.
+To make search tractable, subsequent thoughts are constrained not to change any filled words or letters, so that the ToT has at most 10 intermediate steps.
+For thought generation, at each state we translate all existing thoughts (e.g. “h2.motor; h1.tasks” for the state in Figure
+6
+(a)) into letter constraints for remaining clues (e.g. “v1.To heap: tm___;…”) and prompt a proposal prompt
+5
+5
+5
+times to come up with candidates for where and what to fill in the next word. Importantly, we also prompt the LM to give a confidence level for different thoughts, and aggregate these across proposals to obtain a sorted list of next thoughts to explore (Figure
+6
+(a)).
+For state evaluations, we similarly translate each state into letter constraints for remaining clues, then evaluate for each clue if it is possible to fill given the constraints. If any remaining clue is deemed “impossible” to fill in (e.g. “v1. To heap: tm_s_”), then the exploration of the state’s subtree is pruned and DFS backtracks to its parent to explore the next promising thought. We limit DFS search steps to 100, and simply render the deepest explored state (the first explored one if multiple) into the final output.
+Results.
+As shown in Table
+5
+, IO and CoT prompting methods perform poorly with a word-level success rate less than
+16
+%
+percent
+16
+16\%
+, while ToT significantly improves all metrics, achieving a word-level success rate of
+60
+%
+percent
+60
+60\%
+and solving 4 out of 20 games. Such an improvement is not surprising, given IO and CoT lack mechanisms to try different clues, make changes to decisions, or backtrack.
+Oracle and ablation studies.
+When outputting from the oracle best DFS state (instead of the heuristically determined best state) per task, ToT performance is even higher and actually solves 7/20 games (Table
+5
+, “+best state”), indicating our simple output heuristics can be readily improved. Interestingly, sometimes when the crosswords game is actually solved, the state evaluator might still deem some words as “impossible” and prune — possibly because
+5
+×
+5
+5
+5
+5\times 5
+crosswords by design have some rare or obselete words that GPT-4 cannot recognize
+2
+2
+2
+For example, “agend” is an obsolete form of “agendum”, but GPT-4 deems it a typo for “agenda”. External retrieval
+or web interaction
+could augment LM for problem solving under knowledge uncertainty.
+.
+Given the state evaluation as a pruning heuristic is imperfect, we also explore ablating the pruning, and find the performance generally worse (Table
+5
+, “-prune”). However, it could actually find the correct solution for 4/20 games (though only outputting 1 via heuristic), 3 of which are games ToT+pruning cannot solve within 100 steps. Thus, better heuristics for DFS pruning are critical for problem solving in this case.
+Lastly, we confirm the importance of backtracking by running an ablation that keeps filling the most promising clue for at most 20 steps, allowing overwrites. This is similar to a “greedy” BFS search with breadth limit of
+b
+=
+1
+𝑏
+1
+b=1
+, and performs poorly with a word level success of only
+20
+%
+percent
+20
+20\%
+(Table
+5
+, “-backtrack”).
+5
+Related Work
+Planning and decision making.
+Smart planning and decision making are critical to achieving predefined goals. As they are trained on vast amount of world knowledge and human examples,
+LMs are known to have already absorbed rich commonsense that makes it possible to propose reasonable plans conditioned on problem setting and environmental states
+[
+12
+,
+42
+,
+37
+,
+13
+,
+35
+,
+41
+,
+40
+]
+. Our proposed ToT approach extends existing planning formulations by considering multiple potentially feasible plans simultaneously at each problem-solving step, and proceeding with the most promising ones. The integration between thought sampling and value feedback organically integrates planning and decision-making mechanisms, enabling effective search inside a solution tree. On the other hand, traditional decision-making procedures usually require training dedicated reward and policy models as in reinforcement learning (for example CHAI
+[
+33
+]
+), whereas we use the LM itself to provide the value estimates for decision making.
+RAP
+[
+9
+]
+is a concurrent work that treats language model reasoning as planning with its internal world model, and proposes a MCTS-based method similar to ToT. However, its tasks are simpler than ours, and its framework lacks the modularity to incorporate different tree search algorithms.
+Self-reflection.
+Using LLMs to assess the viability of their own predictions is becoming an increasingly important procedure in problem solving.
+[
+28
+,
+20
+,
+24
+]
+introduced the “self-reflection” mechanism, in which LMs provide feedback to their generation candidates.
+[
+4
+]
+improves LMs code generation accuracy by injecting feedback messages generated by the LM itself based on its code execution results. Similarly,
+[
+17
+]
+also introduces “critic” or review steps over the actions and states, deciding the next action to take in solving computer operation tasks. Another recent work very relevant to ours is “self-eval guided decoding”
+[
+39
+]
+. Similar to our method, self-eval decoding also follows a tree-search procedure with leaves sampled from stochastic beam search decoding, which are then evaluated by LLM itself with carefully prepared self-eval prompts. Their approach however, uses the PAL formulation
+[
+8
+]
+which represents thoughts as codes, which makes it difficult to tackle challenging tasks like creative writing which we consider in this paper. Our Tree-of-Thought formulation is thus more versatile and handles challenging tasks on which GPT-4 only achieves very low accuracy with standard prompts.
+Program-guided LLM generation.
+Our proposal is also related to recent advancements that organize LM’s behavior with systematic procedures
+[
+14
+,
+44
+,
+6
+,
+43
+]
+or symbolic program guidance. For example,
+Schlag et al. [
+27
+]
+embeds LMs in an algorithmic search procedure to help solve problems like question answering step-by-step, in which the search trees are expanded by relevant paragraphs that might provide answers. This approach however differs from ours in that trees are expanded by sampling external paragraphs instead of the LM’s own thoughts, and there is no reflection or voting steps. Another approach, LLM+P
+[
+18
+]
+, goes one step further and delegates the actual planning process to a classical planner.
+Classical search methods.
+Last but not least, our approach can be treated as a modern rendition of classical search methods for problem solving. For example it can be considered as a heuristic search algorithm like A*
+[
+10
+]
+, in which the heuristic at each search node is provided by the LM’s self-assessment. From this perspective, our method is also related to NeuroLogic A*esque decoding
+[
+19
+]
+, which is inspired by A* search but introduces look-ahead heuristics that are efficient for LMs to improve the beam-search or top-k sampling decoding. This method however is constrained to sentence generation tasks, whereas our framework are designed for complex, multi-step problem solving guarded by value feedback.
+6
+Discussion
+Limitations and future directions.
+Deliberate search such as ToT might not be necessary for many existing tasks that GPT-4 already excels at (see Appendix
+B.1
+), and as an initial step this work only explores three relatively simple tasks that challenges GPT-4 (see Appendix
+B.2
+for some GPT-3.5 experiment results) and calls of better search and planning abilities incorporated with LMs. However, as we begin to deploy LMs for more real-world decision making applications (e.g. coding, data analysis, robotics, etc.), more complex tasks could emerge and present new opportunities to study these research questions. Also, search methods like ToT requires more resources (e.g. GPT-4 API cost) than sampling methods in order to improve task performances, but the modular flexibility of ToT allows users to customize such performance-cost tradeoffs, and ongoing open-source efforts
+[
+32
+]
+should readily reduce such costs in the near future. More details about cost and efficiency are in Appendix
+B.3
+. Lastly, this work focuses on using an off-the-shelf LM, and fine-tuning LMs using a ToT-style high-level counterfactual decision making (e.g. deliberating over potential choices for the next paragraph, instead of predicting the next token) might present opportunities to enhance the problem-solving capabilities of LMs.
+Conclusion.
+The associative “System 1” of LMs can be beneficially augmented by a “System 2” based on searching a tree of possible paths to the solution to a problem. The Tree of Thoughts framework provides a way to translate classical insights about problem-solving into actionable methods for contemporary LMs. At the same time, LMs address a weakness of these classical methods, providing a way to solve complex problems that are not easily formalized, such as creative writing. We see this intersection of LMs with classical approaches to AI as an exciting direction.
+Broader Impact
+ToT is a framework that empowers LMs to more autonomously and intelligently make decisions and solve problems. While current tasks are limited to reasoning and search problems, future applications involving interaction with external environments or humans could bring potential danger, e.g. facilitating harmful uses of LMs. On the other hand, ToT also improves the interpretability of model decisions and the opportunity for human alignment, as the resulting representations are readable, high-level language reasoning instead of implicit, low-level token values.
+Acknowledgements
+SY and KN acknowledge support from an Oracle Collaborative Research award and the National Science Foundation under Grant No. 2239363. Any opinions, findings, conclusions, or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation. SY is also supported by the Harold W. Dodds Fellowship from Princeton.
+References
+Brown et al. [2020]
+T. Brown, B. Mann, N. Ryder, M. Subbiah, J. D. Kaplan, P. Dhariwal,
+A. Neelakantan, P. Shyam, G. Sastry, A. Askell, et al.
+Language models are few-shot learners.
+Advances in neural information processing systems
+,
+33:1877–1901, 2020.
+Browne et al. [2012]
+C. Browne, E. J. Powley, D. Whitehouse, S. M. M. Lucas, P. I. Cowling,
+P. Rohlfshagen, S. Tavener, D. P. Liebana, S. Samothrakis, and S. Colton.
+A survey of monte carlo tree search methods.
+IEEE Transactions on Computational Intelligence and AI in
+Games
+, 4:1–43, 2012.
+Campbell et al. [2002]
+M. Campbell, A. J. Hoane Jr, and F.-h. Hsu.
+Deep blue.
+Artificial intelligence
+, 134(1-2):57–83,
+2002.
+Chen et al. [2023]
+X. Chen, M. Lin, N. Schärli, and D. Zhou.
+Teaching large language models to self-debug, 2023.
+Chowdhery et al. [2022]
+A. Chowdhery, S. Narang, J. Devlin, M. Bosma, G. Mishra, A. Roberts, P. Barham,
+H. W. Chung, C. Sutton, S. Gehrmann, et al.
+Palm: Scaling language modeling with pathways.
+arXiv preprint arXiv:2204.02311
+, 2022.
+Creswell and Shanahan [2022]
+A. Creswell and M. Shanahan.
+Faithful reasoning using large language models.
+arXiv preprint arXiv:2208.14271
+, 2022.
+Daw et al. [2005]
+N. D. Daw, Y. Niv, and P. Dayan.
+Uncertainty-based competition between prefrontal and dorsolateral
+striatal systems for behavioral control.
+Nature neuroscience
+, 8(12):1704–1711,
+2005.
+Gao et al. [2023]
+L. Gao, A. Madaan, S. Zhou, U. Alon, P. Liu, Y. Yang, J. Callan, and G. Neubig.
+Pal: Program-aided language models, 2023.
+Hao et al. [2023]
+S. Hao, Y. Gu, H. Ma, J. J. Hong, Z. Wang, D. Z. Wang, and Z. Hu.
+Reasoning with language model is planning with world model.
+arXiv preprint arXiv:2305.14992
+, 2023.
+Hart et al. [1968a]
+P. E. Hart, N. J. Nilsson, and B. Raphael.
+A formal basis for the heuristic determination of minimum cost paths.
+IEEE Transactions on Systems Science and Cybernetics
+,
+4(2):100–107, 1968a.
+doi:
+10.1109/TSSC.1968.300136
+.
+Hart et al. [1968b]
+P. E. Hart, N. J. Nilsson, and B. Raphael.
+A formal basis for the heuristic determination of minimum cost paths.
+IEEE transactions on Systems Science and Cybernetics
+,
+4(2):100–107, 1968b.
+Huang et al. [2022a]
+W. Huang, P. Abbeel, D. Pathak, and I. Mordatch.
+Language models as zero-shot planners: Extracting actionable
+knowledge for embodied agents, 2022a.
+Huang et al. [2022b]
+W. Huang, F. Xia, T. Xiao, H. Chan, J. Liang, P. Florence, A. Zeng, J. Tompson,
+I. Mordatch, Y. Chebotar, et al.
+Inner monologue: Embodied reasoning through planning with language
+models.
+arXiv preprint arXiv:2207.05608
+, 2022b.
+Jung et al. [2022]
+J. Jung, L. Qin, S. Welleck, F. Brahman, C. Bhagavatula, R. L. Bras, and
+Y. Choi.
+Maieutic prompting: Logically consistent reasoning with recursive
+explanations.
+arXiv preprint arXiv:2205.11822
+, 2022.
+Kahneman [2011]
+D. Kahneman.
+Thinking, fast and slow
+.
+Macmillan, 2011.
+Kahneman et al. [2002]
+D. Kahneman, S. Frederick, et al.
+Representativeness revisited: Attribute substitution in intuitive
+judgment.
+Heuristics and biases: The psychology of intuitive judgment
+,
+49(49-81):74, 2002.
+Kim et al. [2023]
+G. Kim, P. Baldi, and S. McAleer.
+Language models can solve computer tasks, 2023.
+Liu et al. [2023]
+B. Liu, Y. Jiang, X. Zhang, Q. Liu, S. Zhang, J. Biswas, and P. Stone.
+Llm+p: Empowering large language models with optimal planning
+proficiency, 2023.
+Lu et al. [2021]
+X. Lu, S. Welleck, P. West, L. Jiang, J. Kasai, D. Khashabi, R. L. Bras,
+L. Qin, Y. Yu, R. Zellers, N. A. Smith, and Y. Choi.
+Neurologic a*esque decoding: Constrained text generation with
+lookahead heuristics.
+In
+North American Chapter of the Association for Computational
+Linguistics
+, 2021.
+Madaan et al. [2023]
+A. Madaan, N. Tandon, P. Gupta, S. Hallinan, L. Gao, S. Wiegreffe, U. Alon,
+N. Dziri, S. Prabhumoye, Y. Yang, S. Welleck, B. P. Majumder, S. Gupta,
+A. Yazdanbakhsh, and P. Clark.
+Self-refine: Iterative refinement with self-feedback, 2023.
+Newell et al. [1959]
+A. Newell, J. C. Shaw, and H. A. Simon.
+Report on a general problem solving program.
+In
+IFIP congress
+, volume 256, page 64. Pittsburgh, PA, 1959.
+Newell et al. [1972]
+A. Newell, H. A. Simon, et al.
+Human problem solving
+.
+Prentice-Hall, 1972.
+OpenAI [2023]
+OpenAI.
+Gpt-4 technical report.
+ArXiv
+, abs/2303.08774, 2023.
+Paul et al. [2023]
+D. Paul, M. Ismayilzada, M. Peyrard, B. Borges, A. Bosselut, R. West, and
+B. Faltings.
+Refiner: Reasoning feedback on intermediate representations, 2023.
+Radford et al. [2018]
+A. Radford, K. Narasimhan, T. Salimans, I. Sutskever, et al.
+Improving language understanding by generative pre-training.
+OpenAI blog
+, 2018.
+Radford et al. [2019]
+A. Radford, J. Wu, R. Child, D. Luan, D. Amodei, I. Sutskever, et al.
+Language models are unsupervised multitask learners.
+OpenAI blog
+, 1(8):9, 2019.
+Schlag et al. [2023]
+I. Schlag, S. Sukhbaatar, A. Celikyilmaz, W. tau Yih, J. Weston,
+J. Schmidhuber, and X. Li.
+Large language model programs, 2023.
+Shinn et al. [2023]
+N. Shinn, B. Labash, and A. Gopinath.
+Reflexion: an autonomous agent with dynamic memory and
+self-reflection, 2023.
+Silver et al. [2017]
+D. Silver, J. Schrittwieser, K. Simonyan, I. Antonoglou, A. Huang, A. Guez,
+T. Hubert, L. Baker, M. Lai, A. Bolton, et al.
+Mastering the game of go without human knowledge.
+nature
+, 550(7676):354–359, 2017.
+Sloman [1996]
+S. A. Sloman.
+The empirical case for two systems of reasoning.
+Psychological bulletin
+, 119(1):3, 1996.
+Stanovich [1999]
+K. E. Stanovich.
+Who is rational? Studies of individual differences in
+reasoning
+.
+Psychology Press, 1999.
+Touvron et al. [2023]
+H. Touvron, T. Lavril, G. Izacard, X. Martinet, M.-A. Lachaux, T. Lacroix,
+B. Rozière, N. Goyal, E. Hambro, F. Azhar, et al.
+Llama: Open and efficient foundation language models.
+arXiv preprint arXiv:2302.13971
+, 2023.
+Verma et al. [2022]
+S. Verma, J. Fu, S. Yang, and S. Levine.
+Chai: A chatbot ai for task-oriented dialogue with offline
+reinforcement learning.
+In
+Proceedings of the 2022 Conference of the North American
+Chapter of the Association for Computational Linguistics: Human Language
+Technologies
+, pages 4471–4491, 2022.
+Wallace et al. [2022]
+E. Wallace, N. Tomlin, A. Xu, K. Yang, E. Pathak, M. Ginsberg, and D. Klein.
+Automated crossword solving.
+arXiv preprint arXiv:2205.09665
+, 2022.
+Wang et al. [2023a]
+L. Wang, W. Xu, Y. Lan, Z. Hu, Y. Lan, R. K.-W. Lee, and E.-P. Lim.
+Plan-and-solve prompting: Improving zero-shot chain-of-thought
+reasoning by large language models, 2023a.
+Wang et al. [2022]
+X. Wang, J. Wei, D. Schuurmans, Q. Le, E. Chi, and D. Zhou.
+Self-consistency improves chain of thought reasoning in language
+models.
+arXiv preprint arXiv:2203.11171
+, 2022.
+Wang et al. [2023b]
+Z. Wang, S. Cai, A. Liu, X. Ma, and Y. Liang.
+Describe, explain, plan and select: Interactive planning with large
+language models enables open-world multi-task agents, 2023b.
+Wei et al. [2022]
+J. Wei, X. Wang, D. Schuurmans, M. Bosma, E. Chi, Q. Le, and D. Zhou.
+Chain of thought prompting elicits reasoning in large language
+models.
+arXiv preprint arXiv:2201.11903
+, 2022.
+Xie et al. [2023]
+Y. Xie, K. Kawaguchi, Y. Zhao, X. Zhao, M.-Y. Kan, J. He, and Q. Xie.
+Decomposition enhances reasoning via self-evaluation guided decoding,
+2023.
+Yang et al. [2023]
+S. Yang, O. Nachum, Y. Du, J. Wei, P. Abbeel, and D. Schuurmans.
+Foundation models for decision making: Problems, methods, and
+opportunities, 2023.
+Yao et al. [2022]
+S. Yao, J. Zhao, D. Yu, N. Du, I. Shafran, K. Narasimhan, and Y. Cao.
+ReAct: Synergizing reasoning and acting in language models.
+arXiv preprint arXiv:2210.03629
+, 2022.
+Zhang et al. [2023]
+S. Zhang, Z. Chen, Y. Shen, M. Ding, J. B. Tenenbaum, and C. Gan.
+Planning with large language models for code generation.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2023.
+URL
+https://openreview.net/forum?id=Lr8cOOtYbfL
+.
+Zhou et al. [2022]
+D. Zhou, N. Schärli, L. Hou, J. Wei, N. Scales, X. Wang, D. Schuurmans,
+C. Cui, O. Bousquet, Q. Le, et al.
+Least-to-most prompting enables complex reasoning in large language
+models.
+arXiv preprint arXiv:2205.10625
+, 2022.
+Zhu et al. [2022]
+X. Zhu, J. Wang, L. Zhang, Y. Zhang, R. Gan, J. Zhang, and Y. Yang.
+Solving math word problem via cooperative reasoning induced language
+models.
+arXiv preprint arXiv:2210.16257
+, 2022.
+Appendix A
+Code, Prompts, Trajectories
+All code is available at
+https://github.com/princeton-nlp/tree-of-thought-llm
+.
+All prompts are available at
+https://github.com/princeton-nlp/tree-of-thought-llm/tree/master/src/tot/prompts
+.
+Trajectories are available at
+https://github.com/princeton-nlp/tree-of-thought-llm/tree/master/logs
+.
+Appendix B
+Additional Experiment Results
+Given the motivation of exploring and extending the capability frontier of language models, our experiments in the main paper have focused on a setup with the state-of-the-art language model (GPT-4), and three hard tasks invented to challenge it. Here, we report additional experiments with weaker LLM or easier tasks, and discuss cost and efficiency.
+GSM8K
+StrategyQA
+IO
+51
+73
+CoT
+86
+82
+ToT
+90
+83
+Table 4:
+New tasks with
+zero-shot ToT and GPT-4.
+GPT-4
+GPT-3.5
+IO
+7.3%
+6%
+CoT
+4.0%
+3%
+ToT
+74%
+19%
+Table 5:
+Game of 24 with
+GPT-4 vs GPT-3.5.
+GPT-4
+GPT-3.5
+IO
+6.19
+4.47
+CoT
+6.93
+5.16
+ToT
+7.56
+6.62
+Table 6:
+Creative Writing with
+GPT-4 vs. GPT-3.5.
+B.1
+Extension to new tasks (GSM8k, StrategyQA) with zero-shot ToT
+While more common NLP tasks might be too easy for GPT-4 and do not require ToT (which is why we considered harder new tasks), we believe applying ToT to new tasks could be straightforward. For example, we implemented a simple and generic zero-shot ToT-BFS similar to creative writing (sample 5 problem solving strategies then vote for the best one; then sample 5 solutions based on the best strategy then vote for the best one) for GSM8K and StrategyQA with few extra lines of code:
+# define the answer format of new tasks
+gsm8k_format = ‘"the answer is n" where n is a number’
+strategyqa_format = ‘either "the answer is yes" or "the answer is no"’
+
+# define zero-shot io prompting
+standard_prompt = ‘Answer the following question with {format}: {input}’
+
+# define thought format for zero-shot cot and zero-shot tot
+cot_prompt = ‘‘‘Answer the following question: {input}
+
+Make a strategy then write. Your output should be of the following format:
+
+Strategy:
+Your strategy about how to answer the question.
+
+Answer:
+Your answer to the question. It should end with {format}.
+’’’
+
+# define zero-shot voting used for zero-shot tot
+vote_prompt = ‘‘‘Given an instruction and several choices,
+decide which choice is most promising.
+Analyze each choice in detail, then conclude in the last line
+"The best choice is {s}", where s the integer id of the choice.
+’’’
+We evaluated on a subset of 100 random GSM8K test and StrategyQA dev questions. As shown in Table
+B
+and as expected, ToT improves over CoT on both tasks (but only slightly, given GPT-4 + CoT is already very good on such tasks, and StrategyQA’s bottleneck is external knowledge, not reasoning). Considering computational costs, it is more suitable to try smaller LLMs + ToT for traditional NLP tasks, or GPT-4 + ToT for hard tasks that challenge GPT-4 + CoT’s reasoning.
+B.2
+Extension to new LMs (GPT-3.5)
+To understand how ToT works with other LLMs, we also ran GPT-3.5-turbo for Creative Writing (Table
+B
+) and Game of 24 (Table
+B
+).
+On both tasks, “ToT
+>
+>
+CoT
+>
+>
+IO” remains true for GPT-3.5.
+On Creative Writing, we find GPT-3.5+ToT outperform GPT-4+IO, and similar to GPT-4+CoT, which suggests ToT could also work well on weaker language models.
+On Game of 24 (we changed 1-shot proposal prompt to 3-shot to make it work), GPT-3.5+ToT’s 19% is far worse than GPT-4+ToT’s 74%. To further understand the importance of generation vs. evaluation, we ran GPT-4 generation + GPT-3.5 evaluation (64%) and GPT-3.5 generation + GPT-4 evaluation (31%). This suggests the game’s bottleneck is thought generation, and different generation/evaluation language models might attain decent results while reducing costs.
+B.3
+Cost and efficiency
+Running ToT requires significantly more computations than IO or CoT prompting. For example, in Game of 24 (Table
+7
+below), solving a problem with ToT requires 5.5k completion tokens, close to 100 CoT trials (6.7k tokens). But the performance of ToT is better than best of 100 independent CoT trials.
+Game of 24
+Generate/Prompt tokens
+Cost per case
+Success
+IO (best of 100)
+1.8k / 1.0k
+$0.13
+33%
+CoT (best of 100)
+6.7k / 2.2k
+$0.47
+49%
+ToT
+5.5k / 1.4k
+$0.74
+74%
+Table 7:
+Cost analysis on Game of 24.
+On Creative Writing (Table
+8
+below), we found ToT takes around 5x completion tokens and money cost, which is intuitive as
+b
+=
+5
+𝑏
+5
+b=5
+and most tokens are generated passages.
+Creative Writing
+Generate/Prompt tokens
+Cost per case
+IO
+0.9k / 0.4k
+$0.06
+CoT
+0.9k / 0.4k
+$0.07
+ToT
+4k / 2.9k
+$0.32
+Table 8:
+Cost analysis on Game of 24.
+So completing Game of 24 and Creative Writing’s main ToT experiments cost around
+0.74
+×
+100
++
+0.32
+×
+100
+=
+106
+0.74
+100
+0.32
+100
+106
+0.74\times 100+0.32\times 100=106
+dollars. Crosswords’ DFS experiments should be also within
+100
+100
+100
+dollars. In general, cost and efficiency of ToT highly depend on the prompts and search algorithms used, and could require 5-100 times more generated tokens than CoT. Some actionable insights:
+•
+We recommend using ToT on tasks requiring deliberate reasoning, on which CoT struggles.
+•
+Flexibility of ToT allows some performance-cost tradeoff, e.g., change beam size or vote number in BFS, few-shot vs. zero-shot prompting, GPT-3.5 vs. GPT-4, etc. One could configure the setup based on some resource constraints or performance goal.
+•
+There is much space for improving efficiency, e.g., BFS could early stop when solution is found, or trim down beam size to when some thoughts are ”impossible”.
+•
+We believe that more computation is indeed required in order for the model to achieve stronger intelligence, and this should not become a blocking issue as in the long run, (open-source) LMs will become much cheaper and more efficient. It is also a great direction how to better train/finetune LMs for thought generation and/or evaluation.
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..8112d1616db6d69cf8f81687e4e0653027333c1a
--- /dev/null
+++ b/research/notes/230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models.md
@@ -0,0 +1,213 @@
+---
+title: '[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language
+  Models'
+id: 230510601-tree-of-thoughts-deliberate-problem-solving-with-large-language-models
+tags:
+- deepread
+created: '2026-06-10T00:39:55.627654Z'
+source: https://arxiv.org/abs/2305.10601
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:39:55.627506Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2305.10601] Tree of Thoughts: Deliberate Problem Solving with Large Language Models
+Computer Science > Computation and Language
+arXiv:2305.10601
+(cs)
+[Submitted on 17 May 2023 (
+v1
+), last revised 3 Dec 2023 (this version, v2)]
+Title:
+Tree of Thoughts: Deliberate Problem Solving with Large Language Models
+Authors:
+Shunyu Yao
+,
+Dian Yu
+,
+Jeffrey Zhao
+,
+Izhak Shafran
+,
+Thomas L. Griffiths
+,
+Yuan Cao
+,
+Karthik Narasimhan
+View a PDF of the paper titled Tree of Thoughts: Deliberate Problem Solving with Large Language Models, by Shunyu Yao and 6 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Language models are increasingly being deployed for general problem solving across a wide range of tasks, but are still confined to token-level, left-to-right decision-making processes during inference. This means they can fall short in tasks that require exploration, strategic lookahead, or where initial decisions play a pivotal role. To surmount these challenges, we introduce a new framework for language model inference, Tree of Thoughts (ToT), which generalizes over the popular Chain of Thought approach to prompting language models, and enables exploration over coherent units of text (thoughts) that serve as intermediate steps toward problem solving. ToT allows LMs to perform deliberate decision making by considering multiple different reasoning paths and self-evaluating choices to decide the next course of action, as well as looking ahead or backtracking when necessary to make global choices. Our experiments show that ToT significantly enhances language models' problem-solving abilities on three novel tasks requiring non-trivial planning or search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of tasks, our method achieved a success rate of 74%. Code repo with all prompts:
+this https URL
+.
+Comments:
+NeurIPS 2023 camera ready version. Code repo with all prompts:
+this https URL
+Subjects:
+Computation and Language (cs.CL)
+; Artificial Intelligence (cs.AI); Machine Learning (cs.LG)
+Cite as:
+arXiv:2305.10601
+[cs.CL]
+(or
+arXiv:2305.10601v2
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2305.10601
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Shunyu Yao [
+view email
+]
+[v1]
+Wed, 17 May 2023 23:16:17 UTC (609 KB)
+[v2]
+Sun, 3 Dec 2023 22:50:35 UTC (623 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Tree of Thoughts: Deliberate Problem Solving with Large Language Models, by Shunyu Yao and 6 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-05
+Change to browse by:
+cs
+cs.AI
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+5 blog links
+(
+what is this?
+)
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..3704173207d5ba9f751b4ab7e1bc04b1dcdef9a4
--- /dev/null
+++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2.md
@@ -0,0 +1,4095 @@
+---
+title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning
+  in Language Models'
+id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-2
+tags:
+- deepread
+created: '2026-06-10T00:40:44.608943Z'
+source: https://ar5iv.labs.arxiv.org/html/2310.04406
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:40:44.608803Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Andy Zhou
+University of Illinois at Urbana-Champaign
+AI@UIUC
+Kai Yan
+University of Illinois at Urbana-Champaign
+Michal Shlapentokh-Rothman
+University of Illinois at Urbana-Champaign
+Haohan Wang
+University of Illinois at Urbana-Champaign
+Yu-Xiong Wang
+University of Illinois at Urbana-Champaign
+Abstract
+While large language models (LLMs) have demonstrated impressive performance on a range of decision-making tasks, they rely on simple acting processes and fall short of broad deployment as autonomous agents. We introduce LATS (Language Agent Tree Search), a general framework that synergizes the capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration from Monte Carlo tree search commonly used in model-based reinforcement learning, LATS employs LLMs as agents, value functions, and optimizers, repurposing their latent strengths for enhanced decision-making. What is crucial in this method is the use of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that moves beyond the limitations of existing techniques. Our experimental evaluation across diverse domains, such as programming, HotPotQA, and WebShop, illustrates the applicability of LATS for decision-making while maintaining competitive reasoning performance. In particular, LATS achieves 94.4% for programming on HumanEval with GPT-4 and an average score of 75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness and generality of our method.
+1
+Introduction
+General autonomous agents capable of reasoning and decision-making in a variety of environments
+(Wooldridge & Jennings,
+1995
+)
+have been of longstanding interest in the field of artificial intelligence. While this has traditionally been studied in reinforcement learning, the recent rise of large language models (LLMs)
+(Brown et al.,
+2020
+; Chowdhery et al.,
+2022
+; Touvron et al.,
+2023
+; OpenAI,
+2023
+)
+with strong reasoning and general adaptability offers an alternative paradigm. Not only have LLMs excelled on standard NLP tasks such as text summarization
+(Nallapati et al.,
+2016
+)
+or natural language inference
+(Bowman et al.,
+2015
+)
+, but they have been adapted to an increasingly diverse set of tasks that often require advanced common-sense reasoning or quantitative skills
+(Cobbe et al.,
+2021
+; Saparov & He,
+2022
+)
+. LLMs are also capable of performing in complex environments that involve knowledge and reasoning, such as web navigation
+(Yao et al.,
+2022
+; Deng et al.,
+2023
+)
+, tool-use
+(Schick et al.,
+2023
+)
+, or open-ended games
+(Fan et al.,
+2022
+)
+.
+Figure 1:
+An overview of LATS. LATS uses an external environment and self-reflection to improve reasoning and decision-making.
+Reasoning and acting abilities have also been improved by prompting techniques that augment LLMs with feedback or observations from an external environment
+(Yao et al.,
+2023b
+; Gao et al.,
+2022
+; Shinn et al.,
+2023
+)
+. This eliminates the need to rely entirely on the base abilities of the Language Model (LM), enhancing it through external tools or semantic feedback. Despite this strength, these methods are reflexive and fall short of humans’ deliberate and thoughtful decision-making characteristics to solve problems
+(Sloman,
+1996
+; Evans,
+2010
+)
+. In particular, such methods fail to consider multiple reasoning paths or to plan ahead. Recent search-guided LLM works
+(Xie et al.,
+2023
+; Yao et al.,
+2023a
+; Hao et al.,
+2023
+)
+address this issue by searching over multiple reasoning chains. While these methods enable planning, these methods operate in isolation and do not incorporate external feedback that can improve reasoning.
+To help address these issues, we propose LATS (Language Agent Tree Search), a general framework for decision-making and reasoning with language models. LATS unifies LM planning, acting, and reasoning strategies by expanding ReAct
+(Yao et al.,
+2023b
+)
+into a search over a combinatorial space of possible reasoning and acting steps. We adapt Monte Carlo tree search (MCTS) from model-based reinforcement learning
+(Silver et al.,
+2017
+; Anthony et al.,
+2017
+; Jiang et al.,
+2018
+)
+to language agents, repurposing a pretrained LLM as an agent, value function, and optimizer. Utilizing the strong natural language understanding and in-context learning ability of modern LMs, we use text as an interface between each component of the framework, allowing LATS to adapt planning to environmental conditions without additional training. To the best of our knowledge,
+LATS is the first framework that combines reasoning, acting, and planning to enhance LLMs
+. Notably, LATS doubles the performance of GPT-3.5 on HotPotQA
+(Yang et al.,
+2018
+)
+over ReAct
+(Yao et al.,
+2023b
+)
+and raises the average score by
+22.1
+22.1
+22.1
+on WebShop
+(Yao et al.,
+2022
+)
+. When used with GPT-4, LATS achieves a
+94.4
+94.4
+94.4
+Pass@1 rate for programming on HumanEval
+(Chen et al.,
+2021
+)
+, setting the state of the art. To summarize, our
+contributions
+are the following:
+•
+We introduce an LM-based Monte Carlo tree search variant to deliberately construct the best trajectory from sampled actions, enabling more flexible and adaptive problem-solving compared to reflexive prompting methods. This is guided by heuristics from the LM.
+•
+By integrating external feedback and self-reflection, LATS enhances model sensibility and enables agents to learn from experience, surpassing reasoning-based search methods.
+•
+Through experiments across diverse domains like programming, interactive QA, and web navigation, we demonstrate the versatility of LATS in harnessing LLMs for autonomous reasoning and decision-making.
+2
+Related Work
+Approach
+Reasoning
+Acting
+Planning
+Self
+External
+Reflection
+Memory
+CoT
+(Wei et al.,
+2022
+)
+✓
+×
+\times
+×
+\times
+×
+\times
+×
+\times
+ReAct
+(Yao et al.,
+2023b
+)
+✓
+✓
+×
+\times
+×
+\times
+×
+\times
+ToT
+(Yao et al.,
+2023a
+)
+✓
+×
+\times
+✓
+✓
+✓
+RAP
+(Hao et al.,
+2023
+)
+✓
+×
+\times
+✓
+×
+\times
+✓
+Self-Refine
+(Madaan et al.,
+2023
+)
+✓
+×
+\times
+×
+\times
+✓
+×
+\times
+Beam Search
+(Xie et al.,
+2023
+)
+✓
+×
+\times
+×
+\times
+✓
+×
+\times
+Reflexion
+(Shinn et al.,
+2023
+)
+✓
+✓
+×
+\times
+✓
+✓
+LATS (Ours)
+✓
+✓
+✓
+✓
+✓
+Table 1:
+A summary of related work on reasoning, acting, and planning. LATS is the first work incorporating designs from all three domains, allowing use in all corresponding tasks. We refer to planning as the use of a search algorithm, self-reflection as the use of LM-generated feedback, and external memory as storaging past text context for future updates of solution.
+a) Tree-of-Thoughts
+b) Reasoning via Planning
+c) Language Agent Tree Search
+Figure 2:
+An overview of the differences between LATS and recently proposed LM search algorithms ToT
+(Yao et al.,
+2023a
+)
+and RAP
+(Hao et al.,
+2023
+)
+. LATS leverages environmental feedback and self-reflection to further adapt search and improve performance.
+LLMs for reasoning.
+For LLMs, reasoning typically involves decomposing complex inputs into sequential intermediate steps towards a final answer
+(Cobbe et al.,
+2021
+)
+, demonstrated with Chain-of-Thought (CoT) prompting
+(Wei et al.,
+2022
+)
+and its variants
+(Wei et al.,
+2022
+; Kojima et al.,
+2022
+; Wang et al.,
+2022
+)
+. However, these methods, which create chains autoregressively in a single step, often suffer from error propagation as the number of steps increases
+(Guo et al.,
+2018
+; Chen et al.,
+2022b
+)
+due to compound errors. Various advancements aim to mitigate this issue; some approaches, such as Self-Consistency
+(Wang et al.,
+2022
+)
+, employ majority voting over sampled chains, while others focus on multi-step decomposition, such as least-to-most prompting
+(Zhou et al.,
+2022
+)
+, or use of external tools such as a scratchpad
+(Nye et al.,
+2021
+)
+or compiler
+(Gao et al.,
+2022
+)
+. Recently, CoT has been improved with search algorithms
+(Yao et al.,
+2023a
+; Hao et al.,
+2023
+; Besta et al.,
+2023
+)
+that can sample trajectories more effectively. Tree-of-thought (ToT) prompting
+(Yao et al.,
+2023a
+)
+uses DFS or BFS-based search guided by an LM-generated heuristic while Reasoning via Planning (RAP)
+(Hao et al.,
+2023
+)
+uses MCTS with rollouts simulated by the LM. However, they rely solely on LM internal knowledge and cannot adapt to useful external feedback.
+LLMs for acting.
+The strong reasoning and common-sense abilities of LLMs have also been adapted for decision-making or acting tasks as a policy model in interactive environments. In the realm of robotics LLMs have been employed as high-level controllers of control policies
+(Ahn et al.,
+2022
+; Huang et al.,
+2022
+; Driess et al.,
+2023
+)
+. Similar work
+(Baker et al.,
+2022
+; Wang et al.,
+2023
+; Zhu et al.,
+2023
+)
+has also adapted LLM agents to complex multimodal games such as Minecraft
+(Guss et al.,
+2019
+; Fan et al.,
+2022
+)
+. LLMs are particularly useful in text-based environments
+(Liu et al.,
+2018
+; Shridhar et al.,
+2020
+; Liu et al.,
+2023
+)
+, where acting-based prompting techniques such as ReAct
+(Yao et al.,
+2023b
+)
+have seen success. Similar to CoT, ReAct is limited by its simplicity and cannot effectively adapt to environment conditions. Many extensions have been proposed to address this, including Self-refine
+(Madaan et al.,
+2023
+)
+and Reflexion
+(Shinn et al.,
+2023
+; Yao et al.,
+2023c
+)
+, which uses self-reflection to enhance reasoning and decision-making, and AdaPlanner
+(Sun et al.,
+2023
+)
+, which incorporates both positive and negative environmental feedback. However these methods focus on refining an individual plan or trajectory and do not consider alternative choices at each step. In addition, recent work
+(Huang et al.,
+2023
+)
+has suggested LLMs cannot self-correct their internal reasoning, making it critical to use external feedback. Alternatively to pure decision-making environments, the reasoning and practical abilities of LLMs have been enhanced by access to external tools, such as APIs, search engines, calculators, or other models
+(Schick et al.,
+2023
+; Shen et al.,
+2023
+; Surís et al.,
+2023
+)
+. Contrary to reasoning-based approaches, these methods have not been improved with planning, limiting their effectiveness. We summarize them in Tab.
+1
+.
+Tree-based search.
+Tree-based search, where multiple branches of outcomes are explored during search, is widely used in many planning algorithms
+(Świechowski et al.,
+2023
+; LaValle et al.,
+2001
+)
+and Reinforcement Learning (RL)
+(Hafner et al.,
+2019
+; Du et al.,
+2023
+; Wu et al.,
+2023
+)
+algorithms for its good exploration-exploitation trade-off. Though tree-based search requires an environment model that can expand from arbitrary state
+(Vodopivec et al.,
+2017
+)
+, which often requires extra training in RL
+(Hafner et al.,
+2023
+)
+, such problem does not exist for LM tasks as we can conveniently backup to any state by setting the input to be the context and corresponding previous output by the LM. Thus, we work on the tree-based framework and use MCTS
+(Świechowski et al.,
+2023
+)
+to fully release the potential of LMs, while avoiding the cost of training a value function over language descriptions by leveraging the in-context learning
+(Brown et al.,
+2020
+)
+abilities of LLMs.
+3
+Preliminaries
+3.1
+Problem Setting and Prompting
+Before describing LATS, we first define our problem and outline a few established methods that leverage large language models for reasoning or decision-making. In LM reasoning or decision making, we are given an input
+x
+𝑥
+x
+in natural language and a pretrained language model
+p
+θ
+​
+(
+x
+)
+subscript
+𝑝
+𝜃
+𝑥
+p_{\theta}(x)
+parameterized by
+θ
+𝜃
+\theta
+; our goal is to generate a final output
+y
+∼
+p
+θ
+​
+(
+x
+)
+similar-to
+𝑦
+subscript
+𝑝
+𝜃
+𝑥
+y\sim p_{\theta}(x)
+corresponding to the answer (reasoning) or completes the task (decision-making). Both
+x
+𝑥
+x
+and
+y
+𝑦
+y
+are language
+sequences
+, which are comprised of a list of
+tokens
+(the basic elements of natural language, often words), denoted as
+x
+=
+(
+x
+​
+[
+1
+]
+,
+…
+,
+x
+​
+[
+n
+]
+)
+𝑥
+𝑥
+delimited-[]
+1
+…
+𝑥
+delimited-[]
+𝑛
+x=(x[1],\dots,x[n])
+and
+y
+=
+(
+y
+​
+[
+1
+]
+,
+…
+,
+y
+​
+[
+n
+]
+)
+𝑦
+𝑦
+delimited-[]
+1
+…
+𝑦
+delimited-[]
+𝑛
+y=(y[1],\dots,y[n])
+. The LM decodes text autoregressively, i.e., without other inputs, the probability for an LM to generate a sequence
+x
+𝑥
+x
+is given by
+p
+θ
+​
+(
+x
+)
+=
+∏
+i
+=
+1
+n
+p
+θ
+​
+(
+x
+​
+[
+i
+]
+|
+x
+​
+[
+1
+​
+…
+​
+i
+−
+1
+]
+)
+subscript
+𝑝
+𝜃
+𝑥
+superscript
+subscript
+product
+𝑖
+1
+𝑛
+subscript
+𝑝
+𝜃
+conditional
+𝑥
+delimited-[]
+𝑖
+𝑥
+delimited-[]
+1
+…
+𝑖
+1
+p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1\dots i-1])
+. Usually, to improve the LM,
+prompts
+are provided along with the input
+x
+𝑥
+x
+, which are specific instructions or few-shot input-output examples. We denote the generic process where an input
+x
+𝑥
+x
+is transformed into an output
+y
+𝑦
+y
+by LM:
+y
+∼
+p
+θ
+​
+(
+y
+|
+prompt
+I
+​
+O
+​
+(
+x
+)
+)
+similar-to
+𝑦
+subscript
+𝑝
+𝜃
+conditional
+𝑦
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+y\sim p_{\theta}(y|\texttt{prompt}_{IO}(x))
+, where
+prompt
+I
+​
+O
+​
+(
+x
+)
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+\texttt{prompt}_{IO}(x)
+denotes the input
+x
+𝑥
+x
+.
+Chain-of-thought (CoT) Prompting
+(Wei et al.,
+2022
+)
+was introduced to cater to scenarios where direct mapping from
+x
+𝑥
+x
+to
+y
+𝑦
+y
+is intricate, such as when
+x
+𝑥
+x
+is from a mathematical query or challenging question. This method hinges on creating
+thoughts
+z
+1
+,
+…
+,
+z
+n
+subscript
+𝑧
+1
+…
+subscript
+𝑧
+𝑛
+z_{1},\dots,z_{n}
+that act as stepping stones between
+x
+𝑥
+x
+and
+y
+𝑦
+y
+; each thought
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+is a language sequence. To employ CoT prompting, thoughts are extracted sequentially as
+z
+i
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
+|
+x
+,
+z
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑧
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+1
+z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1})
+, with the final output being
+y
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+y
+|
+x
+,
+z
+1
+​
+⋯
+​
+n
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+𝑦
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑛
+y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n})
+.
+Tree-of-thought (ToT) Prompting
+(Yao et al.,
+2023a
+)
+extends CoT prompting by exploring multiple reasoning paths over thoughts. It frames problems as a search over a tree where each node
+s
+=
+[
+x
+,
+z
+1
+⋅
+i
+]
+𝑠
+𝑥
+subscript
+𝑧
+⋅
+1
+𝑖
+s=[x,z_{1\cdot i}]
+represents a partial solution state comprising the original input
+x
+𝑥
+x
+and thought sequence
+z
+1
+​
+⋯
+​
+i
+subscript
+𝑧
+1
+⋯
+𝑖
+z_{1\cdots i}
+. Thoughts
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+are generated by proposal or sampling with CoT
+z
+i
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
+|
+x
+,
+z
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑧
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+1
+z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1})
+. Deliberate search algorithms like breadth-first or depth-first search are used to systematically explore the tree, guided by heuristics based on language model evaluations
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+of each state.
+Reasoning via Planning
+(RAP)
+(Hao et al.,
+2023
+)
+is similar to ToT, except that MCTS is used over DFS or BFS. Heuristics are designed from an LM, such as the likelihood or confidence of an action, and the LM is used as a world model to predict subsequent states during the simulation step.
+ReAct
+(Yao et al.,
+2023b
+)
+extends language models to tasks where the mapping from
+x
+𝑥
+x
+to
+y
+𝑦
+y
+is enhanced by or requires interactions with an external environment, such as a game or API. This technique constructs an action space
+A
+^
+=
+A
+∪
+Z
+^
+𝐴
+𝐴
+𝑍
+\hat{A}=A\cup Z
+that adds permissible actions
+a
+𝑎
+a
+to the reasoning traces
+z
+𝑧
+z
+from CoT. Observations
+o
+𝑜
+o
+from the environment are used to improve both reasoning and acting. To solve problems with ReAct, after each observation, actions are generated from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+sequentially as
+a
+i
+∼
+p
+θ
+R
+​
+e
+​
+A
+​
+c
+​
+t
+​
+(
+a
+i
+|
+x
+,
+o
+1
+​
+⋯
+​
+i
+−
+1
+,
+a
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑎
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝑅
+𝑒
+𝐴
+𝑐
+𝑡
+conditional
+subscript
+𝑎
+𝑖
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑖
+1
+subscript
+𝑎
+1
+⋯
+𝑖
+1
+a_{i}\sim p_{\theta}^{ReAct}(a_{i}|x,o_{1\cdots i-1},a_{1\cdots i-1})
+, with the final output being
+y
+∼
+p
+θ
+R
+​
+e
+​
+A
+​
+c
+​
+t
+​
+(
+y
+|
+x
+,
+o
+1
+​
+⋯
+​
+n
+,
+a
+1
+​
+⋯
+​
+n
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝑅
+𝑒
+𝐴
+𝑐
+𝑡
+conditional
+𝑦
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑛
+subscript
+𝑎
+1
+⋯
+𝑛
+y\sim p_{\theta}^{ReAct}(y~{}|~{}x,o_{1\cdots n},a_{1\cdots n})
+.
+While the previously described prompting techniques improve LM performance on reasoning tasks, they falter on difficult tasks that involve multifaceted decision-making due to several shortcomings: 1)
+Flexibility
+: Base prompting methods (CoT or ReAct) autoregressively sample from the LM, neglecting potential alternative continuations from specific states. 2)
+Sensibility
+: Reasoning-based methods (CoT, RAP, or ToT) rely solely on the internal representations of the LM and cannot consider external observations. This dependency risks fact hallucination and error propagation while setting a performance ceiling. 3)
+Adaptability
+: Current planning frameworks (RAP or ToT) use simple search algorithms such as BFS or cannot leverage environmental feedback to improve planning. Additionally, the agent is static and cannot reuse previous experience or learn from trial and error. While RAP also adopts MCTS, it is constrained to tasks where the LM can become a world model and accurately predict states. These shortcomings limit the ability of LMs to be deployed as general problem-solving agents and form the motivation for LATS.
+3.2
+Monte-Carlo Tree Search (MCTS)
+Monte-Carlo Tree Search (MCTS) is a heuristic search algorithm that is proved successful on many decision-making environments such as Atari
+(Ye et al.,
+2021
+)
+and Go
+(Silver et al.,
+2016
+)
+. MCTS builds a decision tree where every node in the tree is a state and edge is an action. MCTS runs for
+k
+𝑘
+k
+episodes; for each episode, it starts from the root (i.e., initial state) and iteratively conducts two steps to expand the tree: 1)
+Expansion
+, where multiple children states
+s
+𝑠
+s
+are explored from the current parent state
+p
+𝑝
+p
+by sampling
+n
+𝑛
+n
+actions, and 2)
+Selection
+, where the children with the highest UCT
+(Upper Confidence bounds applied to Trees)
+(Kocsis & Szepesvári,
+2006
+)
+value is selected by the next iteration. The UCT of a child state
+s
+𝑠
+s
+is calculated as follows:
+U
+​
+C
+​
+T
+​
+(
+s
+)
+=
+V
+​
+(
+s
+)
++
+w
+​
+ln
+⁡
+N
+​
+(
+p
+)
+N
+​
+(
+s
+)
+,
+𝑈
+𝐶
+𝑇
+𝑠
+𝑉
+𝑠
+𝑤
+𝑁
+𝑝
+𝑁
+𝑠
+UCT(s)=V(s)+w\sqrt{\frac{\ln N(p)}{N(s)}},
+(1)
+where
+N
+​
+(
+s
+)
+𝑁
+𝑠
+N(s)
+is the number of visits to a node
+s
+𝑠
+s
+,
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+is the value function (expected return) from the subtree of
+s
+𝑠
+s
+,
+w
+𝑤
+w
+is the exploration weight, and
+p
+𝑝
+p
+is the parent node of
+s
+𝑠
+s
+. The child node with the highest UCT value is selected for expansion in the next iteration. When the end of an episode is reached, a
+backpropagation
+is carried out: the return
+r
+𝑟
+r
+is used for updating every
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+along the path
+with the formula
+V
+​
+(
+s
+)
+=
+V
+old
+​
+(
+s
+)
+​
+(
+N
+​
+(
+s
+)
+−
+1
+)
++
+r
+N
+​
+(
+s
+)
+𝑉
+𝑠
+subscript
+𝑉
+old
+𝑠
+𝑁
+𝑠
+1
+𝑟
+𝑁
+𝑠
+V(s)=\frac{V_{\text{old}}(s)(N(s)-1)+r}{N(s)}
+, where
+V
+old
+​
+(
+s
+)
+subscript
+𝑉
+old
+𝑠
+V_{\text{old}}(s)
+is the old value function. Normally, the major shortcoming of MCTS is that it requires an environment model to undo previous steps and form a searching tree, which is often a strong assumption. However, such a limitation does not exist for LMs, as we can conveniently reset to any step by simply copy-pasting historical text input. Such a special property is the key motivation of our work.
+4
+Unifying Planning, Reasoning, and Acting
+4.1
+LM Agent
+LATS supports sequential reasoning or decision-making tasks on the basis of ReAct. At time step
+t
+𝑡
+t
+, an agent receives an observation
+o
+t
+∈
+O
+subscript
+𝑜
+𝑡
+𝑂
+o_{t}\in O
+from the environment and takes an action
+a
+t
+∈
+A
+subscript
+𝑎
+𝑡
+𝐴
+a_{t}\in A
+following some policy
+π
+​
+(
+a
+t
+|
+x
+,
+o
+1
+​
+⋯
+​
+i
+−
+1
+,
+a
+1
+​
+⋯
+​
+i
+−
+1
+)
+𝜋
+conditional
+subscript
+𝑎
+𝑡
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑖
+1
+subscript
+𝑎
+1
+⋯
+𝑖
+1
+\pi(a_{t}|x,o_{1\cdots i-1},a_{1\cdots i-1})
+, where
+x
+𝑥
+x
+consists of the task instruction and a number of few-shot examples. We initialize the agent with
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+to leverage the useful language representations of an LM as a base decision-maker. We follow the ReAct instantiation in which the action space
+A
+^
+=
+A
+∪
+Z
+^
+𝐴
+𝐴
+𝑍
+\hat{A}=A\cup Z
+consists of both the space of permissible actions
+A
+𝐴
+A
+and language space of reasoning traces
+Z
+𝑍
+Z
+. Actions directly affect the environment and result in observation, while thoughts are used to formalize decisions by organizing information, planning future actions, or injecting internal knowledge. The exact instantiation of the action space depends on the particular environment; for decision-making tasks actions might consist of commands on a website while for reasoning tasks the action space might be limited to a few external tools or APIs.
+Instead of greedily decoding one trajectory or solution, we sample
+n
+𝑛
+n
+actions from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+using the current state. This is based on the intuition that for complex decision-making tasks, there is likely to be a range of potential trajectories or reasoning paths that are correct
+(Evans,
+2010
+)
+. Sampling a diverse set of candidates at each step mitigates the stochastic nature of LM text generation and enables greater exploration in both the decision-making and reasoning space. We wrap
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+within our proposed search algorithm to deliberately construct the best trajectory from sampled actions.
+4.2
+LATS
+Figure 3:
+An overview of the six operations of LATS. A node is
+selected
+,
+expanded
+,
+evaluated
+, then
+simulated
+until a terminal node is reached, then the resulting value is
+backpropagated
+. If the trajectory fails, a
+reflection
+is generated and used as additional context for future trials. These operations are performed in succession until the budget is reached or task is successful.
+The main component of LATS is a search algorithm that controls the overall problem-solving process with deliberate planning. To find the most promising trajectory and systemically balance exploration with exploitation, we adopt a variant of Monte Carlo Tree Search (MCTS) that frames decision-making as a tree search, in which each node
+s
+=
+[
+x
+,
+a
+1
+​
+⋯
+​
+i
+,
+o
+1
+​
+⋯
+​
+i
+]
+𝑠
+𝑥
+subscript
+𝑎
+1
+⋯
+𝑖
+subscript
+𝑜
+1
+⋯
+𝑖
+s=[x,a_{1\cdots i},o_{1\cdots i}]
+represents a state comprising the original input
+x
+𝑥
+x
+, action sequence
+a
+1
+⋅
+i
+subscript
+𝑎
+⋅
+1
+𝑖
+a_{1\cdot i}
+, and observation sequence
+o
+1
+⋅
+i
+subscript
+𝑜
+⋅
+1
+𝑖
+o_{1\cdot i}
+.
+To adapt MCTS for language agents, LATS repurposes
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+as an agent, state evaluator, and feedback generator, leveraging the useful language priors of modern LMs to facilitate planning. While standard MCTS and RAP
+Hao et al. (
+2023
+)
+rely on internal dynamics models to facilitate simulation, LATS is model-free and uses environment interaction. LATS consists of a series of operations,
+selection, expansion, evaluation, simulation, backpropagation, and reflection
+, performed in succession until the task is successfully completed or a computational limit is reached. The full psuedocode of LATS can be found in Sec.
+A
+in the Appendix.
+Selection.
+In the first operation, the algorithm identifies a segment of the current tree most suitable for subsequent expansion. Starting from the root node, denoted as the initial state
+s
+0
+subscript
+𝑠
+0
+s_{0}
+, a child node is selected at each tree level until a leaf node is reached. To balance exploration and exploitation, we use the UCT algorithm as shown in Eq.
+1
+.
+Expansion.
+After selecting a node, the second operation expands the tree by sampling
+n
+𝑛
+n
+actions from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, as described in the prior section. The environment receives each action and returns corresponding feedback as an observation. This results in
+n
+𝑛
+n
+new child nodes added to the tree. This tree is stored in an external long-term memory structure.
+Evaluation.
+The third operation assigns a scalar value to each new child node to be used for selection and backpropagation. This value effectively quantifies the agent’s progress in task completion, serving as a heuristic to steer the search algorithm towards the most promising regions of the tree. Following
+Yao et al. (
+2023a
+)
+we repurpose
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+into a value function by prompting it to reason about a given state. To obtain a scalar value, we instruct
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+to end its reasoning trace with a score indicating the correctness of the trajectory. This method offers enhanced flexibility over programmed heuristics
+(Campbell et al.,
+2002
+)
+and greater efficiency than learned heuristics
+(Silver et al.,
+2017
+)
+.
+Simulation.
+The fourth operation expands the currently selected node until a terminal state is reached. At each depth level we sample and evaluate nodes with the same operations, but prioritize nodes of highest value. Reaching a terminal state provides objective feedback on the correctness of a trajectory. If the task is completed successfully, then LATS terminates the search. If the solution is partially successful or unsuccessful, then we perform two additional operations as described below.
+Backpropagation.
+This operation updates the values of the tree based on the outcome of a trajectory. For each node
+s
+0
+,
+s
+1
+,
+…
+,
+s
+n
+subscript
+𝑠
+0
+subscript
+𝑠
+1
+…
+subscript
+𝑠
+𝑛
+s_{0},s_{1},\dots,s_{n}
+in the trajectory from root (initial state
+s
+0
+subscript
+𝑠
+0
+s_{0}
+) of the searching tree to leaf (terminal state
+s
+n
+subscript
+𝑠
+𝑛
+s_{n}
+), its value is updated to reflect the outcome of the simulation by
+N
+​
+(
+s
+i
+)
+=
+N
+old
+​
+(
+s
+i
+)
++
+1
+𝑁
+subscript
+𝑠
+𝑖
+subscript
+𝑁
+old
+subscript
+𝑠
+𝑖
+1
+N(s_{i})=N_{\text{old}}(s_{i})+1
+and
+V
+​
+(
+s
+i
+)
+=
+r
++
+N
+old
+​
+(
+s
+i
+)
+​
+V
+old
+​
+(
+s
+i
+)
+N
+​
+(
+s
+i
+)
+𝑉
+subscript
+𝑠
+𝑖
+𝑟
+subscript
+𝑁
+old
+subscript
+𝑠
+𝑖
+subscript
+𝑉
+old
+subscript
+𝑠
+𝑖
+𝑁
+subscript
+𝑠
+𝑖
+V(s_{i})=\frac{r+N_{\text{old}}(s_{i})V_{\text{old}}(s_{i})}{N(s_{i})}
+, where
+r
+𝑟
+r
+is the return and
+N
+old
+,
+V
+old
+subscript
+𝑁
+old
+subscript
+𝑉
+old
+N_{\text{old}},V_{\text{old}}
+are the old number of visits and value function. These updated values are used in the UCT formula (Eq.
+1
+) to guide the selection of the next node for exploration.
+Reflection.
+In addition to the environmental feedback, we also leverage
+self-reflection
+to further refine the decision-making process
+(Shinn et al.,
+2023
+; Madaan et al.,
+2023
+)
+. Upon encountering an unsuccessful terminal node,
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+is prompted with the trajectory and final reward to provide a verbal self-reflection that summarizes the errors in the reasoning or acting process and proposes superior alternatives. We store both failed trajectories and corresponding reflections in the memory. In subsequent iterations, these are integrated as additional context to the agent and value function, refining both through in-context learning. This imparts a semantic gradient signal more useful than a scalar value, enabling the agent to learn from trial and error without the cost of expensive optimization processes such as reinforcement learning.
+Conceptually, LATS has the following advantages as a general framework for reasoning and decision-making with LM agents.
+(1)
+Generality
+: LATS supports both reasoning and decision-making tasks by defining a shared space of thoughts and actions. (2)
+Deliberate
+: The use of MCTS and LM value function ensures a principled search that selects options with high value while exploring promising alternatives. (3)
+Adaptability
+: LATS is designed around the use of external feedback through observations and self-reflection, enabling greater adaptation during problem-solving. (4)
+Flexibility
+: LATS can accommodate different scenarios, environments, and resource stipulations by modifying state design and tree dimensions. (5)
+Modularity
+: The base LM agent, reflection generator, and value function can be independently altered and adapted to individual LM properties.
+5
+Experiments
+To demonstrate the general applicability of LATS, we evaluate our method on a variety of decision-making domains that requires both reasoning and acting ability: programming
+(Chen et al.,
+2021
+; Austin et al.,
+2021
+)
+, HotPotQA
+(Yang et al.,
+2018
+)
+, and WebShop
+(Yao et al.,
+2022
+)
+.
+5.1
+HotPotQA
+For a task that can be approached with both reasoning-based and acting-based strategies, we consider HotPotQA
+(Yang et al.,
+2018
+)
+, a multi-hop question-answering benchmark that requires retrieval over two or more Wikipedia passages. For the action space, in addition to LM thoughts we follow the setup from
+Yao et al. (
+2023b
+)
+, which provides the agent with API calls to search and lookup information. The output of these API calls and self-generated reflections form the observation space. We use a subset of 100 questions and three few-shot examples for each method. For ToT, we use DFS as the base search algorithm and scoring with the LM as the heuristic. For all methods that involve sampling, including LATS, we sample
+k
+=
+50
+𝑘
+50
+k=50
+trajectories. More details and prompts can be found in Sec.
+D
+and Sec.
+E
+in the Appendix.
+We evaluate internal reasoning strategies by removing actions and observations from the context, corresponding to CoT
+(Wei et al.,
+2022
+)
+and its variants, CoT-SC
+(Wang et al.,
+2022
+)
+, ToT
+(Yao et al.,
+2023a
+)
+, and RAP
+(Hao et al.,
+2023
+)
+. These methods rely solely on the agent’s existing knowledge to answer the question. We also consider acting-based methods ReAct, Reflexion, and LATS, which augment the agent with the interactive API environment and primarily evaluate its information retrieval abilities. While LATS is designed for scenarios where external feedback can enhance reasoning, we also implement a reasoning-only version with CoT as the base prompt. We also combine internal and external reasoning in LATS by first prompting with a CoT-based prompt, then switching to a ReAct-based prompt upon failure. This is closer to how humans might approach this task, by using tools to lookup additional information only when the answer is not already known.
+Prompt Method
+HotpotQA (EM)
+I/O
+0.32
+CoT
+(Wei et al.,
+2022
+)
+0.34
+CoT - SC
+(Wang et al.,
+2022
+)
+0.38
+ToT
+(Yao et al.,
+2023a
+)
+0.55
+RAP
+(Hao et al.,
+2023
+)
+0.60
+RAP (n = 10)
+0.60
+LATS (CoT)
+0.60
+Prompt Method
+HotpotQA (EM)
+ReAct
+(Yao et al.,
+2023b
+)
+0.32
+ReAct (best of k)
+0.38
+Reflexion
+(Shinn et al.,
+2023
+)
+0.51
+LATS
+0.61
+LATS (n = 3)
+0.56
+LATS (n = 10)
+0.64
+LATS (CoT + ReAct)
+0.71
+Table 2:
+GPT-3.5 reasoning-based prompting (left) and acting-based prompting (right) results on HotpotQA. LATS achieves the highest exact match (EM) for acting and is competitive on reasoning. Unless otherwise specified, we sample
+n
+=
+5
+𝑛
+5
+n=5
+nodes during expansion and
+k
+=
+50
+𝑘
+50
+k=50
+trajectories.
+Results.
+We observe in Tab.
+2
+that both internal reasoning and external retrieval strategies perform well on HotPotQA. Due to their large-scale training corpus, modern LLMs already encode factual knowledge and can often directly answer the question correctly. While CoT can slightly enhance performance on questions requiring reasoning, larger gains are observed with search methods ToT and RAP, which can sample and explore more outputs. We observe similar results for acting-based methods. LATS surpasses ReAct, even when sampling the same number of trajectories, by expanding more nodes with principled search (see Fig.
+5
+in Appendix
+D
+for a qualitative sample). This is demonstrated when modifying
+n
+𝑛
+n
+, the number of nodes expanded during each iteration. Increasing
+n
+𝑛
+n
+can consistently improve performance, although at greater computational and inference costs. LATS is also competitive to RAP on internal reasoning but performs worse than acting. Combining internal and external reasoning in LATS results in the highest performance, indicating the importance of external feedback in augmenting reasoning even in tasks the base LM can already perform.
+5.2
+Programming
+Prompt Method
+Model
+Pass@1
+CoT
+(Wei et al.,
+2022
+)
+GPT-3.5
+46.9
+ReAct
+(Yao et al.,
+2023b
+)
+GPT-3.5
+56.9
+Reflexion
+(Shinn et al.,
+2023
+)
+GPT-3.5
+68.1
+ToT
+(Yao et al.,
+2023a
+)
+GPT-3.5
+54.4
+RAP
+(Hao et al.,
+2023
+)
+GPT-3.5
+63.1
+LATS (Ours)
+GPT-3.5
+83.8
+I/O
+GPT-4
+80.1
+Reflexion
+GPT-4
+91.0
+LATS
+GPT-4
+94.4
+Prompt Method
+Pass@1
+CoT
+(Wei et al.,
+2022
+)
+54.9
+ReAct
+(Wei et al.,
+2022
+)
+67.0
+Reflexion
+(Shinn et al.,
+2023
+)
+70.0
+ToT
+(Yao et al.,
+2023a
+)
+65.8
+RAP
+(Hao et al.,
+2023
+)
+71.4
+LATS (Ours)
+81.1
+Table 3:
+GPT-3.5 and GPT-4 Pass@1 accuracy on HumanEval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+. Prompting with LATS achieves the highest performance. We sample 5 solutions during expansion for
+8
+iterations.
+To demonstrate the importance of external observations for complex reasoning tasks, we evaluate the baselines and LATS on programming with Humaneval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+. Both datasets measure the correctness of synthesized programs in Python from natural language docstrings. We use individual solutions as the action space and test suite and compiler feedback as the external observation. We follow
+Chen et al. (
+2022a
+)
+and use an LLM to generate a synthetic test suite of syntactically valid “assert” statements for each question. For each step, the solution is evaluated on this test suite, and the results including successful and failed tests and compiler output, are added to the context as an observation. We use the same test suite for Reflexion.
+For this task, the reasoning and acting baselines share an action space, but acting methods are able to incorporate observations as additional context. For LATS, since each action corresponds to a complete solution, we skip the simulation step of LATS and directly use the percentage of passed tests as the backpropagated reward. We use
+k
+=
+8
+𝑘
+8
+k=8
+iterations, set the number of generated tests at
+4
+4
+4
+, and sample
+n
+=
+5
+𝑛
+5
+n=5
+solutions during expansion. After the search is completed, we select the solution with the highest value and evaluate it on the real test suite for the pass@1 accuracy evaluation. More details and prompts can be found in Sec.
+D
+and Sec.
+F
+in the Appendix.
+Results.
+We find in Tab
+3
+that both search and semantic feedback are crucial for better performance. Despite not using observations, ToT and RAP are competitive with Reflexion. LATS has the highest performance on both datasets. Since RAP uses a similar search algorithm as LATS, this reveals the importance of external feedback for difficult reasoning tasks such as programming. With GPT-4, using LATS sets the state of the art for HumanEval, showing LATS can be used with more advanced LLMs for higher performance.
+5.3
+Webshop
+For a complex decision-making environment with practical applications, we consider WebShop
+(Yao et al.,
+2022
+)
+, an online shopping environment composed of a website with 1.18M real-world products and 12k human instructions. Agents must navigate a website through a variety of commands to purchase an item matching a user specification. We use the preconstructed action space of search and click commands and browser feedback and reflections for the observation. The performance is gauged using two metrics: an average score, reflecting the percentage of user-specified attributes met by the selected product, and a success rate, indicating the frequency with which the chosen product fulfills all given conditions. We compare against acting-based prompting methods and RL-based approaches. We evaluate on 50 instructions, expand
+n
+=
+5
+𝑛
+5
+n=5
+children for LATS, and set
+k
+=
+30
+𝑘
+30
+k=30
+for LATS, ReAct best of
+k
+𝑘
+k
+, and Reflexion. More details and prompts are in Appendix
+D
+and
+G
+.
+Results.
+We find in Tab.
+5
+that GPT-3.5 with ReAct is competitive to imitation learning, and can exceed reinforcement learning techniques with stronger prompting strategies. Sampling
+k
+=
+30
+𝑘
+30
+k=30
+trajectories with ReAct and Reflexion results in a similar performance, suggesting the semantic feedback is not as helpful in complex environments like WebShop. Indeed like in
+Shinn et al. (
+2023
+)
+, we find that generated reflections are often generic and do not provide useful feedback, resulting in a tendency for the agent to become stuck in local minima. However, using LATS indeed results in a noticeable improvement, indicating a more effective exploration for the same number of iterations.
+5.4
+Additional Observations
+Method
+Score
+SR
+ReAct
+(Yao et al.,
+2023b
+)
+53.8
+28.0
+ReAct (best of k)
+59.1
+32.0
+Reflexion
+(Shinn et al.,
+2023
+)
+64.2
+35.0
+LATS
+75.9
+38.0
+IL
+59.9
+29.1
+IL+RL
+62.4
+28.7
+Fine-tuning
+(Furuta et al.,
+2023
+)
+67.5
+45.0
+Expert
+82.1
+59.6
+Table 4:
+Score and success rate (SR) on Webshop. Table is separated into prompting, RL-based training, and human performance. For the same number of iterations, LATS improves both score and success rate, and surpasses RL-based training. IL/IL+RL taken from
+Yao et al. (
+2022
+)
+.
+Prompt Method
+HotPotQA (EM)
+ToT (ReAct)
+0.39
+RAP (ReAct)
+0.54
+LATS (No LM Heuristic)
+0.37
+LATS (DFS)
+0.42
+LATS (No Reflection)
+0.56
+LATS
+0.61
+Table 5:
+Ablation results on LATS and baseline variants in HotPotQA; we use ReAct as the base prompt and sample
+n
+=
+5
+𝑛
+5
+n=5
+children and
+k
+=
+50
+𝑘
+50
+k=50
+maximum trajectories. LATS requires every component and operation for optimal performance.
+We also conduct additional experiments on HotPotQA to demonstrate the effect of each component of LATS. We also design a version of ToT and RAP with ReAct prompt and can handle external observations. We use HotPotQA as our setup incorporates both reasoning (through thoughts) and acting (through API calls); the results are shown in Tab.
+5
+. More ablations for token consumption on HotPotQA are in Tab.
+7
+in Appendix
+C
+. Note that baselines generally perform worse than the reasoning-only setting of HotPotQA, which indicates that the acting-based setting is more challenging and adaption of search algorithms to decision-making scenarios is non-trivial.
+Self-reflection.
+We use self-reflection to provide additional semantic signals for the agent. We observe a
+0.05
+0.05
+0.05
+performance drop when removed from LATS, suggesting this is useful. This is a smaller gain Reflexion
+(Shinn et al.,
+2023
+)
+observes over ReAct
+(Yao et al.,
+2023b
+)
+as shown in Tab.
+2
+, suggesting overlap between the types of questions where there is an improvement with self-reflection and search. This variant outperforms RAP-ReAct, reflecting our improvements to MCTS.
+Search Algorithm.
+MCTS is a more principled search algorithm than variants like A* or DFS search and the basis for observed performance gains. We observe the effects of using DFS, and incorporate the LM-based heuristic used in ToT
+(Yao et al.,
+2023a
+)
+in which branches with low values are pruned. This removes the selection and backpropagation operations, and we observe a
+0.08
+0.08
+0.08
+drop in performance when sampling the same number of nodes, but outperforms ToT-ReAct.
+6
+Conclusion
+In this work, we introduce Language Agent Tree Search (LATS), the first framework to unify planning, acting, and reasoning for enhanced LLM problem solving. By deliberately constructing trajectories with search algorithms, incorporating external feedback, and enabling agents to learn from experience, LATS addresses key limitations of prior prompting techniques. Our evaluations demonstrate the ability of LATS to harness LLM capabilities for a variety of decision-making tasks while keeping its reasoning ability without additional training. The proposed synergies between search, interaction, and reflection offer a versatile approach to autonomous decision-making, highlighting the potential of LLMs as generalist agents. A full discussion of the limitations and broader impacts is in Appendix
+B
+.
+References
+Ahn et al. (2022)
+Michael Ahn, Anthony Brohan, Noah Brown, Yevgen Chebotar, Omar Cortes, Byron David, Chelsea Finn, Chuyuan Fu, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Daniel Ho, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Eric Jang, Rosario Jauregui Ruano, Kyle Jeffrey, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Kuang-Huei Lee, Sergey Levine, Yao Lu, Linda Luu, Carolina Parada, Peter Pastor, Jornell Quiambao, Kanishka Rao, Jarek Rettinghouse, Diego Reyes, Pierre Sermanet, Nicolas Sievers, Clayton Tan, Alexander Toshev, Vincent Vanhoucke, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Mengyuan Yan, and Andy Zeng.
+Do as i can, not as i say: Grounding language in robotic affordances.
+arXiv:2204.01691
+, 2022.
+Anthony et al. (2017)
+T. Anthony, Z. Tian, and D. Barber.
+Thinking fast and slow with deep learning and tree search.
+In
+NIPS
+, 2017.
+Austin et al. (2021)
+Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, et al.
+Program synthesis with large language models.
+arXiv:2108.07732
+, 2021.
+Baker et al. (2022)
+Bowen Baker, Ilge Akkaya, Peter Zhokhov, Joost Huizinga, Jie Tang, Adrien Ecoffet, Brandon Houghton, Raul Sampedro, and Jeff Clune.
+Video pretraining (vpt): Learning to act by watching unlabeled online videos.
+arXiv:2206.11795
+, 2022.
+Besta et al. (2023)
+Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michal Podstawski, Hubert Niewiadomski, Piotr Nyczyk, and Torsten Hoefler.
+Graph of thoughts: Solving elaborate problems with large language models.
+arXiv:2308.09687
+, 2023.
+Bowman et al. (2015)
+Samuel R Bowman, Gabor Angeli, Christopher Potts, and Christopher D Manning.
+A large annotated corpus for learning natural language inference.
+In
+EMNLP
+, 2015.
+Brown et al. (2020)
+Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei.
+Language models are few-shot learners.
+In
+NeurIPS
+, 2020.
+Campbell et al. (2002)
+Murray Campbell, A Joseph Hoane Jr, and Feng-hsiung Hsu.
+Deep blue.
+Artificial intelligence
+, 2002.
+Chen et al. (2022a)
+Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen.
+Codet: Code generation with generated tests.
+arXiv:2207.10397
+, 2022a.
+Chen et al. (2021)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.
+Evaluating large language models trained on code.
+arXiv:2107.03374
+, 2021.
+Chen et al. (2022b)
+Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen.
+Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks.
+arXiv preprint arXiv:2211.12588
+, 2022b.
+Chowdhery et al. (2022)
+Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al.
+Palm: Scaling language modeling with pathways.
+arXiv:2204.02311
+, 2022.
+Cobbe et al. (2021)
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al.
+Training verifiers to solve math word problems.
+arXiv:2110.14168
+, 2021.
+Deng et al. (2023)
+Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su.
+Mind2web: Towards a generalist agent for the web.
+arXiv:2306.06070
+, 2023.
+Driess et al. (2023)
+Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence.
+Palm-e: An embodied multimodal language model.
+arXiv:2303.03378
+, 2023.
+Du et al. (2023)
+Yilun Du, Mengjiao Yang, Bo Dai, Hanjun Dai, Ofir Nachum, Joshua B. Tenenbaum, Dale Schuurmans, and Pieter Abbeel.
+Learning universal policies via text-guided video generation.
+arXiv:2302.00111
+, 2023.
+Evans (2010)
+Jonathan St BT Evans.
+Intuition and reasoning: A dual-process perspective.
+Psychological Inquiry
+, 2010.
+Fan et al. (2022)
+Linxi Fan, Guanzhi Wang, Yunfan Jiang, Ajay Mandlekar, Yuncong Yang, Haoyi Zhu, Andrew Tang, De-An Huang, Yuke Zhu, and Anima Anandkumar.
+Minedojo: Building open-ended embodied agents with internet-scale knowledge.
+In
+NeurIPS Datasets and Benchmarks Track
+, 2022.
+Furuta et al. (2023)
+Hiroki Furuta, Ofir Nachum, Kuang-Huei Lee, Yutaka Matsuo, Shixiang Shane Gu, and Izzeddin Gur.
+Multimodal web navigation with instruction-finetuned foundation models.
+arXiv preprint arXiv:2305.11854
+, 2023.
+Gao et al. (2022)
+Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig.
+Pal: Program-aided language models.
+arXiv preprint arXiv:2211.10435
+, 2022.
+Guo et al. (2018)
+Jiaxian Guo, Sidi Lu, Han Cai, Weinan Zhang, Yong Yu, and Jun Wang.
+Long text generation via adversarial training with leaked information.
+AAAI
+, 2018.
+Guss et al. (2019)
+William H. Guss, Brandon Houghton, Nicholay Topin, Phillip Wang, Cayden Codel, Manuela Veloso, and Ruslan Salakhutdinov.
+Minerl: A large-scale dataset of minecraft demonstrations.
+In
+IJCAI
+, 2019.
+Hafner et al. (2019)
+Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha, Honglak Lee, and James Davidson.
+Learning latent dynamics for planning from pixels.
+In
+ICML
+, 2019.
+Hafner et al. (2023)
+Danijar Hafner, Jurgis Pasukonis, Jimmy Ba, and Timothy Lillicrap.
+Mastering diverse domains through world models.
+arXiv:2301.04104
+, 2023.
+Hao et al. (2023)
+Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and Zhiting Hu.
+Reasoning with language model is planning with world model.
+arXiv:2305.14992
+, 2023.
+Huang et al. (2023)
+Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, and Denny Zhou.
+Large language models cannot self-correct reasoning yet.
+arXiv:2310.01798
+, 2023.
+Huang et al. (2022)
+Wenlong Huang, Fei Xia, Ted Xiao, Harris Chan, Jacky Liang, Pete Florence, Andy Zeng, Jonathan Tompson, Igor Mordatch, Yevgen Chebotar, et al.
+Inner monologue: Embodied reasoning through planning with language models.
+arXiv:2207.05608
+, 2022.
+Jiang et al. (2018)
+D. Jiang, E. Ekwedike, and H. Liu.
+Feedback-based tree search for reinforcement learning.
+In
+ICML
+, 2018.
+Kocsis & Szepesvári (2006)
+Levente Kocsis and Csaba Szepesvári.
+Bandit based monte-carlo planning.
+In
+ECML
+, 2006.
+Kojima et al. (2022)
+Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.
+Large language models are zero-shot reasoners.
+arXiv:2205.11916
+, 2022.
+LaValle et al. (2001)
+Steven M LaValle, James J Kuffner, BR Donald, et al.
+Rapidly-exploring random trees: Progress and prospects.
+Algorithmic and computational robotics: new directions
+, 2001.
+Liu et al. (2018)
+Evan Zheran Liu, Kelvin Guu, Panupong Pasupat, Tianlin Shi, and Percy Liang.
+Reinforcement learning on web interfaces using workflow-guided exploration.
+In
+ICLR
+, 2018.
+Liu et al. (2023)
+Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, and Jie Tang.
+Agentbench: Evaluating llms as agents.
+arXiv:2308.03688
+, 2023.
+Madaan et al. (2023)
+Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, Shashank Gupta, Bodhisattwa Prasad Majumder, Katherine Hermann, Sean Welleck, Amir Yazdanbakhsh, and Peter Clark.
+Self-refine: Iterative refinement with self-feedback.
+arXiv:2303.17651
+, 2023.
+Nallapati et al. (2016)
+Ramesh Nallapati, Bowen Zhou, Cicero dos Santos, Caglar Gulcehre, and Bing Xiang.
+Abstractive text summarization using sequence-to-sequence rnns and beyond.
+In
+SIGNLL
+, 2016.
+Nye et al. (2021)
+Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, et al.
+Show your work: Scratchpads for intermediate computation with language models.
+arXiv:2112.00114
+, 2021.
+OpenAI (2023)
+OpenAI.
+Gpt-4 technical report.
+arXiv:2303.08774
+, 2023.
+Saparov & He (2022)
+Abulhair Saparov and He He.
+Language models are greedy reasoners: A systematic formal analysis of chain-of-thought.
+arXiv:2210.01240
+, 2022.
+Schick et al. (2023)
+Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom.
+Toolformer: Language models can teach themselves to use tools.
+arXiv:2302.04761
+, 2023.
+Shen et al. (2023)
+Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang.
+Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface.
+arXiv:2303.17580
+, 2023.
+Shinn et al. (2023)
+Noah Shinn, Federico Cassano, Beck Labash, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+arXiv:2303.11366
+, 2023.
+Shridhar et al. (2020)
+Mohit Shridhar, Xingdi Yuan, Marc-Alexandre Côté, Yonatan Bisk, Adam Trischler, and Matthew Hausknecht.
+Alfworld: Aligning text and embodied environments for interactive learning.
+arXiv:2010.03768
+, 2020.
+Silver et al. (2016)
+David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al.
+Mastering the game of go with deep neural networks and tree search.
+nature
+, 2016.
+Silver et al. (2017)
+David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja Huang, Arthur Guez, Thomas Hubert, Lucas baker, Matthew Lai, Adrian Bolton, Yutian Chen, Timothy P. Lillicrap, Fan Hui, L. Sifre, George van den Driessche, Thore Graepel, and Demis Hassabis.
+Mastering the game of go without human knowledge.
+Nature
+, 2017.
+Sloman (1996)
+Steven A. Sloman.
+The empirical case for two systems of reasoning.
+Psychological Bulletin
+, 1996.
+Sun et al. (2023)
+Haotian Sun, Yuchen Zhuang, Lingkai Kong, Bo Dai, and Chao Zhang.
+Adaplanner: Adaptive planning from feedback with language models.
+arXiv:2305.16653
+, 2023.
+Surís et al. (2023)
+Dídac Surís, Sachit Menon, and Carl Vondrick.
+Vipergpt: Visual inference via python execution for reasoning.
+arXiv preprint arXiv:2303.08128
+, 2023.
+Świechowski et al. (2023)
+Maciej Świechowski, Konrad Godlewski, Bartosz Sawicki, and Jacek Mańdziuk.
+Monte carlo tree search: A review of recent modifications and applications.
+Artificial Intelligence Review
+, 2023.
+Touvron et al. (2023)
+Hugo Touvron, Louis Martin, Kevin R. Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Daniel M. Bikel, Lukas Blecher, Cristian Cantón Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony S. Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel M. Kloumann, A. V. Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, R. Subramanian, Xia Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zhengxu Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and
+Thomas Scialom.
+Llama 2: Open foundation and fine-tuned chat models.
+arXiv:2307.09288
+, 2023.
+Vodopivec et al. (2017)
+Tom Vodopivec, Spyridon Samothrakis, and Branko Ster.
+On monte carlo tree search and reinforcement learning.
+Journal of Artificial Intelligence Research
+, 2017.
+Wang et al. (2023)
+Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar.
+Voyager: An open-ended embodied agent with large language models.
+arXiv:2305.16291
+, 2023.
+Wang et al. (2022)
+Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou.
+Self-consistency improves chain of thought reasoning in language models.
+arXiv:2203.11171
+, 2022.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou.
+Chain of thought prompting elicits reasoning in large language models.
+arXiv:2201.11903
+, 2022.
+Wooldridge & Jennings (1995)
+Michael Wooldridge and Nicholas R Jennings.
+Intelligent agents: Theory and practice.
+The knowledge engineering review
+, 1995.
+Wu et al. (2023)
+Philipp Wu, Alejandro Escontrela, Danijar Hafner, Pieter Abbeel, and Ken Goldberg.
+Daydreamer: World models for physical robot learning.
+In
+CoRL
+. PMLR, 2023.
+Xie et al. (2023)
+Yuxi Xie, Kenji Kawaguchi, Yiran Zhao, Xu Zhao, Min-Yen Kan, Junxian He, and Qizhe Xie.
+Decomposition enhances reasoning via self-evaluation guided decoding.
+arXiv:2305.00633
+, 2023.
+Yang et al. (2018)
+Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning.
+Hotpotqa: A dataset for diverse, explainable multi-hop question answering.
+arXiv:1809.09600
+, 2018.
+Yao et al. (2022)
+Shunyu Yao, Howard Chen, John Yang, and Karthik R Narasimhan.
+Webshop: Towards scalable real-world web interaction with grounded language agents.
+In
+NeurIPS
+, 2022.
+Yao et al. (2023a)
+Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Thomas L. Griffiths, Yuan Cao, and Karthik Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language models.
+arXiv:2305.10601
+, 2023a.
+Yao et al. (2023b)
+Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao.
+ReAct: Synergizing reasoning and acting in language models.
+In
+ICLR
+, 2023b.
+Yao et al. (2023c)
+Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, Ran Xu, Phil Mui, Huan Wang, Caiming Xiong, and Silvio Savarese.
+Retroformer: Retrospective large language agents with policy gradient optimization.
+arXiv preprint arXiv:2308.02151
+, 2023c.
+Ye et al. (2021)
+Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, and Yang Gao.
+Mastering atari games with limited data.
+In
+NeurIPS
+, 2021.
+Zhou et al. (2022)
+Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Olivier Bousquet, Quoc Le, and Ed Chi.
+Least-to-most prompting enables complex reasoning in large language models.
+arXiv:2205.10625
+, 2022.
+Zhu et al. (2023)
+Xizhou Zhu, Yuntao Chen, Hao Tian, Chenxin Tao, Weijie Su, Chenyu Yang, Gao Huang, Bin Li, Lewei Lu, Xiaogang Wang, Yu Qiao, Zhaoxiang Zhang, and Jifeng Dai.
+Ghost in the minecraft: Generally capable agents for open-world environments via large language models with text-based knowledge and memory.
+arXiv:2305.17144
+, 2023.
+7
+Appendix
+The appendix is organized as follows. First in Sec.
+A
+, we show the pseudocode of our proposed algorithm, LATS; then in Sec.
+B
+, we provide further discussion of our method and its limitations, future direction and broader impact; then in Sec.
+C
+we provide additional experimental results; then in Sec.
+D
+, we specify the environment details in our experiments; finally, we list our prompts used for the three environments in Sec.
+E
+(HotPotQA), Sec.
+F
+(Programming) and Sec.
+G
+(Webshop) respectively.
+Appendix A
+LATS Pseudocode
+Alg.
+1
+shows the pseudocode of our algorithm LATS. Nodes are stored explicitly in the memory. Unless otherwise specified, in all experiments we use
+n
+=
+5
+𝑛
+5
+n=5
+and
+w
+=
+1
+𝑤
+1
+w=1
+.
+Algorithm 1
+LATS
+⁡
+(
+S
+0
+,
+p
+θ
+,
+p
+V
+,
+p
+ref
+,
+d
+,
+k
+,
+n
+,
+w
+)
+LATS
+subscript
+𝑆
+0
+subscript
+𝑝
+𝜃
+subscript
+𝑝
+𝑉
+subscript
+𝑝
+ref
+𝑑
+𝑘
+𝑛
+𝑤
+\operatorname{LATS}(S_{0},p_{\theta},{p_{V}},p_{\text{ref}},d,k,n,w)
+Initial state
+s
+1
+subscript
+𝑠
+1
+s_{1}
+, action generator
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, value function
+p
+V
+subscript
+𝑝
+𝑉
+p_{V}
+, reflection generator
+p
+ref
+subscript
+𝑝
+ref
+p_{\text{ref}}
+, number of generated actions
+n
+𝑛
+n
+, depth limit
+L
+𝐿
+L
+, number of roll-outs
+K
+𝐾
+K
+, context
+c
+𝑐
+c
+, and exploration weight
+w
+𝑤
+w
+Initialize action space
+A
+𝐴
+A
+, observation space
+O
+𝑂
+O
+Initialize the state-action value function
+p
+V
+:
+S
+×
+A
+↦
+ℝ
+:
+subscript
+𝑝
+𝑉
+maps-to
+𝑆
+𝐴
+ℝ
+{p_{V}}:S\times A\mapsto\mathbb{R}
+and visit counter
+N
+:
+S
+↦
+ℕ
+:
+𝑁
+maps-to
+𝑆
+ℕ
+{N}:S\mapsto\mathbb{N}
+to zero
+for
+k
+←
+0
+,
+…
+,
+K
+−
+1
+←
+𝑘
+0
+…
+𝐾
+1
+k\leftarrow 0,\dots,K-1
+do
+for
+t
+←
+0
+,
+…
+,
+L
+−
+1
+←
+𝑡
+0
+…
+𝐿
+1
+t\leftarrow 0,\dots,L-1
+do
+if
+s
+t
+subscript
+𝑠
+𝑡
+s_{t}
+not terminal
+then
+▷
+▷
+\triangleright
+Expansion & Simulation
+for
+i
+←
+1
+,
+…
+,
+n
+←
+𝑖
+1
+…
+𝑛
+i\leftarrow 1,\dots,n
+do
+Sample
+a
+t
+(
+i
+)
+∼
+p
+θ
+​
+(
+a
+∣
+s
+t
+)
+similar-to
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+subscript
+𝑝
+𝜃
+conditional
+𝑎
+subscript
+𝑠
+𝑡
+a_{t}^{(i)}\sim p_{\theta}(a\mid s_{t})
+Get
+o
+t
+(
+i
+)
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+o_{t}^{(i)}
+from environment,
+s
+t
++
+1
+(
+i
+)
+←
+(
+c
+t
+(
+i
+)
+,
+o
+t
+(
+i
+)
+,
+a
+t
+(
+i
+)
+)
+←
+superscript
+subscript
+𝑠
+𝑡
+1
+𝑖
+superscript
+subscript
+𝑐
+𝑡
+𝑖
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+s_{t+1}^{(i)}\leftarrow(c_{t}^{(i)},o_{t}^{(i)},a_{t}^{(i)})
+,
+c
+t
++
+1
+(
+i
+)
+←
+(
+o
+t
+(
+i
+)
+,
+a
+t
+(
+i
+)
+)
+←
+superscript
+subscript
+𝑐
+𝑡
+1
+𝑖
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+c_{t+1}^{(i)}\leftarrow(o_{t}^{(i)},a_{t}^{(i)})
+Evaluate
+V
+t
+(
+i
+)
+∼
+p
+V
+​
+(
+s
+t
+(
+i
+)
+)
+similar-to
+superscript
+subscript
+𝑉
+𝑡
+𝑖
+subscript
+𝑝
+𝑉
+superscript
+subscript
+𝑠
+𝑡
+𝑖
+{V}_{t}^{(i)}\sim{p_{V}}(s_{t}^{(i)})
+▷
+▷
+\triangleright
+Evaluation
+V
+​
+(
+s
+t
+)
+←
+V
+t
+(
+i
+)
+←
+𝑉
+subscript
+𝑠
+𝑡
+superscript
+subscript
+𝑉
+𝑡
+𝑖
+{V}(s_{t})\leftarrow{V}_{t}^{(i)}
+Add
+s
+t
+(
+i
+)
+superscript
+subscript
+𝑠
+𝑡
+𝑖
+s_{t}^{(i)}
+to children
+end
+for
+end
+if
+if
+s
+t
+subscript
+𝑠
+𝑡
+s_{t}
+is terminal
+then
+▷
+▷
+\triangleright
+Reflection
+Get
+r
+𝑟
+r
+from environment
+if
+r
+𝑟
+r
+not success
+then
+reflection
+←
+p
+ref
+​
+(
+c
+t
+)
+←
+reflection
+subscript
+𝑝
+ref
+subscript
+𝑐
+𝑡
+\text{reflection}\leftarrow p_{\text{ref}}(c_{t})
+c
+←
+reflection
+←
+𝑐
+reflection
+c\leftarrow\text{reflection}
+end
+if
+end
+if
+a
+t
+←
+arg
+⁡
+max
+a
+∈
+e
+​
+(
+s
+t
+)
+⁡
+[
+V
+​
+(
+s
+t
+)
++
+w
+​
+ln
+⁡
+N
+​
+(
+s
+t
+−
+1
+)
+N
+​
+(
+s
+t
+)
+]
+←
+subscript
+𝑎
+𝑡
+subscript
+𝑎
+𝑒
+subscript
+𝑠
+𝑡
+𝑉
+subscript
+𝑠
+𝑡
+𝑤
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑁
+subscript
+𝑠
+𝑡
+a_{t}\leftarrow\arg\max_{a\in e(s_{t})}\left[{V(s_{t})}+w\sqrt{\frac{\ln{N}(s_{t-1})}{{N}(s_{t})}}\right]
+▷
+▷
+\triangleright
+Selection
+N
+​
+(
+s
+t
++
+1
+)
+←
+N
+​
+(
+s
+t
++
+1
+)
++
+1
+←
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑁
+subscript
+𝑠
+𝑡
+1
+1
+{N}(s_{t+1})\leftarrow{N}(s_{t+1})+1
+if
+a
+t
+subscript
+𝑎
+𝑡
+a_{t}
+is an output action
+then
+break
+end
+for
+T
+←
+←
+𝑇
+absent
+T\leftarrow
+the actual number of steps
+for
+t
+←
+T
+−
+1
+,
+…
+,
+0
+←
+𝑡
+𝑇
+1
+…
+0
+t\leftarrow T-1,\dots,0
+do
+▷
+▷
+\triangleright
+Backpropagation
+V
+​
+(
+s
+t
+)
+←
+V
+​
+(
+s
+t
+)
+​
+(
+N
+​
+(
+s
+t
+)
+−
+1
+)
++
+r
+N
+​
+(
+s
+t
+)
+←
+𝑉
+subscript
+𝑠
+𝑡
+𝑉
+subscript
+𝑠
+𝑡
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑟
+𝑁
+subscript
+𝑠
+𝑡
+V(s_{t})\leftarrow\frac{V(s_{t})(N(s_{t})-1)+r}{N(s_{t})}
+end
+for
+end
+for
+Appendix B
+Discussion
+Limitations.
+Although LATS can improve reasoning and decision-making, this arrives at a higher computational cost relative to simpler prompting methods like ReAct or Reflexion. The search process takes more time than standard prompting or simpler techniques, and requires greater inference costs. While such an issue is mitigated by the fact that the number of nodes
+n
+𝑛
+n
+expanded at every step provides a natural trade-off between performance and efficiency (setting
+n
+=
+1
+𝑛
+1
+n=1
+makes the method as effecient as ReAct with multiple trials or CoT-SC), in practice we recommend using LATS for difficult tasks like programming or for situations where performance is prioritized over efficiency. We hope that continued advancements in LLMs will reduce costs and increase the practicality of LATS.
+Additionally, the benchmarks we use in this paper are relatively simple and focused on decision-making, compared to the complexity of real-world interactive environments. In addition, some environments might not easily support rollbacks to previous states. However, the design of LATS is flexible and can be adjusted to various resource constraints. Using planning-based prompting methods like LATS in environments like Minecraft
+(Fan et al.,
+2022
+)
+and more reasoning benchmarks would be interesting avenues for future work.
+Broader impact.
+LATS is a framework that enhances LLM performance through interactions with an environment. This improvement in autonomous decision-making may facilitate harmful uses of LLMs. Alternatively, LATS enhances interpretability and the potential for greater alignment, as it generates understandable, high-level linguistic reasoning and actions through several rounds of decision-making and reflection, rather than relying on implicit, low-level token values.
+Appendix C
+Ablations
+Prompt Method
+HotpotQA (EM)
+LATS (w=0.5)
+0.55
+LATS (w=2.0)
+0.61
+LATS (d=4)
+0.58
+LATS (CoT)
+0.60
+LATS (No LM Heuristic)
+0.37
+LATS
+0.61
+Table 6:
+Ablation results on LATS and baseline variants in HotPotQA measured by Exact Match (EM). We test different depth
+d
+𝑑
+d
+, exploration factor
+w
+𝑤
+w
+, and versions of LATS using CoT and without the LM value function. We sample
+n
+=
+5
+𝑛
+5
+n=5
+and
+k
+=
+50
+𝑘
+50
+k=50
+trajectories.
+Figure 4:
+Performance over successive iterations on HumanEval with GPT-3.5.
+In this section, we ablate various designs of LATS. Experiments are conducted on HotPotQA with a maximum of
+k
+=
+50
+𝑘
+50
+k=50
+trajectories and sampling size of
+n
+=
+5
+𝑛
+5
+n=5
+and HumanEval with a maximum of
+k
+=
+8
+𝑘
+8
+k=8
+trajectories and sampling size of
+n
+=
+5
+𝑛
+5
+n=5
+. The result for HotPotQA is shown in Tab.
+5
+and HumanEval in Fig.
+4
+.
+Exploration weight.
+We find that there is lower performance on HotPotQA when the exploration weight
+w
+𝑤
+w
+in the selection formula is decreased to
+0.5
+0.5
+0.5
+, suggesting that this reduces the effectiveness of the search. Increasing
+w
+𝑤
+w
+to
+2.0
+2.0
+2.0
+does not lead to a performance improvement, but we tend to observe faster convergence. The optimal setting depends on the particular environment and complexity of the state space.
+Depth.
+In our main experiments we use a maximum depth of
+d
+=
+7
+𝑑
+7
+d=7
+on HotPotQA for all methods, following previous work
+(Yao et al.,
+2023b
+)
+. We ablate the effect on LATS after reducing it to
+d
+=
+4
+𝑑
+4
+d=4
+. This results in only a slight drop in performance. We find that most questions can be answered within four steps, and using a greater number of steps tends to force the agent into local minima and rarely improves success.
+LM value function.
+The LM value function scores states based on expected future reward. Without this heuristic, the only signal to guide search would be from environment rewards for completed trajectories, which are scarce and often binary. When we remove the evaluation operation, we observe a dramatic
+0.24
+0.24
+0.24
+drop in performance.
+Performance over time.
+To see the effects of increasing the number of trajectories sampled, we change
+k
+𝑘
+k
+to different values. We conduct this experiment on HumanEval, which has a more noticeable difference due to sampling less trajectories. The results are shown in Fig.
+4
+, in which LATS scales better with more iterations than Reflexion.
+Sample complexity and Token cost.
+One possible concern of LATS is that the tree-structured search might consume much more tokens than existing methods. To further study the computational cost of LATS compared to prior methods, we examine the sample complexity (i.e. asymptotic token cost) of all methods considered in this paper, and count the average number of nodes expanded by our method and other tree-structured methods (ToT and RAP) upon successful search on HotPotQA. We present the results in Tab.
+7
+; the result shows that our method has the same sample complexity as other tree-based search methods, and has less average number of nodes expanded upon success, which indicates less token cost. The token cost gap will be even larger when taking failed trajectories into account, since our method has higher success rate and reaches computational budget limit less often.
+Method
+Performance (
+↑
+↑
+\uparrow
+)
+Sample complexity (
+↓
+↓
+\downarrow
+)
+Avg. #nodes upon success (
+↓
+↓
+\downarrow
+)
+ReAct (Best
+k
+=
+250
+𝑘
+250
+k=250
+)
+0.42
+0.42
+0.42
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+CoT-SC (
+n
+=
+1
+,
+k
+=
+250
+formulae-sequence
+𝑛
+1
+𝑘
+250
+n=1,k=250
+)
+0.40
+0.40
+0.40
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+LATS (
+n
+=
+1
+,
+k
+=
+50
+formulae-sequence
+𝑛
+1
+𝑘
+50
+n=1,k=50
+)
+0.48
+0.48
+0.48
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+ToT (ReAct)
+0.49
+0.49
+0.49
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+84.05
+84.05
+84.05
+RAP (ReAct)
+0.54
+0.54
+0.54
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+70.60
+70.60
+70.60
+LATS (
+n
+=
+5
+,
+k
+=
+50
+formulae-sequence
+𝑛
+5
+𝑘
+50
+n=5,k=50
+)
+0.61
+0.61
+0.61
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+66.65
+66.65
+66.65
+Table 7:
+The performance, sample complexity of different methods and average number of nodes expanded upon success by methods with tree-based search.
+n
+𝑛
+n
+is the number of children nodes expanded at every step and
+k
+𝑘
+k
+is the number of trajectories. Our method has the same sample complexity as other methods with tree-based search and expands less nodes upon success, which indicates lower token cost.
+Appendix D
+Environment Details
+D.1
+HotPotQA
+Figure 5:
+Example trajectories on HotPotQA for ReAct (left) and LATS (right). LATS can sample more actions and avoid failure from previous mistakes by evaluating states with an LM to guide the search toward promising areas of the tree.
+HotPotQA
+(Yang et al.,
+2018
+)
+is a question-answering dataset that requires reasoning over multiple supporting documents to answer questions. It contains 113k Wikipedia-based question-answer pairs crafted by crowdworkers to be diverse, multi-hop, and explainable. Questions cover a range of types like entities, locations, dates, and comparison of shared properties between two entities. Crowdworkers also provide supporting facts from the documents that justify the answer. We use the HotPotQA benchmark setting with all the Wikipedia paragraphs to test retrieval. We use a randomly selected subset of 100 questions for our experiments and a maximum depth limit of 6. Fig.
+5
+illustrates how ReAct and LATS work on an example task of HotPotQA, and gives a qualitative example on how LATS outperforms ReAct on the task.
+Action Space.
+We adopt the Wikipedia web API proposed in
+Yao et al. (
+2023b
+)
+, with three types of actions to support interactive information retrieval:
+(1)
+search
+[
+entity
+], which returns the first 5 sentences from the corresponding
+entity
+wiki page if it exists, or else suggests top-5 similar entities from the Wikipedia search engine,
+(2)
+lookup
+[
+string
+], which returns the next sentence in the page containing
+string
+,
+(3)
+finish
+[
+answer
+], which finishes the current task with
+answer
+.
+These API calls and free-form thoughts form the action space for this environment.
+D.2
+Programming
+The HumanEval dataset
+(Chen et al.,
+2021
+)
+is a collection of 164 handwritten programming problems introduced to evaluate the functional correctness of models for synthesizing programs from natural language descriptions. Each problem includes a function signature, docstring description, reference implementation, and multiple unit tests, with an average of 7.7 tests per problem. The programming tasks assess comprehension of natural language, reasoning, algorithms, and basic mathematics, at a difficulty level comparable to simple software interview questions. Pass rates are evaluated with the pass@k metric, where k samples are generated per problem and a problem is considered solved if any sample passes all tests. We use all 164 problems for our experiments and a maximum depth limit of 8.
+The Mostly Basic Programming Problems (MBPP)
+Austin et al. (
+2021
+)
+benchmark contains 974 short Python functions designed to evaluate program synthesis techniques. The dataset was constructed by crowdsourcing from workers with basic Python knowledge. Each data point consists of a natural language description of a programming task, a reference solution implementation, and three test cases for functional correctness. The natural language prompts are typically short, one-sentence descriptions. Solutions cover common programming constructs including mathematical operations, list processing, string manipulation, and usage of the Python standard library. On average, solutions are 6.8 lines of code. The dataset is also supplemented with an additional set of 426 problems that were manually verified for unambiguous specifications, standard function signatures, and accurate test cases. We use a randomly selected subset of 397 problems for our experiments.
+D.3
+WebShop
+WebShop
+(Yao et al.,
+2022
+)
+is an interactive web-based environment designed to evaluate agents on grounded language understanding and decision-making. It simulates an e-commerce shopping task by providing agents with over 1 million real-world products scraped from Amazon, spanning 5 categories and 113 subcategories. These products contain rich linguistic information, with an average text length of 262 words and a vocabulary size of 224k. In addition, there are over 800k unique product options available for customization. The environment renders webpages in two modes: HTML mode provides pixel-level observations with interactive elements, while simple mode converts the raw HTML into a structured text observation more amenable for training agents. The action space consists of query searches and button clicks, which transition between 4 page types: search, results, item and item-detail. Instructions are crowdsourced natural language specifying product attributes and options, with a total of 12k collected. Automatic rewards are computed by comparing the product purchased by the agent against the attributes and options specified in the instruction, using both lexical matching and semantic similarity metrics.
+Type
+Argument
+State
+→
+→
+\rightarrow
+Next State
+search
+[
+Query
+]
+Search
+→
+→
+\rightarrow
+Results
+choose
+Back to search
+∗
+*
+→
+→
+\rightarrow
+Search
+choose
+Prev/Next page
+Results
+→
+→
+\rightarrow
+Results
+choose
+[
+Product title
+]
+Results
+→
+→
+\rightarrow
+Item
+choose
+[
+Option
+]
+Item
+→
+→
+\rightarrow
+Item
+choose
+Desc/Overview
+Item
+→
+→
+\rightarrow
+Item-Detail
+choose
+Previous
+Item-Detail
+→
+→
+\rightarrow
+Item
+choose
+Buy
+Item
+→
+→
+\rightarrow
+Episode End
+Table 8:
+Action space of webshop.
+There are two evaluation metrics used in WebShop: (1)
+Task Score
+: defined as
+(
+100
+×
+avg. reward
+)
+100
+avg. reward
+(100\times\text{avg. reward})
+, which captures the average reward obtained across episodes; and (2)
+Success Rate (SR)
+defined as the portion of instructions where
+r
+=
+1
+𝑟
+1
+r=1
+. The reward is calculated based on the number of attributes satisfied by the selected item. We use 50 environments for our experiments and a maximum depth limit of 15.
+Appendix E
+HotPotQA Prompts
+E.1
+Base Acting Prompt
+Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+After each observation, provide the next Thought and next Action.
+Here are some examples:
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+Thought 2: Arthur’s Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman’s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey.[2] In 2011 the circulation of the magazine was 1,310,696 copies.[3]
+Thought 3: First for Women was started in 1989. 1844 (Arthur’s Magazine) ¡ 1989 (First for Women), so Arthur’s Magazine was started first.
+Action 3: Finish[Arthur’s Magazine]
+(examples)
+You have attempted to answer the following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.
+(trajectories)
+(input)
+E.2
+Base Reasoning Prompt
+Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question. Start your response with either Action or an indexed Thought
+Here are some examples:
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Let’s think step by step.
+Thought 1: The eastern sector of Colorado orogeny extends into the High Plains.
+Thought 2: High Plains rise in elevation from around 1,800 to 7,000 ft
+Thought 3: The answer is 1,800 to 7,000 ft.
+Action: Finish[1,800 to 7,000 ft]
+(examples)
+Previous trial:
+(trajectories)
+(input)
+E.3
+Value Function Prompt
+Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action.
+Thus the correctness score is 10
+(other examples)
+(failed trajectories)
+(context)
+E.4
+Reflection Prompt
+Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action.
+Thus the correctness score is 10
+(other examples)
+(failed trajectories)
+(context)
+Appendix F
+Programming Prompts
+F.1
+HumanEval function implementation example
+Sample function signature:
+⬇
+def
+minSubArraySum
+(
+nums
+):
+Given
+an
+array
+of
+integers
+nums
+,
+find
+the
+minimum
+sum
+of
+any
+non
+-
+empty
+sub
+-
+array
+of
+nums
+.
+Example
+minSubArraySum
+([2,
+3,
+4,
+1,
+2,
+4])
+==
+1
+minSubArraySum
+([-1,
+-2,
+-3])
+==
+-6
+Sample function body implementation:
+⬇
+min_sum
+=
+float
+(’
+inf
+’)
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+current_sum
+=
+0
+for
+j
+in
+range
+(
+i
+,
+len
+(
+nums
+)):
+current_sum
++=
+nums
+[
+j
+]
+if
+current_sum
+<
+min_sum
+:
+min_sum
+=
+current_sum
+return
+min_sum
+F.2
+Base Acting/Reasoning Prompt
+You are an AI Python assistant. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Write your full implementation (restate the function signature).
+Example 1:
+[previous impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
+-
+b
+[unit test results from previous impl]:
+Tested passed:
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 2) == 4 # output: -1
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input.
+[improved impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
++
+b
+F.3
+Reflection Prompt
+You are a Python programming assistant. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user.
+Example 1:
+[previous impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
+-
+b
+[unit test results from previous impl]:
+Tested passed:
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 2) == 4 # output: -1
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input.
+F.4
+Test Case Generation Prompt
+You are an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
+(input)
+Appendix G
+WebShop Prompts
+G.1
+Acting Prompt
+Webshop
+Instruction:
+i would like a 3 ounce bottle of bright citrus deodorant for sensitive skin, and price lower than 50.00 dollars
+[Search]
+Action: search[3 ounce bright citrus deodorant sensitive skin]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next
+]
+[B078GWRC1J]
+Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+$10.99
+[B078GTKVXY]
+Ginger Fresh Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+$10.99
+[B08KBVJ4XN]
+Barrel and Oak - Aluminum-Free Deodorant, Deodorant for Men, Essential Oil-Based Scent, 24-Hour Odor Protection, Cedar & Patchouli Blend, Gentle on Sensitive Skin (Mountain Sage, 2.7 oz, 2-Pack)
+$15.95
+Action: think[B078GWRC1J and B078GTKVXY are bright citrus deodorant less then 50 dollars. I can check B078GWRC1J first.]
+Observation: OK.
+Action: click[B078GWRC1J]
+Observation:
+[Back to Search]
+[Prev]
+scent [assorted scents][bright citrus][calming lavender][ginger fresh][simply non-scents]
+size [travel set (4-pack)][3 ounce (pack of 1)][3-ounce (2-pack)]
+Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+Price: $10.99
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[For 3 ounce bottle of bright citrus deodorant for sensitive skin, the item has options ’bright citrus’ and ’3 ounce (pack of 1)’ and seems good to buy.]
+Observation: OK.
+Action: click[bright citrus]
+Observation: You have clicked bright citrus.
+Action: click[3 ounce (pack of 1)]
+Observation: You have clicked 3 ounce (pack of 1).
+Action: click[Buy Now]
+G.2
+Value Function Prompt
+Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Here are some examples
+Webshop
+Instruction:
+i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars
+[Search]
+Action: search[dairy free and apple variety pack of chips]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next]
+[B07HRFJWP8]
+Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total)
+$100.0
+[B01KMHY5PG]
+Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars)
+$21.49
+[B008D2X8C4]
+Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz)
+$100.0
+Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.]
+Observation: OK.
+Action: click[B008D2X8C4]
+Observation:
+[Back to Search]
+[Prev]
+flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack]
+size [0.8 ounce (pack of 24)][4 ounce (pack of 12)]
+Price: $100.0
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.]
+Observation: OK.
+Action: click[variety pack]
+Observation: You have clicked variety pack.
+Action: click[0.8 ounce (pack of 24)]
+Observation: You have clicked 0.8 ounce (pack of 24).
+Action: click[Buy Now]
+Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products.
+Thus the correctness score is 5
+(input)
+G.3
+Reflection Prompt
+Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Here are some examples
+Webshop
+Instruction:
+i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars
+[Search]
+Action: search[dairy free and apple variety pack of chips]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next]
+[B07HRFJWP8]
+Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total)
+$100.0
+[B01KMHY5PG]
+Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars)
+$21.49
+[B008D2X8C4]
+Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz)
+$100.0
+Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.]
+Observation: OK.
+Action: click[B008D2X8C4]
+Observation:
+[Back to Search]
+[Prev]
+flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack]
+size [0.8 ounce (pack of 24)][4 ounce (pack of 12)]
+Price: $100.0
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.]
+Observation: OK.
+Action: click[variety pack]
+Observation: You have clicked variety pack.
+Action: click[0.8 ounce (pack of 24)]
+Observation: You have clicked 0.8 ounce (pack of 24).
+Action: click[Buy Now]
+Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products.
+(input)
+Reflection:
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b8107c035d219f37354da26d53aab293e7415c2
--- /dev/null
+++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3.md
@@ -0,0 +1,4095 @@
+---
+title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning
+  in Language Models'
+id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la-3
+tags:
+- deepread
+created: '2026-06-10T00:40:52.405072Z'
+source: https://ar5iv.labs.arxiv.org/html/2310.04406
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:40:52.404928Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Andy Zhou
+University of Illinois at Urbana-Champaign
+AI@UIUC
+Kai Yan
+University of Illinois at Urbana-Champaign
+Michal Shlapentokh-Rothman
+University of Illinois at Urbana-Champaign
+Haohan Wang
+University of Illinois at Urbana-Champaign
+Yu-Xiong Wang
+University of Illinois at Urbana-Champaign
+Abstract
+While large language models (LLMs) have demonstrated impressive performance on a range of decision-making tasks, they rely on simple acting processes and fall short of broad deployment as autonomous agents. We introduce LATS (Language Agent Tree Search), a general framework that synergizes the capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration from Monte Carlo tree search commonly used in model-based reinforcement learning, LATS employs LLMs as agents, value functions, and optimizers, repurposing their latent strengths for enhanced decision-making. What is crucial in this method is the use of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that moves beyond the limitations of existing techniques. Our experimental evaluation across diverse domains, such as programming, HotPotQA, and WebShop, illustrates the applicability of LATS for decision-making while maintaining competitive reasoning performance. In particular, LATS achieves 94.4% for programming on HumanEval with GPT-4 and an average score of 75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness and generality of our method.
+1
+Introduction
+General autonomous agents capable of reasoning and decision-making in a variety of environments
+(Wooldridge & Jennings,
+1995
+)
+have been of longstanding interest in the field of artificial intelligence. While this has traditionally been studied in reinforcement learning, the recent rise of large language models (LLMs)
+(Brown et al.,
+2020
+; Chowdhery et al.,
+2022
+; Touvron et al.,
+2023
+; OpenAI,
+2023
+)
+with strong reasoning and general adaptability offers an alternative paradigm. Not only have LLMs excelled on standard NLP tasks such as text summarization
+(Nallapati et al.,
+2016
+)
+or natural language inference
+(Bowman et al.,
+2015
+)
+, but they have been adapted to an increasingly diverse set of tasks that often require advanced common-sense reasoning or quantitative skills
+(Cobbe et al.,
+2021
+; Saparov & He,
+2022
+)
+. LLMs are also capable of performing in complex environments that involve knowledge and reasoning, such as web navigation
+(Yao et al.,
+2022
+; Deng et al.,
+2023
+)
+, tool-use
+(Schick et al.,
+2023
+)
+, or open-ended games
+(Fan et al.,
+2022
+)
+.
+Figure 1:
+An overview of LATS. LATS uses an external environment and self-reflection to improve reasoning and decision-making.
+Reasoning and acting abilities have also been improved by prompting techniques that augment LLMs with feedback or observations from an external environment
+(Yao et al.,
+2023b
+; Gao et al.,
+2022
+; Shinn et al.,
+2023
+)
+. This eliminates the need to rely entirely on the base abilities of the Language Model (LM), enhancing it through external tools or semantic feedback. Despite this strength, these methods are reflexive and fall short of humans’ deliberate and thoughtful decision-making characteristics to solve problems
+(Sloman,
+1996
+; Evans,
+2010
+)
+. In particular, such methods fail to consider multiple reasoning paths or to plan ahead. Recent search-guided LLM works
+(Xie et al.,
+2023
+; Yao et al.,
+2023a
+; Hao et al.,
+2023
+)
+address this issue by searching over multiple reasoning chains. While these methods enable planning, these methods operate in isolation and do not incorporate external feedback that can improve reasoning.
+To help address these issues, we propose LATS (Language Agent Tree Search), a general framework for decision-making and reasoning with language models. LATS unifies LM planning, acting, and reasoning strategies by expanding ReAct
+(Yao et al.,
+2023b
+)
+into a search over a combinatorial space of possible reasoning and acting steps. We adapt Monte Carlo tree search (MCTS) from model-based reinforcement learning
+(Silver et al.,
+2017
+; Anthony et al.,
+2017
+; Jiang et al.,
+2018
+)
+to language agents, repurposing a pretrained LLM as an agent, value function, and optimizer. Utilizing the strong natural language understanding and in-context learning ability of modern LMs, we use text as an interface between each component of the framework, allowing LATS to adapt planning to environmental conditions without additional training. To the best of our knowledge,
+LATS is the first framework that combines reasoning, acting, and planning to enhance LLMs
+. Notably, LATS doubles the performance of GPT-3.5 on HotPotQA
+(Yang et al.,
+2018
+)
+over ReAct
+(Yao et al.,
+2023b
+)
+and raises the average score by
+22.1
+22.1
+22.1
+on WebShop
+(Yao et al.,
+2022
+)
+. When used with GPT-4, LATS achieves a
+94.4
+94.4
+94.4
+Pass@1 rate for programming on HumanEval
+(Chen et al.,
+2021
+)
+, setting the state of the art. To summarize, our
+contributions
+are the following:
+•
+We introduce an LM-based Monte Carlo tree search variant to deliberately construct the best trajectory from sampled actions, enabling more flexible and adaptive problem-solving compared to reflexive prompting methods. This is guided by heuristics from the LM.
+•
+By integrating external feedback and self-reflection, LATS enhances model sensibility and enables agents to learn from experience, surpassing reasoning-based search methods.
+•
+Through experiments across diverse domains like programming, interactive QA, and web navigation, we demonstrate the versatility of LATS in harnessing LLMs for autonomous reasoning and decision-making.
+2
+Related Work
+Approach
+Reasoning
+Acting
+Planning
+Self
+External
+Reflection
+Memory
+CoT
+(Wei et al.,
+2022
+)
+✓
+×
+\times
+×
+\times
+×
+\times
+×
+\times
+ReAct
+(Yao et al.,
+2023b
+)
+✓
+✓
+×
+\times
+×
+\times
+×
+\times
+ToT
+(Yao et al.,
+2023a
+)
+✓
+×
+\times
+✓
+✓
+✓
+RAP
+(Hao et al.,
+2023
+)
+✓
+×
+\times
+✓
+×
+\times
+✓
+Self-Refine
+(Madaan et al.,
+2023
+)
+✓
+×
+\times
+×
+\times
+✓
+×
+\times
+Beam Search
+(Xie et al.,
+2023
+)
+✓
+×
+\times
+×
+\times
+✓
+×
+\times
+Reflexion
+(Shinn et al.,
+2023
+)
+✓
+✓
+×
+\times
+✓
+✓
+LATS (Ours)
+✓
+✓
+✓
+✓
+✓
+Table 1:
+A summary of related work on reasoning, acting, and planning. LATS is the first work incorporating designs from all three domains, allowing use in all corresponding tasks. We refer to planning as the use of a search algorithm, self-reflection as the use of LM-generated feedback, and external memory as storaging past text context for future updates of solution.
+a) Tree-of-Thoughts
+b) Reasoning via Planning
+c) Language Agent Tree Search
+Figure 2:
+An overview of the differences between LATS and recently proposed LM search algorithms ToT
+(Yao et al.,
+2023a
+)
+and RAP
+(Hao et al.,
+2023
+)
+. LATS leverages environmental feedback and self-reflection to further adapt search and improve performance.
+LLMs for reasoning.
+For LLMs, reasoning typically involves decomposing complex inputs into sequential intermediate steps towards a final answer
+(Cobbe et al.,
+2021
+)
+, demonstrated with Chain-of-Thought (CoT) prompting
+(Wei et al.,
+2022
+)
+and its variants
+(Wei et al.,
+2022
+; Kojima et al.,
+2022
+; Wang et al.,
+2022
+)
+. However, these methods, which create chains autoregressively in a single step, often suffer from error propagation as the number of steps increases
+(Guo et al.,
+2018
+; Chen et al.,
+2022b
+)
+due to compound errors. Various advancements aim to mitigate this issue; some approaches, such as Self-Consistency
+(Wang et al.,
+2022
+)
+, employ majority voting over sampled chains, while others focus on multi-step decomposition, such as least-to-most prompting
+(Zhou et al.,
+2022
+)
+, or use of external tools such as a scratchpad
+(Nye et al.,
+2021
+)
+or compiler
+(Gao et al.,
+2022
+)
+. Recently, CoT has been improved with search algorithms
+(Yao et al.,
+2023a
+; Hao et al.,
+2023
+; Besta et al.,
+2023
+)
+that can sample trajectories more effectively. Tree-of-thought (ToT) prompting
+(Yao et al.,
+2023a
+)
+uses DFS or BFS-based search guided by an LM-generated heuristic while Reasoning via Planning (RAP)
+(Hao et al.,
+2023
+)
+uses MCTS with rollouts simulated by the LM. However, they rely solely on LM internal knowledge and cannot adapt to useful external feedback.
+LLMs for acting.
+The strong reasoning and common-sense abilities of LLMs have also been adapted for decision-making or acting tasks as a policy model in interactive environments. In the realm of robotics LLMs have been employed as high-level controllers of control policies
+(Ahn et al.,
+2022
+; Huang et al.,
+2022
+; Driess et al.,
+2023
+)
+. Similar work
+(Baker et al.,
+2022
+; Wang et al.,
+2023
+; Zhu et al.,
+2023
+)
+has also adapted LLM agents to complex multimodal games such as Minecraft
+(Guss et al.,
+2019
+; Fan et al.,
+2022
+)
+. LLMs are particularly useful in text-based environments
+(Liu et al.,
+2018
+; Shridhar et al.,
+2020
+; Liu et al.,
+2023
+)
+, where acting-based prompting techniques such as ReAct
+(Yao et al.,
+2023b
+)
+have seen success. Similar to CoT, ReAct is limited by its simplicity and cannot effectively adapt to environment conditions. Many extensions have been proposed to address this, including Self-refine
+(Madaan et al.,
+2023
+)
+and Reflexion
+(Shinn et al.,
+2023
+; Yao et al.,
+2023c
+)
+, which uses self-reflection to enhance reasoning and decision-making, and AdaPlanner
+(Sun et al.,
+2023
+)
+, which incorporates both positive and negative environmental feedback. However these methods focus on refining an individual plan or trajectory and do not consider alternative choices at each step. In addition, recent work
+(Huang et al.,
+2023
+)
+has suggested LLMs cannot self-correct their internal reasoning, making it critical to use external feedback. Alternatively to pure decision-making environments, the reasoning and practical abilities of LLMs have been enhanced by access to external tools, such as APIs, search engines, calculators, or other models
+(Schick et al.,
+2023
+; Shen et al.,
+2023
+; Surís et al.,
+2023
+)
+. Contrary to reasoning-based approaches, these methods have not been improved with planning, limiting their effectiveness. We summarize them in Tab.
+1
+.
+Tree-based search.
+Tree-based search, where multiple branches of outcomes are explored during search, is widely used in many planning algorithms
+(Świechowski et al.,
+2023
+; LaValle et al.,
+2001
+)
+and Reinforcement Learning (RL)
+(Hafner et al.,
+2019
+; Du et al.,
+2023
+; Wu et al.,
+2023
+)
+algorithms for its good exploration-exploitation trade-off. Though tree-based search requires an environment model that can expand from arbitrary state
+(Vodopivec et al.,
+2017
+)
+, which often requires extra training in RL
+(Hafner et al.,
+2023
+)
+, such problem does not exist for LM tasks as we can conveniently backup to any state by setting the input to be the context and corresponding previous output by the LM. Thus, we work on the tree-based framework and use MCTS
+(Świechowski et al.,
+2023
+)
+to fully release the potential of LMs, while avoiding the cost of training a value function over language descriptions by leveraging the in-context learning
+(Brown et al.,
+2020
+)
+abilities of LLMs.
+3
+Preliminaries
+3.1
+Problem Setting and Prompting
+Before describing LATS, we first define our problem and outline a few established methods that leverage large language models for reasoning or decision-making. In LM reasoning or decision making, we are given an input
+x
+𝑥
+x
+in natural language and a pretrained language model
+p
+θ
+​
+(
+x
+)
+subscript
+𝑝
+𝜃
+𝑥
+p_{\theta}(x)
+parameterized by
+θ
+𝜃
+\theta
+; our goal is to generate a final output
+y
+∼
+p
+θ
+​
+(
+x
+)
+similar-to
+𝑦
+subscript
+𝑝
+𝜃
+𝑥
+y\sim p_{\theta}(x)
+corresponding to the answer (reasoning) or completes the task (decision-making). Both
+x
+𝑥
+x
+and
+y
+𝑦
+y
+are language
+sequences
+, which are comprised of a list of
+tokens
+(the basic elements of natural language, often words), denoted as
+x
+=
+(
+x
+​
+[
+1
+]
+,
+…
+,
+x
+​
+[
+n
+]
+)
+𝑥
+𝑥
+delimited-[]
+1
+…
+𝑥
+delimited-[]
+𝑛
+x=(x[1],\dots,x[n])
+and
+y
+=
+(
+y
+​
+[
+1
+]
+,
+…
+,
+y
+​
+[
+n
+]
+)
+𝑦
+𝑦
+delimited-[]
+1
+…
+𝑦
+delimited-[]
+𝑛
+y=(y[1],\dots,y[n])
+. The LM decodes text autoregressively, i.e., without other inputs, the probability for an LM to generate a sequence
+x
+𝑥
+x
+is given by
+p
+θ
+​
+(
+x
+)
+=
+∏
+i
+=
+1
+n
+p
+θ
+​
+(
+x
+​
+[
+i
+]
+|
+x
+​
+[
+1
+​
+…
+​
+i
+−
+1
+]
+)
+subscript
+𝑝
+𝜃
+𝑥
+superscript
+subscript
+product
+𝑖
+1
+𝑛
+subscript
+𝑝
+𝜃
+conditional
+𝑥
+delimited-[]
+𝑖
+𝑥
+delimited-[]
+1
+…
+𝑖
+1
+p_{\theta}(x)=\prod_{i=1}^{n}p_{\theta}(x[i]|x[1\dots i-1])
+. Usually, to improve the LM,
+prompts
+are provided along with the input
+x
+𝑥
+x
+, which are specific instructions or few-shot input-output examples. We denote the generic process where an input
+x
+𝑥
+x
+is transformed into an output
+y
+𝑦
+y
+by LM:
+y
+∼
+p
+θ
+​
+(
+y
+|
+prompt
+I
+​
+O
+​
+(
+x
+)
+)
+similar-to
+𝑦
+subscript
+𝑝
+𝜃
+conditional
+𝑦
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+y\sim p_{\theta}(y|\texttt{prompt}_{IO}(x))
+, where
+prompt
+I
+​
+O
+​
+(
+x
+)
+subscript
+prompt
+𝐼
+𝑂
+𝑥
+\texttt{prompt}_{IO}(x)
+denotes the input
+x
+𝑥
+x
+.
+Chain-of-thought (CoT) Prompting
+(Wei et al.,
+2022
+)
+was introduced to cater to scenarios where direct mapping from
+x
+𝑥
+x
+to
+y
+𝑦
+y
+is intricate, such as when
+x
+𝑥
+x
+is from a mathematical query or challenging question. This method hinges on creating
+thoughts
+z
+1
+,
+…
+,
+z
+n
+subscript
+𝑧
+1
+…
+subscript
+𝑧
+𝑛
+z_{1},\dots,z_{n}
+that act as stepping stones between
+x
+𝑥
+x
+and
+y
+𝑦
+y
+; each thought
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+is a language sequence. To employ CoT prompting, thoughts are extracted sequentially as
+z
+i
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
+|
+x
+,
+z
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑧
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+1
+z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1})
+, with the final output being
+y
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+y
+|
+x
+,
+z
+1
+​
+⋯
+​
+n
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+𝑦
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑛
+y\sim p_{\theta}^{CoT}(y|x,z_{1\cdots n})
+.
+Tree-of-thought (ToT) Prompting
+(Yao et al.,
+2023a
+)
+extends CoT prompting by exploring multiple reasoning paths over thoughts. It frames problems as a search over a tree where each node
+s
+=
+[
+x
+,
+z
+1
+⋅
+i
+]
+𝑠
+𝑥
+subscript
+𝑧
+⋅
+1
+𝑖
+s=[x,z_{1\cdot i}]
+represents a partial solution state comprising the original input
+x
+𝑥
+x
+and thought sequence
+z
+1
+​
+⋯
+​
+i
+subscript
+𝑧
+1
+⋯
+𝑖
+z_{1\cdots i}
+. Thoughts
+z
+i
+subscript
+𝑧
+𝑖
+z_{i}
+are generated by proposal or sampling with CoT
+z
+i
+∼
+p
+θ
+C
+​
+o
+​
+T
+​
+(
+z
+i
+|
+x
+,
+z
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑧
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝐶
+𝑜
+𝑇
+conditional
+subscript
+𝑧
+𝑖
+𝑥
+subscript
+𝑧
+1
+⋯
+𝑖
+1
+z_{i}\sim p_{\theta}^{CoT}(z_{i}|x,z_{1\cdots i-1})
+. Deliberate search algorithms like breadth-first or depth-first search are used to systematically explore the tree, guided by heuristics based on language model evaluations
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+of each state.
+Reasoning via Planning
+(RAP)
+(Hao et al.,
+2023
+)
+is similar to ToT, except that MCTS is used over DFS or BFS. Heuristics are designed from an LM, such as the likelihood or confidence of an action, and the LM is used as a world model to predict subsequent states during the simulation step.
+ReAct
+(Yao et al.,
+2023b
+)
+extends language models to tasks where the mapping from
+x
+𝑥
+x
+to
+y
+𝑦
+y
+is enhanced by or requires interactions with an external environment, such as a game or API. This technique constructs an action space
+A
+^
+=
+A
+∪
+Z
+^
+𝐴
+𝐴
+𝑍
+\hat{A}=A\cup Z
+that adds permissible actions
+a
+𝑎
+a
+to the reasoning traces
+z
+𝑧
+z
+from CoT. Observations
+o
+𝑜
+o
+from the environment are used to improve both reasoning and acting. To solve problems with ReAct, after each observation, actions are generated from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+sequentially as
+a
+i
+∼
+p
+θ
+R
+​
+e
+​
+A
+​
+c
+​
+t
+​
+(
+a
+i
+|
+x
+,
+o
+1
+​
+⋯
+​
+i
+−
+1
+,
+a
+1
+​
+⋯
+​
+i
+−
+1
+)
+similar-to
+subscript
+𝑎
+𝑖
+superscript
+subscript
+𝑝
+𝜃
+𝑅
+𝑒
+𝐴
+𝑐
+𝑡
+conditional
+subscript
+𝑎
+𝑖
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑖
+1
+subscript
+𝑎
+1
+⋯
+𝑖
+1
+a_{i}\sim p_{\theta}^{ReAct}(a_{i}|x,o_{1\cdots i-1},a_{1\cdots i-1})
+, with the final output being
+y
+∼
+p
+θ
+R
+​
+e
+​
+A
+​
+c
+​
+t
+​
+(
+y
+|
+x
+,
+o
+1
+​
+⋯
+​
+n
+,
+a
+1
+​
+⋯
+​
+n
+)
+similar-to
+𝑦
+superscript
+subscript
+𝑝
+𝜃
+𝑅
+𝑒
+𝐴
+𝑐
+𝑡
+conditional
+𝑦
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑛
+subscript
+𝑎
+1
+⋯
+𝑛
+y\sim p_{\theta}^{ReAct}(y~{}|~{}x,o_{1\cdots n},a_{1\cdots n})
+.
+While the previously described prompting techniques improve LM performance on reasoning tasks, they falter on difficult tasks that involve multifaceted decision-making due to several shortcomings: 1)
+Flexibility
+: Base prompting methods (CoT or ReAct) autoregressively sample from the LM, neglecting potential alternative continuations from specific states. 2)
+Sensibility
+: Reasoning-based methods (CoT, RAP, or ToT) rely solely on the internal representations of the LM and cannot consider external observations. This dependency risks fact hallucination and error propagation while setting a performance ceiling. 3)
+Adaptability
+: Current planning frameworks (RAP or ToT) use simple search algorithms such as BFS or cannot leverage environmental feedback to improve planning. Additionally, the agent is static and cannot reuse previous experience or learn from trial and error. While RAP also adopts MCTS, it is constrained to tasks where the LM can become a world model and accurately predict states. These shortcomings limit the ability of LMs to be deployed as general problem-solving agents and form the motivation for LATS.
+3.2
+Monte-Carlo Tree Search (MCTS)
+Monte-Carlo Tree Search (MCTS) is a heuristic search algorithm that is proved successful on many decision-making environments such as Atari
+(Ye et al.,
+2021
+)
+and Go
+(Silver et al.,
+2016
+)
+. MCTS builds a decision tree where every node in the tree is a state and edge is an action. MCTS runs for
+k
+𝑘
+k
+episodes; for each episode, it starts from the root (i.e., initial state) and iteratively conducts two steps to expand the tree: 1)
+Expansion
+, where multiple children states
+s
+𝑠
+s
+are explored from the current parent state
+p
+𝑝
+p
+by sampling
+n
+𝑛
+n
+actions, and 2)
+Selection
+, where the children with the highest UCT
+(Upper Confidence bounds applied to Trees)
+(Kocsis & Szepesvári,
+2006
+)
+value is selected by the next iteration. The UCT of a child state
+s
+𝑠
+s
+is calculated as follows:
+U
+​
+C
+​
+T
+​
+(
+s
+)
+=
+V
+​
+(
+s
+)
++
+w
+​
+ln
+⁡
+N
+​
+(
+p
+)
+N
+​
+(
+s
+)
+,
+𝑈
+𝐶
+𝑇
+𝑠
+𝑉
+𝑠
+𝑤
+𝑁
+𝑝
+𝑁
+𝑠
+UCT(s)=V(s)+w\sqrt{\frac{\ln N(p)}{N(s)}},
+(1)
+where
+N
+​
+(
+s
+)
+𝑁
+𝑠
+N(s)
+is the number of visits to a node
+s
+𝑠
+s
+,
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+is the value function (expected return) from the subtree of
+s
+𝑠
+s
+,
+w
+𝑤
+w
+is the exploration weight, and
+p
+𝑝
+p
+is the parent node of
+s
+𝑠
+s
+. The child node with the highest UCT value is selected for expansion in the next iteration. When the end of an episode is reached, a
+backpropagation
+is carried out: the return
+r
+𝑟
+r
+is used for updating every
+V
+​
+(
+s
+)
+𝑉
+𝑠
+V(s)
+along the path
+with the formula
+V
+​
+(
+s
+)
+=
+V
+old
+​
+(
+s
+)
+​
+(
+N
+​
+(
+s
+)
+−
+1
+)
++
+r
+N
+​
+(
+s
+)
+𝑉
+𝑠
+subscript
+𝑉
+old
+𝑠
+𝑁
+𝑠
+1
+𝑟
+𝑁
+𝑠
+V(s)=\frac{V_{\text{old}}(s)(N(s)-1)+r}{N(s)}
+, where
+V
+old
+​
+(
+s
+)
+subscript
+𝑉
+old
+𝑠
+V_{\text{old}}(s)
+is the old value function. Normally, the major shortcoming of MCTS is that it requires an environment model to undo previous steps and form a searching tree, which is often a strong assumption. However, such a limitation does not exist for LMs, as we can conveniently reset to any step by simply copy-pasting historical text input. Such a special property is the key motivation of our work.
+4
+Unifying Planning, Reasoning, and Acting
+4.1
+LM Agent
+LATS supports sequential reasoning or decision-making tasks on the basis of ReAct. At time step
+t
+𝑡
+t
+, an agent receives an observation
+o
+t
+∈
+O
+subscript
+𝑜
+𝑡
+𝑂
+o_{t}\in O
+from the environment and takes an action
+a
+t
+∈
+A
+subscript
+𝑎
+𝑡
+𝐴
+a_{t}\in A
+following some policy
+π
+​
+(
+a
+t
+|
+x
+,
+o
+1
+​
+⋯
+​
+i
+−
+1
+,
+a
+1
+​
+⋯
+​
+i
+−
+1
+)
+𝜋
+conditional
+subscript
+𝑎
+𝑡
+𝑥
+subscript
+𝑜
+1
+⋯
+𝑖
+1
+subscript
+𝑎
+1
+⋯
+𝑖
+1
+\pi(a_{t}|x,o_{1\cdots i-1},a_{1\cdots i-1})
+, where
+x
+𝑥
+x
+consists of the task instruction and a number of few-shot examples. We initialize the agent with
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+to leverage the useful language representations of an LM as a base decision-maker. We follow the ReAct instantiation in which the action space
+A
+^
+=
+A
+∪
+Z
+^
+𝐴
+𝐴
+𝑍
+\hat{A}=A\cup Z
+consists of both the space of permissible actions
+A
+𝐴
+A
+and language space of reasoning traces
+Z
+𝑍
+Z
+. Actions directly affect the environment and result in observation, while thoughts are used to formalize decisions by organizing information, planning future actions, or injecting internal knowledge. The exact instantiation of the action space depends on the particular environment; for decision-making tasks actions might consist of commands on a website while for reasoning tasks the action space might be limited to a few external tools or APIs.
+Instead of greedily decoding one trajectory or solution, we sample
+n
+𝑛
+n
+actions from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+using the current state. This is based on the intuition that for complex decision-making tasks, there is likely to be a range of potential trajectories or reasoning paths that are correct
+(Evans,
+2010
+)
+. Sampling a diverse set of candidates at each step mitigates the stochastic nature of LM text generation and enables greater exploration in both the decision-making and reasoning space. We wrap
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+within our proposed search algorithm to deliberately construct the best trajectory from sampled actions.
+4.2
+LATS
+Figure 3:
+An overview of the six operations of LATS. A node is
+selected
+,
+expanded
+,
+evaluated
+, then
+simulated
+until a terminal node is reached, then the resulting value is
+backpropagated
+. If the trajectory fails, a
+reflection
+is generated and used as additional context for future trials. These operations are performed in succession until the budget is reached or task is successful.
+The main component of LATS is a search algorithm that controls the overall problem-solving process with deliberate planning. To find the most promising trajectory and systemically balance exploration with exploitation, we adopt a variant of Monte Carlo Tree Search (MCTS) that frames decision-making as a tree search, in which each node
+s
+=
+[
+x
+,
+a
+1
+​
+⋯
+​
+i
+,
+o
+1
+​
+⋯
+​
+i
+]
+𝑠
+𝑥
+subscript
+𝑎
+1
+⋯
+𝑖
+subscript
+𝑜
+1
+⋯
+𝑖
+s=[x,a_{1\cdots i},o_{1\cdots i}]
+represents a state comprising the original input
+x
+𝑥
+x
+, action sequence
+a
+1
+⋅
+i
+subscript
+𝑎
+⋅
+1
+𝑖
+a_{1\cdot i}
+, and observation sequence
+o
+1
+⋅
+i
+subscript
+𝑜
+⋅
+1
+𝑖
+o_{1\cdot i}
+.
+To adapt MCTS for language agents, LATS repurposes
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+as an agent, state evaluator, and feedback generator, leveraging the useful language priors of modern LMs to facilitate planning. While standard MCTS and RAP
+Hao et al. (
+2023
+)
+rely on internal dynamics models to facilitate simulation, LATS is model-free and uses environment interaction. LATS consists of a series of operations,
+selection, expansion, evaluation, simulation, backpropagation, and reflection
+, performed in succession until the task is successfully completed or a computational limit is reached. The full psuedocode of LATS can be found in Sec.
+A
+in the Appendix.
+Selection.
+In the first operation, the algorithm identifies a segment of the current tree most suitable for subsequent expansion. Starting from the root node, denoted as the initial state
+s
+0
+subscript
+𝑠
+0
+s_{0}
+, a child node is selected at each tree level until a leaf node is reached. To balance exploration and exploitation, we use the UCT algorithm as shown in Eq.
+1
+.
+Expansion.
+After selecting a node, the second operation expands the tree by sampling
+n
+𝑛
+n
+actions from
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, as described in the prior section. The environment receives each action and returns corresponding feedback as an observation. This results in
+n
+𝑛
+n
+new child nodes added to the tree. This tree is stored in an external long-term memory structure.
+Evaluation.
+The third operation assigns a scalar value to each new child node to be used for selection and backpropagation. This value effectively quantifies the agent’s progress in task completion, serving as a heuristic to steer the search algorithm towards the most promising regions of the tree. Following
+Yao et al. (
+2023a
+)
+we repurpose
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+into a value function by prompting it to reason about a given state. To obtain a scalar value, we instruct
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+to end its reasoning trace with a score indicating the correctness of the trajectory. This method offers enhanced flexibility over programmed heuristics
+(Campbell et al.,
+2002
+)
+and greater efficiency than learned heuristics
+(Silver et al.,
+2017
+)
+.
+Simulation.
+The fourth operation expands the currently selected node until a terminal state is reached. At each depth level we sample and evaluate nodes with the same operations, but prioritize nodes of highest value. Reaching a terminal state provides objective feedback on the correctness of a trajectory. If the task is completed successfully, then LATS terminates the search. If the solution is partially successful or unsuccessful, then we perform two additional operations as described below.
+Backpropagation.
+This operation updates the values of the tree based on the outcome of a trajectory. For each node
+s
+0
+,
+s
+1
+,
+…
+,
+s
+n
+subscript
+𝑠
+0
+subscript
+𝑠
+1
+…
+subscript
+𝑠
+𝑛
+s_{0},s_{1},\dots,s_{n}
+in the trajectory from root (initial state
+s
+0
+subscript
+𝑠
+0
+s_{0}
+) of the searching tree to leaf (terminal state
+s
+n
+subscript
+𝑠
+𝑛
+s_{n}
+), its value is updated to reflect the outcome of the simulation by
+N
+​
+(
+s
+i
+)
+=
+N
+old
+​
+(
+s
+i
+)
++
+1
+𝑁
+subscript
+𝑠
+𝑖
+subscript
+𝑁
+old
+subscript
+𝑠
+𝑖
+1
+N(s_{i})=N_{\text{old}}(s_{i})+1
+and
+V
+​
+(
+s
+i
+)
+=
+r
++
+N
+old
+​
+(
+s
+i
+)
+​
+V
+old
+​
+(
+s
+i
+)
+N
+​
+(
+s
+i
+)
+𝑉
+subscript
+𝑠
+𝑖
+𝑟
+subscript
+𝑁
+old
+subscript
+𝑠
+𝑖
+subscript
+𝑉
+old
+subscript
+𝑠
+𝑖
+𝑁
+subscript
+𝑠
+𝑖
+V(s_{i})=\frac{r+N_{\text{old}}(s_{i})V_{\text{old}}(s_{i})}{N(s_{i})}
+, where
+r
+𝑟
+r
+is the return and
+N
+old
+,
+V
+old
+subscript
+𝑁
+old
+subscript
+𝑉
+old
+N_{\text{old}},V_{\text{old}}
+are the old number of visits and value function. These updated values are used in the UCT formula (Eq.
+1
+) to guide the selection of the next node for exploration.
+Reflection.
+In addition to the environmental feedback, we also leverage
+self-reflection
+to further refine the decision-making process
+(Shinn et al.,
+2023
+; Madaan et al.,
+2023
+)
+. Upon encountering an unsuccessful terminal node,
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+is prompted with the trajectory and final reward to provide a verbal self-reflection that summarizes the errors in the reasoning or acting process and proposes superior alternatives. We store both failed trajectories and corresponding reflections in the memory. In subsequent iterations, these are integrated as additional context to the agent and value function, refining both through in-context learning. This imparts a semantic gradient signal more useful than a scalar value, enabling the agent to learn from trial and error without the cost of expensive optimization processes such as reinforcement learning.
+Conceptually, LATS has the following advantages as a general framework for reasoning and decision-making with LM agents.
+(1)
+Generality
+: LATS supports both reasoning and decision-making tasks by defining a shared space of thoughts and actions. (2)
+Deliberate
+: The use of MCTS and LM value function ensures a principled search that selects options with high value while exploring promising alternatives. (3)
+Adaptability
+: LATS is designed around the use of external feedback through observations and self-reflection, enabling greater adaptation during problem-solving. (4)
+Flexibility
+: LATS can accommodate different scenarios, environments, and resource stipulations by modifying state design and tree dimensions. (5)
+Modularity
+: The base LM agent, reflection generator, and value function can be independently altered and adapted to individual LM properties.
+5
+Experiments
+To demonstrate the general applicability of LATS, we evaluate our method on a variety of decision-making domains that requires both reasoning and acting ability: programming
+(Chen et al.,
+2021
+; Austin et al.,
+2021
+)
+, HotPotQA
+(Yang et al.,
+2018
+)
+, and WebShop
+(Yao et al.,
+2022
+)
+.
+5.1
+HotPotQA
+For a task that can be approached with both reasoning-based and acting-based strategies, we consider HotPotQA
+(Yang et al.,
+2018
+)
+, a multi-hop question-answering benchmark that requires retrieval over two or more Wikipedia passages. For the action space, in addition to LM thoughts we follow the setup from
+Yao et al. (
+2023b
+)
+, which provides the agent with API calls to search and lookup information. The output of these API calls and self-generated reflections form the observation space. We use a subset of 100 questions and three few-shot examples for each method. For ToT, we use DFS as the base search algorithm and scoring with the LM as the heuristic. For all methods that involve sampling, including LATS, we sample
+k
+=
+50
+𝑘
+50
+k=50
+trajectories. More details and prompts can be found in Sec.
+D
+and Sec.
+E
+in the Appendix.
+We evaluate internal reasoning strategies by removing actions and observations from the context, corresponding to CoT
+(Wei et al.,
+2022
+)
+and its variants, CoT-SC
+(Wang et al.,
+2022
+)
+, ToT
+(Yao et al.,
+2023a
+)
+, and RAP
+(Hao et al.,
+2023
+)
+. These methods rely solely on the agent’s existing knowledge to answer the question. We also consider acting-based methods ReAct, Reflexion, and LATS, which augment the agent with the interactive API environment and primarily evaluate its information retrieval abilities. While LATS is designed for scenarios where external feedback can enhance reasoning, we also implement a reasoning-only version with CoT as the base prompt. We also combine internal and external reasoning in LATS by first prompting with a CoT-based prompt, then switching to a ReAct-based prompt upon failure. This is closer to how humans might approach this task, by using tools to lookup additional information only when the answer is not already known.
+Prompt Method
+HotpotQA (EM)
+I/O
+0.32
+CoT
+(Wei et al.,
+2022
+)
+0.34
+CoT - SC
+(Wang et al.,
+2022
+)
+0.38
+ToT
+(Yao et al.,
+2023a
+)
+0.55
+RAP
+(Hao et al.,
+2023
+)
+0.60
+RAP (n = 10)
+0.60
+LATS (CoT)
+0.60
+Prompt Method
+HotpotQA (EM)
+ReAct
+(Yao et al.,
+2023b
+)
+0.32
+ReAct (best of k)
+0.38
+Reflexion
+(Shinn et al.,
+2023
+)
+0.51
+LATS
+0.61
+LATS (n = 3)
+0.56
+LATS (n = 10)
+0.64
+LATS (CoT + ReAct)
+0.71
+Table 2:
+GPT-3.5 reasoning-based prompting (left) and acting-based prompting (right) results on HotpotQA. LATS achieves the highest exact match (EM) for acting and is competitive on reasoning. Unless otherwise specified, we sample
+n
+=
+5
+𝑛
+5
+n=5
+nodes during expansion and
+k
+=
+50
+𝑘
+50
+k=50
+trajectories.
+Results.
+We observe in Tab.
+2
+that both internal reasoning and external retrieval strategies perform well on HotPotQA. Due to their large-scale training corpus, modern LLMs already encode factual knowledge and can often directly answer the question correctly. While CoT can slightly enhance performance on questions requiring reasoning, larger gains are observed with search methods ToT and RAP, which can sample and explore more outputs. We observe similar results for acting-based methods. LATS surpasses ReAct, even when sampling the same number of trajectories, by expanding more nodes with principled search (see Fig.
+5
+in Appendix
+D
+for a qualitative sample). This is demonstrated when modifying
+n
+𝑛
+n
+, the number of nodes expanded during each iteration. Increasing
+n
+𝑛
+n
+can consistently improve performance, although at greater computational and inference costs. LATS is also competitive to RAP on internal reasoning but performs worse than acting. Combining internal and external reasoning in LATS results in the highest performance, indicating the importance of external feedback in augmenting reasoning even in tasks the base LM can already perform.
+5.2
+Programming
+Prompt Method
+Model
+Pass@1
+CoT
+(Wei et al.,
+2022
+)
+GPT-3.5
+46.9
+ReAct
+(Yao et al.,
+2023b
+)
+GPT-3.5
+56.9
+Reflexion
+(Shinn et al.,
+2023
+)
+GPT-3.5
+68.1
+ToT
+(Yao et al.,
+2023a
+)
+GPT-3.5
+54.4
+RAP
+(Hao et al.,
+2023
+)
+GPT-3.5
+63.1
+LATS (Ours)
+GPT-3.5
+83.8
+I/O
+GPT-4
+80.1
+Reflexion
+GPT-4
+91.0
+LATS
+GPT-4
+94.4
+Prompt Method
+Pass@1
+CoT
+(Wei et al.,
+2022
+)
+54.9
+ReAct
+(Wei et al.,
+2022
+)
+67.0
+Reflexion
+(Shinn et al.,
+2023
+)
+70.0
+ToT
+(Yao et al.,
+2023a
+)
+65.8
+RAP
+(Hao et al.,
+2023
+)
+71.4
+LATS (Ours)
+81.1
+Table 3:
+GPT-3.5 and GPT-4 Pass@1 accuracy on HumanEval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+. Prompting with LATS achieves the highest performance. We sample 5 solutions during expansion for
+8
+iterations.
+To demonstrate the importance of external observations for complex reasoning tasks, we evaluate the baselines and LATS on programming with Humaneval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+. Both datasets measure the correctness of synthesized programs in Python from natural language docstrings. We use individual solutions as the action space and test suite and compiler feedback as the external observation. We follow
+Chen et al. (
+2022a
+)
+and use an LLM to generate a synthetic test suite of syntactically valid “assert” statements for each question. For each step, the solution is evaluated on this test suite, and the results including successful and failed tests and compiler output, are added to the context as an observation. We use the same test suite for Reflexion.
+For this task, the reasoning and acting baselines share an action space, but acting methods are able to incorporate observations as additional context. For LATS, since each action corresponds to a complete solution, we skip the simulation step of LATS and directly use the percentage of passed tests as the backpropagated reward. We use
+k
+=
+8
+𝑘
+8
+k=8
+iterations, set the number of generated tests at
+4
+4
+4
+, and sample
+n
+=
+5
+𝑛
+5
+n=5
+solutions during expansion. After the search is completed, we select the solution with the highest value and evaluate it on the real test suite for the pass@1 accuracy evaluation. More details and prompts can be found in Sec.
+D
+and Sec.
+F
+in the Appendix.
+Results.
+We find in Tab
+3
+that both search and semantic feedback are crucial for better performance. Despite not using observations, ToT and RAP are competitive with Reflexion. LATS has the highest performance on both datasets. Since RAP uses a similar search algorithm as LATS, this reveals the importance of external feedback for difficult reasoning tasks such as programming. With GPT-4, using LATS sets the state of the art for HumanEval, showing LATS can be used with more advanced LLMs for higher performance.
+5.3
+Webshop
+For a complex decision-making environment with practical applications, we consider WebShop
+(Yao et al.,
+2022
+)
+, an online shopping environment composed of a website with 1.18M real-world products and 12k human instructions. Agents must navigate a website through a variety of commands to purchase an item matching a user specification. We use the preconstructed action space of search and click commands and browser feedback and reflections for the observation. The performance is gauged using two metrics: an average score, reflecting the percentage of user-specified attributes met by the selected product, and a success rate, indicating the frequency with which the chosen product fulfills all given conditions. We compare against acting-based prompting methods and RL-based approaches. We evaluate on 50 instructions, expand
+n
+=
+5
+𝑛
+5
+n=5
+children for LATS, and set
+k
+=
+30
+𝑘
+30
+k=30
+for LATS, ReAct best of
+k
+𝑘
+k
+, and Reflexion. More details and prompts are in Appendix
+D
+and
+G
+.
+Results.
+We find in Tab.
+5
+that GPT-3.5 with ReAct is competitive to imitation learning, and can exceed reinforcement learning techniques with stronger prompting strategies. Sampling
+k
+=
+30
+𝑘
+30
+k=30
+trajectories with ReAct and Reflexion results in a similar performance, suggesting the semantic feedback is not as helpful in complex environments like WebShop. Indeed like in
+Shinn et al. (
+2023
+)
+, we find that generated reflections are often generic and do not provide useful feedback, resulting in a tendency for the agent to become stuck in local minima. However, using LATS indeed results in a noticeable improvement, indicating a more effective exploration for the same number of iterations.
+5.4
+Additional Observations
+Method
+Score
+SR
+ReAct
+(Yao et al.,
+2023b
+)
+53.8
+28.0
+ReAct (best of k)
+59.1
+32.0
+Reflexion
+(Shinn et al.,
+2023
+)
+64.2
+35.0
+LATS
+75.9
+38.0
+IL
+59.9
+29.1
+IL+RL
+62.4
+28.7
+Fine-tuning
+(Furuta et al.,
+2023
+)
+67.5
+45.0
+Expert
+82.1
+59.6
+Table 4:
+Score and success rate (SR) on Webshop. Table is separated into prompting, RL-based training, and human performance. For the same number of iterations, LATS improves both score and success rate, and surpasses RL-based training. IL/IL+RL taken from
+Yao et al. (
+2022
+)
+.
+Prompt Method
+HotPotQA (EM)
+ToT (ReAct)
+0.39
+RAP (ReAct)
+0.54
+LATS (No LM Heuristic)
+0.37
+LATS (DFS)
+0.42
+LATS (No Reflection)
+0.56
+LATS
+0.61
+Table 5:
+Ablation results on LATS and baseline variants in HotPotQA; we use ReAct as the base prompt and sample
+n
+=
+5
+𝑛
+5
+n=5
+children and
+k
+=
+50
+𝑘
+50
+k=50
+maximum trajectories. LATS requires every component and operation for optimal performance.
+We also conduct additional experiments on HotPotQA to demonstrate the effect of each component of LATS. We also design a version of ToT and RAP with ReAct prompt and can handle external observations. We use HotPotQA as our setup incorporates both reasoning (through thoughts) and acting (through API calls); the results are shown in Tab.
+5
+. More ablations for token consumption on HotPotQA are in Tab.
+7
+in Appendix
+C
+. Note that baselines generally perform worse than the reasoning-only setting of HotPotQA, which indicates that the acting-based setting is more challenging and adaption of search algorithms to decision-making scenarios is non-trivial.
+Self-reflection.
+We use self-reflection to provide additional semantic signals for the agent. We observe a
+0.05
+0.05
+0.05
+performance drop when removed from LATS, suggesting this is useful. This is a smaller gain Reflexion
+(Shinn et al.,
+2023
+)
+observes over ReAct
+(Yao et al.,
+2023b
+)
+as shown in Tab.
+2
+, suggesting overlap between the types of questions where there is an improvement with self-reflection and search. This variant outperforms RAP-ReAct, reflecting our improvements to MCTS.
+Search Algorithm.
+MCTS is a more principled search algorithm than variants like A* or DFS search and the basis for observed performance gains. We observe the effects of using DFS, and incorporate the LM-based heuristic used in ToT
+(Yao et al.,
+2023a
+)
+in which branches with low values are pruned. This removes the selection and backpropagation operations, and we observe a
+0.08
+0.08
+0.08
+drop in performance when sampling the same number of nodes, but outperforms ToT-ReAct.
+6
+Conclusion
+In this work, we introduce Language Agent Tree Search (LATS), the first framework to unify planning, acting, and reasoning for enhanced LLM problem solving. By deliberately constructing trajectories with search algorithms, incorporating external feedback, and enabling agents to learn from experience, LATS addresses key limitations of prior prompting techniques. Our evaluations demonstrate the ability of LATS to harness LLM capabilities for a variety of decision-making tasks while keeping its reasoning ability without additional training. The proposed synergies between search, interaction, and reflection offer a versatile approach to autonomous decision-making, highlighting the potential of LLMs as generalist agents. A full discussion of the limitations and broader impacts is in Appendix
+B
+.
+References
+Ahn et al. (2022)
+Michael Ahn, Anthony Brohan, Noah Brown, Yevgen Chebotar, Omar Cortes, Byron David, Chelsea Finn, Chuyuan Fu, Keerthana Gopalakrishnan, Karol Hausman, Alex Herzog, Daniel Ho, Jasmine Hsu, Julian Ibarz, Brian Ichter, Alex Irpan, Eric Jang, Rosario Jauregui Ruano, Kyle Jeffrey, Sally Jesmonth, Nikhil J Joshi, Ryan Julian, Dmitry Kalashnikov, Yuheng Kuang, Kuang-Huei Lee, Sergey Levine, Yao Lu, Linda Luu, Carolina Parada, Peter Pastor, Jornell Quiambao, Kanishka Rao, Jarek Rettinghouse, Diego Reyes, Pierre Sermanet, Nicolas Sievers, Clayton Tan, Alexander Toshev, Vincent Vanhoucke, Fei Xia, Ted Xiao, Peng Xu, Sichun Xu, Mengyuan Yan, and Andy Zeng.
+Do as i can, not as i say: Grounding language in robotic affordances.
+arXiv:2204.01691
+, 2022.
+Anthony et al. (2017)
+T. Anthony, Z. Tian, and D. Barber.
+Thinking fast and slow with deep learning and tree search.
+In
+NIPS
+, 2017.
+Austin et al. (2021)
+Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, et al.
+Program synthesis with large language models.
+arXiv:2108.07732
+, 2021.
+Baker et al. (2022)
+Bowen Baker, Ilge Akkaya, Peter Zhokhov, Joost Huizinga, Jie Tang, Adrien Ecoffet, Brandon Houghton, Raul Sampedro, and Jeff Clune.
+Video pretraining (vpt): Learning to act by watching unlabeled online videos.
+arXiv:2206.11795
+, 2022.
+Besta et al. (2023)
+Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michal Podstawski, Hubert Niewiadomski, Piotr Nyczyk, and Torsten Hoefler.
+Graph of thoughts: Solving elaborate problems with large language models.
+arXiv:2308.09687
+, 2023.
+Bowman et al. (2015)
+Samuel R Bowman, Gabor Angeli, Christopher Potts, and Christopher D Manning.
+A large annotated corpus for learning natural language inference.
+In
+EMNLP
+, 2015.
+Brown et al. (2020)
+Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei.
+Language models are few-shot learners.
+In
+NeurIPS
+, 2020.
+Campbell et al. (2002)
+Murray Campbell, A Joseph Hoane Jr, and Feng-hsiung Hsu.
+Deep blue.
+Artificial intelligence
+, 2002.
+Chen et al. (2022a)
+Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen.
+Codet: Code generation with generated tests.
+arXiv:2207.10397
+, 2022a.
+Chen et al. (2021)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.
+Evaluating large language models trained on code.
+arXiv:2107.03374
+, 2021.
+Chen et al. (2022b)
+Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen.
+Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks.
+arXiv preprint arXiv:2211.12588
+, 2022b.
+Chowdhery et al. (2022)
+Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al.
+Palm: Scaling language modeling with pathways.
+arXiv:2204.02311
+, 2022.
+Cobbe et al. (2021)
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al.
+Training verifiers to solve math word problems.
+arXiv:2110.14168
+, 2021.
+Deng et al. (2023)
+Xiang Deng, Yu Gu, Boyuan Zheng, Shijie Chen, Samuel Stevens, Boshi Wang, Huan Sun, and Yu Su.
+Mind2web: Towards a generalist agent for the web.
+arXiv:2306.06070
+, 2023.
+Driess et al. (2023)
+Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence.
+Palm-e: An embodied multimodal language model.
+arXiv:2303.03378
+, 2023.
+Du et al. (2023)
+Yilun Du, Mengjiao Yang, Bo Dai, Hanjun Dai, Ofir Nachum, Joshua B. Tenenbaum, Dale Schuurmans, and Pieter Abbeel.
+Learning universal policies via text-guided video generation.
+arXiv:2302.00111
+, 2023.
+Evans (2010)
+Jonathan St BT Evans.
+Intuition and reasoning: A dual-process perspective.
+Psychological Inquiry
+, 2010.
+Fan et al. (2022)
+Linxi Fan, Guanzhi Wang, Yunfan Jiang, Ajay Mandlekar, Yuncong Yang, Haoyi Zhu, Andrew Tang, De-An Huang, Yuke Zhu, and Anima Anandkumar.
+Minedojo: Building open-ended embodied agents with internet-scale knowledge.
+In
+NeurIPS Datasets and Benchmarks Track
+, 2022.
+Furuta et al. (2023)
+Hiroki Furuta, Ofir Nachum, Kuang-Huei Lee, Yutaka Matsuo, Shixiang Shane Gu, and Izzeddin Gur.
+Multimodal web navigation with instruction-finetuned foundation models.
+arXiv preprint arXiv:2305.11854
+, 2023.
+Gao et al. (2022)
+Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig.
+Pal: Program-aided language models.
+arXiv preprint arXiv:2211.10435
+, 2022.
+Guo et al. (2018)
+Jiaxian Guo, Sidi Lu, Han Cai, Weinan Zhang, Yong Yu, and Jun Wang.
+Long text generation via adversarial training with leaked information.
+AAAI
+, 2018.
+Guss et al. (2019)
+William H. Guss, Brandon Houghton, Nicholay Topin, Phillip Wang, Cayden Codel, Manuela Veloso, and Ruslan Salakhutdinov.
+Minerl: A large-scale dataset of minecraft demonstrations.
+In
+IJCAI
+, 2019.
+Hafner et al. (2019)
+Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha, Honglak Lee, and James Davidson.
+Learning latent dynamics for planning from pixels.
+In
+ICML
+, 2019.
+Hafner et al. (2023)
+Danijar Hafner, Jurgis Pasukonis, Jimmy Ba, and Timothy Lillicrap.
+Mastering diverse domains through world models.
+arXiv:2301.04104
+, 2023.
+Hao et al. (2023)
+Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and Zhiting Hu.
+Reasoning with language model is planning with world model.
+arXiv:2305.14992
+, 2023.
+Huang et al. (2023)
+Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, and Denny Zhou.
+Large language models cannot self-correct reasoning yet.
+arXiv:2310.01798
+, 2023.
+Huang et al. (2022)
+Wenlong Huang, Fei Xia, Ted Xiao, Harris Chan, Jacky Liang, Pete Florence, Andy Zeng, Jonathan Tompson, Igor Mordatch, Yevgen Chebotar, et al.
+Inner monologue: Embodied reasoning through planning with language models.
+arXiv:2207.05608
+, 2022.
+Jiang et al. (2018)
+D. Jiang, E. Ekwedike, and H. Liu.
+Feedback-based tree search for reinforcement learning.
+In
+ICML
+, 2018.
+Kocsis & Szepesvári (2006)
+Levente Kocsis and Csaba Szepesvári.
+Bandit based monte-carlo planning.
+In
+ECML
+, 2006.
+Kojima et al. (2022)
+Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa.
+Large language models are zero-shot reasoners.
+arXiv:2205.11916
+, 2022.
+LaValle et al. (2001)
+Steven M LaValle, James J Kuffner, BR Donald, et al.
+Rapidly-exploring random trees: Progress and prospects.
+Algorithmic and computational robotics: new directions
+, 2001.
+Liu et al. (2018)
+Evan Zheran Liu, Kelvin Guu, Panupong Pasupat, Tianlin Shi, and Percy Liang.
+Reinforcement learning on web interfaces using workflow-guided exploration.
+In
+ICLR
+, 2018.
+Liu et al. (2023)
+Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, and Jie Tang.
+Agentbench: Evaluating llms as agents.
+arXiv:2308.03688
+, 2023.
+Madaan et al. (2023)
+Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, Shashank Gupta, Bodhisattwa Prasad Majumder, Katherine Hermann, Sean Welleck, Amir Yazdanbakhsh, and Peter Clark.
+Self-refine: Iterative refinement with self-feedback.
+arXiv:2303.17651
+, 2023.
+Nallapati et al. (2016)
+Ramesh Nallapati, Bowen Zhou, Cicero dos Santos, Caglar Gulcehre, and Bing Xiang.
+Abstractive text summarization using sequence-to-sequence rnns and beyond.
+In
+SIGNLL
+, 2016.
+Nye et al. (2021)
+Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, et al.
+Show your work: Scratchpads for intermediate computation with language models.
+arXiv:2112.00114
+, 2021.
+OpenAI (2023)
+OpenAI.
+Gpt-4 technical report.
+arXiv:2303.08774
+, 2023.
+Saparov & He (2022)
+Abulhair Saparov and He He.
+Language models are greedy reasoners: A systematic formal analysis of chain-of-thought.
+arXiv:2210.01240
+, 2022.
+Schick et al. (2023)
+Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom.
+Toolformer: Language models can teach themselves to use tools.
+arXiv:2302.04761
+, 2023.
+Shen et al. (2023)
+Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang.
+Hugginggpt: Solving ai tasks with chatgpt and its friends in huggingface.
+arXiv:2303.17580
+, 2023.
+Shinn et al. (2023)
+Noah Shinn, Federico Cassano, Beck Labash, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+arXiv:2303.11366
+, 2023.
+Shridhar et al. (2020)
+Mohit Shridhar, Xingdi Yuan, Marc-Alexandre Côté, Yonatan Bisk, Adam Trischler, and Matthew Hausknecht.
+Alfworld: Aligning text and embodied environments for interactive learning.
+arXiv:2010.03768
+, 2020.
+Silver et al. (2016)
+David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al.
+Mastering the game of go with deep neural networks and tree search.
+nature
+, 2016.
+Silver et al. (2017)
+David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja Huang, Arthur Guez, Thomas Hubert, Lucas baker, Matthew Lai, Adrian Bolton, Yutian Chen, Timothy P. Lillicrap, Fan Hui, L. Sifre, George van den Driessche, Thore Graepel, and Demis Hassabis.
+Mastering the game of go without human knowledge.
+Nature
+, 2017.
+Sloman (1996)
+Steven A. Sloman.
+The empirical case for two systems of reasoning.
+Psychological Bulletin
+, 1996.
+Sun et al. (2023)
+Haotian Sun, Yuchen Zhuang, Lingkai Kong, Bo Dai, and Chao Zhang.
+Adaplanner: Adaptive planning from feedback with language models.
+arXiv:2305.16653
+, 2023.
+Surís et al. (2023)
+Dídac Surís, Sachit Menon, and Carl Vondrick.
+Vipergpt: Visual inference via python execution for reasoning.
+arXiv preprint arXiv:2303.08128
+, 2023.
+Świechowski et al. (2023)
+Maciej Świechowski, Konrad Godlewski, Bartosz Sawicki, and Jacek Mańdziuk.
+Monte carlo tree search: A review of recent modifications and applications.
+Artificial Intelligence Review
+, 2023.
+Touvron et al. (2023)
+Hugo Touvron, Louis Martin, Kevin R. Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Daniel M. Bikel, Lukas Blecher, Cristian Cantón Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony S. Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel M. Kloumann, A. V. Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, R. Subramanian, Xia Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zhengxu Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and
+Thomas Scialom.
+Llama 2: Open foundation and fine-tuned chat models.
+arXiv:2307.09288
+, 2023.
+Vodopivec et al. (2017)
+Tom Vodopivec, Spyridon Samothrakis, and Branko Ster.
+On monte carlo tree search and reinforcement learning.
+Journal of Artificial Intelligence Research
+, 2017.
+Wang et al. (2023)
+Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar.
+Voyager: An open-ended embodied agent with large language models.
+arXiv:2305.16291
+, 2023.
+Wang et al. (2022)
+Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou.
+Self-consistency improves chain of thought reasoning in language models.
+arXiv:2203.11171
+, 2022.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou.
+Chain of thought prompting elicits reasoning in large language models.
+arXiv:2201.11903
+, 2022.
+Wooldridge & Jennings (1995)
+Michael Wooldridge and Nicholas R Jennings.
+Intelligent agents: Theory and practice.
+The knowledge engineering review
+, 1995.
+Wu et al. (2023)
+Philipp Wu, Alejandro Escontrela, Danijar Hafner, Pieter Abbeel, and Ken Goldberg.
+Daydreamer: World models for physical robot learning.
+In
+CoRL
+. PMLR, 2023.
+Xie et al. (2023)
+Yuxi Xie, Kenji Kawaguchi, Yiran Zhao, Xu Zhao, Min-Yen Kan, Junxian He, and Qizhe Xie.
+Decomposition enhances reasoning via self-evaluation guided decoding.
+arXiv:2305.00633
+, 2023.
+Yang et al. (2018)
+Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning.
+Hotpotqa: A dataset for diverse, explainable multi-hop question answering.
+arXiv:1809.09600
+, 2018.
+Yao et al. (2022)
+Shunyu Yao, Howard Chen, John Yang, and Karthik R Narasimhan.
+Webshop: Towards scalable real-world web interaction with grounded language agents.
+In
+NeurIPS
+, 2022.
+Yao et al. (2023a)
+Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Thomas L. Griffiths, Yuan Cao, and Karthik Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language models.
+arXiv:2305.10601
+, 2023a.
+Yao et al. (2023b)
+Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao.
+ReAct: Synergizing reasoning and acting in language models.
+In
+ICLR
+, 2023b.
+Yao et al. (2023c)
+Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, Ran Xu, Phil Mui, Huan Wang, Caiming Xiong, and Silvio Savarese.
+Retroformer: Retrospective large language agents with policy gradient optimization.
+arXiv preprint arXiv:2308.02151
+, 2023c.
+Ye et al. (2021)
+Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, and Yang Gao.
+Mastering atari games with limited data.
+In
+NeurIPS
+, 2021.
+Zhou et al. (2022)
+Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Olivier Bousquet, Quoc Le, and Ed Chi.
+Least-to-most prompting enables complex reasoning in large language models.
+arXiv:2205.10625
+, 2022.
+Zhu et al. (2023)
+Xizhou Zhu, Yuntao Chen, Hao Tian, Chenxin Tao, Weijie Su, Chenyu Yang, Gao Huang, Bin Li, Lewei Lu, Xiaogang Wang, Yu Qiao, Zhaoxiang Zhang, and Jifeng Dai.
+Ghost in the minecraft: Generally capable agents for open-world environments via large language models with text-based knowledge and memory.
+arXiv:2305.17144
+, 2023.
+7
+Appendix
+The appendix is organized as follows. First in Sec.
+A
+, we show the pseudocode of our proposed algorithm, LATS; then in Sec.
+B
+, we provide further discussion of our method and its limitations, future direction and broader impact; then in Sec.
+C
+we provide additional experimental results; then in Sec.
+D
+, we specify the environment details in our experiments; finally, we list our prompts used for the three environments in Sec.
+E
+(HotPotQA), Sec.
+F
+(Programming) and Sec.
+G
+(Webshop) respectively.
+Appendix A
+LATS Pseudocode
+Alg.
+1
+shows the pseudocode of our algorithm LATS. Nodes are stored explicitly in the memory. Unless otherwise specified, in all experiments we use
+n
+=
+5
+𝑛
+5
+n=5
+and
+w
+=
+1
+𝑤
+1
+w=1
+.
+Algorithm 1
+LATS
+⁡
+(
+S
+0
+,
+p
+θ
+,
+p
+V
+,
+p
+ref
+,
+d
+,
+k
+,
+n
+,
+w
+)
+LATS
+subscript
+𝑆
+0
+subscript
+𝑝
+𝜃
+subscript
+𝑝
+𝑉
+subscript
+𝑝
+ref
+𝑑
+𝑘
+𝑛
+𝑤
+\operatorname{LATS}(S_{0},p_{\theta},{p_{V}},p_{\text{ref}},d,k,n,w)
+Initial state
+s
+1
+subscript
+𝑠
+1
+s_{1}
+, action generator
+p
+θ
+subscript
+𝑝
+𝜃
+p_{\theta}
+, value function
+p
+V
+subscript
+𝑝
+𝑉
+p_{V}
+, reflection generator
+p
+ref
+subscript
+𝑝
+ref
+p_{\text{ref}}
+, number of generated actions
+n
+𝑛
+n
+, depth limit
+L
+𝐿
+L
+, number of roll-outs
+K
+𝐾
+K
+, context
+c
+𝑐
+c
+, and exploration weight
+w
+𝑤
+w
+Initialize action space
+A
+𝐴
+A
+, observation space
+O
+𝑂
+O
+Initialize the state-action value function
+p
+V
+:
+S
+×
+A
+↦
+ℝ
+:
+subscript
+𝑝
+𝑉
+maps-to
+𝑆
+𝐴
+ℝ
+{p_{V}}:S\times A\mapsto\mathbb{R}
+and visit counter
+N
+:
+S
+↦
+ℕ
+:
+𝑁
+maps-to
+𝑆
+ℕ
+{N}:S\mapsto\mathbb{N}
+to zero
+for
+k
+←
+0
+,
+…
+,
+K
+−
+1
+←
+𝑘
+0
+…
+𝐾
+1
+k\leftarrow 0,\dots,K-1
+do
+for
+t
+←
+0
+,
+…
+,
+L
+−
+1
+←
+𝑡
+0
+…
+𝐿
+1
+t\leftarrow 0,\dots,L-1
+do
+if
+s
+t
+subscript
+𝑠
+𝑡
+s_{t}
+not terminal
+then
+▷
+▷
+\triangleright
+Expansion & Simulation
+for
+i
+←
+1
+,
+…
+,
+n
+←
+𝑖
+1
+…
+𝑛
+i\leftarrow 1,\dots,n
+do
+Sample
+a
+t
+(
+i
+)
+∼
+p
+θ
+​
+(
+a
+∣
+s
+t
+)
+similar-to
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+subscript
+𝑝
+𝜃
+conditional
+𝑎
+subscript
+𝑠
+𝑡
+a_{t}^{(i)}\sim p_{\theta}(a\mid s_{t})
+Get
+o
+t
+(
+i
+)
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+o_{t}^{(i)}
+from environment,
+s
+t
++
+1
+(
+i
+)
+←
+(
+c
+t
+(
+i
+)
+,
+o
+t
+(
+i
+)
+,
+a
+t
+(
+i
+)
+)
+←
+superscript
+subscript
+𝑠
+𝑡
+1
+𝑖
+superscript
+subscript
+𝑐
+𝑡
+𝑖
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+s_{t+1}^{(i)}\leftarrow(c_{t}^{(i)},o_{t}^{(i)},a_{t}^{(i)})
+,
+c
+t
++
+1
+(
+i
+)
+←
+(
+o
+t
+(
+i
+)
+,
+a
+t
+(
+i
+)
+)
+←
+superscript
+subscript
+𝑐
+𝑡
+1
+𝑖
+superscript
+subscript
+𝑜
+𝑡
+𝑖
+superscript
+subscript
+𝑎
+𝑡
+𝑖
+c_{t+1}^{(i)}\leftarrow(o_{t}^{(i)},a_{t}^{(i)})
+Evaluate
+V
+t
+(
+i
+)
+∼
+p
+V
+​
+(
+s
+t
+(
+i
+)
+)
+similar-to
+superscript
+subscript
+𝑉
+𝑡
+𝑖
+subscript
+𝑝
+𝑉
+superscript
+subscript
+𝑠
+𝑡
+𝑖
+{V}_{t}^{(i)}\sim{p_{V}}(s_{t}^{(i)})
+▷
+▷
+\triangleright
+Evaluation
+V
+​
+(
+s
+t
+)
+←
+V
+t
+(
+i
+)
+←
+𝑉
+subscript
+𝑠
+𝑡
+superscript
+subscript
+𝑉
+𝑡
+𝑖
+{V}(s_{t})\leftarrow{V}_{t}^{(i)}
+Add
+s
+t
+(
+i
+)
+superscript
+subscript
+𝑠
+𝑡
+𝑖
+s_{t}^{(i)}
+to children
+end
+for
+end
+if
+if
+s
+t
+subscript
+𝑠
+𝑡
+s_{t}
+is terminal
+then
+▷
+▷
+\triangleright
+Reflection
+Get
+r
+𝑟
+r
+from environment
+if
+r
+𝑟
+r
+not success
+then
+reflection
+←
+p
+ref
+​
+(
+c
+t
+)
+←
+reflection
+subscript
+𝑝
+ref
+subscript
+𝑐
+𝑡
+\text{reflection}\leftarrow p_{\text{ref}}(c_{t})
+c
+←
+reflection
+←
+𝑐
+reflection
+c\leftarrow\text{reflection}
+end
+if
+end
+if
+a
+t
+←
+arg
+⁡
+max
+a
+∈
+e
+​
+(
+s
+t
+)
+⁡
+[
+V
+​
+(
+s
+t
+)
++
+w
+​
+ln
+⁡
+N
+​
+(
+s
+t
+−
+1
+)
+N
+​
+(
+s
+t
+)
+]
+←
+subscript
+𝑎
+𝑡
+subscript
+𝑎
+𝑒
+subscript
+𝑠
+𝑡
+𝑉
+subscript
+𝑠
+𝑡
+𝑤
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑁
+subscript
+𝑠
+𝑡
+a_{t}\leftarrow\arg\max_{a\in e(s_{t})}\left[{V(s_{t})}+w\sqrt{\frac{\ln{N}(s_{t-1})}{{N}(s_{t})}}\right]
+▷
+▷
+\triangleright
+Selection
+N
+​
+(
+s
+t
++
+1
+)
+←
+N
+​
+(
+s
+t
++
+1
+)
++
+1
+←
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑁
+subscript
+𝑠
+𝑡
+1
+1
+{N}(s_{t+1})\leftarrow{N}(s_{t+1})+1
+if
+a
+t
+subscript
+𝑎
+𝑡
+a_{t}
+is an output action
+then
+break
+end
+for
+T
+←
+←
+𝑇
+absent
+T\leftarrow
+the actual number of steps
+for
+t
+←
+T
+−
+1
+,
+…
+,
+0
+←
+𝑡
+𝑇
+1
+…
+0
+t\leftarrow T-1,\dots,0
+do
+▷
+▷
+\triangleright
+Backpropagation
+V
+​
+(
+s
+t
+)
+←
+V
+​
+(
+s
+t
+)
+​
+(
+N
+​
+(
+s
+t
+)
+−
+1
+)
++
+r
+N
+​
+(
+s
+t
+)
+←
+𝑉
+subscript
+𝑠
+𝑡
+𝑉
+subscript
+𝑠
+𝑡
+𝑁
+subscript
+𝑠
+𝑡
+1
+𝑟
+𝑁
+subscript
+𝑠
+𝑡
+V(s_{t})\leftarrow\frac{V(s_{t})(N(s_{t})-1)+r}{N(s_{t})}
+end
+for
+end
+for
+Appendix B
+Discussion
+Limitations.
+Although LATS can improve reasoning and decision-making, this arrives at a higher computational cost relative to simpler prompting methods like ReAct or Reflexion. The search process takes more time than standard prompting or simpler techniques, and requires greater inference costs. While such an issue is mitigated by the fact that the number of nodes
+n
+𝑛
+n
+expanded at every step provides a natural trade-off between performance and efficiency (setting
+n
+=
+1
+𝑛
+1
+n=1
+makes the method as effecient as ReAct with multiple trials or CoT-SC), in practice we recommend using LATS for difficult tasks like programming or for situations where performance is prioritized over efficiency. We hope that continued advancements in LLMs will reduce costs and increase the practicality of LATS.
+Additionally, the benchmarks we use in this paper are relatively simple and focused on decision-making, compared to the complexity of real-world interactive environments. In addition, some environments might not easily support rollbacks to previous states. However, the design of LATS is flexible and can be adjusted to various resource constraints. Using planning-based prompting methods like LATS in environments like Minecraft
+(Fan et al.,
+2022
+)
+and more reasoning benchmarks would be interesting avenues for future work.
+Broader impact.
+LATS is a framework that enhances LLM performance through interactions with an environment. This improvement in autonomous decision-making may facilitate harmful uses of LLMs. Alternatively, LATS enhances interpretability and the potential for greater alignment, as it generates understandable, high-level linguistic reasoning and actions through several rounds of decision-making and reflection, rather than relying on implicit, low-level token values.
+Appendix C
+Ablations
+Prompt Method
+HotpotQA (EM)
+LATS (w=0.5)
+0.55
+LATS (w=2.0)
+0.61
+LATS (d=4)
+0.58
+LATS (CoT)
+0.60
+LATS (No LM Heuristic)
+0.37
+LATS
+0.61
+Table 6:
+Ablation results on LATS and baseline variants in HotPotQA measured by Exact Match (EM). We test different depth
+d
+𝑑
+d
+, exploration factor
+w
+𝑤
+w
+, and versions of LATS using CoT and without the LM value function. We sample
+n
+=
+5
+𝑛
+5
+n=5
+and
+k
+=
+50
+𝑘
+50
+k=50
+trajectories.
+Figure 4:
+Performance over successive iterations on HumanEval with GPT-3.5.
+In this section, we ablate various designs of LATS. Experiments are conducted on HotPotQA with a maximum of
+k
+=
+50
+𝑘
+50
+k=50
+trajectories and sampling size of
+n
+=
+5
+𝑛
+5
+n=5
+and HumanEval with a maximum of
+k
+=
+8
+𝑘
+8
+k=8
+trajectories and sampling size of
+n
+=
+5
+𝑛
+5
+n=5
+. The result for HotPotQA is shown in Tab.
+5
+and HumanEval in Fig.
+4
+.
+Exploration weight.
+We find that there is lower performance on HotPotQA when the exploration weight
+w
+𝑤
+w
+in the selection formula is decreased to
+0.5
+0.5
+0.5
+, suggesting that this reduces the effectiveness of the search. Increasing
+w
+𝑤
+w
+to
+2.0
+2.0
+2.0
+does not lead to a performance improvement, but we tend to observe faster convergence. The optimal setting depends on the particular environment and complexity of the state space.
+Depth.
+In our main experiments we use a maximum depth of
+d
+=
+7
+𝑑
+7
+d=7
+on HotPotQA for all methods, following previous work
+(Yao et al.,
+2023b
+)
+. We ablate the effect on LATS after reducing it to
+d
+=
+4
+𝑑
+4
+d=4
+. This results in only a slight drop in performance. We find that most questions can be answered within four steps, and using a greater number of steps tends to force the agent into local minima and rarely improves success.
+LM value function.
+The LM value function scores states based on expected future reward. Without this heuristic, the only signal to guide search would be from environment rewards for completed trajectories, which are scarce and often binary. When we remove the evaluation operation, we observe a dramatic
+0.24
+0.24
+0.24
+drop in performance.
+Performance over time.
+To see the effects of increasing the number of trajectories sampled, we change
+k
+𝑘
+k
+to different values. We conduct this experiment on HumanEval, which has a more noticeable difference due to sampling less trajectories. The results are shown in Fig.
+4
+, in which LATS scales better with more iterations than Reflexion.
+Sample complexity and Token cost.
+One possible concern of LATS is that the tree-structured search might consume much more tokens than existing methods. To further study the computational cost of LATS compared to prior methods, we examine the sample complexity (i.e. asymptotic token cost) of all methods considered in this paper, and count the average number of nodes expanded by our method and other tree-structured methods (ToT and RAP) upon successful search on HotPotQA. We present the results in Tab.
+7
+; the result shows that our method has the same sample complexity as other tree-based search methods, and has less average number of nodes expanded upon success, which indicates less token cost. The token cost gap will be even larger when taking failed trajectories into account, since our method has higher success rate and reaches computational budget limit less often.
+Method
+Performance (
+↑
+↑
+\uparrow
+)
+Sample complexity (
+↓
+↓
+\downarrow
+)
+Avg. #nodes upon success (
+↓
+↓
+\downarrow
+)
+ReAct (Best
+k
+=
+250
+𝑘
+250
+k=250
+)
+0.42
+0.42
+0.42
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+CoT-SC (
+n
+=
+1
+,
+k
+=
+250
+formulae-sequence
+𝑛
+1
+𝑘
+250
+n=1,k=250
+)
+0.40
+0.40
+0.40
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+LATS (
+n
+=
+1
+,
+k
+=
+50
+formulae-sequence
+𝑛
+1
+𝑘
+50
+n=1,k=50
+)
+0.48
+0.48
+0.48
+O
+​
+(
+k
+)
+𝑂
+𝑘
+O(k)
+N/A
+ToT (ReAct)
+0.49
+0.49
+0.49
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+84.05
+84.05
+84.05
+RAP (ReAct)
+0.54
+0.54
+0.54
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+70.60
+70.60
+70.60
+LATS (
+n
+=
+5
+,
+k
+=
+50
+formulae-sequence
+𝑛
+5
+𝑘
+50
+n=5,k=50
+)
+0.61
+0.61
+0.61
+O
+​
+(
+k
+​
+n
+)
+𝑂
+𝑘
+𝑛
+O(kn)
+66.65
+66.65
+66.65
+Table 7:
+The performance, sample complexity of different methods and average number of nodes expanded upon success by methods with tree-based search.
+n
+𝑛
+n
+is the number of children nodes expanded at every step and
+k
+𝑘
+k
+is the number of trajectories. Our method has the same sample complexity as other methods with tree-based search and expands less nodes upon success, which indicates lower token cost.
+Appendix D
+Environment Details
+D.1
+HotPotQA
+Figure 5:
+Example trajectories on HotPotQA for ReAct (left) and LATS (right). LATS can sample more actions and avoid failure from previous mistakes by evaluating states with an LM to guide the search toward promising areas of the tree.
+HotPotQA
+(Yang et al.,
+2018
+)
+is a question-answering dataset that requires reasoning over multiple supporting documents to answer questions. It contains 113k Wikipedia-based question-answer pairs crafted by crowdworkers to be diverse, multi-hop, and explainable. Questions cover a range of types like entities, locations, dates, and comparison of shared properties between two entities. Crowdworkers also provide supporting facts from the documents that justify the answer. We use the HotPotQA benchmark setting with all the Wikipedia paragraphs to test retrieval. We use a randomly selected subset of 100 questions for our experiments and a maximum depth limit of 6. Fig.
+5
+illustrates how ReAct and LATS work on an example task of HotPotQA, and gives a qualitative example on how LATS outperforms ReAct on the task.
+Action Space.
+We adopt the Wikipedia web API proposed in
+Yao et al. (
+2023b
+)
+, with three types of actions to support interactive information retrieval:
+(1)
+search
+[
+entity
+], which returns the first 5 sentences from the corresponding
+entity
+wiki page if it exists, or else suggests top-5 similar entities from the Wikipedia search engine,
+(2)
+lookup
+[
+string
+], which returns the next sentence in the page containing
+string
+,
+(3)
+finish
+[
+answer
+], which finishes the current task with
+answer
+.
+These API calls and free-form thoughts form the action space for this environment.
+D.2
+Programming
+The HumanEval dataset
+(Chen et al.,
+2021
+)
+is a collection of 164 handwritten programming problems introduced to evaluate the functional correctness of models for synthesizing programs from natural language descriptions. Each problem includes a function signature, docstring description, reference implementation, and multiple unit tests, with an average of 7.7 tests per problem. The programming tasks assess comprehension of natural language, reasoning, algorithms, and basic mathematics, at a difficulty level comparable to simple software interview questions. Pass rates are evaluated with the pass@k metric, where k samples are generated per problem and a problem is considered solved if any sample passes all tests. We use all 164 problems for our experiments and a maximum depth limit of 8.
+The Mostly Basic Programming Problems (MBPP)
+Austin et al. (
+2021
+)
+benchmark contains 974 short Python functions designed to evaluate program synthesis techniques. The dataset was constructed by crowdsourcing from workers with basic Python knowledge. Each data point consists of a natural language description of a programming task, a reference solution implementation, and three test cases for functional correctness. The natural language prompts are typically short, one-sentence descriptions. Solutions cover common programming constructs including mathematical operations, list processing, string manipulation, and usage of the Python standard library. On average, solutions are 6.8 lines of code. The dataset is also supplemented with an additional set of 426 problems that were manually verified for unambiguous specifications, standard function signatures, and accurate test cases. We use a randomly selected subset of 397 problems for our experiments.
+D.3
+WebShop
+WebShop
+(Yao et al.,
+2022
+)
+is an interactive web-based environment designed to evaluate agents on grounded language understanding and decision-making. It simulates an e-commerce shopping task by providing agents with over 1 million real-world products scraped from Amazon, spanning 5 categories and 113 subcategories. These products contain rich linguistic information, with an average text length of 262 words and a vocabulary size of 224k. In addition, there are over 800k unique product options available for customization. The environment renders webpages in two modes: HTML mode provides pixel-level observations with interactive elements, while simple mode converts the raw HTML into a structured text observation more amenable for training agents. The action space consists of query searches and button clicks, which transition between 4 page types: search, results, item and item-detail. Instructions are crowdsourced natural language specifying product attributes and options, with a total of 12k collected. Automatic rewards are computed by comparing the product purchased by the agent against the attributes and options specified in the instruction, using both lexical matching and semantic similarity metrics.
+Type
+Argument
+State
+→
+→
+\rightarrow
+Next State
+search
+[
+Query
+]
+Search
+→
+→
+\rightarrow
+Results
+choose
+Back to search
+∗
+*
+→
+→
+\rightarrow
+Search
+choose
+Prev/Next page
+Results
+→
+→
+\rightarrow
+Results
+choose
+[
+Product title
+]
+Results
+→
+→
+\rightarrow
+Item
+choose
+[
+Option
+]
+Item
+→
+→
+\rightarrow
+Item
+choose
+Desc/Overview
+Item
+→
+→
+\rightarrow
+Item-Detail
+choose
+Previous
+Item-Detail
+→
+→
+\rightarrow
+Item
+choose
+Buy
+Item
+→
+→
+\rightarrow
+Episode End
+Table 8:
+Action space of webshop.
+There are two evaluation metrics used in WebShop: (1)
+Task Score
+: defined as
+(
+100
+×
+avg. reward
+)
+100
+avg. reward
+(100\times\text{avg. reward})
+, which captures the average reward obtained across episodes; and (2)
+Success Rate (SR)
+defined as the portion of instructions where
+r
+=
+1
+𝑟
+1
+r=1
+. The reward is calculated based on the number of attributes satisfied by the selected item. We use 50 environments for our experiments and a maximum depth limit of 15.
+Appendix E
+HotPotQA Prompts
+E.1
+Base Acting Prompt
+Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+After each observation, provide the next Thought and next Action.
+Here are some examples:
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+Thought 2: Arthur’s Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman’s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey.[2] In 2011 the circulation of the magazine was 1,310,696 copies.[3]
+Thought 3: First for Women was started in 1989. 1844 (Arthur’s Magazine) ¡ 1989 (First for Women), so Arthur’s Magazine was started first.
+Action 3: Finish[Arthur’s Magazine]
+(examples)
+You have attempted to answer the following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.
+(trajectories)
+(input)
+E.2
+Base Reasoning Prompt
+Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question. Start your response with either Action or an indexed Thought
+Here are some examples:
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Let’s think step by step.
+Thought 1: The eastern sector of Colorado orogeny extends into the High Plains.
+Thought 2: High Plains rise in elevation from around 1,800 to 7,000 ft
+Thought 3: The answer is 1,800 to 7,000 ft.
+Action: Finish[1,800 to 7,000 ft]
+(examples)
+Previous trial:
+(trajectories)
+(input)
+E.3
+Value Function Prompt
+Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action.
+Thus the correctness score is 10
+(other examples)
+(failed trajectories)
+(context)
+E.4
+Reflection Prompt
+Analyze the trajectories of a solution to a question answering task. The trajectories are labeled by environmental observations about the situation, thoughts that can reason about the current situation and actions that can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Given a question and a trajectory, evaluate its correctness and provide your reasoning and analysis in detail. Focus on the latest thought, action, and observation. Incomplete trajectories can be correct if the thoughts and actions so far are correct, even if the answer is not found yet. Do not generate additional thoughts or actions. Then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Question: Which magazine was started first Arthur’s Magazine or First for Women?
+Thought 1: I need to search Arthur’s Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur’s Magazine]
+Observation 1: Arthur’s Magazine (18440̆0e20̆0800̆0931846) was an American literary periodical published in Philadelphia in the 19th century. Edited by Timothy Shay Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others.[1][2] In May 1846 it was merged into Godey’s Lady’s Book.[3]
+This trajectory is correct as it is reasonable to search for the first magazine provided in the question. It is also better to have simple searches corresponding to a single entity, making this the best action.
+Thus the correctness score is 10
+(other examples)
+(failed trajectories)
+(context)
+Appendix F
+Programming Prompts
+F.1
+HumanEval function implementation example
+Sample function signature:
+⬇
+def
+minSubArraySum
+(
+nums
+):
+Given
+an
+array
+of
+integers
+nums
+,
+find
+the
+minimum
+sum
+of
+any
+non
+-
+empty
+sub
+-
+array
+of
+nums
+.
+Example
+minSubArraySum
+([2,
+3,
+4,
+1,
+2,
+4])
+==
+1
+minSubArraySum
+([-1,
+-2,
+-3])
+==
+-6
+Sample function body implementation:
+⬇
+min_sum
+=
+float
+(’
+inf
+’)
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+current_sum
+=
+0
+for
+j
+in
+range
+(
+i
+,
+len
+(
+nums
+)):
+current_sum
++=
+nums
+[
+j
+]
+if
+current_sum
+<
+min_sum
+:
+min_sum
+=
+current_sum
+return
+min_sum
+F.2
+Base Acting/Reasoning Prompt
+You are an AI Python assistant. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Write your full implementation (restate the function signature).
+Example 1:
+[previous impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
+-
+b
+[unit test results from previous impl]:
+Tested passed:
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 2) == 4 # output: -1
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input.
+[improved impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
++
+b
+F.3
+Reflection Prompt
+You are a Python programming assistant. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user.
+Example 1:
+[previous impl]:
+⬇
+def
+add
+(
+a
+:
+int
+,
+b
+:
+int
+)
+->
+int
+:
+”””
+Given
+integers
+a
+and
+b
+,
+return
+the
+total
+value
+of
+a
+and
+b
+.
+”””
+return
+a
+-
+b
+[unit test results from previous impl]:
+Tested passed:
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 2) == 4 # output: -1
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from ‘-‘ to ‘+‘ in the return statement. This will ensure that the function returns the correct output for the given input.
+F.4
+Test Case Generation Prompt
+You are an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
+(input)
+Appendix G
+WebShop Prompts
+G.1
+Acting Prompt
+Webshop
+Instruction:
+i would like a 3 ounce bottle of bright citrus deodorant for sensitive skin, and price lower than 50.00 dollars
+[Search]
+Action: search[3 ounce bright citrus deodorant sensitive skin]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next
+]
+[B078GWRC1J]
+Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+$10.99
+[B078GTKVXY]
+Ginger Fresh Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+$10.99
+[B08KBVJ4XN]
+Barrel and Oak - Aluminum-Free Deodorant, Deodorant for Men, Essential Oil-Based Scent, 24-Hour Odor Protection, Cedar & Patchouli Blend, Gentle on Sensitive Skin (Mountain Sage, 2.7 oz, 2-Pack)
+$15.95
+Action: think[B078GWRC1J and B078GTKVXY are bright citrus deodorant less then 50 dollars. I can check B078GWRC1J first.]
+Observation: OK.
+Action: click[B078GWRC1J]
+Observation:
+[Back to Search]
+[Prev]
+scent [assorted scents][bright citrus][calming lavender][ginger fresh][simply non-scents]
+size [travel set (4-pack)][3 ounce (pack of 1)][3-ounce (2-pack)]
+Bright Citrus Deodorant by Earth Mama — Natural and Safe for Sensitive Skin, Pregnancy and Breastfeeding, Contains Organic Calendula 3-Ounce
+Price: $10.99
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[For 3 ounce bottle of bright citrus deodorant for sensitive skin, the item has options ’bright citrus’ and ’3 ounce (pack of 1)’ and seems good to buy.]
+Observation: OK.
+Action: click[bright citrus]
+Observation: You have clicked bright citrus.
+Action: click[3 ounce (pack of 1)]
+Observation: You have clicked 3 ounce (pack of 1).
+Action: click[Buy Now]
+G.2
+Value Function Prompt
+Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Here are some examples
+Webshop
+Instruction:
+i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars
+[Search]
+Action: search[dairy free and apple variety pack of chips]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next]
+[B07HRFJWP8]
+Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total)
+$100.0
+[B01KMHY5PG]
+Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars)
+$21.49
+[B008D2X8C4]
+Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz)
+$100.0
+Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.]
+Observation: OK.
+Action: click[B008D2X8C4]
+Observation:
+[Back to Search]
+[Prev]
+flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack]
+size [0.8 ounce (pack of 24)][4 ounce (pack of 12)]
+Price: $100.0
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.]
+Observation: OK.
+Action: click[variety pack]
+Observation: You have clicked variety pack.
+Action: click[0.8 ounce (pack of 24)]
+Observation: You have clicked 0.8 ounce (pack of 24).
+Action: click[Buy Now]
+Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products.
+Thus the correctness score is 5
+(input)
+G.3
+Reflection Prompt
+Given an item to purchase and a trajectory that aims to buy an item that exactly matches the specification, which corresponds to the ideal score of 1.0, analyze the following trajectory, then at the last line conclude ”Thus the correctness score is s”, where s is an integer from 1 to 10.
+Here are some examples
+Webshop
+Instruction:
+i am looking for dairy free and apple variety pack of chips, and price lower than 30.00 dollars
+[Search]
+Action: search[dairy free and apple variety pack of chips]
+Observation:
+[Back to Search]
+Page 1 (Total results: 50)
+[Next]
+[B07HRFJWP8]
+Enjoy Life Foods Soft Baked Ovals, Breakfast Bars, Nut Free Bars, Soy Free, Dairy Free, Non GMO, Gluten Free, Vegan, Variety Pack, 4 Boxes (20 Bars Total)
+$100.0
+[B01KMHY5PG]
+Enjoy Life Soft Baked Chewy Bars, Variety Pack, Nut Free Bars, Soy Free, Dairy Free, Gluten Free, 6 Boxes (30 Total Bars)
+$21.49
+[B008D2X8C4]
+Enjoy Life Lentil Chips Variety Pack, Dairy Free Chips, Soy Free, Nut Free, Non GMO, Vegan, Gluten Free, 24 Bags (0.8 oz)
+$100.0
+Action: think[The first two items are not what I want. The third item is the dairy free and apple variety pack of chips I want.]
+Observation: OK.
+Action: click[B008D2X8C4]
+Observation:
+[Back to Search]
+[Prev]
+flavor name [dill and sour cream][garlic & parmesan][light sea salt][margherita pizza][thai chili lime][variety pack]
+size [0.8 ounce (pack of 24)][4 ounce (pack of 12)]
+Price: $100.0
+Rating: N.A.
+[Description]
+[Features]
+[Reviews]
+[Buy Now]
+Action: think[The item has options ’variety pack’ and ’0.8 ounce (pack of 24)’ and seems good to buy.]
+Observation: OK.
+Action: click[variety pack]
+Observation: You have clicked variety pack.
+Action: click[0.8 ounce (pack of 24)]
+Observation: You have clicked 0.8 ounce (pack of 24).
+Action: click[Buy Now]
+Reflection: In this attempt, I was unsuccessful. I accidentally bought a product that was $100, which is more than my budget of $30. Either way, the initial search results were not good. Next time, I will do search[”variety pack of chips”] and then check if the results meet the dairy free and the $30 budget constraints. I will continue to refine my searches so that I can find more products.
+(input)
+Reflection:
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md
new file mode 100644
index 0000000000000000000000000000000000000000..25e1ac5f2aefc5ff394b5636c0cce4552436d1cd
--- /dev/null
+++ b/research/notes/231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la.md
@@ -0,0 +1,202 @@
+---
+title: '[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning
+  in Language Models'
+id: 231004406-language-agent-tree-search-unifies-reasoning-acting-and-planning-in-la
+tags:
+- deepread
+created: '2026-06-10T00:39:54.848871Z'
+source: https://arxiv.org/abs/2310.04406
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:39:54.848723Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2310.04406] Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Computer Science > Artificial Intelligence
+arXiv:2310.04406
+(cs)
+[Submitted on 6 Oct 2023 (
+v1
+), last revised 6 Jun 2024 (this version, v3)]
+Title:
+Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models
+Authors:
+Andy Zhou
+,
+Kai Yan
+,
+Michal Shlapentokh-Rothman
+,
+Haohan Wang
+,
+Yu-Xiong Wang
+View a PDF of the paper titled Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models, by Andy Zhou and 4 other authors
+View PDF
+HTML (experimental)
+Abstract:
+While language models (LMs) have shown potential across a range of decision-making tasks, their reliance on simple acting processes limits their broad deployment as autonomous agents. In this paper, we introduce Language Agent Tree Search (LATS) -- the first general framework that synergizes the capabilities of LMs in reasoning, acting, and planning. By leveraging the in-context learning ability of LMs, we integrate Monte Carlo Tree Search into LATS to enable LMs as agents, along with LM-powered value functions and self-reflections for proficient exploration and enhanced decision-making. A key feature of our approach is the incorporation of an environment for external feedback, which offers a more deliberate and adaptive problem-solving mechanism that surpasses the constraints of existing techniques. Our experimental evaluation across diverse domains, including programming, interactive question-answering (QA), web navigation, and math, validates the effectiveness and generality of LATS in decision-making while maintaining competitive or improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1 accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates gradient-free performance (average score of 75.9) comparable to gradient-based fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at
+this https URL
+Comments:
+Code at
+this https URL
+Subjects:
+Artificial Intelligence (cs.AI)
+; Computation and Language (cs.CL); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG)
+Cite as:
+arXiv:2310.04406
+[cs.AI]
+(or
+arXiv:2310.04406v3
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2310.04406
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Andy Zhou [
+view email
+]
+[v1]
+Fri, 6 Oct 2023 17:55:11 UTC (371 KB)
+[v2]
+Tue, 5 Dec 2023 05:25:55 UTC (465 KB)
+[v3]
+Thu, 6 Jun 2024 02:51:17 UTC (960 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models, by Andy Zhou and 4 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-10
+Change to browse by:
+cs
+cs.CL
+cs.CV
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md b/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md
new file mode 100644
index 0000000000000000000000000000000000000000..00485e0e2b28c9396efd9f7bfd2978c177e464af
--- /dev/null
+++ b/research/notes/231006770-swe-bench-can-language-models-resolve-real-world-github-issues.md
@@ -0,0 +1,203 @@
+---
+title: '[2310.06770] SWE-bench: Can Language Models Resolve Real-World GitHub Issues?'
+id: 231006770-swe-bench-can-language-models-resolve-real-world-github-issues
+tags:
+- deepread
+created: '2026-06-10T00:23:35.577828Z'
+source: https://arxiv.org/abs/2310.06770
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:35.577638Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2310.06770] SWE-bench: Can Language Models Resolve Real-World GitHub Issues?
+Computer Science > Computation and Language
+arXiv:2310.06770
+(cs)
+[Submitted on 10 Oct 2023 (
+v1
+), last revised 11 Nov 2024 (this version, v3)]
+Title:
+SWE-bench: Can Language Models Resolve Real-World GitHub Issues?
+Authors:
+Carlos E. Jimenez
+,
+John Yang
+,
+Alexander Wettig
+,
+Shunyu Yao
+,
+Kexin Pei
+,
+Ofir Press
+,
+Karthik Narasimhan
+View a PDF of the paper titled SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, by Carlos E. Jimenez and 6 other authors
+View PDF
+Abstract:
+Language models have outpaced our ability to evaluate them effectively, but for their future development it is essential to study the frontier of their capabilities. We find real-world software engineering to be a rich, sustainable, and challenging testbed for evaluating the next generation of language models. To this end, we introduce SWE-bench, an evaluation framework consisting of $2,294$ software engineering problems drawn from real GitHub issues and corresponding pull requests across $12$ popular Python repositories. Given a codebase along with a description of an issue to be resolved, a language model is tasked with editing the codebase to address the issue. Resolving issues in SWE-bench frequently requires understanding and coordinating changes across multiple functions, classes, and even files simultaneously, calling for models to interact with execution environments, process extremely long contexts and perform complex reasoning that goes far beyond traditional code generation tasks. Our evaluations show that both state-of-the-art proprietary models and our fine-tuned model SWE-Llama can resolve only the simplest issues. The best-performing model, Claude 2, is able to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps towards LMs that are more practical, intelligent, and autonomous.
+Comments:
+Data, code, and leaderboard are available at
+this https URL
+ICLR 2024,
+this https URL
+Subjects:
+Computation and Language (cs.CL)
+; Artificial Intelligence (cs.AI); Software Engineering (cs.SE)
+Cite as:
+arXiv:2310.06770
+[cs.CL]
+(or
+arXiv:2310.06770v3
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2310.06770
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Carlos E. Jimenez [
+view email
+]
+[v1]
+Tue, 10 Oct 2023 16:47:29 UTC (2,003 KB)
+[v2]
+Fri, 5 Apr 2024 18:16:29 UTC (2,258 KB)
+[v3]
+Mon, 11 Nov 2024 23:05:04 UTC (2,398 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, by Carlos E. Jimenez and 6 other authors
+View PDF
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-10
+Change to browse by:
+cs
+cs.AI
+cs.SE
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md b/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..f19fdb9491233e227c9191c36b05347ed20ca256
--- /dev/null
+++ b/research/notes/231108105-diloco-distributed-low-communication-training-of-language-models.md
@@ -0,0 +1,208 @@
+---
+title: '[2311.08105] DiLoCo: Distributed Low-Communication Training of Language Models'
+id: 231108105-diloco-distributed-low-communication-training-of-language-models
+tags:
+- deepread
+created: '2026-06-10T00:30:20.411067Z'
+source: https://arxiv.org/abs/2311.08105
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:20.410923Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2311.08105] DiLoCo: Distributed Low-Communication Training of Language Models
+Computer Science > Machine Learning
+arXiv:2311.08105
+(cs)
+[Submitted on 14 Nov 2023 (
+v1
+), last revised 23 Sep 2024 (this version, v3)]
+Title:
+DiLoCo: Distributed Low-Communication Training of Language Models
+Authors:
+Arthur Douillard
+,
+Qixuan Feng
+,
+Andrei A. Rusu
+,
+Rachita Chhaparia
+,
+Yani Donchev
+,
+Adhiguna Kuncoro
+,
+Marc'Aurelio Ranzato
+,
+Arthur Szlam
+,
+Jiajun Shen
+View a PDF of the paper titled DiLoCo: Distributed Low-Communication Training of Language Models, by Arthur Douillard and 8 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Large language models (LLM) have become a critical component in many applications of machine learning. However, standard approaches to training LLM require a large number of tightly interconnected accelerators, with devices exchanging gradients and other intermediate states at each optimization step. While it is difficult to build and maintain a single computing cluster hosting many accelerators, it might be easier to find several computing clusters each hosting a smaller number of devices. In this work, we propose a distributed optimization algorithm, Distributed Low-Communication (DiLoCo), that enables training of language models on islands of devices that are poorly connected. The approach is a variant of federated averaging, where the number of inner steps is large, the inner optimizer is AdamW, and the outer optimizer is Nesterov momentum. On the widely used C4 dataset, we show that DiLoCo on 8 workers performs as well as fully synchronous optimization while communicating 500 times less. DiLoCo exhibits great robustness to the data distribution of each worker. It is also robust to resources becoming unavailable over time, and vice versa, it can seamlessly leverage resources that become available during training.
+Subjects:
+Machine Learning (cs.LG)
+; Computation and Language (cs.CL)
+Cite as:
+arXiv:2311.08105
+[cs.LG]
+(or
+arXiv:2311.08105v3
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2311.08105
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Arthur Douillard [
+view email
+]
+[v1]
+Tue, 14 Nov 2023 12:05:45 UTC (1,609 KB)
+[v2]
+Sat, 2 Dec 2023 14:10:14 UTC (1,610 KB)
+[v3]
+Mon, 23 Sep 2024 10:41:27 UTC (1,610 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled DiLoCo: Distributed Low-Communication Training of Language Models, by Arthur Douillard and 8 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-11
+Change to browse by:
+cs
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md b/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md
new file mode 100644
index 0000000000000000000000000000000000000000..4eb50797128480d47d6e8e18f198a2dda0b43986
--- /dev/null
+++ b/research/notes/231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error.md
@@ -0,0 +1,204 @@
+---
+title: '[2311.08516] LLMs cannot find reasoning errors, but can correct them given
+  the error location'
+id: 231108516-llms-cannot-find-reasoning-errors-but-can-correct-them-given-the-error
+tags:
+- deepread
+created: '2026-06-10T00:40:16.980357Z'
+source: https://arxiv.org/abs/2311.08516
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:16.980220Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2311.08516] LLMs cannot find reasoning errors, but can correct them given the error location
+Computer Science > Artificial Intelligence
+arXiv:2311.08516
+(cs)
+[Submitted on 14 Nov 2023 (
+v1
+), last revised 4 Jun 2024 (this version, v3)]
+Title:
+LLMs cannot find reasoning errors, but can correct them given the error location
+Authors:
+Gladys Tyen
+,
+Hassan Mansoor
+,
+Victor Cărbune
+,
+Peter Chen
+,
+Tony Mak
+View a PDF of the paper titled LLMs cannot find reasoning errors, but can correct them given the error location, by Gladys Tyen and 4 other authors
+View PDF
+HTML (experimental)
+Abstract:
+While self-correction has shown promise in improving LLM outputs in terms of style and quality (e.g. Chen et al., 2023b; Madaan et al., 2023), recent attempts to self-correct logical or reasoning errors often cause correct answers to become incorrect, resulting in worse performances overall (Huang et al., 2023). In this paper, we show that poor self-correction performance stems from LLMs' inability to find logical mistakes, rather than their ability to correct a known mistake. Firstly, we benchmark several state-of-the-art LLMs on their mistake-finding ability and demonstrate that they generally struggle with the task, even in highly objective, unambiguous cases. Secondly, we test the correction abilities of LLMs -- separately from mistake finding -- using a backtracking setup that feeds ground truth mistake location information to the model. We show that this boosts downstream task performance across our 5 reasoning tasks, indicating that LLMs' correction abilities are robust. Finally, we show that it is possible to obtain mistake location information without ground truth labels or in-domain training data. We train a small classifier with out-of-domain data, which exhibits stronger mistake-finding performance than prompting a large model. We release our dataset of LLM-generated logical mistakes, BIG-Bench Mistake, to enable further research into locating LLM reasoning mistakes.
+Comments:
+ACL 2024 Findings
+Subjects:
+Artificial Intelligence (cs.AI)
+; Computation and Language (cs.CL); Machine Learning (cs.LG)
+Cite as:
+arXiv:2311.08516
+[cs.AI]
+(or
+arXiv:2311.08516v3
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2311.08516
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Gladys Tyen [
+view email
+]
+[v1]
+Tue, 14 Nov 2023 20:12:38 UTC (7,191 KB)
+[v2]
+Tue, 9 Jan 2024 03:32:32 UTC (7,191 KB)
+[v3]
+Tue, 4 Jun 2024 10:25:13 UTC (7,319 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled LLMs cannot find reasoning errors, but can correct them given the error location, by Gladys Tyen and 4 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-11
+Change to browse by:
+cs
+cs.CL
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md b/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md
new file mode 100644
index 0000000000000000000000000000000000000000..d97a39f0a95537904058cff394c144e2e58510f8
--- /dev/null
+++ b/research/notes/231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural.md
@@ -0,0 +1,203 @@
+---
+title: '[2312.09152] Evaluating Augmented Reality Communication: How Can We Teach
+  Procedural Skill in AR?'
+id: 231209152-evaluating-augmented-reality-communication-how-can-we-teach-procedural
+tags:
+- deepread
+created: '2026-06-10T00:40:10.692238Z'
+source: https://arxiv.org/abs/2312.09152
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:10.692096Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2312.09152] Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?
+Computer Science > Human-Computer Interaction
+arXiv:2312.09152
+(cs)
+[Submitted on 14 Dec 2023]
+Title:
+Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?
+Authors:
+Manuel Rebol
+,
+Krzysztof Pietroszek
+,
+Neal Sikka
+,
+Claudia Ranniger
+,
+Colton Hood
+,
+Adam Rutenberg
+,
+Puja Sasankan
+,
+Christian Gütl
+View a PDF of the paper titled Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?, by Manuel Rebol and 7 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Augmented reality (AR) has great potential for use in healthcare applications, especially remote medical training and supervision. In this paper, we analyze the usage of an AR communication system to teach a medical procedure, the placement of a central venous catheter (CVC) under ultrasound guidance. We examine various AR communication and collaboration components, including gestural communication, volumetric information, annotations, augmented objects, and augmented screens. We compare how teaching in AR differs from teaching through videoconferencing-based communication. Our results include a detailed medical training steps analysis in which we compare how verbal and visual communication differs between video and AR training. We identify procedural steps in which medical experts give visual instructions utilizing AR components. We examine the change in AR usage and interaction over time and recognize patterns between users. Moreover, AR design recommendations are given based on post-training interviews.
+Comments:
+this https URL
+Subjects:
+Human-Computer Interaction (cs.HC)
+Cite as:
+arXiv:2312.09152
+[cs.HC]
+(or
+arXiv:2312.09152v1
+[cs.HC]
+for this version)
+https://doi.org/10.48550/arXiv.2312.09152
+Focus to learn more
+arXiv-issued DOI via DataCite
+Journal reference:
+Proceedings of the 29th ACM Symposium on Virtual Reality Software and Technology (VRST 2023)
+Related DOI
+:
+https://doi.org/10.1145/3611659.3615685
+Focus to learn more
+DOI(s) linking to related resources
+Submission history
+From: Manuel Rebol [
+view email
+]
+[v1]
+Thu, 14 Dec 2023 17:22:22 UTC (2,671 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Evaluating Augmented Reality Communication: How Can We Teach Procedural Skill in AR?, by Manuel Rebol and 7 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.HC
+< prev
+|
+next >
+new
+|
+recent
+|
+2023-12
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md b/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae4cc129135cc1ad5be27eb3cb3363ef79d0d658
--- /dev/null
+++ b/research/notes/240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks.md
@@ -0,0 +1,203 @@
+---
+title: '[2402.01817] LLMs Can''t Plan, But Can Help Planning in LLM-Modulo Frameworks'
+id: 240201817-llms-cant-plan-but-can-help-planning-in-llm-modulo-frameworks
+tags:
+- deepread
+created: '2026-06-10T00:40:16.134883Z'
+source: https://arxiv.org/abs/2402.01817
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:16.134739Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2402.01817] LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks
+Computer Science > Artificial Intelligence
+arXiv:2402.01817
+(cs)
+[Submitted on 2 Feb 2024 (
+v1
+), last revised 12 Jun 2024 (this version, v3)]
+Title:
+LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks
+Authors:
+Subbarao Kambhampati
+,
+Karthik Valmeekam
+,
+Lin Guan
+,
+Mudit Verma
+,
+Kaya Stechly
+,
+Siddhant Bhambri
+,
+Lucas Saldyt
+,
+Anil Murthy
+View a PDF of the paper titled LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks, by Subbarao Kambhampati and 7 other authors
+View PDF
+HTML (experimental)
+Abstract:
+There is considerable confusion about the role of Large Language Models (LLMs) in planning and reasoning tasks. On one side are over-optimistic claims that LLMs can indeed do these tasks with just the right prompting or self-verification strategies. On the other side are perhaps over-pessimistic claims that all that LLMs are good for in planning/reasoning tasks are as mere translators of the problem specification from one syntactic format to another, and ship the problem off to external symbolic solvers. In this position paper, we take the view that both these extremes are misguided. We argue that auto-regressive LLMs cannot, by themselves, do planning or self-verification (which is after all a form of reasoning), and shed some light on the reasons for misunderstandings in the literature. We will also argue that LLMs should be viewed as universal approximate knowledge sources that have much more meaningful roles to play in planning/reasoning tasks beyond simple front-end/back-end format translators. We present a vision of {\bf LLM-Modulo Frameworks} that combine the strengths of LLMs with external model-based verifiers in a tighter bi-directional interaction regime. We will show how the models driving the external verifiers themselves can be acquired with the help of LLMs. We will also argue that rather than simply pipelining LLMs and symbolic components, this LLM-Modulo Framework provides a better neuro-symbolic approach that offers tighter integration between LLMs and symbolic components, and allows extending the scope of model-based planning/reasoning regimes towards more flexible knowledge, problem and preference specifications.
+Subjects:
+Artificial Intelligence (cs.AI)
+; Machine Learning (cs.LG)
+Cite as:
+arXiv:2402.01817
+[cs.AI]
+(or
+arXiv:2402.01817v3
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2402.01817
+Focus to learn more
+arXiv-issued DOI via DataCite
+Journal reference:
+Proceedings of the 41 st International Conference on Machine Learning, Vienna, Austria. PMLR 235, 2024
+Submission history
+From: Subbarao Kambhampati [
+view email
+]
+[v1]
+Fri, 2 Feb 2024 14:43:18 UTC (4,551 KB)
+[v2]
+Tue, 6 Feb 2024 01:29:37 UTC (4,552 KB)
+[v3]
+Wed, 12 Jun 2024 01:13:11 UTC (6,405 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks, by Subbarao Kambhampati and 7 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-02
+Change to browse by:
+cs
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md b/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md
new file mode 100644
index 0000000000000000000000000000000000000000..4efc88d1f34d445152e0fc496d8c2e6fbf97158c
--- /dev/null
+++ b/research/notes/240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang.md
@@ -0,0 +1,214 @@
+---
+title: '[2402.03300] DeepSeekMath: Pushing the Limits of Mathematical Reasoning in
+  Open Language Models'
+id: 240203300-deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-lang
+tags:
+- deepread
+created: '2026-06-09T23:28:29.232007Z'
+source: https://arxiv.org/abs/2402.03300
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:28:29.231807Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2402.03300] DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+Computer Science > Computation and Language
+arXiv:2402.03300
+(cs)
+[Submitted on 5 Feb 2024 (
+v1
+), last revised 27 Apr 2024 (this version, v3)]
+Title:
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+Authors:
+Zhihong Shao
+,
+Peiyi Wang
+,
+Qihao Zhu
+,
+Runxin Xu
+,
+Junxiao Song
+,
+Xiao Bi
+,
+Haowei Zhang
+,
+Mingchuan Zhang
+,
+Y.K. Li
+,
+Y. Wu
+,
+Daya Guo
+View a PDF of the paper titled DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models, by Zhihong Shao and 10 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.
+Subjects:
+Computation and Language (cs.CL)
+; Artificial Intelligence (cs.AI); Machine Learning (cs.LG)
+Cite as:
+arXiv:2402.03300
+[cs.CL]
+(or
+arXiv:2402.03300v3
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2402.03300
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Zhihong Shao [
+view email
+]
+[v1]
+Mon, 5 Feb 2024 18:55:32 UTC (3,417 KB)
+[v2]
+Tue, 6 Feb 2024 18:39:38 UTC (3,417 KB)
+[v3]
+Sat, 27 Apr 2024 15:25:53 UTC (3,417 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models, by Zhihong Shao and 10 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-02
+Change to browse by:
+cs
+cs.AI
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/240411018-many-shot-in-context-learning.md b/research/notes/240411018-many-shot-in-context-learning.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa692e1af0f5c28a7c33033d651709c8da2795d4
--- /dev/null
+++ b/research/notes/240411018-many-shot-in-context-learning.md
@@ -0,0 +1,228 @@
+---
+title: '[2404.11018] Many-Shot In-Context Learning'
+id: 240411018-many-shot-in-context-learning
+tags:
+- deepread
+created: '2026-06-10T00:40:15.011649Z'
+source: https://arxiv.org/abs/2404.11018
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:15.011513Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2404.11018] Many-Shot In-Context Learning
+Computer Science > Machine Learning
+arXiv:2404.11018
+(cs)
+[Submitted on 17 Apr 2024 (
+v1
+), last revised 17 Oct 2024 (this version, v3)]
+Title:
+Many-Shot In-Context Learning
+Authors:
+Rishabh Agarwal
+,
+Avi Singh
+,
+Lei M. Zhang
+,
+Bernd Bohnet
+,
+Luis Rosias
+,
+Stephanie Chan
+,
+Biao Zhang
+,
+Ankesh Anand
+,
+Zaheer Abbas
+,
+Azade Nova
+,
+John D. Co-Reyes
+,
+Eric Chu
+,
+Feryal Behbahani
+,
+Aleksandra Faust
+,
+Hugo Larochelle
+View a PDF of the paper titled Many-Shot In-Context Learning, by Rishabh Agarwal and 13 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Large language models (LLMs) excel at few-shot in-context learning (ICL) -- learning from a few examples provided in context at inference, without any weight updates. Newly expanded context windows allow us to investigate ICL with hundreds or thousands of examples -- the many-shot regime. Going from few-shot to many-shot, we observe significant performance gains across a wide variety of generative and discriminative tasks. While promising, many-shot ICL can be bottlenecked by the available amount of human-generated examples. To mitigate this limitation, we explore two new settings: Reinforced and Unsupervised ICL. Reinforced ICL uses model-generated chain-of-thought rationales in place of human examples. Unsupervised ICL removes rationales from the prompt altogether, and prompts the model only with domain-specific questions. We find that both Reinforced and Unsupervised ICL can be quite effective in the many-shot regime, particularly on complex reasoning tasks. Finally, we demonstrate that, unlike few-shot learning, many-shot learning is effective at overriding pretraining biases, can learn high-dimensional functions with numerical inputs, and performs comparably to fine-tuning. We also find that inference cost increases linearly in the many-shot regime, and frontier LLMs benefit from many-shot ICL to varying degrees. Our analysis also reveals the limitations of next-token prediction loss as an indicator of downstream ICL performance.
+Comments:
+NeurIPS (Spotlight)
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL)
+Cite as:
+arXiv:2404.11018
+[cs.LG]
+(or
+arXiv:2404.11018v3
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2404.11018
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Rishabh Agarwal [
+view email
+]
+[v1]
+Wed, 17 Apr 2024 02:49:26 UTC (327 KB)
+[v2]
+Wed, 22 May 2024 17:06:10 UTC (370 KB)
+[v3]
+Thu, 17 Oct 2024 17:45:09 UTC (414 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Many-Shot In-Context Learning, by Rishabh Agarwal and 13 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-04
+Change to browse by:
+cs
+cs.AI
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md b/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e3fe7b47c78c3e1400c5c5cc674a318a4575a07
--- /dev/null
+++ b/research/notes/240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers.md
@@ -0,0 +1,197 @@
+---
+title: '[2406.12543] Phase-controlled heat modulation with Aharonov-Bohm interferometers'
+id: 240612543-phase-controlled-heat-modulation-with-aharonov-bohm-interferometers
+tags:
+- deepread
+created: '2026-06-10T00:40:09.876451Z'
+source: https://arxiv.org/abs/2406.12543
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:09.876309Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2406.12543] Phase-controlled heat modulation with Aharonov-Bohm interferometers
+Condensed Matter > Mesoscale and Nanoscale Physics
+arXiv:2406.12543
+(cond-mat)
+[Submitted on 18 Jun 2024]
+Title:
+Phase-controlled heat modulation with Aharonov-Bohm interferometers
+Authors:
+Sun-Yong Hwang
+,
+Björn Sothmann
+,
+Rosa López
+View a PDF of the paper titled Phase-controlled heat modulation with Aharonov-Bohm interferometers, by Sun-Yong Hwang and 2 other authors
+View PDF
+HTML (experimental)
+Abstract:
+A heat modulator is proposed based on a voltage-biased Aharonov-Bohm interferometer. Once an electrical bias is applied, Peltier effects give rise to a flow of heat that can be modulated by a magnetic flux. We determine the corresponding temperature changes using a simple thermal model. Our calculations demonstrate that the modulated temperature difference can be as large as 80 mK at base temperature about 600 mK with relative temperature variations reaching 10\%. Our model also predicts, quite generally, the emergence of spin-polarized heat flows without any ferromagnetic contacts, if Rashba spin-orbit interaction is combined with the applied magnetic flux, which potentially paves the way towards caloritronic information processing.
+Comments:
+8 pages, 4 figures
+Subjects:
+Mesoscale and Nanoscale Physics (cond-mat.mes-hall)
+Cite as:
+arXiv:2406.12543
+[cond-mat.mes-hall]
+(or
+arXiv:2406.12543v1
+[cond-mat.mes-hall]
+for this version)
+https://doi.org/10.48550/arXiv.2406.12543
+Focus to learn more
+arXiv-issued DOI via DataCite
+Journal reference:
+Phys. Rev. Research 6, 013215 (2024)
+Related DOI
+:
+https://doi.org/10.1103/PhysRevResearch.6.013215
+Focus to learn more
+DOI(s) linking to related resources
+Submission history
+From: Sun-Yong Hwang [
+view email
+]
+[v1]
+Tue, 18 Jun 2024 12:22:44 UTC (1,894 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Phase-controlled heat modulation with Aharonov-Bohm interferometers, by Sun-Yong Hwang and 2 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cond-mat.mes-hall
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-06
+Change to browse by:
+cond-mat
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8f39064136b250b4ddafbd13e3aaace2f447b7c
--- /dev/null
+++ b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2.md
@@ -0,0 +1,2384 @@
+---
+title: '[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers'
+id: 240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers-2
+tags:
+- deepread
+created: '2026-06-10T00:40:45.751799Z'
+source: https://ar5iv.labs.arxiv.org/html/2408.06195
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:40:45.751621Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers
+Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers
+Zhenting Qi
+∗‡†
+Mingyuan Ma
+∗‡†
+Jiahang Xu
+∗‡
+Li Lyna Zhang
+‡⋄
+Fan Yang
+‡
+Mao Yang
+‡
+‡
+Microsoft Research Asia
+†
+Harvard University
+Abstract
+This paper introduces rStar, a self-play mutual reasoning approach that significantly improves reasoning capabilities of small language models (SLMs) without fine-tuning or superior models. rStar decouples reasoning into a self-play mutual generation-discrimination process. First, a target SLM augments the Monte Carlo Tree Search (MCTS) with
+a rich set of human-like reasoning actions
+to construct higher quality reasoning trajectories. Next, another SLM, with capabilities similar to the target SLM, acts as a discriminator to verify each trajectory generated by the target SLM. The mutually agreed reasoning trajectories are considered
+mutual consistent
+, thus are more likely to be correct. Extensive experiments across five SLMs demonstrate rStar can effectively solve diverse reasoning problems, including GSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be available at
+here
+.
+$*$
+$*$
+footnotetext:
+Equal contribution. Zhenting Qi and Mingyuan Ma did the work during an internship at MSRA
+$\diamond$
+$\diamond$
+footnotetext:
+Corresponding author: lzhani@microsoft.com
+1
+Introduction
+Despite their success, large language models (LLMs) face significant challenges in complex reasoning
+(Valmeekam et al.,
+2022
+; Weng et al.,
+2023
+)
+. For example, state of the art models like Mistral-7B
+(Jiang et al.,
+2023
+)
+can only achieve 36.5% accuracy on the GSM8K dataset, even with techniques like Chain-of-Throught (CoT)
+(Wei et al.,
+2022
+)
+. Although fine-tuning is shown to be an effective way to improve reasoning capability, most LLMs rely on fine-tuning data distilled or synthesized by
+superior
+models like GPT-4
+(Wang et al.,
+2024a
+; Gou et al.,
+2023
+)
+. Meanwhile, the community has been actively working on a complimentary and yet more challenging approach: Reasoning improvements
+without
+a superior teacher LLM.
+Figure 1:
+With 32 rounds of inference, rStar makes SLMs highly capable problem-solvers, matching or even surpassing the reasoning performance achieved after domain-specialized SFT.
+A promising paradigm to improve reasoning without superior models is to leverage the knowledge within LLMs themselves
+(Wang et al.,
+2023
+; Hao et al.,
+2023
+; Madaan et al.,
+2024
+)
+. For example, RAP
+(Hao et al.,
+2023
+)
+adopts a self-exploration solution to iteratively improve LLM’s reasoning performance through self-rewarded feedback. Unfortunately, study suggests that this paradigm often suffers from two fundamental issues.
+First, LLMs often struggle to effectively explore the solution space during reasoning. The self-exploration often traps in a solution space with low-quality reasoning steps even after many attempts. For example, our experiments reveal that after 32 rounds of self-exploration with RAP
+(Hao et al.,
+2023
+)
+, only 24% of the trajectories generated by LLaMA2-7B on GSM8K are correct.
+Second, even the self-exploration can find high quality reasoning steps, it is difficult for SLMs to tell which reasoning steps are of higher quality or determine which final answers are correct, thus it is hard to effectively guide the self-exploration. Our study shows that a naïve reward-based self-exploration guidance can lead to results no better than random guesses (see Appendix
+A.1
+).
+A more troublesome fact is that the above two issues are more pronounced in the smaller version of LLMs, i.e.,
+SLM
+s, due to their weaker capabilities. For instance, while GPT-4 can improve by self-refining its output
+(Madaan et al.,
+2024
+; Wu et al.,
+2024
+; Zhou et al.,
+2024
+)
+, the approaches are less effective in SLMs and may even lead to worse performance
+(Forsman,
+2024
+)
+. This significantly hinders the adoption of neural language models.
+This paper introduces
+S
+elf-play mu
+T
+u
+A
+l
+R
+easoning
+(rStar), a novel approach that boosts SLMs’ reasoning capability during inference without fine-tuning or superior models. To address the aforementioned challenges, rStar decouples reasoning into a self-play mutual generation-discrimination process as illustrated in Fig.
+2
+.
+Specifically, rStar is unique in the following approaches. First, although relying on a conventional Monte Carlo Tree Search (MCTS) for SLMs to self-generate reasoning steps, rStar advocates
+a richer set of reasoning actions
+in the self-exploration. The new proposed actions simulate human reasoning behaviors given the current reasoning state, such as decomposing and searching for a specific reasoning step, proposing a new sub-question, or rephrasing the given question. This enables SLMs to generate high-quality candidate reasoning trajectories during self-exploration.
+Second, to effectively guide the exploration among the generated reasoning trajectories, rStar augments the MCTS process with a new discrimination process called
+mutual consistency
+. In particular, rStar employs a second SLM with the similar capability, acting as a discriminator to provide unsupervised feedback on each candidate reasoning trajectory generated by MCTS. To improve the accuracy of the feedback, rStar hints the second SLM with sampled partial reasoning trajectories, asking it to complete the remaining reasoning steps. And rStar deems the mutually agreed reasoning trajectories of higher quality. Mutual consistency mirrors the common human practice in the absence of supervision, where agreement among peers (i.e., two SLMs) on derived answers suggests a higher likelihood of correctness.
+As a result, mutual consistency offers more effective reasoning across diverse tasks than other approaches like self-consistency
+(Wang et al.,
+2023
+)
+and avoids the risk of overfitting when training a reward model
+(Chen et al.,
+2024a
+; Wang et al.,
+2024b
+)
+.
+Figure 2:
+Our self-play mutual reasoning is a generation-discrimination process: (1) a self-generator augments the target SLM to generate candidate reasoning trajectories using MCTS; (2) the discriminator uses another SLM to provide unsupervised feedback on each trajectory based on partial hints; (3) based on this feedback, the target SLM decides a final reasoning trajectory as the solution.
+Extensive experiments across five SLMs and five diverse reasoning tasks demonstrate the effectiveness of rStar. With just 32 rounds of MCTS inference, rStar significantly enhances SLMs’ reasoning capabilities, matching or even surpassing the accuracy achieved after fine-tuning. For example, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral, and from 47.23% to 85.52% for LLaMA3-8B. Furthermore, we conduct comprehensive experiments to verify rStar’s superiority over state-of-the-art baselines, including single-round inference techniques like few-shot CoT, multi-round prompting approaches such as self-consistency, and self-improvement techniques such as RAP, ToT, self-evaluation and self-verification.
+2
+Related Work
+Prompting Language Models to Reason
+.
+Prompting-based methods, such as Chain-of-Thought
+(Wei et al.,
+2022
+)
+, focus on designing instructions and pipelines to enhance LLMs’ reasoning performance during inference. Recent advances include planning
+(Hao et al.,
+2023
+; Ding et al.,
+2023
+)
+, problem decomposition
+(Zhou et al.,
+2022
+; Khot et al.,
+2022
+; Hao et al.,
+2023
+)
+, abstraction
+(Zheng et al.,
+2023
+)
+, programming
+(Chen et al.,
+2022
+; Zhou et al.,
+2023
+)
+.
+These methods aim to improve single-round inference performance and are orthogonal to ours.
+LLM Self-improvement
+. Recently, research on the self-improvement of LLMs has rapidly increased.
+Fine-tuning based methods
+(Chen et al.,
+2024b
+;
+a
+)
+leverage the capabilities of a well-pretrained LLM to synthesize data and progressively enhance its performance. Advanced prompting techniques, such as self-verification
+(Gero et al.,
+2023
+; Zhou et al.,
+2023
+)
+, and RAP
+(Hao et al.,
+2023
+)
+, improve performance through iterative self-exploring based on self-diagnosed feedback at inference time.
+However, as illustrated in previous section, the achieved performance often depend on the LLM’s inherent capabilities, and for SLMs, their weaker instruction-following ability and unreliable self-rewarding can mislead self-improvement.
+Sampling Reasoning Paths
+. Recent works
+(Brown et al.,
+2024
+; Li et al.,
+2024
+; Snell et al.,
+2024
+)
+on mathematical reasoning have shown that sampling diverse reasoning paths can significantly enhance performance compared to greedy one-time decoding. Self-Consistency
+(Wang et al.,
+2023
+)
+sample a complete CoT path each time. Tree-search approaches
+(Yao et al.,
+2024
+; Hao et al.,
+2023
+; Zhang et al.,
+2024
+)
+, like MCTS, further improve the performance by breaking down tasks and sampling simpler, individual intermediate reasoning steps. However, most approaches have limited action spaces. For example, RAP
+(Hao et al.,
+2023
+)
+decomposes only subproblems, while AlphaMath
+(Chen et al.,
+2024a
+)
+searches only for one CoT step, limiting effectiveness in generating better trajectories.
+Answer Verification
+. To select correct reasoning trajectories, majority voting
+(Wang et al.,
+2023
+)
+is a widely-used approach. To improve accuracy, some works train value or rewards model for verification
+(Wang et al.,
+2024b
+; Chen et al.,
+2024a
+)
+, but these require additional annotations and have risks in overfitting to specific tasks. Self-verification
+(Weng et al.,
+2023
+)
+leverages LLM capabilities for backward self-verification. Nevertheless, its effectiveness hinges on its inherent ability to reason effectively. Recent studies have shown that LLM struggles to evaluate itself and rectify its initial responses without any external feedbacks
+(Huang et al.,
+2023
+; Feng et al.,
+2023
+)
+.
+3
+Methodology
+3.1
+Overview
+Problem Formulation
+. To solve a reasoning problem by SLMs, we formulate it as a multi-step reasoning generation task, which breaks
+the problem into simpler sub-tasks. This is more effective than traditional CoT-based reasoning
+(Wei et al.,
+2022
+; Wang et al.,
+2023
+)
+, as it is much easier for SLMs to correctly generate one step than complete reasoning steps in a single inference. We leverage the Monte-Carlo Tree Search (MCTS) algorithm
+(Kocsis & Szepesvári,
+2006
+)
+to augment the target SLM for self-generating multi-step reasoning solutions.
+Formally, for a given problem
+x
+𝑥
+x
+and a target SLM
+M
+𝑀
+M
+, the MCTS augments
+M
+𝑀
+M
+to incrementally build a search tree
+𝒯
+𝒯
+\mathcal{T}
+. As illustrated in Fig.
+3
+, the root node represents the question
+x
+𝑥
+x
+, an edge represents an action
+a
+𝑎
+a
+, each child node is an intermediate step
+s
+𝑠
+s
+generated by
+M
+𝑀
+M
+under the corresponding action. A path from the root node to a leaf node (denoted as
+s
+d
+subscript
+𝑠
+𝑑
+s_{d}
+, also called a terminal node) constitutes a candidate solution trajectory
+𝐭
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+d
+𝐭
+direct-sum
+𝑥
+subscript
+𝑠
+1
+subscript
+𝑠
+2
+…
+subscript
+𝑠
+𝑑
+\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d}
+. From the search tree
+𝒯
+𝒯
+\mathcal{T}
+, we can extract a set of solution trajectories
+𝕋
+=
+{
+𝐭
+1
+,
+𝐭
+2
+,
+…
+,
+𝐭
+n
+}
+​
+(
+n
+≥
+1
+)
+𝕋
+superscript
+𝐭
+1
+superscript
+𝐭
+2
+…
+superscript
+𝐭
+𝑛
+𝑛
+1
+\mathbb{T}=\{\mathbf{t}^{1},\mathbf{t}^{2},...,\mathbf{t}^{n}\}(n\geq 1)
+. Our goal is to find the trajectories that can achieve the correct answer for the given question.
+Challenges in SLM Self-Improvement
+. MCTS allows an SLM to explore and evaluate multiple potential solutions. Ideally, by balancing exploration of new possibilities with the exploitation of high-reward actions, the SLM can gradually refine its reasoning steps to generate a final correct reasoning trajectory. However, due to the limited capabilities in SLMs, traditional MCTS yields minimal improvement. First, the vast solution space makes it challenging for SLMs to generate effective solutions. Existing MCTS-based methods
+(Hao et al.,
+2023
+; Kang et al.,
+2024
+)
+that use single actions limit diversity and struggle to generalize across tasks. Approaches like self-consistency
+(Wang et al.,
+2023
+)
+use random sampling ensure diversity, SLMs often produce poor-quality solutions, requiring many attempts to find a correct solution, thereby increasing inference costs.
+Second, it’s challenging to accurately reward each action. Without ground truth labels, it’s difficult to verify the correctness for each intermediate step
+s
+i
+subscript
+𝑠
+𝑖
+s_{i}
+and the final answer in
+s
+d
+subscript
+𝑠
+𝑑
+s_{d}
+. Majority voting in self-consistency requires most traces to be correct, which is often not the case for SLMs. Methods like RAP
+(Hao et al.,
+2023
+)
+use self-rewarding, but our study shows SLMs perform near-random self-rewarding (Appendix
+A.1
+). Training a reward model, as in M
+∗
+(Kang et al.,
+2024
+)
+, can address this challenge but faces difficulties in collecting training data and generalizing across various tasks.
+Overview
+.
+To address these challenges, this section introduces our methodology, rStar, which decomposes reasoning into solution generation and mutual verification in Fig.
+2
+. To tackle the first challenge, we introduce a richer set of human-like reasoning actions that allows for thorough space exploration across diverse reasoning tasks. To address the second challenge, we design an SLM-tailored reward function to evaluate intermediate steps, avoiding reliance on their often unreliable self-evaluations. Moreover, we use another SLM as a discriminator to augment the MCTS process, mutually verifying the correctness of each trajectory with the generator SLM.
+3.2
+Self-generating Reasoning Trajectory with MCTS Rollout
+Figure 3:
+An example to illustrate the process of self-generator. Highlighted nodes from top to bottom constitute a complete reasoning trace.
+Given a question, MCTS augments the target SLM to explore a rich, human-like reasoning action space and generate the next steps based on the current state.
+A Rich Set of Human-like Reasoning Actions
+. At the core of MCTS generation lies the action space, which defines the scope of tree exploration. Most MCTS-based methods use a single action type to build the tree. For instance, in RAP, the action is to propose the next sub-question, whereas in AlphaMath
+(Chen et al.,
+2024a
+)
+and MindStar
+(Kang et al.,
+2024
+)
+, the action is to generate the next reasoning step.
+However, relying on a single action type can easily lead to ineffective space exploration.
+To address this, we revisit how humans approach reasoning.
+Different people solve problems using diverse actions: some break into sub-questions, others solve it directly, and some might rephrase the problem to focus on key conditions. Moreover, people adjust their approach based on current states, choosing different actions as needed. Inspired by this human reasoning process, we introduce a richer set of 5 actions to maximize the SLM’s potential for correctly solving complex reasoning problems.
+⋄
+⋄
+\diamond
+A1
+: Propose an one-step thought
+. This action prompts the LLM to generate the next one-step thought for a given question, by considering the existing reasoning steps. Unlike the CoT, which generates complete thoughts, this approach simplifies the reasoning process and allows the LLM to perform better decision making
+(Yao et al.,
+2024
+; Besta et al.,
+2024
+)
+.
+⋄
+⋄
+\diamond
+A2
+: Propose the remaining thought steps.
+Instead of generating only one step thought per state, this action aligns with standard CoT, enabling “fast thinking” to solve simple question in fewer steps. Given the already generated reasoning steps, it prompts the LLM to directly produce the remaining steps until reaching the final answer.
+⋄
+⋄
+\diamond
+A3
+: Propose next sub-question along with its answer.
+This action is inspired by
+least-to-most prompting
+(Zhou et al.,
+2022
+)
+, which breaks down a complex problem into a series of simpler sub-questions and solves them sequentially. Following RAP’s implementation, we prompt the LLM to ask and then answer the next sub-question.
+⋄
+⋄
+\diamond
+A4
+: Answer the sub-question again.
+Considering that a sub-question might not be answered correctly by
+A3
+, we propose this action to re-answer it. To improve accuracy, this action prompts the LLM to use few-shot CoT. Note that the original answer generated by
+A3
+did not use a CoT-like prompt but instead followed the least-to-most problem decomposition prompt
+(Zhou et al.,
+2022
+)
+.
+⋄
+⋄
+\diamond
+A5
+: Rephrase the question/sub-question.
+When analyzing incorrect cases, we found that many of them are due the LLM misunderstanding the question. For example, it might miss a specific condition provided in the question. Therefore, we propose a new action to rephrase the question more simply. Specifically, we prompt the LLM to clearly list all conditions given in the problem statement.
+Table 1:
+Ablation study on the effectiveness of our rich action space: we evaluate LLaMA3-8B on 200 sampled GSM8K questions.
+Action Space
+Accuracy
+A
+3
+subscript
+𝐴
+3
+A_{3}
+(i.e., RAP)
+70.5
+A
+3
+subscript
+𝐴
+3
+A_{3}
++
+A
+5
+subscript
+𝐴
+5
+A_{5}
+72.5
+A
+3
+subscript
+𝐴
+3
+A_{3}
++
+A
+4
+subscript
+𝐴
+4
+A_{4}
++
+A
+5
+subscript
+𝐴
+5
+A_{5}
+73.5
+A
+2
+subscript
+𝐴
+2
+A_{2}
++
+A
+3
+subscript
+𝐴
+3
+A_{3}
++
+A
+4
+subscript
+𝐴
+4
+A_{4}
++
+A
+5
+subscript
+𝐴
+5
+A_{5}
+74.0
+All (
+A
+1
+subscript
+𝐴
+1
+A_{1}
++
+A
+2
+subscript
+𝐴
+2
+A_{2}
++
+A
+3
+subscript
+𝐴
+3
+A_{3}
++
+A
+4
+subscript
+𝐴
+4
+A_{4}
++
+A
+5
+subscript
+𝐴
+5
+A_{5}
+)
+75.0
+The above 5 actions define a highly diverse action space
+{
+A
+1
+,
+A
+2
+,
+A
+3
+,
+A
+4
+,
+A
+5
+}
+subscript
+𝐴
+1
+subscript
+𝐴
+2
+subscript
+𝐴
+3
+subscript
+𝐴
+4
+subscript
+𝐴
+5
+\{A_{1},A_{2},A_{3},A_{4},A_{5}\}
+.
+At each step
+i
+𝑖
+i
+, MCTS selects an action
+a
+i
+subscript
+𝑎
+𝑖
+a_{i}
+from this space. We then use this action
+a
+i
+subscript
+𝑎
+𝑖
+a_{i}
+to prompt the LLM to generate the next reasoning step
+s
+i
+subscript
+𝑠
+𝑖
+s_{i}
+, based on the current state, which is the previous generated trajectory
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+−
+1
+direct-sum
+𝑥
+subscript
+𝑠
+1
+subscript
+𝑠
+2
+…
+subscript
+𝑠
+𝑖
+1
+x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}
+. Note that certain actions require orders. For example,
+A4
+can only happen after
+A3
+, and
+A5
+can only happen after the root question. As shown in Table
+1
+, each action plays a crucial role in improving the final reasoning accuracy.
+Reward Function.
+Another critical component in MCTS is the reward function, which evaluates the value of each action and directs the tree expansion.
+We design a simple yet effective reward function for SLMs. First, we exclude self-rewarding techniques for any intermediate nodes due to the limited capabilities of SLMs. Second, to ensure generalization across different reasoning tasks, we avoid introducing external supervision (e.g., tools or trained value models). Our approach draws inspiration from AlphaGo
+(Silver et al.,
+2017
+)
+, where we score each intermediate node based on its contribution to the final correct answer. Consequently, actions that frequently lead to correct answers receive higher rewards, making them more likely to be selected in future MCTS tree expansions.
+We define
+Q
+​
+(
+s
+,
+a
+)
+𝑄
+𝑠
+𝑎
+Q(s,a)
+as the reward value for node
+s
+𝑠
+s
+generated under action
+a
+𝑎
+a
+.
+Initially, all unexplored nodes are assigned
+Q
+​
+(
+s
+i
+,
+a
+i
+)
+=
+0
+𝑄
+subscript
+𝑠
+𝑖
+subscript
+𝑎
+𝑖
+0
+Q(s_{i},a_{i})=0
+, leading to random tree expansions. Upon reaching the first terminal node
+n
+d
+subscript
+𝑛
+𝑑
+n_{d}
+, we compute a reward score
+Q
+​
+(
+s
+d
+,
+a
+d
+)
+𝑄
+subscript
+𝑠
+𝑑
+subscript
+𝑎
+𝑑
+Q(s_{d},a_{d})
+based on whether it reaches the correct answer.
+This score is then back-propagated to each intermediate node along the trajectory
+𝐭
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+d
+𝐭
+direct-sum
+𝑥
+subscript
+𝑠
+1
+subscript
+𝑠
+2
+…
+subscript
+𝑠
+𝑑
+\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d}
+. Specifically, for each
+s
+i
+subscript
+𝑠
+𝑖
+s_{i}
+(for
+i
+=
+1
+,
+2
+,
+…
+,
+d
+−
+1
+𝑖
+1
+2
+…
+𝑑
+1
+i=1,2,...,d-1
+), its
+Q
+𝑄
+Q
+value is updated as follows:
+Q
+​
+(
+s
+i
+,
+a
+i
+)
+=
+Q
+​
+(
+s
+i
+,
+a
+i
+)
++
+Q
+​
+(
+s
+d
+,
+a
+d
+)
+𝑄
+subscript
+𝑠
+𝑖
+subscript
+𝑎
+𝑖
+𝑄
+subscript
+𝑠
+𝑖
+subscript
+𝑎
+𝑖
+𝑄
+subscript
+𝑠
+𝑑
+subscript
+𝑎
+𝑑
+Q(s_{i},a_{i})=Q(s_{i},a_{i})+Q(s_{d},a_{d})
+. To compute the
+Q
+​
+(
+s
+d
+,
+a
+d
+)
+𝑄
+subscript
+𝑠
+𝑑
+subscript
+𝑎
+𝑑
+Q(s_{d},a_{d})
+for the terminal node, we use the likelihood (confidence) of self-consistency majority voting as the reward value.
+Figure 4:
+The prompt example for mutual reasoning consistency.
+Solution Generation with MCTS Rollout
+. We now describe how our MCTS generates candidate reasoning trajectories. Starting from the initial root node
+s
+0
+subscript
+𝑠
+0
+s_{0}
+, we perform multiple searches consisting of
+selection
+,
+expansion
+,
+simulations
+and
+back-propagation
+. Specifically, the simulation is performed using the default
+rollout
+policy, and to achieve more accurate reward estimation, we perform multiple rollouts. To balance the exploration and exploitation, we use the well-known Upper Confidence Bounds applied to Trees (UCT)
+(Kocsis & Szepesvári,
+2006
+)
+to select each node. This selection process is mathematically represented as:
+UCT
+​
+(
+s
+,
+a
+)
+=
+Q
+​
+(
+s
+,
+a
+)
+N
+​
+(
+s
+,
+a
+)
++
+c
+​
+ln
+⁡
+N
+p
+​
+a
+​
+r
+​
+e
+​
+n
+​
+t
+​
+(
+s
+)
+N
+​
+(
+s
+,
+a
+)
+.
+UCT
+𝑠
+𝑎
+𝑄
+𝑠
+𝑎
+𝑁
+𝑠
+𝑎
+𝑐
+subscript
+𝑁
+𝑝
+𝑎
+𝑟
+𝑒
+𝑛
+𝑡
+𝑠
+𝑁
+𝑠
+𝑎
+\text{UCT}(s,a)=\frac{Q(s,a)}{N(s,a)}+c\sqrt{\frac{\ln N_{parent}(s)}{N(s,a)}}.
+where
+N
+​
+(
+s
+,
+a
+)
+𝑁
+𝑠
+𝑎
+N(s,a)
+is the number of times node
+s
+𝑠
+s
+has been visited in previous iterations, and
+N
+p
+​
+a
+​
+r
+​
+e
+​
+n
+​
+t
+​
+(
+s
+)
+subscript
+𝑁
+𝑝
+𝑎
+𝑟
+𝑒
+𝑛
+𝑡
+𝑠
+N_{parent}(s)
+represents the visiting count of the parent node of
+s
+𝑠
+s
+.
+Q
+​
+(
+s
+,
+a
+)
+𝑄
+𝑠
+𝑎
+Q(s,a)
+is the estimated reward value and will be updated through back-propagation.
+c
+𝑐
+c
+is a constant that balances exploitation and exploration.
+Once the search reaches a terminal node, either a terminal state or a predetermined maximum tree depth
+d
+𝑑
+d
+, we obtain a trajectory from the root to terminal node. We collect all trajectories from the rollout iterations as candidate solutions. The next section explains how we verify each of them.
+3.3
+Reasoning Trajectory Selection with Mutual Consistency
+In traditional MCTS, typically only one trajectory is selected as the final solution based on a specific metric, such as choosing the path with the highest reward from the rollout iterations. Unfortunately, after trying various existing methods, we found it challenging to define a single metric that reliably selects the trajectory containing the correct answer.
+Therefore, we collect all trajectories and propose mutual reasoning consistency for answer selection.
+Mutual Reasoning Consistency by Discriminator SLM
+2
+. As shown in Fig.
+2
+, in addition to the target SLM
+M
+𝑀
+M
+, we introduce another SLM
+M
+^
+^
+𝑀
+\hat{M}
+to serve as a discriminator, providing external unsupervised feedback for each candidate trajectory.
+Specifically, for
+𝐭
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+d
+𝐭
+direct-sum
+𝑥
+subscript
+𝑠
+1
+subscript
+𝑠
+2
+…
+subscript
+𝑠
+𝑑
+\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d}
+, we mask the reasoning steps starting from a randomly sampled step
+i
+𝑖
+i
+(
+i
+<
+d
+𝑖
+𝑑
+i<d
+). We then provide the earlier reasoning trajectory
+𝐭
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+−
+1
+𝐭
+direct-sum
+𝑥
+subscript
+𝑠
+1
+subscript
+𝑠
+2
+…
+subscript
+𝑠
+𝑖
+1
+\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}
+as a prompt to
+M
+^
+^
+𝑀
+\hat{M}
+to complete the remaining steps for the question. Due to the provision of the earlier
+i
+−
+1
+𝑖
+1
+i-1
+reasoning steps as a hint, we reduce the difficulty, thereby increasing the likelihood that SLM
+M
+^
+^
+𝑀
+\hat{M}
+can provide the correct answer.
+As shown in Fig.
+4
+, we compare whether the answer completed by
+M
+^
+^
+𝑀
+\hat{M}
+matches the original trajectory
+𝐭
+𝐭
+\mathbf{t}
+. If they are consistent, we consider
+t
+𝑡
+t
+as an validate trajectory for final selection.
+We provide an intuitive explanation to illustrate the rational behind our approach. Consider students solving a problem without a teacher’s feedback. A student (SLM
+1
+) unsure of their solution might ask a peer (SLM
+2
+) to review their reasoning. If the peer, given the same initial steps, arrives at the same answer, the student gains confidence in their solution. This peer verification process reflects the mutual reasoning consistency we aim to achieve.
+Final Trajectory Selection by SLM
+1
+. After applying mutual reasoning consistency to all candidate trajectories, we return to the target SLM
+M
+𝑀
+M
+to select the final trajectory from the validated ones. We compute each trajectory’s final score by multiplying its reward with the terminal node’s confidence score achieved from rollouts. The trajectory with the highest final score is chosen as the solution.
+Table 2:
+rStar greatly improves reasoning accuracy across various SLMs and tasks. rStar (generator@maj): uses majority voting for answer verification to show the MCTS generator’s effectiveness.
+Method
+LLaMA2-7B
+Mistral-7B
+LLaMA3-8B
+LLaMA3-8B-Instruct
+Phi3-mini-4k
+GSM8K
+Zero-shot CoT
+1.44
+17.89
+22.66
+68.38
+20.17
+Few-shot CoT
+12.51
+36.46
+47.23
+74.53
+83.45
+SC@maj8
+15.31
+42.91
+54.21
+78.39
+86.35
+SC@maj64
+20.77
+52.84
+64.37
+83.24
+88.02
+SC@maj128
+23.05
+57.25
+67.55
+84.69
+88.68
+ToT
+12.96
+38.89
+36.01
+69.07
+79.68
+RAP
+24.34
+56.25
+57.99
+80.59
+81.88
+rStar (generator @maj)
+27.22
+64.59
+74.38
+88.70
+90.44
+rStar
+63.91
+81.88
+85.52
+91.13
+90.67
+GSM-Hard
+Zero-shot CoT
+0.83
+5.16
+6.44
+14.94
+33.73
+Few-shot CoT
+3.71
+13.57
+13.80
+25.63
+40.63
+SC@maj8
+4.39
+17.36
+18.20
+28.51
+42.00
+SC@maj64
+6.52
+22.59
+23.73
+30.33
+44.80
+SC@maj128
+6.89
+25.01
+25.47
+31.16
+45.56
+ToT
+2.35
+11.47
+10.61
+19.64
+32.68
+RAP
+7.28
+22.52
+18.95
+29.64
+40.94
+rStar (generator @maj)
+8.64
+29.26
+26.76
+33.35
+46.55
+rStar
+18.57
+37.91
+32.97
+37.53
+46.55
+SVAMP
+Zero-shot CoT
+8.90
+26.10
+40.20
+70.90
+84.70
+Few-shot CoT
+48.10
+72.80
+76.90
+89.20
+92.80
+SC@maj8
+49.90
+74.60
+79.10
+89.20
+93.50
+SC@maj64
+54.10
+76.70
+80.70
+90.50
+93.30
+SC@maj128
+54.50
+76.60
+80.80
+90.60
+93.70
+ToT
+33.40
+56.30
+62.20
+79.80
+84.90
+RAP
+41.00
+71.80
+73.10
+85.70
+91.50
+rStar (generator @maj)
+60.30
+83.10
+86.20
+91.89
+93.80
+rStar
+74.90
+86.40
+90.00
+94.29
+94.10
+StrategyQA
+Zero-shot CoT
+52.67
+57.20
+41.48
+57.21
+54.68
+Few-shot CoT
+58.82
+65.65
+64.05
+68.41
+63.61
+SC@maj8
+59.10
+65.50
+63.76
+68.26
+64.34
+SC@maj64
+58.51
+63.61
+63.46
+67.39
+62.74
+SC@maj128
+58.37
+62.01
+63.31
+66.67
+59.53
+ToT
+45.27
+55.75
+57.64
+60.41
+40.47
+RAP
+59.68
+64.48
+63.32
+68.71
+60.26
+rStar (generator @maj)
+61.57
+69.43
+65.50
+71.47
+65.50
+rStar
+67.25
+70.31
+67.69
+71.57
+67.25
+4
+Experiments
+4.1
+Setup
+Models and Datasets
+. rStar is a general approach applicable to various LLMs and reasoning tasks. We evaluate 5 SLMs: Phi3-mini (3.8B)
+(Abdin et al.,
+2024
+)
+, LLaMA2-7B, Mistral-7B
+(Jiang et al.,
+2023
+)
+, LLaMA3-8B, and LLaMA3-8B-Instruct
+(Meta,
+2024
+)
+. We test across 5 reasoning tasks, including 4 mathematical tasks (GSM8K
+(Cobbe et al.,
+2021
+)
+, GSM-Hard
+(Gao et al.,
+2022
+)
+, MATH
+(Hendrycks et al.,
+2021
+)
+, SVAMP
+(Patel et al.,
+2021
+)
+) and one commonsense reasoning task (StrategyQA
+(Geva et al.,
+2021
+)
+).
+Implementation Details.
+In the trajectory self-generation stage, we augment each target SLM with our MCTS, performing 32 rollouts. Except for MATH, where we set the depth
+d
+𝑑
+d
+to 8, all other tasks have a
+d
+𝑑
+d
+=5. Actions
+A
+1
+subscript
+𝐴
+1
+A_{1}
+and
+A
+3
+subscript
+𝐴
+3
+A_{3}
+have a maximum of 5 nodes per depth, while the other actions have a default node count of 1. In the trajectory discrimination stage, we use Phi3-mini-4k as the discriminator, which has only 3.8B parameters, for effective inference. Moreover, the discriminator performs inference in a parallelized manner, making the verification process highly efficient.
+Notably, when Phi3 is the target SLM, it performs self-discrimination. For a given trajectory, we randomly split it between 20% and 80% of its steps, providing the first half of the steps as input to the discriminator SLM, which then completes the remaining steps.
+Detailed prompts are available in appendix
+A.3
+.
+4.2
+Main Results
+Baselines
+. We compare rStar against three strong baseline types:
+(i)
+single-round CoT prompting
+, including zero-shot CoT
+(Kojima et al.,
+2022
+)
+and few-shot CoT
+(Wei et al.,
+2022
+)
+;
+(ii)
+multi-round CoT prompting
+using the widely adopted self-consistency (SC) method
+(Wang et al.,
+2023
+)
+. We sample answers 8, 64, and 128 times, employing majority voting for answer selection; and
+(iii)
+multi-round self-improving approaches
+. For this, we select ToT
+(Yao et al.,
+2024
+)
+and RAP
+(Hao et al.,
+2023
+)
+as baselines, using BFS and MCTS for tree search, respectively. Note that the action in ToT corresponds to our action
+A
+1
+subscript
+𝐴
+1
+A_{1}
+, while RAP corresponds to our action
+A
+3
+subscript
+𝐴
+3
+A_{3}
+. For the answer selection, we follow their original implementations.
+Results on diverse reasoning benchmarks
+. We start by evaluating the effectiveness of rStar on general reasoning benchmarks. Table
+2
+compares its accuracy with state-of-the-art baselines on diverse SLMs and reasoning datasets. To demonstrate the effectiveness of our generator, we also provide the accuracy of rStar (gen. @maj), which do not apply our discriminator and use majority voting for answer verification. We highlight three key observations:
+(1)
+SLMs empowered with rStar demonstrate highly capable problem-solving abilities. For example, LLaMA2-7B originally had an accuracy of only 12.51% on GSM8K using few-shot CoT. However, with improvements from rStar, its accuracy increased to 63.91%, nearly matching the accuracy achieved with fine-tuning as shown in Fig.
+1
+. Similarly, Mistral with rStar can even outperform fine-tuned MetaMath by +4.18%. This improvement shows that SLMs already have strong reasoning capabilities but need guidance to generate and select the correct solutions.
+(2)
+rStar consistently improves the reasoning accuracy of various evaluated SLMs across different tasks to a state-of-the-art level. In contrast, none of the baseline approaches consistently perform well across all four benchmarks. For example, while SC excels in three mathematical tasks, it is less effective on the logical reasoning task of StrategyQA. Specifically, SC with more sampling can even lower the score on StrategyQA. RAP performs better than SC on StrategyQA but falls short compared to SC on most mathematical reasoning tasks.
+(3)
+Even without our proposed discriminator for reasoning trajectory verification, our MCTS generator demonstrates greater effectiveness in improving reasoning accuracy for SLMs compared to existing multi-round inference baselines. For example, rStar (generator @maj) achieves up to 2.88%-16.39% higher accuracy than RAP, 10.60%- 38.37% higher accuracy than ToT, and 1.69% - 7.34% higher accuracy than SC on the GSM8K dataset.
+Table 3:
+Reasoning performance comparison on the challenging MATH-500 dataset. Due to the extensive LaTeX syntax in the dataset, which is challenging for pre-trained LLMs in instruction following, we evaluate only on LLaMA3-8B-instruct and Phi3-Mini-4k-Instruct.
+Method
+LLaMA3-8b-Instruct
+Phi3-mini-4k
+Zeroshot CoT
+5.80
+3.60
+Fewshot CoT
+17.80
+32.20
+SC@maj8
+30.00
+40.40
+SC@maj64
+33.00
+45.20
+SC@maj128
+33.80
+45.60
+ToT
+13.60
+18.20
+RAP
+18.80
+27.80
+rStar (generator @maj)
+38.30
+48.40
+rStar
+42.94
+48.60
+Figure 5:
+Performance comparison on the GSM8K dataset under different number of rollouts. rStar can significantly improve reasoning accuracy with just 2 rollouts.
+Results on challenging mathematical dataset
+. We also evaluate the effectiveness of rStar on more challenging mathematical datasets. In particular, we select the GSM-Hard and MATH datasets. Following
+(Wang et al.,
+2024b
+; Lightman et al.,
+2023
+)
+, we use MATH-500, a subset of representative problems from the MATH dataset, to speedup the evaluation. As shown in Table
+2
+and Table
+3
+, rStar
+is capable of significantly improve the reasoning accuracy of SLMs on these challenging mathematical datasets.
+Remarkably, when compared to SOTA baselines, we observe a significant improvements of up to +12.9% and +9.14% on GSM-Hard and MATH-500, respectively.
+4.3
+Ablation Study
+Table 4:
+Ablation study on the effectiveness of our MCTS generator. Ours+self-eval: we apply self-evaluation to prompt model for self-rewarding each intermediate action in our generator.
+Generator
+LLaMA3-8B
+LLaMA3-8B-Instruct
+GSM8K
+StrategyQA
+GSM8K
+StrategyQA
+Answer verification
+Answer verification
+Answer verification
+Answer verification
+Maj
+Ours
+Maj
+Ours
+Maj
+Ours
+Maj
+Ours
+RAP
+56.56
+57.31
+62.30
+64.63
+81.35
+84.69
+69.43
+70.60
+SC (@128)
+67.55
+85.06
+63.31
+65.65
+84.69
+89.99
+66.67
+68.56
+Ours+Self-eval
+70.28
+82.18
+65.07
+66.22
+88.07
+89.92
+69.28
+69.43
+Ours
+74.38
+85.52
+65.50
+67.69
+88.70
+91.13
+71.47
+71.57
+Table 5:
+Ablation study on discriminator effectiveness. We evaluate accuracy on GSM8K.
+Left
+: Our discriminator consistently outperforms others in verifying solution trajectories generated by different generators.
+Right
+: The ablation study on the choice of discriminator model.
+Discriminator
+LLaMA3-8B
+LLaMA3-8B-Instruct
+Generator
+Generator
+SC
+Ours
+SC
+Ours
+Maj
+67.55
+74.38
+84.69
+88.70
+Self-verification
+74.00
+75.52
+83.02
+86.63
+Ours
+85.06
+85.52
+89.99
+91.13
+Model
+Discriminator SLM
+Accuracy
+LLaMA3-8B-Instruct
+Maj
+88.70
+LLaMA3-8B-Instruct
+88.78
+LLaMA3.1-8B-Instruct
+89.52
+Phi3-Mini-Instruct
+91.13
+GPT-4 (2024-05-01)
+92.57
+Effectiveness under different rollouts
+. rStar uses a rollout policy for MCTS tree expansion. More rollouts generate more candidate solution trajectories but increase inference cost. In Fig.
+5
+, we compare the accuracy of SC, RAP, and our rStar across different rollouts on GSM8K. For SC, we sample solutions based on each number of rollouts and use majority voting to select the answer. We highlight two key observations:
+(1)
+Even with just 2 rollouts, rStar significantly improves reasoning accuracy for SLMs, demonstrating its effectiveness;
+(2)
+Both rStar and SC benefit from more rollouts, whereas RAP tends to saturate and even decline after 4 rollouts on LLaMA3-8B-Instruct. One reason is that the single-type action space in RAP limits the effective MCTS exploration.
+The effectiveness of MCTS generator
+. We compare our MCTS generator with three baselines: (i) the MCTS generator used in RAP; (ii) SC with 128 randomly sampled solutions; and (iii) our generator with Self-evaluation, a popular technique that self-evaluates the reward score for each action. Baseline (iii) specifically evaluates the effectiveness of our reward function.
+To isolate the impact of answer verification methods, each generator is evaluated under both majority voting and our discriminator for trajectory selection. As shown in Table
+4
+, our generator consistently outperforms the baseline generators across different answer verification methods. More, we demonstrate the effectiveness of our SLM-tailored reward function, as self-evaluation reduces our generator’s accuracy.
+The effectiveness of discriminator
+. We setup two experiments for evaluation. First, we compare our discrimination approach with two baselines: the majority voting and self-verification
+(Weng et al.,
+2023
+)
+. Specifically, we follow the key idea in
+Weng et al. (
+2023
+)
+to prompt the SLM (i.e., generator SLM) to self-verify the correctness of each trajectory. To demonstrate the generalization ability of our discriminator, we used candidate solutions from different generators for evaluation.
+As shown in Table
+5
+(Left), our discriminator significantly improves reasoning accuracy when performing answer verification on trajectories generated by different generators. Similar to the previous self-evaluation experiment, self-verification on SLMs is ineffective.
+Second, we study the impact of discriminator model selection. Our current discriminator models are all Phi3-Mini-Instruct. We tested various LLMs, both stronger and weaker, as discriminators for LLaMA3-8B-Instruct. As shown in Table
+5
+(Right), the choice of discriminator model generally does not affect the effectiveness of our mutual reasoning consistency for answer verification. Notably, using the powerful GPT-4 as the discriminator only slightly improves performance (91.13% to 92.57%), demonstrating that mutual reasoning consistency can effectively verify answers using SLMs.
+5
+Conclusion
+In this work, we present rStar, a generator-discriminator self-play approach that significantly grow the reasoning capabilities for SLMs at the inference time. Our approach reveals that SLMs, such as LLaMA2-7B, already exhibit strong reasoning capabilities prior to domain specialized supervised fine-tuning. rStar achieves state-of-the-art performance across five SLMs and five diverse reasoning tasks, substantially outperforming existing multi-round prompting and self-improvement approaches. Furthermore, we conduct extensive ablation studies and analysis, contributing to the development of more advanced SLM self-improved reasoning.
+References
+Abdin et al. (2024)
+Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah,
+Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl,
+et al.
+Phi-3 technical report: A highly capable language model locally on
+your phone.
+arXiv preprint arXiv:2404.14219
+, 2024.
+Besta et al. (2024)
+Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Michal
+Podstawski, Lukas Gianinazzi, Joanna Gajda, Tomasz Lehmann, Hubert
+Niewiadomski, Piotr Nyczyk, et al.
+Graph of thoughts: Solving elaborate problems with large language
+models.
+In
+Proceedings of the AAAI Conference on Artificial
+Intelligence
+, volume 38, pp.  17682–17690, 2024.
+Brown et al. (2024)
+Bradley Brown, Jordan Juravsky, Ryan Ehrlich, Ronald Clark, Quoc V Le,
+Christopher Ré, and Azalia Mirhoseini.
+Large language monkeys: Scaling inference compute with repeated
+sampling.
+arXiv preprint arXiv:2407.21787
+, 2024.
+Chen et al. (2024a)
+Guoxin Chen, Minpeng Liao, Chengxi Li, and Kai Fan.
+Alphamath almost zero: process supervision without process,
+2024a.
+Chen et al. (2022)
+Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen.
+Program of thoughts prompting: Disentangling computation from
+reasoning for numerical reasoning tasks.
+arXiv preprint arXiv:2211.12588
+, 2022.
+Chen et al. (2024b)
+Zixiang Chen, Yihe Deng, Huizhuo Yuan, Kaixuan Ji, and Quanquan Gu.
+Self-play fine-tuning converts weak language models to strong
+language models.
+arXiv preprint arXiv:2401.01335
+, 2024b.
+Cobbe et al. (2021)
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz
+Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano,
+et al.
+Training verifiers to solve math word problems.
+arXiv preprint arXiv:2110.14168
+, 2021.
+Ding et al. (2023)
+Ruomeng Ding, Chaoyun Zhang, Lu Wang, Yong Xu, Minghua Ma, Wei Zhang, Si Qin,
+Saravan Rajmohan, Qingwei Lin, and Dongmei Zhang.
+Everything of thoughts: Defying the law of penrose triangle for
+thought generation.
+arXiv preprint arXiv:2311.04254
+, 2023.
+Feng et al. (2023)
+Xidong Feng, Ziyu Wan, Muning Wen, Ying Wen, Weinan Zhang, and Jun Wang.
+Alphazero-like tree-search can guide large language model decoding
+and training.
+arXiv preprint arXiv:2309.17179
+, 2023.
+Forsman (2024)
+Anton Forsman.
+Analyzing the performance of self-refine on different large language
+models.
+2024.
+URL
+https://github.com/anforsm/self-refine/blob/main/report.pdf
+.
+Gao et al. (2022)
+Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie
+Callan, and Graham Neubig.
+Pal: Program-aided language models.
+arXiv preprint arXiv:2211.10435
+, 2022.
+Gero et al. (2023)
+Zelalem Gero, Chandan Singh, Hao Cheng, Tristan Naumann, Michel Galley,
+Jianfeng Gao, and Hoifung Poon.
+Self-verification improves few-shot clinical information extraction.
+arXiv preprint arXiv:2306.00024
+, 2023.
+Geva et al. (2021)
+Mor Geva, Daniel Khashabi, Elad Segal, Tushar Khot, Dan Roth, and Jonathan
+Berant.
+Did aristotle use a laptop? a question answering benchmark with
+implicit reasoning strategies.
+Transactions of the Association for Computational Linguistics
+,
+9:346–361, 2021.
+URL
+https://huggingface.co/datasets/ChilleD/StrategyQA
+.
+Gou et al. (2023)
+Zhibin Gou, Zhihong Shao, Yeyun Gong, Yujiu Yang, Minlie Huang, Nan Duan,
+Weizhu Chen, et al.
+Tora: A tool-integrated reasoning agent for mathematical problem
+solving.
+arXiv preprint arXiv:2309.17452
+, 2023.
+Hao et al. (2023)
+Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and
+Zhiting Hu.
+Reasoning with language model is planning with world model.
+arXiv preprint arXiv:2305.14992
+, 2023.
+Hendrycks et al. (2021)
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric
+Tang, Dawn Song, and Jacob Steinhardt.
+Measuring mathematical problem solving with the math dataset.
+arXiv preprint arXiv:2103.03874
+, 2021.
+Huang et al. (2023)
+Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu,
+Xinying Song, and Denny Zhou.
+Large language models cannot self-correct reasoning yet.
+arXiv preprint arXiv:2310.01798
+, 2023.
+Jiang et al. (2023)
+Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford,
+Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel,
+Guillaume Lample, Lucile Saulnier, et al.
+Mistral 7b.
+arXiv preprint arXiv:2310.06825
+, 2023.
+Kang et al. (2024)
+Jikun Kang, Xin Zhe Li, Xi Chen, Amirreza Kazemi, and Boxing Chen.
+Mindstar: Enhancing math reasoning in pre-trained llms at inference
+time.
+arXiv preprint arXiv:2405.16265
+, 2024.
+Khot et al. (2022)
+Tushar Khot, Harsh Trivedi, Matthew Finlayson, Yao Fu, Kyle Richardson, Peter
+Clark, and Ashish Sabharwal.
+Decomposed prompting: A modular approach for solving complex tasks.
+arXiv preprint arXiv:2210.02406
+, 2022.
+Kocsis & Szepesvári (2006)
+Levente Kocsis and Csaba Szepesvári.
+Bandit based monte-carlo planning.
+volume 2006, pp.  282–293, 09 2006.
+ISBN 978-3-540-45375-8.
+doi:
+10.1007/11871842_29
+.
+Kojima et al. (2022)
+Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke
+Iwasawa.
+Large language models are zero-shot reasoners.
+Advances in neural information processing systems
+,
+35:22199–22213, 2022.
+Li et al. (2024)
+Chen Li, Weiqi Wang, Jingcheng Hu, Yixuan Wei, Nanning Zheng, Han Hu, Zheng
+Zhang, and Houwen Peng.
+Common 7b language models already possess strong math capabilities.
+arXiv preprint arXiv:2403.04706
+, 2024.
+Lightman et al. (2023)
+Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy
+Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+arXiv preprint arXiv:2305.20050
+, 2023.
+Madaan et al. (2024)
+Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah
+Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al.
+Self-refine: Iterative refinement with self-feedback.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Meta (2024)
+Meta.
+Introducing meta llama3: The most capable openly available llm to
+date, 2024.
+URL
+https://ai.meta.com/blog/meta-llama-3/
+.
+Patel et al. (2021)
+Arkil Patel, Satwik Bhattamishra, and Navin Goyal.
+Are nlp models really able to solve simple math word problems?
+In
+Proceedings of the 2021 Conference of the North American
+Chapter of the Association for Computational Linguistics: Human Language
+Technologies
+, pp.  2080–2094, 2021.
+Roy & Roth (2015)
+Subhro Roy and Dan Roth.
+Solving General Arithmetic Word Problems.
+In
+Proc. of the Conference on Empirical Methods in Natural
+Language Processing (EMNLP)
+, 2015.
+URL
+http://cogcomp.org/papers/arithmetic.pdf
+.
+Silver et al. (2017)
+David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew
+Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore
+Graepel, et al.
+Mastering chess and shogi by self-play with a general reinforcement
+learning algorithm.
+arXiv preprint arXiv:1712.01815
+, 2017.
+Snell et al. (2024)
+Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar.
+Scaling llm test-time compute optimally can be more effective than
+scaling model parameters, 2024.
+URL
+https://arxiv.org/abs/2408.03314
+.
+Valmeekam et al. (2022)
+Karthik Valmeekam, Alberto Olmo, Sarath Sreedharan, and Subbarao Kambhampati.
+Large language models still can’t plan (a benchmark for LLMs on
+planning and reasoning about change).
+In
+NeurIPS 2022 Foundation Models for Decision Making
+Workshop
+, 2022.
+URL
+https://openreview.net/forum?id=wUU-7XTL5XO
+.
+Wang et al. (2024a)
+Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui
+Zhang, Linqi Song, Mingjie Zhan, and Hongsheng Li.
+Mathcoder: Seamless code integration in LLMs for enhanced
+mathematical reasoning.
+In
+The Twelfth International Conference on Learning
+Representations
+, 2024a.
+URL
+https://openreview.net/forum?id=z8TW0ttBPp
+.
+Wang et al. (2024b)
+Peiyi Wang, Lei Li, Zhihong Shao, R. X. Xu, Damai Dai, Yifei Li, Deli Chen,
+Y. Wu, and Zhifang Sui.
+Math-shepherd: Verify and reinforce llms step-by-step without human
+annotations, 2024b.
+Wang et al. (2023)
+Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc V Le, Ed H. Chi, Sharan Narang,
+Aakanksha Chowdhery, and Denny Zhou.
+Self-consistency improves chain of thought reasoning in language
+models.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2023.
+URL
+https://openreview.net/forum?id=1PL1NIMMrw
+.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V
+Le, Denny Zhou, et al.
+Chain-of-thought prompting elicits reasoning in large language
+models.
+Advances in Neural Information Processing Systems
+,
+35:24824–24837, 2022.
+Weng et al. (2023)
+Yixuan Weng, Minjun Zhu, Fei Xia, Bin Li, Shizhu He, Shengping Liu, Bin Sun,
+Kang Liu, and Jun Zhao.
+Large language models are better reasoners with self-verification.
+2023.
+Wu et al. (2024)
+Zhenyu Wu, Qingkai Zeng, Zhihan Zhang, Zhaoxuan Tan, Chao Shen, and Meng Jiang.
+Large language models can self-correct with minimal effort.
+arXiv preprint arXiv:2405.14092
+, 2024.
+Yao et al. (2024)
+Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and
+Karthik Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language
+models.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Zhang et al. (2024)
+Di Zhang, Jiatong Li, Xiaoshui Huang, Dongzhan Zhou, Yuqiang Li, and Wanli
+Ouyang.
+Accessing gpt-4 level mathematical olympiad solutions via monte carlo
+tree self-refine with llama-3 8b.
+arXiv preprint arXiv:2406.07394
+, 2024.
+Zheng et al. (2023)
+Huaixiu Steven Zheng, Swaroop Mishra, Xinyun Chen, Heng-Tze Cheng, Ed H Chi,
+Quoc V Le, and Denny Zhou.
+Take a step back: Evoking reasoning via abstraction in large language
+models.
+arXiv preprint arXiv:2310.06117
+, 2023.
+Zhou et al. (2023)
+Aojun Zhou, Ke Wang, Zimu Lu, Weikang Shi, Sichun Luo, Zipeng Qin, Shaoqing Lu,
+Anya Jia, Linqi Song, Mingjie Zhan, et al.
+Solving challenging math word problems using gpt-4 code interpreter
+with code-based self-verification.
+arXiv preprint arXiv:2308.07921
+, 2023.
+Zhou et al. (2022)
+Denny Zhou, Nathanael Schärli, Le Hou, Jason Wei, Nathan Scales, Xuezhi
+Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc V Le, et al.
+Least-to-most prompting enables complex reasoning in large language
+models.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2022.
+Zhou et al. (2024)
+Pei Zhou, Jay Pujara, Xiang Ren, Xinyun Chen, Heng-Tze Cheng, Quoc V Le, Ed H
+Chi, Denny Zhou, Swaroop Mishra, and Huaixiu Steven Zheng.
+Self-discover: Large language models self-compose reasoning
+structures.
+arXiv preprint arXiv:2402.03620
+, 2024.
+Appendix A
+Appendix
+A.1
+Experiments to evaluate the self-rewarding in SLMs
+Table 6:
+Analysis on the effectiveness of SLMs’ self-rewarding. The original
+r
+1
+subscript
+𝑟
+1
+r_{1}
+is a self-evaluation of the helpfulness of the new proposed subquestion, while
+r
+2
+subscript
+𝑟
+2
+r_{2}
+measures the confidence in answering the subquestion through self-consistency majority voting. Results show that replacing the self-evaluated
+r
+1
+subscript
+𝑟
+1
+r_{1}
+to random values does not significantly impact the final reasoning performance.
+Method
+LLaMA2-7B
+Mistral
+GSM8K
+RAP
+24.34
+56.25
+RAP + random
+r
+1
+subscript
+𝑟
+1
+r_{1}
+22.90
+55.50
+RAP + random
+r
+2
+subscript
+𝑟
+2
+r_{2}
+22.67
+49.66
+Multiarith
+RAP
+57.22
+91.11
+RAP + random
+r
+1
+subscript
+𝑟
+1
+r_{1}
+52.78
+90.56
+RAP + random
+r
+2
+subscript
+𝑟
+2
+r_{2}
+47.22
+81.11
+Ablation study on self-rewarding in RAP
+. RAP rewards both intermediate and terminal nodes. For each node generated by its action, it combines two scores,
+r
+1
+subscript
+𝑟
+1
+r_{1}
+and
+r
+2
+subscript
+𝑟
+2
+r_{2}
+, to determine the final reward score. Formally,
+r
+=
+r
+1
+×
+r
+2
+𝑟
+subscript
+𝑟
+1
+subscript
+𝑟
+2
+r=r_{1}\times r_{2}
+.
+r
+1
+subscript
+𝑟
+1
+r_{1}
+is a self-evaluation score that evaluates the LLM’s own estimation of the helpfulness of the current node. Specifically, it prompts the LLM with the question "
+Is the new question useful
+?".
+r
+2
+subscript
+𝑟
+2
+r_{2}
+is the confidence of correctly answering the proposed new question, measured by self-consistency majority voting.
+To evaluate the effectiveness of self-rewarding in RAP, we replace
+r
+1
+subscript
+𝑟
+1
+r_{1}
+and
+r
+2
+subscript
+𝑟
+2
+r_{2}
+with random values sampled from (0,1)and re-run RAP on LLaMA2-7B and Mistral-7B. We select a challenging dataset, GSM8K and an easy mathematical reasoning dataset, Multiarith
+(Roy & Roth,
+2015
+)
+, for evaluation.
+Table
+6
+compares the results with original RAP. We can see that replacing
+r
+1
+subscript
+𝑟
+1
+r_{1}
+with random values has minimal impact on RAP’s performance across different SLMs and datasets. However, replacing
+r
+2
+subscript
+𝑟
+2
+r_{2}
+with random values result in a noticeable drop in accuracy on Mistral and Multiarith. This indicates that self-evaluation
+r
+1
+subscript
+𝑟
+1
+r_{1}
+has minimal effect, suggesting that LLaMA2-7B and Mistral are essentially performing near-random self-evaluations.
+A.2
+Discussions
+Discussions on the importance of generator and discriminator
+. In our experiments, we found that on certain SLMs, the discriminator yields more significant improvement than the generator. For instance, on LLaMA2-7B, rStar (generator @maj) can improves accuracy by +4.17% on GSM8K, while our discriminator can further boosts accuracy by +36.69%. However, both the generator and discriminator are crucial to final performance. The generator must effectively produce the correct solution first, and the discriminator then provide reasonable feedback to select the correct solution.
+The importance of the generator and discriminator varies based on the SLM’s solution generation effectiveness. For LLaMA2-7B, the proportion of correct solution trajectories is low, leading to a lower majority voting score. In this case, the discriminator is crucial for identifying the correct solution, significantly boosting accuracy. Conversely, for stronger models like LLaMA3-8B-instruct, our generator produces a higher proportion of correct solutions. Therefore, improving the generator results in greater accuracy improvements.
+Table 7:
+Inference costs of rStar on GSM8K. We show the average number of inferences and generated tokens required to solve a question.
+LLaMA2-7B
+Mistral
+Avg. calls
+166.81
+148.90
+Avg. generated tokens
+367.1k
+348.6k
+Inference cost
+. rStar grows SLMs reasoning capabilities at inference time. The primary inference cost arises from our MCTS self-generator. Table
+7
+shows the average number of inferences and tokens generated for solving a GSM8K question after 32 rollouts.
+On LLaMA2-7B and Mistral, this averages 166 and 148 model calls to solve a question, respectively. Currently, completing the 32 rollouts for the entire GSM8K test set takes about 4.5 days on a single A100 GPU per model. These costs can be significantly reduced by distributing tasks across multiple GPUs or batching model calls within each rollout.
+A.3
+Prompt examples in MTCS
+Action 1. Propose an one-step thought.
+### Instruction:
+There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+### Response:
+Let’s think step by step.
+Step 1: Identify the initial number of trees. The problem states there are 15 trees in the grove.
+Step 2: Identify the final number of trees. The problem states there will be 21 trees after the workers are done planting.
+Step 3: Subtract the initial number of trees from the final number of trees to find out how many trees were planted.
+Step 4: Therefore, the grove workers planted 21 (final number of trees) - 15 (initial number of trees) = 6 trees today.
+Step 5: The answer is 6.
+### Instruction:
+If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+### Response:
+Let’s think step by step.
+Step 1: Start with the number of cars that are already in the parking lot, which is 3 cars.
+Step 2: Add the number of cars that arrive, which is 2 cars.
+Step 3: Add the numbers together. there are 3 cars + 2 cars = 5 cars in the parking lot.
+Step 4: The answer is 5.
+### Instruction:
+Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+### Response:
+Let’s think step by step.
+Step 1: Add the number of chocolates Leah and her sister had initially. Leah had 32 chocolates and her sister had 42 chocolates. So, they had 32 + 42 = 74 chocolates in total.
+Step 2: Subtract the number of chocolates they ate from the total number of chocolates they had. They ate 35 chocolates. So, they have 74 - 35 = 39 chocolates left.
+Step 3: The answer is 39.
+### Instruction:
+Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+### Response:
+Let’s think step by step.
+Step 1: Identify the initial number of lollipops. The problem states Jason had 20 lollipops.
+Step 2: Identify the final number of lollipops. The problem states Jason now has 12 lollipops.
+Step 3: Subtract the final number of lollipops from the initial number to find out how many lollipops Jason gave to Denny.
+Step 4: Therefore, Jason gave Denny 20 (initial number of lollipops) - 12 (final number of lollipops) = 8 lollipops.
+Step 5: The answer is 8.
+###
+Instruction:
+{user question}
+###
+Response:
+Let’s think step by step.
+Action 2: Propose the remaining thought steps /A4: Answer the sub-question again.
+### Instruction:
+There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+### Response:
+Let’s think step by step. There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is: 6.
+### Instruction:
+If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+### Response:
+Let’s think step by step. There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is: 5.
+### Instruction:
+Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+### Response:
+Let’s think step by step. Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is: 39.
+### Instruction:
+Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+### Response:
+Let’s think step by step. Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is: 8.
+### Instruction:
+Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
+### Response:
+Let’s think step by step. Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is: 9.
+### Instruction:
+There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
+### Response:
+Let’s think step by step. There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is: 29.
+### Instruction:
+Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?
+### Response:
+Let’s think step by step. Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is: 33.
+### Instruction:
+Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+### Response:
+Let’s think step by step. Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is: 8.
+###
+Instruction:
+{user question}
+###
+Response
+:
+Action 3: Propose next sub-question along with its answer.
+Given a question, please decompose it into sub-questions. For each sub-question, please answer it in a complete sentence, ending with "The answer is <a numeric answer>". When the original question is answerable, please start the subquestion with "Now we can answer the question: <original question>".
+Question 1: Four years ago, Kody was only half as old as Mohamed. If Mohamed is currently twice as 30 years old, how old is Kody?
+Question 1.1: How old is Mohamed currently?
+Answer 1.1: Mohamed is twice as old as 30 years, which means he is 30 * 2 = 60 years old.
+Question 1.2: What was Kody’s age four years ago, given that it was half of Mohamed’s age at that time?
+Answer 1.2: Four years ago, Mohamed was 60 - 4 = 56 years old, so Kody was half of that, which is 56 / 2 = 28 years old.
+Question 1.3: Now we can answer the question: How old is Kody?
+Answer 1.3: Kody is currently 28 + 4 = 32 years old. The answer is 32.
+Question 2: On a moonless night, three fireflies danced in the evening breeze. They were joined by four less than a dozen more fireflies before two of the fireflies flew away. How many fireflies remained?
+Question 2.1: How many fireflies joined?
+Answer 2.1: The fireflies were joined by four less than a dozen more fireflies, which are 12 - 4 = 8 fireflies. The answer is 8.
+Question 2.2: Now we can answer the question: How many fireflies remained?
+Answer 2.2: Three fireflies were dancing originally. They were joined by 8 fireflies before two of them flew away. So there were 3 + 8 - 2 = 9 remaining. The answer is 9.
+Question 3: Ali has four $10 bills and six $20 bills that he saved after working for Mr. James on his farm. Ali gives her sister half of the total money he has and uses 3/5 of the remaining amount of money to buy dinner. Calculate the amount of money he has after buying the dinner.
+Question 3.1: How much money does Ali have after giving half of his total money to his sister?
+Answer 3.1: Ali initially has four $10 bills and six $20 bills, totaling 4 * 10 + 6 * 20 = 160 dollars. Giving half of this to his sister leaves him with 160 / 2 = 80 dollars. The answer is 80.
+Question 3.2: How much money does Ali spend on dinner?
+Answer 3.2: Ali uses 3/5 of his remaining money, which is 80 dollars, to buy dinner. Therefore, he spends 80 * 3/5 = 48 dollars on dinner. The answer is 48.
+Question 3.3: Now we can answer the question: How much money does Ali have after buying the dinner?
+Answer 3.3: After buying the dinner, Ali has 80 - 48 = 32 dollars left. The answer is 32.
+Question 4: A car is driving through a tunnel with many turns. After a while, the car must travel through a ring that requires a total of 4 right-hand turns. After the 1st turn, it travels 5 meters. After the 2nd turn, it travels 8 meters. After the 3rd turn, it travels a little further and at the 4th turn, it immediately exits the tunnel. If the car has driven a total of 23 meters around the ring, how far did it have to travel after the 3rd turn?
+Question 4.1: How far did the car travel except for the 3rd turn?
+Answer 4.1: It travels 5 meters after the 1st, 8 meters after the 2nd, and 0 meters after the 4th turn. It’s a total of 5 + 8 + 0 = 13 meters. The answer is 13.
+Question 4.2: Now we can answer the question: How far did the car have to travel after the 3rd turn?
+Answer 4.2: The car has driven a total of 23 meters around the ring. It travels 13 meters except for the 3rd turn. So it has to travel 23 - 13 = 10 meters after the 3rd turn. The answer is 10.
+Question 5: {user question}
+Action 5: Rephrase the question/sub-question.
+You are an AI assistant to help me rephrase questions by splitting the question context into conditions. In your rephrased question, remember to fully express the information in the original question.
+Original Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Olivia starts with $23. Condition 2: She buys five bagels, each costing $3. Question: How much money does Olivia have remaining after her purchase?
+Original Question: Michael had 58 golf balls. On Tuesday, he lost 23 golf balls. On Wednesday, he lost 2 more. How many golf balls did he have at the end of Wednesday?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Michael initially has 58 golf balls. Condition 2: On Tuesday, he loses 23 golf balls. Condition 3: On Wednesday, he loses 2 additional golf balls. Question: What is the total number of golf balls Michael has left at the end of Wednesday?
+Original Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Angelo and Melanie need to study 2 textbook chapters and 4 worksheets. Condition 2: They allocate 3 hours per textbook chapter and 1.5 hours per worksheet. Condition 3: Their daily study limit is 4 hours, with a 10-minute break every hour, three 10-minute snack breaks, and a 30-minute lunch break each day. Question: Over the next week, for how many days should they plan to study to cover all their materials?
+Original Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Leah has 32 chocolates. Condition 2: Her sister has 42 chocolates. Condition 3: Together, they consume 35 chocolates. Question: How many chocolates remain between them after they have eaten some?
+Original Question: There were nine computers in the server room. Five more computers were installed each day, from Monday to Thursday. How many computers are now in the server room?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Initially, there are nine computers in the server room. Condition 2: Each day, from Monday to Thursday, five additional computers are installed. Question: What is the total number of computers in the server room after these installations?
+Original Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Jason starts with 20 lollipops. Condition 2: After giving some lollipops to Denny, Jason has 12 lollipops left. Question: How many lollipops did Jason give to Denny?
+Original Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He rearranged five of these boxes into packages of six highlighters each and sold them for $3 per package. He sold the rest of the highlighters separately at the rate of three pens for $2. How much profit did he make in total, in dollars?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Sam purchases a dozen boxes of highlighters, with each box containing 30 pens, at $10 per box. Condition 2: He repackages five boxes into packages of six highlighters, selling each package for $3. Condition 3: He sells the remaining highlighters at a rate of three for $2. Question: What is Sam’s total profit from these transactions?
+Original Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+Rephrased Question: Given a list of conditions, please answer the question. Condition 1: Initially, there are 15 trees in the grove. Condition 2: Grove workers will add more trees to the grove today. Condition 3: After planting, the total number of trees in the grove will increase to 21. Question: How many trees did the grove workers plant today?
+Original Question: {user question}
+Rephrased Question:
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3ac1ad4988e055862a44bb360bd4693cd8112ea
--- /dev/null
+++ b/research/notes/240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers.md
@@ -0,0 +1,191 @@
+---
+title: '[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers'
+id: 240806195-mutual-reasoning-makes-smaller-llms-stronger-problem-solvers
+tags:
+- deepread
+created: '2026-06-10T00:39:56.384617Z'
+source: https://arxiv.org/abs/2408.06195
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:39:56.384488Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2408.06195] Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers
+Computer Science > Computation and Language
+arXiv:2408.06195
+(cs)
+[Submitted on 12 Aug 2024]
+Title:
+Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers
+Authors:
+Zhenting Qi
+,
+Mingyuan Ma
+,
+Jiahang Xu
+,
+Li Lyna Zhang
+,
+Fan Yang
+,
+Mao Yang
+View a PDF of the paper titled Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers, by Zhenting Qi and 5 other authors
+View PDF
+HTML (experimental)
+Abstract:
+This paper introduces rStar, a self-play mutual reasoning approach that significantly improves reasoning capabilities of small language models (SLMs) without fine-tuning or superior models. rStar decouples reasoning into a self-play mutual generation-discrimination process. First, a target SLM augments the Monte Carlo Tree Search (MCTS) with a rich set of human-like reasoning actions to construct higher quality reasoning trajectories. Next, another SLM, with capabilities similar to the target SLM, acts as a discriminator to verify each trajectory generated by the target SLM. The mutually agreed reasoning trajectories are considered mutual consistent, thus are more likely to be correct. Extensive experiments across five SLMs demonstrate rStar can effectively solve diverse reasoning problems, including GSM8K, GSM-Hard, MATH, SVAMP, and StrategyQA. Remarkably, rStar boosts GSM8K accuracy from 12.51% to 63.91% for LLaMA2-7B, from 36.46% to 81.88% for Mistral-7B, from 74.53% to 91.13% for LLaMA3-8B-Instruct. Code will be available at
+this https URL
+.
+Subjects:
+Computation and Language (cs.CL)
+Cite as:
+arXiv:2408.06195
+[cs.CL]
+(or
+arXiv:2408.06195v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2408.06195
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Li Lyna Zhang [
+view email
+]
+[v1]
+Mon, 12 Aug 2024 14:42:13 UTC (1,140 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Mutual Reasoning Makes Smaller LLMs Stronger Problem-Solvers, by Zhenting Qi and 5 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-08
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md b/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..37659dcfa3e528844394e6fccbf0d433cdf4502f
--- /dev/null
+++ b/research/notes/241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2.md
@@ -0,0 +1,2144 @@
+---
+title: '[2410.20285] SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search
+  and Iterative Refinement'
+id: 241020285-swe-search-enhancing-software-agents-with-monte-carlo-tree-search-and-2
+tags:
+- deepread
+created: '2026-06-10T00:41:20.614811Z'
+source: https://ar5iv.labs.arxiv.org/html/2410.20285
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:41:20.614654Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2410.20285] SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement
+SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement
+Antonis Antoniades
+1∗
+, Albert Örwall
+2
+,
+Kexun Zhang
+3
+,
+Yuxi Xie
+4
+, Anirudh Goyal
+5
+, William Wang
+1
+1
+University of California, Santa Barbara,
+2
+Moatless AI,
+3
+Carnegie Mellon University,
+4
+National University of Singapore,
+5
+Mila
+Denotes equal contribution.
+Correspondence to:
+antonis@ucsb.edu
+,
+albert@moatless.ai
+.
+Code:
+github.com/aorwall/moatless-tree-search
+Abstract
+Software engineers operating in complex and dynamic environments must continuously adapt to evolving requirements, learn iteratively from experience, and reconsider their approaches based on new insights. However, current large language model (LLM)-based software agents often rely on rigid processes and tend to repeat ineffective actions without the capacity to evaluate their performance or adapt their strategies over time. To address these challenges, we propose SWE-Search, a multi-agent framework that integrates Monte Carlo Tree Search (MCTS) with a self-improvement mechanism to enhance software agents’ performance on repository-level software tasks. SWE-Search extends traditional MCTS by incorporating a hybrid value function that leverages LLMs for both numerical value estimation and qualitative evaluation. This enables self-feedback loops where agents iteratively refine their strategies based on both quantitative numerical evaluations and qualitative natural language assessments of pursued trajectories. The framework includes a SWE-Agent for adaptive exploration, a Value Agent for iterative feedback, and a Discriminator Agent that facilitates multi-agent debate for collaborative decision-making. Applied to the SWE-bench benchmark, our approach demonstrates a 23% relative improvement in performance across five models compared to standard open-source agents without MCTS. Our analysis reveals how performance scales with increased search depth and identifies key factors that facilitate effective self-evaluation in software agents. This work highlights the potential of self-evaluation driven search techniques to enhance agent reasoning and planning in complex, dynamic software engineering environments.
+1
+Introduction
+Software engineering is a complex and iterative process involving exploration, problem-solving, and decision-making under uncertainty. Tasks such as debugging, feature development, and code refactoring require continuous assessment of different approaches, frequent backtracking, and the incorporation of new information. While machine learning has made progress in automating parts of this workflow
+(Li et al.,
+2022
+; OpenAI et al.,
+2024
+; Ouyang et al.,
+2022
+; Yang et al.,
+2024b
+)
+, replicating the adaptive and strategic behavior of human engineers remains a significant challenge. This is due to the inherently non-linear and iterative nature of software engineering, where engineers dynamically explore various solutions, refine strategies based on feedback, and collaborate to identify the most effective path forward. Current large language model (LLM)-based software agents
+(Xia et al.,
+2024
+; Zhang et al.,
+2024d
+)
+, while powerful, often struggle with complex, long-horizon tasks that require adaptive strategies and flexible reassessment over time. These agents can become trapped in repetitive patterns, limiting their effectiveness in tackling more intricate software engineering problems.
+To address these challenges, we introduce
+SWE-Search
+, a multi-agent system that replicates the adaptability, iterative learning, and collaborative decision-making of human engineers. SWE-Search is designed to address three critical needs in software engineering:
+Flexible Exploration and Adaptation
+: Engineering problems often require exploring multiple approaches and adapting strategies based on evolving information
+(Li et al.,
+2022
+)
+. SWE-Search’s SWE-Agent operates in a flexible state space, allowing it to fluidly transition between actions such as planning, searching, and editing. This design mirrors the way engineers backtrack and adjust their approach dynamically, ensuring the agent can revise its course when faced with new challenges or information, and points towards the direction of more general, open-ended systems
+(Wang et al.,
+2023
+; Ma et al.,
+2024a
+; Lu et al.,
+2024b
+; Faldor et al.,
+2024
+; Hu et al.,
+2024
+; Lu et al.,
+2024a
+)
+.
+Iterative Learning through Feedback
+: Effective engineering relies heavily on continuous testing and refinement. To replicate this, SWE-Search integrates a Monte Carlo Tree Search (MCTS)
+(Silver et al.,
+2016b
+)
+planning module paired with a Value Agent. The MCTS module balances exploration and exploitation to guide the agent through complex solution spaces. The Value Agent augments this process by providing both utility estimates and qualitative feedback, allowing the agent to iteratively improve its decision-making based on past experiences, similar to how engineers refine their work through feedback and debugging.
+Collaborative Decision-Making
+: Complex problems often benefit from diverse perspectives
+(Khan et al.,
+2024
+; Amayuelas et al.,
+2024
+; Du et al.,
+2023
+; Zhang et al.,
+2024c
+)
+. In SWE-Search, once a set of potential solutions is generated, the Discriminator Agent facilitates a multi-agent debate. Each agent advocates for different solutions by presenting arguments, which are critically evaluated by a judge agent. This process mirrors real-world engineering collaboration, where teams deliberate to refine and select the most robust solutions.
+The architecture of SWE-Search is designed to automate software engineering tasks through these adaptive, feedback-driven, and collaborative processes. The SWE-Agent serves as the system’s problem solver, operating in a dynamic environment where it can backtrack and adapt its actions as necessary. The MCTS Planning Module efficiently guides exploration and exploitation, ensuring that the agent balances the need for innovation with the need to focus on promising solutions. The Value Agent provides continual feedback, offering both quantitative assessments and qualitative insights, helping the agent refine its strategy iteratively. Finally, the Discriminator Agent ensures that the final decision is rigorously vetted through a multi-agent debate, simulating the collaborative decision-making processes commonly found in engineering teams.
+We evaluate SWE-Search on the SWE-bench benchmark, a comprehensive dataset from real-world open-source repositories. SWE-bench tests agents’ ability to resolve software issues by generating code patches that fix failing tests. SWE-Search demonstrates a
+23
+%
+percent
+23
+23\%
+relative performance improvement across five models compared to standard open-source agents, highlighting the effectiveness of strategic search and iterative self-evaluation. Through detailed analysis, we explore how performance scales with increased search depth and identify key factors that enhance self-assessment in software agents. Our work demonstrates the potential of MCTS and iterative learning to improve agent reasoning and planning in dynamic, complex domains like software engineering, introducing a new paradigm for autonomous software development.
+2
+Related Work
+Search methods
+Various search approaches have been applied to Large Language Models (LLMs) to facilitate System 2
+(Kahneman,
+2011
+; Saha et al.,
+2024
+; Pan et al.,
+2023
+; Bounsi et al.,
+2024
+)
+thinking in non-linear reasoning structures. A critical feature of these approaches is their ability to backtrack. Unlike greedy processes
+(Black,
+2005
+)
+, search algorithms explore multiple branches at each step, potentially escaping paths that lead to dead ends. These methods differ in their strategies for exploring and memorizing possible choices, and in their heuristics for switching between them. Breadth-first search
+(Moore,
+1959
+)
+maintains all possible search paths, incurring significant memory and computational costs. Depth-first search
+(Cormen et al.,
+2009
+)
+, in contrast, prioritizes the most promising path in a more greedy manner. When applied to LLMs, these methods demonstrate a trade-off between diversity and quality in text generation
+(Yao et al.,
+2023
+)
+. The A
+∗
+algorithm
+(Hart et al.,
+1968
+)
+combines aspects of breadth-first and greedy search to find optimal solutions using a predetermined evaluation function. In this work, we adopt Monte Carlo Tree Search (MCTS)
+(Silver et al.,
+2016b
+)
+, an advanced search algorithm that conducts statistical tree search without requiring dedicated evaluation heuristics for each state. MCTS has achieved impressive results in complex strategy games
+(Silver et al.,
+2016a
+)
+, protein folding
+(Jumper et al.,
+2021
+)
+, and algorithm discovery
+(Fawzi et al.,
+2022
+)
+.
+Software Agents
+Software agents are designed to perform autonomous actions within large codebases. Given a repository-level task, these agents typically locate relevant files and code segments before implementing necessary changes. We focus on the SWE-bench task
+(Jimenez et al.,
+2024
+)
+, which involves resolving real-world GitHub issues. Among the agents with disclosed technical details on SWE-bench,
+Yang et al. (
+2024b
+)
+introduced the concept of agent-computer interfaces with SWE-agent. OpenDevin
+(Wang et al.,
+2024b
+)
+presents a collection of community-driven agents, including CodeAct
+(Wang et al.,
+2024a
+)
+. The Agentless approach demonstrated competitive performance using a simple two-step process of localization and repair. AutoCodeRover
+(Zhang et al.,
+2024d
+)
+incorporated advanced code tools such as abstract syntax trees and spectrum-based fault localization. The Alibaba Lingma Agent
+(Ma et al.,
+2024b
+)
+introduced a search-based approach for repository exploration, followed by a structured editing phase. While effective, it constitutes a more hand-designed solution specifically designed to interface with the search functionality of their agent.
+3
+Methodology
+SWE-Search is a multi-agent system designed to tackle complex software engineering tasks by integrating dynamic planning, value estimation, and deliberative decision-making. The core motivation behind this method is to emulate the sophisticated, iterative workflows of human software engineers, where exploration, planning, and collaboration are crucial to solving intricate problems. By leveraging the strengths of Monte Carlo Tree Search (MCTS) for planning, a Value Agent for utility estimation and feedback, and a Discriminator Agent for final decision-making through debate, SWE-Search provides a comprehensive, adaptive framework capable of navigating and solving real-world software engineering challenges.
+SWE-Search consists of four primary components that work in synergy:
+SWE-Search Framework and Action Agent
+: Building on the moatless-tools framework
+(Örwall,
+2024
+)
+, SWE-Search operates in a dynamic code environment with a flexible state-space and a git-like commit tree structure. This design facilitates efficient backtracking to previous states, enabling the Action Agent to explore diverse solution trajectories. The adaptable state-space enhances the system’s ability to exploit the MCTS module effectively.
+Search Algorithm
+: The core of SWE-Search’s exploration strategy is based on a Monte Carlo Tree Search (MCTS) which uses a heuristic-based selection process similar to AlphaZero
+(Silver et al.,
+2016a
+)
+, specifically tailored for software engineering tasks. This modified MCTS algorithm effectively balances exploration and exploitation, helping the agent explore a diverse set of solutions and converge quickly on the most promising strategies.
+Value (Function) Agent
+: To approximate the utility of each observation, we employ an LLM-based value function, which in addition to outputting a value, also generates an explanation in natural language. This explanation can be leveraged to improve subsequent actions from parent nodes, enabling iterative self-improvement of the search process.
+Discriminator Agent
+: In the final stage of SWE-Search, the Discriminator Agent evaluates the solutions generated by the search process. Inspired by multi-agent debate frameworks
+Du et al. (
+2023
+); Khan et al. (
+2024
+); Amayuelas et al. (
+2024
+)
+, this agent engages in a structured debate, where multiple agents argue for or against the proposed solutions. The debate process not only surfaces diverse perspectives but also leads to a more rigorously justified final decision.
+This system architecture combines the strengths of dynamic action selection, strategic planning, and collaborative deliberation, creating a comprehensive tool capable of handling the complexity and iterative nature of software engineering tasks.
+3.1
+Problem Formulation
+Figure 1:
+SWE-Search Overview.
+Tree search.
+Each state is represented as a node from which the agent can expand from, and each corresponding action is presented as an edge.
+Evaluation.
+Uses all relevant context including trajectory information, file context, and executed tests, to provide a quantitative value estimation and qualitative explanation in natural language.
+Expansion.
+Nodes can be expanded using value function feedback from future actions.
+The task of the SWE agent can be formalized as a tuple
+ℳ
+=
+(
+𝒮
+,
+𝒞
+,
+𝒜
+,
+𝒱
+,
+𝒫
+,
+p
+0
+,
+ρ
+)
+ℳ
+𝒮
+𝒞
+𝒜
+𝒱
+𝒫
+subscript
+𝑝
+0
+𝜌
+\mathcal{M}=(\mathcal{S},\mathcal{C},\mathcal{A},\mathcal{V},\mathcal{P},p_{0},\rho)
+. Here,
+𝒮
+𝒮
+\mathcal{S}
+represents the state space, encompassing all possible states such as the current context of the files the agent is working on and the overall status of the codebase. The context space, denoted as
+𝒞
+𝒞
+\mathcal{C}
+, includes metadata about the repository and the initial problem description. The value function
+𝒱
+𝒱
+\mathcal{V}
+assigns a utility score to each state-action pair
+O
+​
+(
+a
+,
+t
+)
+𝑂
+𝑎
+𝑡
+O(a,t)
+, guiding the agent’s decisions.
+The environment’s dynamics are defined by a context-dependent transition function
+𝒫
+:
+𝒮
+×
+𝒜
+×
+𝒞
+→
+Δ
+​
+(
+𝒮
+)
+:
+𝒫
+→
+𝒮
+𝒜
+𝒞
+Δ
+𝒮
+\mathcal{P}:\mathcal{S}\times\mathcal{A}\times\mathcal{C}\rightarrow\Delta(\mathcal{S})
+, which models the evolution of the repository’s state after each action. The initial state distribution,
+p
+0
+:
+𝒞
+→
+Δ
+​
+(
+𝒮
+)
+:
+subscript
+𝑝
+0
+→
+𝒞
+Δ
+𝒮
+p_{0}:\mathcal{C}\rightarrow\Delta(\mathcal{S})
+, specifies how the initial state depends on the given context, while
+ρ
+∈
+Δ
+​
+(
+𝒞
+)
+𝜌
+Δ
+𝒞
+\rho\in\Delta(\mathcal{C})
+defines the distribution over contexts.
+Given an initial context
+c
+∼
+ρ
+similar-to
+𝑐
+𝜌
+c\sim\rho
+and an initial state
+s
+0
+∼
+p
+0
+(
+⋅
+∣
+c
+)
+s_{0}\sim p_{0}(\cdot\mid c)
+, the SWE agent executes its policy
+π
+:
+𝒮
+×
+𝒞
+→
+Δ
+​
+(
+𝒜
+)
+:
+𝜋
+→
+𝒮
+𝒞
+Δ
+𝒜
+\pi:\mathcal{S}\times\mathcal{C}\rightarrow\Delta(\mathcal{A})
+, which selects actions based on the current state and context. At each time step
+t
+𝑡
+t
+, the agent takes an action
+a
+t
+∼
+π
+​
+(
+s
+t
+,
+c
+)
+similar-to
+subscript
+𝑎
+𝑡
+𝜋
+subscript
+𝑠
+𝑡
+𝑐
+a_{t}\sim\pi(s_{t},c)
+and receives a corresponding reward
+ℛ
+​
+(
+s
+t
+,
+a
+t
+,
+c
+)
+ℛ
+subscript
+𝑠
+𝑡
+subscript
+𝑎
+𝑡
+𝑐
+\mathcal{R}(s_{t},a_{t},c)
+. The environment then transitions to a new state
+s
+t
++
+1
+∼
+𝒫
+(
+⋅
+∣
+s
+t
+,
+a
+t
+,
+c
+)
+s_{t+1}\sim\mathcal{P}(\cdot\mid s_{t},a_{t},c)
+, and the agent continues to observe this updated state. Over time, this process generates a trajectory
+τ
+:=
+{
+s
+t
+,
+a
+t
+,
+r
+t
+}
+t
+=
+0
+T
+assign
+𝜏
+superscript
+subscript
+subscript
+𝑠
+𝑡
+subscript
+𝑎
+𝑡
+subscript
+𝑟
+𝑡
+𝑡
+0
+𝑇
+\tau:=\{s_{t},a_{t},r_{t}\}_{t=0}^{T}
+as the agent interacts with the environment.
+The agent’s objective is to maximize the cumulative reward over the trajectory, which is captured by the value function
+v
+​
+(
+s
+t
+,
+a
+t
+,
+{
+s
+i
+}
+i
+=
+0
+t
+−
+1
+,
+{
+a
+i
+}
+i
+=
+0
+t
+−
+1
+)
+𝑣
+subscript
+𝑠
+𝑡
+subscript
+𝑎
+𝑡
+superscript
+subscript
+subscript
+𝑠
+𝑖
+𝑖
+0
+𝑡
+1
+superscript
+subscript
+subscript
+𝑎
+𝑖
+𝑖
+0
+𝑡
+1
+v(s_{t},a_{t},\{s_{i}\}_{i=0}^{t-1},\{a_{i}\}_{i=0}^{t-1})
+. This value function depends not only on the current state and action but also on the history of previous states and actions, which deviates from the assumptions of a Markovian process. Formally, the agent seeks to maximize the expected cumulative reward, defined as:
+max
+π
+⁡
+V
+T
+​
+(
+ρ
+)
+=
+max
+π
+⁡
+𝔼
+​
+τ
+​
+[
+∑
+t
+=
+0
+T
+ℛ
+​
+(
+s
+t
+,
+a
+t
+,
+c
+)
+∣
+c
+∼
+ρ
+;
+π
+]
+subscript
+𝜋
+superscript
+𝑉
+𝑇
+𝜌
+subscript
+𝜋
+𝔼
+𝜏
+delimited-[]
+similar-to
+conditional
+superscript
+subscript
+𝑡
+0
+𝑇
+ℛ
+subscript
+𝑠
+𝑡
+subscript
+𝑎
+𝑡
+𝑐
+𝑐
+𝜌
+𝜋
+\max_{\pi}V^{T}(\rho)=\max_{\pi}\mathbb{E}{\tau}\left[\sum_{t=0}^{T}\mathcal{R}(s_{t},a_{t},c)\mid c\sim\rho;\pi\right]
+.
+This optimization captures the agent’s (in-context) process, as it adjusts its policy
+π
+𝜋
+\pi
+to achieve the highest expected return across multiple trajectories, considering both current and historical information.
+3.2
+SWE-Search Framework and Action Agent
+The SWE-Search Action Agent builds on the moatless-tools framework
+(Örwall,
+2024
+)
+. Its action space,
+𝒜
+𝒜
+\mathcal{A}
+, is organized as a two-tier hierarchy, comprising both action types and their corresponding specific actions. Formally, this can be expressed as
+𝒜
+=
+(
+t
+,
+a
+)
+∣
+t
+∈
+𝒯
+,
+a
+∈
+𝒜
+t
+formulae-sequence
+𝒜
+conditional
+𝑡
+𝑎
+𝑡
+𝒯
+𝑎
+subscript
+𝒜
+𝑡
+\mathcal{A}={(t,a)\mid t\in\mathcal{T},a\in\mathcal{A}_{t}}
+, where
+𝒯
+𝒯
+\mathcal{T}
+represents the set of action types (e.g.,
+Search
+,
+Plan
+,
+Edit
+), and
+𝒜
+t
+subscript
+𝒜
+𝑡
+\mathcal{A}_{t}
+is the set of possible actions corresponding to each type
+t
+𝑡
+t
+. These actions range from tool invocations and code modifications to the generation of structured text. To enhance the agent’s effectiveness in search-driven tasks, we introduced the following modifications:
+One key modification we implemented is the expansion of the
+Plan
+state, allowing it to transition flexibly to any other state, rather than being limited to transitioning only to
+Edit
+. This change is motivated by the need to enable more dynamic and adaptive problem-solving behaviors within the agent. In the context of software engineering, rigid state transitions can be overly restrictive, forcing the agent into predetermined pathways that may not always align with the complexities of real-world scenarios. For instance, during code modification tasks, an agent might recognize mid-process that further planning, additional searches, or different types of analysis are necessary before proceeding with edits. Restricting transitions only to editing would artificially constrain the agent, potentially leading it to suboptimal actions or causing it to become stuck in unproductive loops. By allowing transitions to any state, we empower the agent to adapt to new information as it arises (
+Fig.
+2
+), exploring a wider variety of trajectories. This enhanced flexibility reflects the iterative and often non-linear nature of real software engineering workflows, where engineers frequently revisit planning, testing, and research phases before committing to edits.
+Second, the agent is empowered to execute any tests within the codebase at its discretion, as well as to create and implement new tests. The results of these tests are incorporated into both the value function and the agent’s subsequent decision-making process. It is crucial to highlight that the tests required to resolve a given instance (i.e., fail-to-pass tests) are not explicitly revealed to the agent. However, the agent can leverage any pre-existing tests within the repository, simulating the behavior of a real-world software engineer.
+1
+1
+1
+This approach aligns with the practices of other SWE agents, and has been validated by the authors of SWE-bench, who confirmed its legitimacy as long as the fail-to-pass tests remain concealed from the model.
+3.3
+Value (Function) Agent
+The role of the Value Agent extends beyond simply estimating the expected utility of a given state-action pair
+O
+n
+​
+(
+s
+n
+,
+a
+n
+)
+subscript
+𝑂
+𝑛
+subscript
+𝑠
+𝑛
+subscript
+𝑎
+𝑛
+O_{n}(s_{n},a_{n})
+. In addition to calculating the value
+v
+n
+subscript
+𝑣
+𝑛
+v_{n}
+, the Value Agent generates a written explanation, denoted as
+ε
+𝜀
+\varepsilon
+. This explanation serves a dual purpose: it provides transparency into the decision-making process and functions as feedback for the Action Agent, which can leverage this explanation when re-expanding from the parent node of
+O
+n
+subscript
+𝑂
+𝑛
+O_{n}
+(see
+Figure
+1
+,
+hindsight feedback
+). This approach enables the system to iteratively refine its decision-making process, mirroring how a human software engineer continuously re-evaluates their approach based on new information to improve their problem-solving strategy.
+The input to the value function consists of all state-action pairs up to and including the current state being evaluated, alongside specific instructions on how to assess the state. This allows the Value Agent to contextualize the decision within the trajectory, accounting for the sequence of actions and states leading up to the present. The final output of the value function can be formalized as:
+(
+v
+t
+,
+ε
+t
+)
+=
+V
+​
+(
+s
+t
+,
+a
+t
+,
+{
+s
+i
+}
+i
+=
+0
+​
+…
+​
+t
+−
+1
+,
+{
+a
+i
+}
+i
+=
+0
+​
+…
+​
+t
+−
+1
+)
+subscript
+𝑣
+𝑡
+subscript
+𝜀
+𝑡
+𝑉
+subscript
+𝑠
+𝑡
+subscript
+𝑎
+𝑡
+subscript
+subscript
+𝑠
+𝑖
+𝑖
+0
+…
+𝑡
+1
+subscript
+subscript
+𝑎
+𝑖
+𝑖
+0
+…
+𝑡
+1
+(v_{t},\varepsilon_{t})=V(s_{t},a_{t},\{s_{i}\}_{i=0{\ldots}t-1},\{a_{i}\}_{i=0{\ldots}t-1})
+(1)
+Here,
+v
+t
+subscript
+𝑣
+𝑡
+v_{t}
+represents the expected utility of the current state-action pair, while
+ε
+t
+subscript
+𝜀
+𝑡
+\varepsilon_{t}
+is the accompanying explanation.
+In practice, the Value Agent is tasked with analyzing the entire trajectory leading up to the current state-action pair, providing not only the required utility estimate
+v
+t
+subscript
+𝑣
+𝑡
+v_{t}
+, but also a detailed explanation
+ε
+t
+subscript
+𝜀
+𝑡
+\varepsilon_{t}
+. This explanation is critical for the agent’s overall performance, as it offers insight into the reasoning behind utility estimates, which in turn informs the Action Agent’s future decisions. We have observed that one of the key factors driving the effectiveness of the Value Agent lies in the clarity and specificity of these explanations. A well-articulated explanation can illuminate the strengths and limitations of different state types (e.g.,
+Search
+,
+Edit
+,
+Plan
+), helping the Action Agent better understand which types of states are more promising or risky to pursue.
+By providing detailed feedback on the potential utility of different actions and contextualizing them within the broader trajectory, the Value Agent enables more informed and strategic decision-making by the Action Agent. This integration of both quantitative and qualitative feedback leads to improved performance and more adaptive behavior throughout the task (
+Fig.
+4
+a
+).
+3.4
+Search Algorithm
+Our search tree is structured with nodes representing states
+𝒮
+t
+subscript
+𝒮
+𝑡
+\mathcal{S}_{t}
+and edges representing actions
+𝒜
+t
+subscript
+𝒜
+𝑡
+\mathcal{A}_{t}
+. The search algorithm employed is a modified Monte Carlo Tree Search (MCTS), specifically adapted for the tasks of the SWE-Agent. Unlike prior approaches for web agents that utilize language models in the selection process
+Koh et al. (
+2024
+); Zhang et al. (
+2024b
+)
+, we deliberately choose not to rely on language models for node selection. Instead, we adopt a more straightforward heuristic-based selection function, similar to the approach used in AlphaZero
+Silver et al. (
+2016a
+;
+2018
+)
+. This decision is driven by the need for interpretability, efficiency, and the focus on tasks where heuristic-based exploration suffices to guide the agent effectively through complex software engineering environments.
+At the core of our algorithm is a modified Upper Confidence Bound for Trees (UCT) selection criterion
+Kocsis & Szepesvári (
+2006
+)
+, which determines the next node to expand. This criterion balances exploitation of known high-reward actions with exploration of less-visited states. We introduce additional terms to encourage strategic exploration early in the search process, and to penalize over-exploration at later stages when convergence on the optimal solution is desired. The modified UCT function is expressed as:
+U
+​
+C
+​
+T
+​
+(
+s
+,
+a
+)
+=
+e
+​
+x
+​
+p
+​
+l
+​
+o
+​
+i
+​
+t
+​
+a
+​
+t
+​
+i
+​
+o
+​
+n
++
+e
+​
+x
+​
+p
+​
+l
+​
+o
+​
+r
+​
+a
+​
+t
+​
+i
+​
+o
+​
+n
++
+e
+​
+a
+​
+r
+​
+l
+​
+y
+​
+_
+​
+d
+​
+e
+​
+p
+​
+t
+​
+h
+​
+_
+​
+b
+​
+o
+​
+n
+​
+u
+​
+s
+−
+l
+​
+a
+​
+t
+​
+e
+​
+_
+​
+d
+​
+e
+​
+p
+​
+t
+​
+h
+​
+_
+​
+p
+​
+e
+​
+n
+​
+a
+​
+l
+​
+t
+​
+y
+𝑈
+𝐶
+𝑇
+𝑠
+𝑎
+𝑒
+𝑥
+𝑝
+𝑙
+𝑜
+𝑖
+𝑡
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑒
+𝑥
+𝑝
+𝑙
+𝑜
+𝑟
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑒
+𝑎
+𝑟
+𝑙
+𝑦
+_
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+_
+𝑏
+𝑜
+𝑛
+𝑢
+𝑠
+𝑙
+𝑎
+𝑡
+𝑒
+_
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+_
+𝑝
+𝑒
+𝑛
+𝑎
+𝑙
+𝑡
+𝑦
+UCT(s,a)=exploitation+exploration+early\_depth\_bonus-late\_depth\_penalty
+(2)
+This can be expressed more formally as:
+U
+​
+C
+​
+T
+​
+(
+s
+,
+a
+)
+=
+V
+​
+(
+s
+,
+a
+)
++
+C
+​
+ln
+⁡
+N
+​
+(
+s
+)
+N
+​
+(
+s
+,
+a
+)
++
+α
+​
+e
+−
+β
+​
+(
+d
+−
+1
+)
+−
+γ
+​
+d
+𝑈
+𝐶
+𝑇
+𝑠
+𝑎
+𝑉
+𝑠
+𝑎
+𝐶
+𝑁
+𝑠
+𝑁
+𝑠
+𝑎
+𝛼
+superscript
+𝑒
+𝛽
+𝑑
+1
+𝛾
+𝑑
+UCT(s,a)=V(s,a)+C\sqrt{\frac{\ln N(s)}{N(s,a)}}+\alpha e^{-\beta(d-1)}-\gamma\sqrt{d}
+(3)
+V
+​
+(
+s
+,
+a
+)
+𝑉
+𝑠
+𝑎
+V(s,a)
+is the value estimate of the state-action pair
+,
+N
+​
+(
+s
+,
+a
+)
+𝑁
+𝑠
+𝑎
+N(s,a)
+is the number of times the state-action pair
+(
+s
+,
+a
+)
+𝑠
+𝑎
+(s,a)
+has been visited,
+N
+​
+(
+s
+)
+𝑁
+𝑠
+N(s)
+is the visit count of state
+s
+𝑠
+s
+,
+d
+𝑑
+d
+is the depth of the node in the search tree, and
+C
+𝐶
+C
+,
+α
+𝛼
+\alpha
+,
+β
+𝛽
+\beta
+, and
+γ
+𝛾
+\gamma
+are constants that control the balance between exploration, exploitation, and depth-dependent rewards and penalties.
+This formulation is inspired by the way software engineers explore potential solutions to a task. In practice, an engineer’s search process can be broken down into the following key phases, which our algorithm mirrors:
+Early Exploration
+: Initially, an engineer explores a wide variety of potential approaches to fully understand the problem and identify promising strategies. This is encouraged in our algorithm by the
+e
+​
+a
+​
+r
+​
+l
+​
+y
+​
+_
+​
+d
+​
+e
+​
+p
+​
+t
+​
+h
+​
+_
+​
+b
+​
+o
+​
+n
+​
+u
+​
+s
+𝑒
+𝑎
+𝑟
+𝑙
+𝑦
+_
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+_
+𝑏
+𝑜
+𝑛
+𝑢
+𝑠
+early\_depth\_bonus
+, represented by the term
+α
+​
+e
+−
+β
+​
+(
+d
+−
+1
+)
+𝛼
+superscript
+𝑒
+𝛽
+𝑑
+1
+\alpha e^{-\beta(d-1)}
+, which rewards exploration at shallow depths, simulating the early phases of wide exploration.
+Convergence and Exploitation
+: As the engineer gains more information and narrows down the options, the focus shifts to exploiting the most effective solution paths. This transition is handled by the standard UCT exploitation term
+V
+​
+(
+s
+,
+a
+)
+𝑉
+𝑠
+𝑎
+V(s,a)
+and is further reinforced by the
+l
+​
+a
+​
+t
+​
+e
+​
+_
+​
+d
+​
+e
+​
+p
+​
+t
+​
+h
+​
+_
+​
+p
+​
+e
+​
+n
+​
+a
+​
+l
+​
+t
+​
+y
+𝑙
+𝑎
+𝑡
+𝑒
+_
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+_
+𝑝
+𝑒
+𝑛
+𝑎
+𝑙
+𝑡
+𝑦
+late\_depth\_penalty
+(
+−
+γ
+​
+d
+𝛾
+𝑑
+-\gamma\sqrt{d}
+), which discourages over-exploration as the agent delves deeper into the search tree.
+Quick Abandonment of Poor Strategies
+: Software engineers are also adept at abandoning poor strategies when new information indicates that a particular approach is not viable. We capture this behavior by implementing a simple heuristic rule that abandons nodes associated with consecutive low rewards, ensuring that the agent does not waste resources on unproductive trajectories.
+At each step, the node with the highest UCT value is selected for expansion, formalized as:
+s
+∗
+=
+arg
+​
+max
+(
+s
+,
+a
+)
+⁡
+U
+​
+C
+​
+T
+​
+(
+s
+,
+a
+)
+superscript
+𝑠
+subscript
+arg
+max
+𝑠
+𝑎
+𝑈
+𝐶
+𝑇
+𝑠
+𝑎
+s^{*}=\operatorname*{arg\,max}_{(s,a)}UCT(s,a)
+(4)
+This approach effectively mimics the decision-making process of a software engineer, who balances exploration of potential strategies with a focus on converging towards the optimal solution, while remaining flexible enough to backtrack when necessary. By incorporating heuristic feedback and depth-based adjustments, the algorithm avoids getting stuck in unproductive paths and enhances the agent’s ability to identify high-reward strategies with minimal computational overhead
+Appendix
+6
+.
+3.4.1
+Discriminator Agent
+The final stage of SWE-Search involves the Discriminator Agent, whose role is to evaluate the candidate solutions generated by the search process and select the one most likely to resolve the issue at hand. This module accepts up to five final solutions produced by the search and engages in a multi-agent debate to determine the most promising option. Drawing inspiration from recent work on persuasive multi-agent debates
+(Khan et al.,
+2024
+; Amayuelas et al.,
+2024
+)
+, the Discriminator leverages the collective reasoning of multiple agents to ensure a more robust final selection. Configuration and hyperparameter details can be found in
+Table
+2
+.
+In this stage, agents are presented with the original problem statement and candidate solutions. They engage in a structured debate to determine the most effective solution, supporting their choices with logical reasoning and evidence from the search process. This debate encourages a thorough exploration of trade-offs between solutions, potentially uncovering strengths or weaknesses not evident during individual searches. Finally, a judge agent evaluates the arguments and selects the solution deemed most likely to resolve the issue. This process simulates the collaborative decision-making in software engineering teams, where diverse perspectives lead to a more thorough evaluation of candidate solutions, ultimately increasing the likelihood of identifying the most optimal outcome.
+The discriminator process not only enhances the robustness of the final solution but also adds transparency, as the reasoning behind the choice is clearly articulated and evaluated. This ensures that the selected solution is well-reasoned and thoroughly vetted before implementation.
+Figure 2:
+Hindsight feedback error correction.
+Instance sympy__sympy-15678, SWE-Search with Qwen2.5-72B-Instruct. Initially, the Action Agent performs edits and runs tests, which pass. It prematurely concludes the search. Without actually knowing the proposed solution does not resolve the issue, the Value Agent identifies potentially missed tests and assigns a low reward. Upon re-expansion using the Value Agent’s feedback, new tests fail, prompting the Action Agent to make additional edits, which result in a preferred solution which ultimately resolves the issue.
+4
+Experiments
+Benchmark
+For our experiments, we utilize SWE-bench Lite, a curated subset of the official SWE-bench, containing 300 instances. This dataset is specifically designed to be self-contained and focuses primarily on evaluating functional bug fixes, providing a controlled environment to assess the performance of our system.
+Evaluation Metrics
+We use two metrics: resolve rate (
+Pass@1
+) and
+Pass@5
+. Resolve rate is the percentage of issues successfully resolved, measuring overall effectiveness. Pass@5 is the percentage of issues where a correct solution is found within five attempts. This allows us to assess the efficiency of the search in identifying successful bug fixes within a limited number of iterations.
+Baselines
+Software agents leverage diverse tools, architectures, and models, leading to variability in their performance on subsets of the SWE-bench Lite dataset
+(Zhang et al.,
+2024a
+)
+. For comparison, we build upon the moatless-tools framework
+(Örwall,
+2024
+)
+, a high-performing open-source agent commonly used in research settings
+(Chowdhury et al.,
+2024
+)
+. To isolate the impact of our search approach, we adapt moatless-tools as our baseline, referred to as moatless-adapted. This allows us to fairly compare the performance of SWE-Search against moatless-adapted across various models, including two closed-source models (GPT-4o, GPT-4o-mini) and three open-source models (Qwen2.5-72B-Instruct
+(Yang et al.,
+2024a
+)
+, Llama-3.1-70B-Instruct
+(Dubey et al.,
+2024
+)
+, and DeepSeek-V2.5
+(DeepSeek-AI et al.,
+2024
+)
+). We also reference official moatless-tools GPT-4o results on SWE-bench Lite to ensure a fair and consistent comparison.
+Implementation Details
+For consistency, we use identical prompts across all models. In SWE-Search, we limit each node to a maximum of three expansions and cap the total search iterations at 100. Further details on model hyperparameters can be found in
+Appendix,
+2
+.
+Table 1:
+Resolve Rate Comparison, SWE-bench Lite
+Model
+Moatless-v1
+Moatless-adapted
+SWE-Search
+%
+Δ
+Δ
+\Delta
+GPT-4o
+24.3
+25.7
+31.0
++17
+GPT-4o-mini
+–
+13.0
+17.0
++24
+Qwen-2.5-72b-Instruct
+–
+18.0
+24.7
++27
+Deepseek-V2.5
+–
+16.3
+21.0
++22
+Llama-3.1-70b-Instruct
+–
+13.6
+17.7
++23
+Mean %
+Δ
+Δ
+\Delta
++23
+4.1
+Experimental Results
+4.1.1
+SWE-Search Surpasses all Corresponding Base Agents and Enables Smaller, Open Source Models to Approach GPT-4o
+On average, SWE-Search outperforms the baseline agent across all five models, achieving a 23% relative improvement
+(Table
+1
+)
+. Notably, SWE-Search with Qwen-2.5-72B-Instruct exceeds the performance of GPT-4o using the original Moatless-v1 framework, and closely matches its performance when compared with the Moatless-adapted agent, with only a slight difference (
+Δ
+=
+−
+1
+%
+Δ
+percent
+1
+\Delta=-1\%
+). Interestingly, all five models demonstrate significant improvement when utilizing the proposed approach, with consistent gains across different models.
+4.1.2
+Search Enables Agents to Make Better Use of More Flexibility
+To prevent goal divergence, most agents, including moatless-tools, rely on strict transition rules, where state transitions follow predetermined sequences (e.g., Search
+→
+→
+\rightarrow
+Identify, Plan
+→
+→
+\rightarrow
+Edit). In moatless-adapted, we introduce a more flexible transition logic that allows a Plan state to transition into any other state type. This added flexibility has both advantages and drawbacks. On the positive side, it enables the agent to autonomously correct its trajectory without external feedback, particularly when the necessary adjustments span only a limited portion of the task. However, this increased flexibility also introduces the risk of the agent becoming trapped in infinite loops. Without a high-level control mechanism to detect and mitigate these situations, the agent may fail to recover from such loops. This trade-off is evident in the modest performance difference between Moatless-v1 and moatless-adapted, with a slight performance improvement of only 1.4% (
+Table
+1
+).
+4.1.3
+Impact of Hindsight Feedback on Agent Performance
+One key advantage of utilizing LLMs as general value functions is their dual ability to provide both quantitative value estimates and qualitative assessments in natural language. These qualitative insights can significantly enhance the agent’s action generation and search process by offering detailed feedback on potential errors or overlooked aspects of the task. In practice, feedback was also crucial in eliciting diversity in the actions taken by the agent, as without it, the agent would often take very similar actions when re-expanding from a parent node.
+As shown in
+Figure
+2
+, this mechanism plays a critical role in improving the agent’s performance. During the initial expansion, the agent prematurely concludes that the task is complete. However, the value function correctly identifies gaps in the test coverage, specifically in addressing potential corner cases, and assigns a low reward. This feedback prompts the agent to re-expand the parent state, leading to the introduction of new tests, which subsequently fail. The agent then performs a series of edits (summarized in the figure for brevity), ultimately resolving the task correctly. Empirically, we observe that the instances unresolved by moatless-adapted but successfully solved by SWE-Search are often attributed to this search-and-feedback loop, where iterative feedback drives the agent toward a correct solution.
+4.2
+Importance of Comprehensive State Information for Value Function Performance
+Model
+Pass@1
+Pass@5
+GPT-4o
+31.0
+34.0
+GPT-4o-mini
+17.0
+22.3
+Qwen-2.5-72b-Instruct
+24.7
+25.7
+Deepseek-V2.5
+21.0
+23.3
+Llama-3.1-70b-Instruct
+21.0
+22.3
+Figure 3:
+SWE-bench SWE-Search results
+The effectiveness of SWE-Search hinges on the value function’s ability to accurately differentiate between desirable and undesirable states, and to provide actionable feedback that drives improvement. However, our experiments revealed that the value function sometimes failed to recognize critical decision points in the search tree. It frequently misinterpreted the purpose of certain actions, leading to the undervaluation of effective strategies by assigning low rewards. As shown in
+Figure
+4
+a
+, before the introduction of state-specific value prompts, the agent consistently assigned low rewards even when the Action Agent correctly identified the need for additional context, such as locating relevant files. This issue persisted despite the agent successfully identifying the files later. By implementing state-specific prompts across core state clusters (Searching, Planning, Editing), the value function became significantly more adept at interpreting the intent behind actions and evaluating their outcomes within each state. For further details on experiments distinguishing between effective and ineffective states, refer to
+Appendix
+8
+.
+Figure 4:
+(a) Importance of state-specific value prompts.
+On the left and right are the respective Value Agents’ outputs with and without state-specific prompts. While the action in both cases is effective in finding the right file, the non-state-specific scenario does not recognize this and assigns a low reward. On the contrary, the state-specific prompt correctly assigns a high reward to this state.
+(b) Performance scaling with search depth across different language models.
+The graph shows the number of issues resolved as a function of the number of transitions (search iterations) for all models used.
+Scaling SWE agents with Inference-time Compute
+The success of large language models (LLMs) has traditionally been attributed to the expansion of training data and model size, i.e., training-time compute
+(Wei et al.,
+2022
+; Chung et al.,
+2022
+)
+. Recently, researchers have started exploring how different methods scale with inference-time
+(OpenAI,
+2024
+; Snell et al.,
+2024
+; Dubey et al.,
+2024
+)
+. Here, we study the performance of software engineering agents through increased inference-time compute. As shown in
+Figure
+4
+b
+, increasing search iterations leads to a consistent rise in the number of resolved issues. To ensure experimental feasibility across the 300 instances in the SWE-bench Lite dataset, we applied conservative parameters (maximum iterations
+=
+100
+absent
+100
+=100
+, maximum expansions per node
+=
+3
+absent
+3
+=3
+). Approaches like SWE-Search enable the allocation of greater resources to specific challenges, such as addressing critical software vulnerabilities
+(Rigaki et al.,
+2024
+; Fang et al.,
+2024
+)
+, offering a scalable solution to complex tasks.
+Figure 5:
+(a) Value Function vs. Discriminator Comparison.
+Comparison of value function vs. discriminator ability to discern the final solution that resolved the issue when there is one. The discriminator performs better across all models except GPT-4o-mini. DeepSeek-V2.5 had the smallest disparity between the two methods, suggesting an ability to act as a well-calibrated value function.
+(b) Model-Specific Issue Resolution.
+Venn diagram of resolved issues by model. Each model can solve a handful of unique instances.
+Convergence of Value Function and Discriminator to Right Solution
+The search process can yield multiple proposed solutions. Ideally, the mean trajectory value of the the proposed solution that resolves the issue will always be the highest, which would yields the ideal performance of the agent
+(Table
+3
+)
+. In practice, the value function successfully converged on the correct solution 73% of the time on average across the five models. The discriminator module performed even better, increasing the proportion of correct solutions selected to 84%. While in typical large action spaces, Monte Carlo Tree Search (MCTS) is run for thousands of iterations
+(Silver et al.,
+2016b
+)
+, the value function’s success rate remains impressive given the computational constraints. However, SWE-Search could further benefit from enhanced methods for identifying the correct solutions more consistently, allowing it to fully reach its potential.
+Different Models can Resolve Vastly Different Issue Subsets
+When comparing the resolved instances across the five models, we observed significant diversity in the subsets of issues each model successfully solved. As shown in
+Figure
+5
+, each model managed to resolve at least one unique instance. Notably, a surprising number of issues (33) were solved by other models but not by GPT-4o. This suggests that model diversity could play an important role, at least in the short term, in enhancing the performance of SWE-agents.
+5
+Discussion and Conclusion
+In this paper, we introduced SWE-Search, a general framework that integrates Monte Carlo Tree Search (MCTS) and qualitative feedback to enhance the performance of software engineering agents. The proposed approach demonstrated improvements over different baseline models, highlighting the potential of search-based methods in software engineering tasks.
+One of the key advantages of search-based approaches, as demonstrated in our work, is their ability to scale performance with increased inference-time compute. This flexibility allows the system to adapt to problems that require higher computational resources, such as discovering software vulnerabilities or even generating large codebases from scratch. Future research should focus on two main directions: (a) investigating how search agents scale with computational resources, and (b) expanding the application of software agent search to a broader range of complex use cases.
+Given that search techniques like MCTS closely resemble the problem-solving processes of human software engineers, we expect these methods to become increasingly prevalent in agent-driven systems. As the nature of software engineering tasks evolves, system architectures will need to become more fluid and adaptable, fully leveraging the potential of search-based techniques. This evolution will likely lead to the development of larger, more general agentic systems capable of tackling a wide array of software engineering challenges.
+References
+Amayuelas et al. (2024)
+Alfonso Amayuelas, Xianjun Yang, Antonis Antoniades, Wenyue Hua, Liangming Pan, and William Wang.
+Multiagent collaboration attack: Investigating adversarial attacks in large language model collaborations via debate, 2024.
+URL
+https://arxiv.org/abs/2406.14711
+.
+Black (2005)
+Paul E. Black.
+greedy algorithm, feb 2005.
+URL
+https://www.nist.gov/dads/HTML/greedyalgo.html
+.
+Accessed: TODAY.
+Bounsi et al. (2024)
+Wilfried Bounsi, Borja Ibarz, Andrew Dudzik, Jessica B. Hamrick, Larisa Markeeva, Alex Vitvitskyi, Razvan Pascanu, and Petar Veličković.
+Transformers meet neural algorithmic reasoners, 2024.
+URL
+https://arxiv.org/abs/2406.09308
+.
+Chowdhury et al. (2024)
+Neil Chowdhury, James Aung, Chan Jun Shern, Oliver Jaffe, Dane Sherburn, Giulio Starace, Evan Mays, Rachel Dias, Marwan Aljubeh, Mia Glaese, Carlos E. Jimenez, John Yang, Kevin Liu, and Aleksander Madry.
+Introducing SWE-bench verified, August 2024.
+URL
+https://openai.com/research/introducing-swe-bench-verified
+.
+OpenAI Blog.
+Chung et al. (2022)
+Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Alex Castro-Ros, Marie Pellat, Kevin Robinson, Dasha Valter, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei.
+Scaling instruction-finetuned language models, 2022.
+URL
+https://arxiv.org/abs/2210.11416
+.
+Cormen et al. (2009)
+Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein.
+Introduction to Algorithms, Third Edition
+.
+The MIT Press, 3rd edition, 2009.
+ISBN 0262033844.
+DeepSeek-AI et al. (2024)
+DeepSeek-AI, Qihao Zhu, Daya Guo, Zhihong Shao, Dejian Yang, Peiyi Wang, Runxin Xu, Y. Wu, Yukun Li, Huazuo Gao, Shirong Ma, Wangding Zeng, Xiao Bi, Zihui Gu, Hanwei Xu, Damai Dai, Kai Dong, Liyue Zhang, Yishi Piao, Zhibin Gou, Zhenda Xie, Zhewen Hao, Bingxuan Wang, Junxiao Song, Deli Chen, Xin Xie, Kang Guan, Yuxiang You, Aixin Liu, Qiushi Du, Wenjun Gao, Xuan Lu, Qinyu Chen, Yaohui Wang, Chengqi Deng, Jiashi Li, Chenggang Zhao, Chong Ruan, Fuli Luo, and Wenfeng Liang.
+Deepseek-coder-v2: Breaking the barrier of closed-source models in code intelligence, 2024.
+URL
+https://arxiv.org/abs/2406.11931
+.
+Du et al. (2023)
+Yilun Du, Shuang Li, Antonio Torralba, Joshua B. Tenenbaum, and Igor Mordatch.
+Improving factuality and reasoning in language models through multiagent debate, 2023.
+URL
+https://arxiv.org/abs/2305.14325
+.
+Dubey et al. (2024)
+Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, Anirudh Goyal, Anthony Hartshorn, Aobo Yang, Archi Mitra, Archie Sravankumar, Artem Korenev, Arthur Hinsvark, Arun Rao, Aston Zhang, Aurelien Rodriguez, Austen Gregerson, Ava Spataru, Baptiste Roziere, Bethany Biron, Binh Tang, Bobbie Chern, Charlotte Caucheteux, Chaya Nayak, Chloe Bi, Chris Marra, Chris McConnell, Christian Keller, Christophe Touret, Chunyang Wu, Corinne Wong, Cristian Canton Ferrer, Cyrus Nikolaidis, Damien Allonsius, Daniel Song, Danielle Pintz, Danny Livshits, David Esiobu, Dhruv Choudhary, Dhruv Mahajan, Diego Garcia-Olano, Diego Perino, Dieuwke Hupkes, Egor Lakomkin, Ehab AlBadawy, Elina Lobanova, Emily Dinan, Eric Michael Smith, Filip Radenovic, Frank Zhang, Gabriel Synnaeve, Gabrielle Lee, Georgia Lewis Anderson, Graeme Nail, Gregoire Mialon, Guan Pang, Guillem Cucurell, Hailey Nguyen, Hannah Korevaar, Hu Xu, Hugo Touvron, Iliyan Zarov,
+Imanol Arrieta Ibarra, Isabel Kloumann, Ishan Misra, Ivan Evtimov, Jade Copet, Jaewon Lee, Jan Geffert, Jana Vranes, Jason Park, Jay Mahadeokar, Jeet Shah, Jelmer van der Linde, Jennifer Billock, Jenny Hong, Jenya Lee, Jeremy Fu, Jianfeng Chi, Jianyu Huang, Jiawen Liu, Jie Wang, Jiecao Yu, Joanna Bitton, Joe Spisak, Jongsoo Park, Joseph Rocca, Joshua Johnstun, Joshua Saxe, Junteng Jia, Kalyan Vasuden Alwala, Kartikeya Upasani, Kate Plawiak, Ke Li, Kenneth Heafield, Kevin Stone, Khalid El-Arini, Krithika Iyer, Kshitiz Malik, Kuenley Chiu, Kunal Bhalla, Lauren Rantala-Yeary, Laurens van der Maaten, Lawrence Chen, Liang Tan, Liz Jenkins, Louis Martin, Lovish Madaan, Lubo Malo, Lukas Blecher, Lukas Landzaat, Luke de Oliveira, Madeline Muzzi, Mahesh Pasupuleti, Mannat Singh, Manohar Paluri, Marcin Kardas, Mathew Oldham, Mathieu Rita, Maya Pavlova, Melanie Kambadur, Mike Lewis, Min Si, Mitesh Kumar Singh, Mona Hassan, Naman Goyal, Narjes Torabi, Nikolay Bashlykov, Nikolay Bogoychev, Niladri Chatterji, Olivier
+Duchenne, Onur Çelebi, Patrick Alrassy, Pengchuan Zhang, Pengwei Li, Petar Vasic, Peter Weng, Prajjwal Bhargava, Pratik Dubal, Praveen Krishnan, Punit Singh Koura, Puxin Xu, Qing He, Qingxiao Dong, Ragavan Srinivasan, Raj Ganapathy, Ramon Calderer, Ricardo Silveira Cabral, Robert Stojnic, Roberta Raileanu, Rohit Girdhar, Rohit Patel, Romain Sauvestre, Ronnie Polidoro, Roshan Sumbaly, Ross Taylor, Ruan Silva, Rui Hou, Rui Wang, Saghar Hosseini, Sahana Chennabasappa, Sanjay Singh, Sean Bell, Seohyun Sonia Kim, Sergey Edunov, Shaoliang Nie, Sharan Narang, Sharath Raparthy, Sheng Shen, Shengye Wan, Shruti Bhosale, Shun Zhang, Simon Vandenhende, Soumya Batra, Spencer Whitman, Sten Sootla, Stephane Collot, Suchin Gururangan, Sydney Borodinsky, Tamar Herman, Tara Fowler, Tarek Sheasha, Thomas Georgiou, Thomas Scialom, Tobias Speckbacher, Todor Mihaylov, Tong Xiao, Ujjwal Karn, Vedanuj Goswami, Vibhor Gupta, Vignesh Ramanathan, Viktor Kerkez, Vincent Gonguet, Virginie Do, Vish Vogeti, Vladan Petrovic, Weiwei Chu,
+Wenhan Xiong, Wenyin Fu, Whitney Meers, Xavier Martinet, Xiaodong Wang, Xiaoqing Ellen Tan, Xinfeng Xie, Xuchao Jia, Xuewei Wang, Yaelle Goldschlag, Yashesh Gaur, Yasmine Babaei, Yi Wen, Yiwen Song, Yuchen Zhang, Yue Li, Yuning Mao, Zacharie Delpierre Coudert, Zheng Yan, Zhengxing Chen, Zoe Papakipos, Aaditya Singh, Aaron Grattafiori, Abha Jain, Adam Kelsey, Adam Shajnfeld, Adithya Gangidi, Adolfo Victoria, Ahuva Goldstand, Ajay Menon, Ajay Sharma, Alex Boesenberg, Alex Vaughan, Alexei Baevski, Allie Feinstein, Amanda Kallet, Amit Sangani, Anam Yunus, Andrei Lupu, Andres Alvarado, Andrew Caples, Andrew Gu, Andrew Ho, Andrew Poulton, Andrew Ryan, Ankit Ramchandani, Annie Franco, Aparajita Saraf, Arkabandhu Chowdhury, Ashley Gabriel, Ashwin Bharambe, Assaf Eisenman, Azadeh Yazdan, Beau James, Ben Maurer, Benjamin Leonhardi, Bernie Huang, Beth Loyd, Beto De Paola, Bhargavi Paranjape, Bing Liu, Bo Wu, Boyu Ni, Braden Hancock, Bram Wasti, Brandon Spence, Brani Stojkovic, Brian Gamido, Britt Montalvo, Carl
+Parker, Carly Burton, Catalina Mejia, Changhan Wang, Changkyu Kim, Chao Zhou, Chester Hu, Ching-Hsiang Chu, Chris Cai, Chris Tindal, Christoph Feichtenhofer, Damon Civin, Dana Beaty, Daniel Kreymer, Daniel Li, Danny Wyatt, David Adkins, David Xu, Davide Testuggine, Delia David, Devi Parikh, Diana Liskovich, Didem Foss, Dingkang Wang, Duc Le, Dustin Holland, Edward Dowling, Eissa Jamil, Elaine Montgomery, Eleonora Presani, Emily Hahn, Emily Wood, Erik Brinkman, Esteban Arcaute, Evan Dunbar, Evan Smothers, Fei Sun, Felix Kreuk, Feng Tian, Firat Ozgenel, Francesco Caggioni, Francisco Guzmán, Frank Kanayet, Frank Seide, Gabriela Medina Florez, Gabriella Schwarz, Gada Badeer, Georgia Swee, Gil Halpern, Govind Thattai, Grant Herman, Grigory Sizov, Guangyi, Zhang, Guna Lakshminarayanan, Hamid Shojanazeri, Han Zou, Hannah Wang, Hanwen Zha, Haroun Habeeb, Harrison Rudolph, Helen Suk, Henry Aspegren, Hunter Goldman, Ibrahim Damlaj, Igor Molybog, Igor Tufanov, Irina-Elena Veliche, Itai Gat, Jake Weissman, James
+Geboski, James Kohli, Japhet Asher, Jean-Baptiste Gaya, Jeff Marcus, Jeff Tang, Jennifer Chan, Jenny Zhen, Jeremy Reizenstein, Jeremy Teboul, Jessica Zhong, Jian Jin, Jingyi Yang, Joe Cummings, Jon Carvill, Jon Shepard, Jonathan McPhie, Jonathan Torres, Josh Ginsburg, Junjie Wang, Kai Wu, Kam Hou U, Karan Saxena, Karthik Prasad, Kartikay Khandelwal, Katayoun Zand, Kathy Matosich, Kaushik Veeraraghavan, Kelly Michelena, Keqian Li, Kun Huang, Kunal Chawla, Kushal Lakhotia, Kyle Huang, Lailin Chen, Lakshya Garg, Lavender A, Leandro Silva, Lee Bell, Lei Zhang, Liangpeng Guo, Licheng Yu, Liron Moshkovich, Luca Wehrstedt, Madian Khabsa, Manav Avalani, Manish Bhatt, Maria Tsimpoukelli, Martynas Mankus, Matan Hasson, Matthew Lennie, Matthias Reso, Maxim Groshev, Maxim Naumov, Maya Lathi, Meghan Keneally, Michael L. Seltzer, Michal Valko, Michelle Restrepo, Mihir Patel, Mik Vyatskov, Mikayel Samvelyan, Mike Clark, Mike Macey, Mike Wang, Miquel Jubert Hermoso, Mo Metanat, Mohammad Rastegari, Munish Bansal, Nandhini
+Santhanam, Natascha Parks, Natasha White, Navyata Bawa, Nayan Singhal, Nick Egebo, Nicolas Usunier, Nikolay Pavlovich Laptev, Ning Dong, Ning Zhang, Norman Cheng, Oleg Chernoguz, Olivia Hart, Omkar Salpekar, Ozlem Kalinli, Parkin Kent, Parth Parekh, Paul Saab, Pavan Balaji, Pedro Rittner, Philip Bontrager, Pierre Roux, Piotr Dollar, Polina Zvyagina, Prashant Ratanchandani, Pritish Yuvraj, Qian Liang, Rachad Alao, Rachel Rodriguez, Rafi Ayub, Raghotham Murthy, Raghu Nayani, Rahul Mitra, Raymond Li, Rebekkah Hogan, Robin Battey, Rocky Wang, Rohan Maheswari, Russ Howes, Ruty Rinott, Sai Jayesh Bondu, Samyak Datta, Sara Chugh, Sara Hunt, Sargun Dhillon, Sasha Sidorov, Satadru Pan, Saurabh Verma, Seiji Yamamoto, Sharadh Ramaswamy, Shaun Lindsay, Shaun Lindsay, Sheng Feng, Shenghao Lin, Shengxin Cindy Zha, Shiva Shankar, Shuqiang Zhang, Shuqiang Zhang, Sinong Wang, Sneha Agarwal, Soji Sajuyigbe, Soumith Chintala, Stephanie Max, Stephen Chen, Steve Kehoe, Steve Satterfield, Sudarshan Govindaprasad, Sumit Gupta,
+Sungmin Cho, Sunny Virk, Suraj Subramanian, Sy Choudhury, Sydney Goldman, Tal Remez, Tamar Glaser, Tamara Best, Thilo Kohler, Thomas Robinson, Tianhe Li, Tianjun Zhang, Tim Matthews, Timothy Chou, Tzook Shaked, Varun Vontimitta, Victoria Ajayi, Victoria Montanez, Vijai Mohan, Vinay Satish Kumar, Vishal Mangla, Vítor Albiero, Vlad Ionescu, Vlad Poenaru, Vlad Tiberiu Mihailescu, Vladimir Ivanov, Wei Li, Wenchen Wang, Wenwen Jiang, Wes Bouaziz, Will Constable, Xiaocheng Tang, Xiaofang Wang, Xiaojian Wu, Xiaolan Wang, Xide Xia, Xilun Wu, Xinbo Gao, Yanjun Chen, Ye Hu, Ye Jia, Ye Qi, Yenda Li, Yilin Zhang, Ying Zhang, Yossi Adi, Youngjin Nam, Yu, Wang, Yuchen Hao, Yundi Qian, Yuzi He, Zach Rait, Zachary DeVito, Zef Rosnbrick, Zhaoduo Wen, Zhenyu Yang, and Zhiwei Zhao.
+The llama 3 herd of models, 2024.
+URL
+https://arxiv.org/abs/2407.21783
+.
+Faldor et al. (2024)
+Maxence Faldor, Jenny Zhang, Antoine Cully, and Jeff Clune.
+Omni-epic: Open-endedness via models of human notions of interestingness with environments programmed in code, 2024.
+URL
+https://arxiv.org/abs/2405.15568
+.
+Fang et al. (2024)
+Richard Fang, Rohan Bindu, Akul Gupta, Qiusi Zhan, and Daniel Kang.
+Teams of llm agents can exploit zero-day vulnerabilities, 2024.
+URL
+https://arxiv.org/abs/2406.01637
+.
+Fawzi et al. (2022)
+A. Fawzi, M. Balog, A. Huang, T. Hubert, B. Romera-Paredes, M. Barekatain, A. Novikov, F. J. R. Ruiz, J. Schrittwieser, G. Swirszcz, D. Silver, D. Hassabis, and P. Kohli.
+Discovering faster matrix multiplication algorithms with reinforcement learning.
+Nature
+, 610(7930):47–53, 2022.
+doi:
+10.1038/s41586-022-05172-4
+.
+Hart et al. (1968)
+Peter E. Hart, Nils J. Nilsson, and Bertram Raphael.
+A formal basis for the heuristic determination of minimum cost paths.
+IEEE Trans. Syst. Sci. Cybern.
+, 4(2):100–107, 1968.
+doi:
+10.1109/TSSC.1968.300136
+.
+URL
+https://doi.org/10.1109/TSSC.1968.300136
+.
+Hu et al. (2024)
+Shengran Hu, Cong Lu, and Jeff Clune.
+Automated design of agentic systems, 2024.
+URL
+https://arxiv.org/abs/2408.08435
+.
+Jimenez et al. (2024)
+Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan.
+Swe-bench: Can language models resolve real-world github issues?, 2024.
+URL
+https://arxiv.org/abs/2310.06770
+.
+Jumper et al. (2021)
+J. Jumper, R. Evans, A. Pritzel, T. Green, M. Figurnov, O. Ronneberger, K. Tunyasuvunakool, R. Bates, A. Žídek, A. Potapenko, A. Bridgland, C. Meyer, S. A. A. Kohl, A. J. Ballard, A. Cowie, B. Romera-Paredes, S. Nikolov, R. Jain, J. Adler, T. Back, S. Petersen, D. Reiman, E. Clancy, M. Zielinski, M. Steinegger, M. Pacholska, T. Berghammer, S. Bodenstein, D. Silver, O. Vinyals, A. W. Senior, K. Kavukcuoglu, P. Kohli, and D. Hassabis.
+Highly accurate protein structure prediction with AlphaFold.
+Nature
+, 596(7873):583–589, 2021.
+doi:
+10.1038/s41586-021-03819-2
+.
+Kahneman (2011)
+Daniel Kahneman.
+Thinking, fast and slow
+.
+Farrar, Straus and Giroux, New York, NY, US, 2011.
+ISBN 978-0-374-27563-1.
+Khan et al. (2024)
+Akbir Khan, John Hughes, Dan Valentine, Laura Ruis, Kshitij Sachan, Ansh Radhakrishnan, Edward Grefenstette, Samuel R. Bowman, Tim Rocktäschel, and Ethan Perez.
+Debating with more persuasive llms leads to more truthful answers, 2024.
+URL
+https://arxiv.org/abs/2402.06782
+.
+Kocsis & Szepesvári (2006)
+Levente Kocsis and Csaba Szepesvári.
+Bandit based monte-carlo planning.
+In Johannes Fürnkranz, Tobias Scheffer, and Myra Spiliopoulou (eds.),
+Machine Learning: ECML 2006
+, pp.  282–293, Berlin, Heidelberg, 2006. Springer Berlin Heidelberg.
+ISBN 978-3-540-46056-5.
+Koh et al. (2024)
+Jing Yu Koh, Stephen McAleer, Daniel Fried, and Ruslan Salakhutdinov.
+Tree search for language model agents, 2024.
+URL
+https://arxiv.org/abs/2407.01476
+.
+Li et al. (2022)
+Yujia Li, David Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser, Rémi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal Lago, Thomas Hubert, Peter Choy, Cyprien de Masson d’Autume, Igor Babuschkin, Xinyun Chen, Po-Sen Huang, Johannes Welbl, Sven Gowal, Alexey Cherepanov, James Molloy, Daniel J. Mankowitz, Esme Sutherland Robson, Pushmeet Kohli, Nando de Freitas, Koray Kavukcuoglu, and Oriol Vinyals.
+Competition-level code generation with alphacode.
+Science
+, 378(6624):1092–1097, 2022.
+doi:
+10.1126/science.abq1158
+.
+URL
+https://www.science.org/doi/abs/10.1126/science.abq1158
+.
+Lu et al. (2024a)
+Chris Lu, Cong Lu, Robert Tjarko Lange, Jakob Foerster, Jeff Clune, and David Ha.
+The ai scientist: Towards fully automated open-ended scientific discovery, 2024a.
+URL
+https://arxiv.org/abs/2408.06292
+.
+Lu et al. (2024b)
+Cong Lu, Shengran Hu, and Jeff Clune.
+Intelligent go-explore: Standing on the shoulders of giant foundation models, 2024b.
+URL
+https://arxiv.org/abs/2405.15143
+.
+Ma et al. (2024a)
+Yecheng Jason Ma, William Liang, Guanzhi Wang, De-An Huang, Osbert Bastani, Dinesh Jayaraman, Yuke Zhu, Linxi Fan, and Anima Anandkumar.
+Eureka: Human-level reward design via coding large language models, 2024a.
+URL
+https://arxiv.org/abs/2310.12931
+.
+Ma et al. (2024b)
+Yingwei Ma, Qingping Yang, Rongyu Cao, Binhua Li, Fei Huang, and Yongbin Li.
+How to understand whole software repository?, 2024b.
+URL
+https://arxiv.org/abs/2406.01422
+.
+Moore (1959)
+E.F. Moore.
+The Shortest Path Through a Maze
+.
+Bell Telephone System. Technical publications. monograph. Bell Telephone System., 1959.
+URL
+https://books.google.com/books?id=IVZBHAAACAAJ
+.
+OpenAI (2024)
+OpenAI.
+OpenAI o1 System Card, September 2024.
+URL
+https://openai.com/research/o1-system-card
+.
+Online report.
+OpenAI et al. (2024)
+OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo
+Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan
+Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O’Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez,
+Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia
+Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph.
+Gpt-4 technical report, 2024.
+URL
+https://arxiv.org/abs/2303.08774
+.
+Örwall (2024)
+Albert Örwall.
+Moatless tools, jun 2024.
+URL
+https://github.com/aorwall/moatless-tools
+.
+Accessed: 2024-07-16.
+Ouyang et al. (2022)
+Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe.
+Training language models to follow instructions with human feedback, 2022.
+URL
+https://arxiv.org/abs/2203.02155
+.
+Pan et al. (2023)
+Liangming Pan, Alon Albalak, Xinyi Wang, and William Yang Wang.
+Logic-lm: Empowering large language models with symbolic solvers for faithful logical reasoning, 2023.
+URL
+https://arxiv.org/abs/2305.12295
+.
+Rigaki et al. (2024)
+Maria Rigaki, Carlos Catania, and Sebastian Garcia.
+Hackphyr: A local fine-tuned llm agent for network security environments, 2024.
+URL
+https://arxiv.org/abs/2409.11276
+.
+Saha et al. (2024)
+Swarnadeep Saha, Archiki Prasad, Justin Chih-Yao Chen, Peter Hase, Elias Stengel-Eskin, and Mohit Bansal.
+System-1.x: Learning to balance fast and slow planning with language models, 2024.
+URL
+https://arxiv.org/abs/2407.14414
+.
+Silver et al. (2016a)
+David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, Sander Dieleman, Dominik Grewe, John Nham, Nal Kalchbrenner, Ilya Sutskever, Timothy Lillicrap, Madeleine Leach, Koray Kavukcuoglu, Thore Graepel, and Demis Hassabis.
+Mastering the game of go with deep neural networks and tree search.
+Nature
+, 529(7587):484–489, 1 2016a.
+ISSN 1476-4687.
+doi:
+10.1038/nature16961
+.
+URL
+https://doi.org/10.1038/nature16961
+.
+Silver et al. (2016b)
+David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Vedavyas Panneershelvam, Marc Lanctot, Sander Dieleman, Dominik Grewe, John Nham, Nal Kalchbrenner, Ilya Sutskever, Timothy P. Lillicrap, Madeleine Leach, Koray Kavukcuoglu, Thore Graepel, and Demis Hassabis.
+Mastering the game of go with deep neural networks and tree search.
+Nat.
+, 529(7587):484–489, 2016b.
+doi:
+10.1038/NATURE16961
+.
+URL
+https://doi.org/10.1038/nature16961
+.
+Silver et al. (2018)
+David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, Timothy Lillicrap, Karen Simonyan, and Demis Hassabis.
+A general reinforcement learning algorithm that masters chess, shogi, and go through self-play.
+Science
+, 362(6419):1140–1144, 2018.
+doi:
+10.1126/science.aar6404
+.
+URL
+https://www.science.org/doi/abs/10.1126/science.aar6404
+.
+Snell et al. (2024)
+Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar.
+Scaling llm test-time compute optimally can be more effective than scaling model parameters, 2024.
+URL
+https://arxiv.org/abs/2408.03314
+.
+Wang et al. (2023)
+Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar.
+Voyager: An open-ended embodied agent with large language models, 2023.
+URL
+https://arxiv.org/abs/2305.16291
+.
+Wang et al. (2024a)
+Xingyao Wang, Yangyi Chen, Lifan Yuan, Yizhe Zhang, Yunzhu Li, Hao Peng, and Heng Ji.
+Executable code actions elicit better llm agents, 2024a.
+URL
+https://arxiv.org/abs/2402.01030
+.
+Wang et al. (2024b)
+Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and Graham Neubig.
+Opendevin: An open platform for ai software developers as generalist agents, 2024b.
+URL
+https://arxiv.org/abs/2407.16741
+.
+Wei et al. (2022)
+Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, et al.
+Emergent abilities of large language models.
+arXiv preprint arXiv:2206.07682
+, 2022.
+Xia et al. (2024)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying llm-based software engineering agents, 2024.
+URL
+https://arxiv.org/abs/2407.01489
+.
+Yang et al. (2024a)
+An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jianxin Yang, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Xuejing Liu, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, Zhifang Guo, and Zhihao Fan.
+Qwen2 technical report, 2024a.
+URL
+https://arxiv.org/abs/2407.10671
+.
+Yang et al. (2024b)
+John Yang, Carlos E. Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press.
+Swe-agent: Agent-computer interfaces enable automated software engineering, 2024b.
+URL
+https://arxiv.org/abs/2405.15793
+.
+Yao et al. (2023)
+Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language models.
+In Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz Hardt, and Sergey Levine (eds.),
+Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023
+, 2023.
+URL
+http://papers.nips.cc/paper_files/paper/2023/hash/271db9922b8d1f4dd7aaef84ed5ac703-Abstract-Conference.html
+.
+Zhang et al. (2024a)
+Kexun Zhang, Weiran Yao, Zuxin Liu, Yihao Feng, Zhiwei Liu, Rithesh Murthy, Tian Lan, Lei Li, Renze Lou, Jiacheng Xu, Bo Pang, Yingbo Zhou, Shelby Heinecke, Silvio Savarese, Huan Wang, and Caiming Xiong.
+Diversity empowers intelligence: Integrating expertise of software engineering agents, 2024a.
+URL
+https://arxiv.org/abs/2408.07060
+.
+Zhang et al. (2024b)
+Yao Zhang, Zijian Ma, Yunpu Ma, Zhen Han, Yu Wu, and Volker Tresp.
+Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration, 2024b.
+URL
+https://arxiv.org/abs/2408.15978
+.
+Zhang et al. (2024c)
+Yiqun Zhang, Xiaocui Yang, Shi Feng, Daling Wang, Yifei Zhang, and Kaisong Song.
+Can llms beat humans in debating? a dynamic multi-agent framework for competitive debate, 2024c.
+URL
+https://arxiv.org/abs/2408.04472
+.
+Zhang et al. (2024d)
+Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, and Abhik Roychoudhury.
+Autocoderover: Autonomous program improvement, 2024d.
+URL
+https://arxiv.org/abs/2404.05427
+.
+Appendix A
+Reproducibility
+All models and data used in our work are publicly available. We additionally provide hyperparameter details in
+Appendix
+2
+. The code will be released as a public repository upon publication.
+Appendix B
+Additional Implementation Details
+Moatless-adapted is an extended version of the moatless-tools library with support for a tree structure, the ability to revert to earlier versions of the codebase, and the capability to run tests.
+The standard implementation of moatless-tools is based on a finite state machine structure where a state holds information about file context and properties set in the configuration or from previous states. It can then transition to a new state when an action is executed. The request that initiates the action is created by an LLM. This follows a linear structure where one state can transition to another state. In moatless-adapted, this model is extended so that a state can expand by using actions to create more states. The connections between states are then represented in a tree structure with nodes.
+Each state has a file context associated with it. This file context will be included in the prompt sent to an LLM. To limit the size of the prompt, files are divided into ”spans,” where a span could be, for example, a section of code (e.g., imports), a class, or a function. These are identified by span IDs. Thus, the LLM sees a limited part of the code at a time but can request more context by searching for or adding files and spans. The file context therefore changes over time, and a specific state of file context is linked to a specific state.
+In the standard implementation of moatless-tools, changes to the codebase are made linearly, and each change is saved directly to the file system. In moatless-adapted, however, there is a need to be able to revert to earlier states and thus return to a previous version of the codebase. To handle this, the code is stored in a git repository where each change is committed, and each state has a reference to a commit as well as the current patch of the diff from the initial commit that existed before starting. This way, one can go back to an earlier state by specifying the state ID, and the commit that was current at that time will be checked out.
+The test files present in the file context are run each time the Plan state is initiated, and the test results are provided to the state. The tests are then run in Docker images built via the SWE-bench library. To use this approach in a benchmark where a larger number of instances should be able to run simultaneously, a solution is used where these images are run as pods in a Kubernetes cluster. Moatless-tools communicates with the testbed by applying patches and running commands via an API. When a new instance starts, a pod is created which is then reset at each run, applying the current patch and running tests according to the test command specified in the SWE-bench library. It’s important to add here that the agent is not aware of the
+PASS_TO_PASS
+or
+FAIL_TO_PASS
+tests in the SWE-bench harness, but only knows how to run the tests. This corresponds to a real engineering environment where each project can have its own test commands.
+Appendix C
+MCTS Hyperparameters
+The Monte Carlo Tree Search (MCTS) algorithm used in this study employs several hyperparameters.
+Table 2:
+MCTS Hyperparameters
+Hyperparameter
+Description
+Default
+c_param
+UCT exploration parameter
+1.41
+max_expansions
+Max children per node
+5
+max_iterations
+Max MCTS iterations
+100
+provide_feedback
+Enable feedback
+True
+best_first
+Use best-first strategy
+True
+value_function_temperature
+Value function temperature
+0.2
+max_depth
+Max tree depth
+20
+UCT Score Calculation Parameters
+exploration_weight
+UCT exploration weight
+1.0
+depth_weight
+Depth penalty weight
+0.8
+depth_bonus_factor
+Depth bonus factor
+200.0
+high_value_threshold
+High-value node threshold
+55.0
+low_value_threshold
+Low-value node threshold
+50.0
+very_high_value_threshold
+Very high-value threshold
+75.0
+high_value_leaf_bonus_constant
+High-value leaf bonus
+20.0
+high_value_bad_children_bonus_constant
+High-value bad children bonus
+20.0
+high_value_child_penalty_constant
+High-value child penalty
+5.0
+Action Model Parameters
+action_model_temperature
+Action model temperature
+0.2
+Discriminator Parameters
+number_of_agents
+Number of Discriminator Agents
+5
+number_of_round
+Number of debate rounds
+3
+discriminator_temperature
+Discriminator temperature
+1.0
+These hyperparameters can be adjusted to fine-tune the MCTS algorithm’s performance for specific problem domains or computational constraints. The values listed here are the defaults as defined in the
+TreeSearchSettings
+class and the MCTS implementation.
+Appendix D
+Ability of MCTS to Escape Unproductive Loops vs. Baseline
+Figure 6:
+Avoiding Repetitive Actions, django__django__10914.
+We found that the base agent can often get stuck performing repetitive actions
+that do not bring it closer to solving the issue, and which commonly lead to unresolvable dead-ends. In this example, the base agent
+was stuck implementing wrong tests which continuously returned errors. In contrast, when this happens in
+SWE-Search, the Value Agent recognizes this, terminating these trajectories quickly,
+as happens in Node 73 (orange).
+Appendix E
+Model Instance Resolution Uniqueness
+To understand the complementary strengths of different models in resolving software issues, we analyzed how unique their resolved issue subsets where. Figure
+7
+illustrates the resolution patterns for each model across five of the codebases in SWE-bench-lite.
+Figure 7:
+Unique Issue Resolution Patterns Across Models and Libraries.
+Each column represents a different Python reposiroty, and each row within a column represents a specific issue. Colored blocks indicate successful resolution by the corresponding model (see legend). White spaces denote unresolved issues. This visualization highlights the diverse problem-solving capabilities of different models across various software domains, demonstrating that no single model dominates across all issues and libraries.
+Appendix F
+Ability of Value Function to Discern Successful Trajectories
+Before implementing SWE-Search, we conducted a general study across many models to evaluate the models’ ability to differentiate states which led to resolved vs. unresolved issues. Figure
+8
+shows the results of this study. We found that in general, models assigned higher rewards to states which eventually led to resolved issues. Of particular interest was the Deepseek model, which seemed to identify critical errors in trajectories effectively. This was also observed in the final agent (see Fig.
+5
+a).
+Figure 8:
+Average State Reward Comparison Across Models.
+This graph compares the average state rewards assigned by different language models for resolved (green) and unresolved (red) issues. Error bars indicate standard deviation. Most models consistently assign higher rewards to states leading to resolved issues, with the exception of the. The ’Average’ column represents the mean across all models, demonstrating a clear distinction between resolved and unresolved states.
+Appendix G
+Value Function Prompts
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md b/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md
new file mode 100644
index 0000000000000000000000000000000000000000..0cd5f107eee0dd474a43b16695912551d6efcfa5
--- /dev/null
+++ b/research/notes/241221139-training-software-engineering-agents-and-verifiers-with-swe-gym.md
@@ -0,0 +1,200 @@
+---
+title: '[2412.21139] Training Software Engineering Agents and Verifiers with SWE-Gym'
+id: 241221139-training-software-engineering-agents-and-verifiers-with-swe-gym
+tags:
+- deepread
+created: '2026-06-10T00:22:57.639430Z'
+source: https://arxiv.org/abs/2412.21139
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:22:57.639309Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2412.21139] Training Software Engineering Agents and Verifiers with SWE-Gym
+Computer Science > Software Engineering
+arXiv:2412.21139
+(cs)
+[Submitted on 30 Dec 2024 (
+v1
+), last revised 6 Jun 2025 (this version, v2)]
+Title:
+Training Software Engineering Agents and Verifiers with SWE-Gym
+Authors:
+Jiayi Pan
+,
+Xingyao Wang
+,
+Graham Neubig
+,
+Navdeep Jaitly
+,
+Heng Ji
+,
+Alane Suhr
+,
+Yizhe Zhang
+View a PDF of the paper titled Training Software Engineering Agents and Verifiers with SWE-Gym, by Jiayi Pan and 6 other authors
+View PDF
+HTML (experimental)
+Abstract:
+We present SWE-Gym, the first environment for training real-world software engineering (SWE) agents. SWE-Gym contains 2,438 real-world Python task instances, each comprising a codebase with an executable runtime environment, unit tests, and a task specified in natural language. We use SWE-Gym to train language model based SWE agents, achieving up to 19% absolute gains in resolve rate on the popular SWE-Bench Verified and Lite test sets. We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym. When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym, models, and agent trajectories.
+Comments:
+Accepted at ICML 2025. Code at
+this https URL
+Subjects:
+Software Engineering (cs.SE)
+; Computation and Language (cs.CL)
+Cite as:
+arXiv:2412.21139
+[cs.SE]
+(or
+arXiv:2412.21139v2
+[cs.SE]
+for this version)
+https://doi.org/10.48550/arXiv.2412.21139
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Jiayi Pan [
+view email
+]
+[v1]
+Mon, 30 Dec 2024 18:15:39 UTC (156 KB)
+[v2]
+Fri, 6 Jun 2025 07:53:20 UTC (295 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Training Software Engineering Agents and Verifiers with SWE-Gym, by Jiayi Pan and 6 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.SE
+< prev
+|
+next >
+new
+|
+recent
+|
+2024-12
+Change to browse by:
+cs
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md b/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca10638d8e6004c747f968d001164579d7058a06
--- /dev/null
+++ b/research/notes/250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep.md
@@ -0,0 +1,196 @@
+---
+title: '[2501.04519] rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved
+  Deep Thinking'
+id: 250104519-rstar-math-small-llms-can-master-math-reasoning-with-self-evolved-deep
+tags:
+- deepread
+created: '2026-06-10T00:40:01.597011Z'
+source: https://arxiv.org/abs/2501.04519
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:40:01.596873Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.04519] rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking
+Computer Science > Computation and Language
+arXiv:2501.04519
+(cs)
+[Submitted on 8 Jan 2025]
+Title:
+rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking
+Authors:
+Xinyu Guan
+,
+Li Lyna Zhang
+,
+Yifei Liu
+,
+Ning Shang
+,
+Youran Sun
+,
+Yi Zhu
+,
+Fan Yang
+,
+Mao Yang
+View a PDF of the paper titled rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking, by Xinyu Guan and 7 other authors
+View PDF
+HTML (experimental)
+Abstract:
+We present rStar-Math to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models. rStar-Math achieves this by exercising "deep thinking" through Monte Carlo Tree Search (MCTS), where a math policy SLM performs test-time search guided by an SLM-based process reward model. rStar-Math introduces three innovations to tackle the challenges in training the two SLMs: (1) a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories used to train the policy SLM; (2) a novel process reward model training method that avoids naïve step-level score annotation, yielding a more effective process preference model (PPM); (3) a self-evolution recipe in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities. Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems, rStar-Math boosts SLMs' math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME), rStar-Math solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at
+this https URL
+.
+Subjects:
+Computation and Language (cs.CL)
+Cite as:
+arXiv:2501.04519
+[cs.CL]
+(or
+arXiv:2501.04519v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2501.04519
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Li Lyna Zhang [
+view email
+]
+[v1]
+Wed, 8 Jan 2025 14:12:57 UTC (632 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking, by Xinyu Guan and 7 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md b/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md
new file mode 100644
index 0000000000000000000000000000000000000000..f13686520ee09bfc2ef3e5b0ebfa3bb8870455fa
--- /dev/null
+++ b/research/notes/250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th.md
@@ -0,0 +1,3557 @@
+---
+title: '[2501.04519] \sysname: Small LLMs Can Master Math Reasoning with Self-Evolved
+  Deep Thinking'
+id: 250104519-sysname-small-llms-can-master-math-reasoning-with-self-evolved-deep-th
+tags:
+- deepread
+created: '2026-06-10T00:40:46.873514Z'
+source: https://ar5iv.labs.arxiv.org/html/2501.04519
+source_domain: ar5iv.labs.arxiv.org
+fetched_at: '2026-06-10T00:40:46.873327Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.04519] \sysname: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking
+\sysname
+: Small LLMs Can Master Math Reasoning
+with Self-Evolved Deep Thinking
+Xinyu Guan
+∗
+Li Lyna Zhang
+∗⋄
+Yifei Liu
+Ning Shang   Youran Sun    Yi Zhu    Fan Yang    Mao Yang
+Microsoft Research Asia
+Abstract
+We present
+\sysname
+to demonstrate that small language models (SLMs) can rival or even surpass the math reasoning capability of OpenAI o1, without distillation from superior models.
+\sysname
+achieves this by exercising “deep thinking” through Monte Carlo Tree Search (MCTS), where a math
+policy SLM
+performs test-time search guided by an SLM-based
+process reward model
+.
+\sysname
+introduces three innovations to tackle the challenges in training the two SLMs:
+(1)
+a novel code-augmented CoT data sythesis method, which performs extensive MCTS rollouts to generate
+step-by-step verified reasoning trajectories
+used to train the policy SLM;
+(2)
+a novel process reward model training method that avoids naïve step-level score annotation, yielding a more effective
+process preference model (PPM)
+;
+(3)
+a
+self-evolution recipe
+in which the policy SLM and PPM are built from scratch and iteratively evolved to improve reasoning capabilities.
+Through 4 rounds of self-evolution with millions of synthesized solutions for 747k math problems,
+\sysname
+boosts SLMs’ math reasoning to state-of-the-art levels. On the MATH benchmark, it improves Qwen2.5-Math-7B from 58.8% to 90.0% and Phi3-mini-3.8B from 41.4% to 86.4%, surpassing o1-preview by +4.5% and +0.9%. On the USA Math Olympiad (AIME),
+\sysname
+solves an average of 53.3% (8/15) of problems, ranking among the top 20% the brightest high school math students. Code and data will be available at
+https://github.com/microsoft/rStar
+.
+Task
+(pass@1 Acc)
+rStar-Math
+(Qwen-7B)
+rStar-Math
+(Qwen-1.5B)
+rStar-Math
+(Phi3-mini)
+OpenAI
+o1-preview
+OpenAI
+o1-mini
+QWQ
+32B-preview
+GPT-4o
+DeepSeek-V3
+MATH
+90.0
+88.6
+86.4
+85.5
+90.0
+90.6
+76.6
+90.2
+AIME 2024
+53.3
+46.7
+43.3
+44.6
+56.7
+50.0
+9.3
+39.2
+Olympiad Bench
+65.6
+64.6
+60.3
+-
+65.3
+61.2
+43.3
+55.4
+College Math
+60.5
+59.3
+59.1
+-
+57.8
+55.8
+48.5
+58.9
+Omni-Math
+50.5
+48.5
+46.0
+52.5
+60.5
+49.6
+30.5
+35.9
+Table 1:
+\sysname
+enables frontier math reasoning in SLMs via deep thinking over 64 trajectories.
+$*$
+$*$
+footnotetext:
+Equal contribution.
+$\diamond$
+$\diamond$
+footnotetext:
+Project leader; correspondence to lzhani@microsoft.com
+$\S$
+$\S$
+footnotetext:
+Xinyu Guan and Youran Sun did this work during the internship at MSRA. Xinyu Guan (2001gxy@gmail.com) is with Peking University, Youran Sun is with Tsinghua University.
+1
+Introduction
+Recent studies have demonstrated that large language models (LLMs) are capable of tackling mathematical problems
+(Team,
+2024a
+; Yang et al.,
+2024
+; OpenAI,
+2024
+; Liu et al.,
+2024
+)
+. However, the conventional approach of having LLMs generate complete solutions in a single inference – akin to System 1 thinking
+(Daniel,
+2011
+)
+– often yields fast but error-prone results
+(Valmeekam et al.,
+2023
+; OpenAI,
+2023
+)
+. In response, test-time compute scaling
+(Snell et al.,
+2024
+; Qi et al.,
+2024
+)
+suggests a paradigm shift toward a System 2-style thinking, which emulates human reasoning through a slower and deeper thought process. In this paradigm, an LLM serves as a policy model to generate multiple math reasoning steps, which are then evaluated by another LLM acting as a reward model
+(OpenAI,
+2024
+)
+. The steps and solutions deemed more likely to be correct are selected. The process repeats iteratively and ultimately derives the final answer.
+In the test-time compute paradigm, the key is to train a powerful policy model that generates promising solution steps and a reliable reward model that accurately evaluates them, both of which depend on
+high-quality
+training data. Unfortunately, it is well-known that off-the-shelf high-quality math reasoning data is scarce, and synthesizing high-quality math data faces fundamental challenges.
+For the policy model, it is challenging to distinguish erroneous reasoning steps from the correct ones, complicating the elimination of low-quality data. It is worth noting that in math reasoning, a correct final answer does not ensure the correctness of the entire reasoning trace
+(Lanham et al.,
+2023
+)
+. Incorrect intermediate steps significantly decrease data quality.
+As for the reward model, process reward modeling (PRM) shows a great potential by providing fine-grained feedback on intermediate steps
+(Lightman et al.,
+2023
+)
+. However, the training data is even scarcer in this regard: accurate step-by-step feedback requires intense human labeling efforts and is impractical to scale, while those automatic annotation attempts show limited gains due to noisy reward scores
+(Luo et al.,
+2024
+; Wang et al.,
+2024c
+; Chen et al.,
+2024
+)
+.
+Due to the above challenges, existing distill-based data synthesis approaches to training policy models, e.g., scaling up GPT4-distilled CoT data
+(Tang et al.,
+2024
+; Huang et al.,
+2024
+)
+, have shown diminishing returns and cannot exceed the capability of their teacher model; meanwhile, as of today, training reliable PRMs for math reasoning remains an open question.
+Figure 1:
+The overview of
+\sysname
+.
+In this work, we introduce
+\sysname
+, a self-evolvable System 2-style reasoning approach that achieves the state-of-the-art math reasoning, rivaling and sometimes even surpassing OpenAI o1 on challenging math competition benchmarks with a model size as small as 7 billion. Unlike solutions relying on superior LLMs for data synthesis,
+\sysname
+leverages smaller language models (SLMs) with Monte Carlo Tree Search (MCTS) to establish a self-evolutionary process, iteratively generating higher-quality training data. To achieve self-evolution,
+\sysname
+introduces three key innovations.
+First, a novel code-augmented CoT data synthesis method, which performs
+extensive
+MCTS rollouts to generate
+step-by-step verified reasoning trajectories
+with
+self-annotated MCTS Q-values
+. Specifically, math problem-solving is decomposed into multi-step generation within MCTS. At each step, the SLM serving as the policy model samples candidate nodes, each generating a one-step CoT and the corresponding Python code. To verify the generation quality, only nodes with successful Python code execution are retained, thus mitigating errors in intermediate steps. Moreover, extensive MCTS rollouts automatically assign a Q-value to each intermediate step based on its contribution: steps contributing to more trajectories that lead to the correct answer are given higher Q-values and considered higher quality. This ensures that the reasoning trajectories generated by SLMs consist of correct, high-quality intermediate steps.
+Second, a novel method that trains an SLM acting as a
+process preference model
+, i.e., a PPM to implement the desired PRM, that reliably predicts a reward label for each math reasoning step. The PPM leverages the fact that, although Q-values are still not precise enough to score each reasoning step despite using extensive MCTS rollouts, the Q-values can reliably distinguish positive (correct) steps from negative (irrelevant/incorrect) ones. Thus the training method constructs preference pairs for each step based on Q-values and uses a pairwise ranking loss
+(Ouyang et al.,
+2022
+)
+to optimize PPM’s score prediction for each reasoning step, achieving reliable labeling. This approach avoids conventional methods that directly use Q-values as reward labels
+(Luo et al.,
+2024
+; Chen et al.,
+2024
+)
+, which are inherently noisy and imprecise in stepwise reward assignment.
+Finally, a four-round self-evolution recipe that progressively builds both a frontier policy model and PPM from scratch. We begin by curating a dataset of 747k math word problems from publicly available sources. In each round, we use the latest policy model and PPM to perform MCTS, generating increasingly high-quality training data using the above two methods to train a stronger policy model and PPM for next round. Each round achieves progressive refinement: (1) a stronger policy SLM, (2) a more reliable PPM, (3) generating better reasoning trajectories via PPM-augmented MCTS, and (4) improving training data coverage to tackle more challenging and even competition-level math problems.
+Extensive experiments across four SLMs (1.5B-7B) and seven math reasoning tasks demonstrate the effectiveness of
+\sysname
+. Remarkably,
+\sysname
+improves all four SLMs, matching or even surpassing OpenAI o1 on challenging math benchmarks. On MATH benchmark, with 8 search trajectories,
+\sysname
+boosts Qwen2.5-Math-7B from 58.8% to 89.4% and Qwen2.5-Math-1.5B from 51.2% to 87.8%. With 64 trajectories, the scores rise to 90% and 88.4%, outperforming o1-preview by 4.5% and 2.6% and matching o1-mini’s 90%. On the Olympiad-level AIME 2024,
+\sysname
+solves on average 53.3% (8/15) of the problems, exceeding o1-preview by 8.7% and all other open-sourced LLMs. We further conduct comprehensive experiments to verify the superiority of step-by-step verified reasoning trajectories over state-of-the-art data synthesis baselines, as well as the PPM’s effectiveness compared to outcome reward models and Q value-based PRMs. Finally, we present key findings from
+\sysname
+deep thinking, including the intrinsic self-reflection capability and PPM’s preference for theorem-applications intermediate steps.
+2
+Related Works
+Math Data Synthesis
+. Advancements in LLM math reasoning have largely relied on curating high-quality CoT data, with most leading approaches being GPT-distilled, using frontier models like GPT-4 for synthesis
+(Wang et al.,
+2024b
+; Gou et al.,
+2023
+; Luo et al.,
+2023
+)
+. Notable works include NuminaMath
+(Jia LI and Polu,
+2024a
+)
+and
+MetaMath
+(Yu et al.,
+2023b
+)
+. While effective, this limits reasoning to the capabilities of the teacher LLM.
+Hard problems that the teacher LLM cannot solve are excluded in the training set.
+Even solvable problems may contain error-prone intermediate steps, which are hard to detect. Although rejection sampling methods
+(Yuan et al.,
+2023
+; Brown et al.,
+2024
+)
+can improve data quality,
+they do not guarantee correct intermediate steps. As a result, scaling up CoT data has diminishing returns, with gains nearing saturation—e.g., OpenMathInstruct-2
+(Toshniwal et al.,
+2024
+)
+only sees a 3.9% boost on MATH despite an 8× increase in dataset size.
+Scaling Test-time Compute
+has introduced new scaling laws, allowing LLMs to improve performance across by generating multiple samples and using reward models for best-solution selection
+(Snell et al.,
+2024
+; Wu et al.,
+2024
+; Brown et al.,
+2024
+)
+. Various test-time search methods have been proposed
+(Kang et al.,
+2024
+; Wang et al.,
+2024a
+)
+, including random sampling
+(Wang et al.,
+2023
+)
+and tree-search methods
+(Yao et al.,
+2024
+; Hao et al.,
+2023
+; Zhang et al.,
+2024b
+; Qi et al.,
+2024
+)
+like MCTS. However, open-source methods for scaling test-time computation have shown limited gains in math reasoning, often due to policy LLM or reward model limitations.
+\sysname
+addresses this by iteratively evolving the policy LLM and reward model, achieving System 2 mathematical reasoning performance comparable to OpenAI o1
+(OpenAI,
+2024
+)
+.
+Reward Models
+are crucial for effective System 2 reasoning but are challenging to obtain. Recent works include LLM-as-a-Judge for verification
+(Zheng et al.,
+2023
+; Qi et al.,
+2024
+)
+and specialized reward models like Outcome Reward Model
+(Yang et al.,
+2024
+; Yu et al.,
+2023a
+)
+and Process Reward Model (PRM)
+(Lightman et al.,
+2024
+)
+. While PRMs offer promising dense, step-level reward signals for
+complex reasoning
+(Luo et al.,
+2024
+; Wang et al.,
+2024c
+)
+, collecting step-level annotations remains an obstacle. While
+Kang et al. (
+2024
+); Wang et al. (
+2024a
+)
+rely on costly human-annotated datasets like PRM800k
+(Lightman et al.,
+2024
+)
+,
+recent approaches
+(Wang et al.,
+2024c
+; Luo et al.,
+2024
+)
+explore automated annotation via Monte Carlo Sampling or MCTS. However, they struggle to generate precise reward scores, which limits performance gains.
+\sysname
+introduces a novel process preference reward (PPM) that eliminates the need for accurate step-level reward score annotation.
+3
+Methodology
+3.1
+Design Choices
+MCTS for Effective System 2 Reasoning
+.
+We aim to train a math policy SLM and a process reward model (PRM), and integrating both within Monte Carlo Tree Search (MCTS) for System 2 deep thinking. MCTS is chosen for two key reasons. First, it breaks down complex math problems into simpler single-step generation tasks, reducing the difficulty for the policy SLM compared to other System 2 methods like Best-of-N
+(Brown et al.,
+2024
+)
+or self-consistency
+(Wang et al.,
+2023
+)
+, which require generating full solutions in one inference.
+Second, the step-by-step generation in MCTS naturally yields step-level training data for both models. Standard MCTS rollout automatically assign Q-value to each step based on its contribution to the final correct answer, obviating the need for human-generated step-level annotations for process reward model training.
+Ideally, advanced LLMs such as GPT-4 could be integrated within MCTS to generate training data. However, this approach faces two key challenges. First, even these powerful models struggle to consistently solve difficult problems, such as Olympiad-level mathematics. Consequently, the resulting training data would primarily consist of simpler solvable problems, limiting its diversity and quality. Second, annotating per-step Q-values demands extensive MCTS rollouts; insufficient tree exploration can lead to spurious Q-value assignments, such as overestimating suboptimal steps. Given that each rollout involves multiple single-step generations and these models are computationally expensive, increasing rollouts significantly raises inference costs.
+Overview
+. To this end, we explore using two 7B SLMs (a policy SLM and a PRM) to generate higher-quality training data, with their smaller size allowing for extensive MCTS rollouts on accessible hardware (e.g., 4
+×
+\times
+40GB A100 GPUs). However, self-generating data presents greater challenges for SLMs, due to their weaker capabilities.
+SLMs frequently fail to generate correct solutions, and even when the final answer is correct, the intermediate steps are often flawed or of poor quality. Moreover, SLMs solve fewer challenging problems compared to advanced models like GPT-4.
+This section introduces our methodology, as illustrated in Fig.
+1
+. To mitigate errors and low-quality intermediate steps, we introduce a code-augmented CoT synthetic method, which performs extensive MCTS rollouts to generate step-by-step verified reasoning trajectories, annotated with Q-values. To further improve SLM performance on challenging problems, we introduce a four-round self-evolution recipe. In each round, both the policy SLM and the reward model are updated to stronger versions, progressively tackling more difficult problems and generating higher-quality training data. Finally, we present a novel process reward model training approach that eliminates the need for precise per-step reward annotations, yielding the more
+effective process preference model (PPM).
+3.2
+Step-by-Step Verified Reasoning Trajectory
+We start by introducing our method for generating step-by-step verified reasoning trajectories with per-step Q-value annotations. Given a problem
+x
+x
+and a policy model
+M
+M
+, we run the standard MCTS to incrementally construct a search tree for step-by-step solution exploration. As shown in Fig.
+1
+(a),
+the root node represents question
+x
+x
+, while child nodes correspond to intermediate steps
+s
+s
+generated by
+M
+M
+. A root-to-leaf path ending at terminal node
+s
+d
+s_{d}
+forms a trajectory
+𝐭
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+d
+\mathbf{t}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d}
+, with each step
+s
+i
+s_{i}
+assigned a Q-value
+Q
+​
+(
+s
+i
+)
+Q(s_{i})
+.
+From the search tree
+𝒯
+\mathcal{T}
+, we extract solution trajectories
+𝕋
+=
+{
+𝐭
+1
+,
+𝐭
+2
+,
+…
+,
+𝐭
+n
+}
+​
+(
+n
+≥
+1
+)
+\mathbb{T}=\{\mathbf{t}^{1},\mathbf{t}^{2},...,\mathbf{t}^{n}\}(n\geq 1)
+. Our goal is to select high-quality trajectories from
+𝒯
+\mathcal{T}
+to construct the training set. For this purpose, we introduce code-augmented CoT synthesis method to filter out low-quality generations and perform extensive rollouts to improve the reliability of Q-value accuracy.
+Code-augmented CoT Generation
+. Prior MCTS approaches primarily generate natural language (NL) CoTs
+(Qi et al.,
+2024
+; Zhang et al.,
+2024a
+)
+. However, LLMs often suffer from hallucination, producing incorrect or irrelevant steps yet still arrive at the correct answer by chance
+(Lanham et al.,
+2023
+)
+. These flawed steps are challenging to detect and eliminate. To address this, we propose a novel code execution augmented CoT. As shown in Fig.
+2
+, the policy model generates a one-step NL CoT alongside its corresponding Python code, where the NL CoT is embedded as a Python comment. Only generations with successfully executed Python code are retained as valid candidates.
+Figure 2:
+An example of Code-augmented CoT.
+Specifically, starting from the initial root node
+x
+x
+, we perform multiple MCTS iterations through
+selection
+,
+expansion
+,
+rollout
+, and
+back-propagation
+. At step
+i
+i
+, we collect the latest reasoning trajectory
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+−
+1
+x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}
+as the current state. Based on this state, we prompt (see Appendix
+A.3
+) the policy model to generate
+n
+n
+candidates
+s
+i
+,
+0
+,
+…
+,
+s
+i
+,
+n
+−
+1
+s_{i,0},...,s_{i,n-1}
+for step
+i
+i
+. Python code execution is then employed to filter valid nodes. As shown in Fig.
+2
+, each generation
+s
+i
+,
+j
+s_{i,j}
+is concatenated with the code from all previous steps, forming
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+−
+1
+⊕
+s
+i
+,
+j
+s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}\oplus s_{i,j}
+. Candidates that execute successfully are retained as valid nodes and scored by the PPM, which assigns a Q-value
+q
+​
+(
+s
+i
+)
+q(s_{i})
+.
+Then, we use the well-known Upper Confidence bounds for Trees (UCT)
+(Kocsis and Szepesvári,
+2006
+)
+to select the best node among the
+n
+n
+candidates. This selection process is mathematically represented as:
+UCT
+​
+(
+s
+)
+=
+Q
+​
+(
+s
+)
++
+c
+​
+ln
+⁡
+N
+p
+​
+a
+​
+r
+​
+e
+​
+n
+​
+t
+​
+(
+s
+)
+N
+​
+(
+s
+)
+;
+where
+Q
+​
+(
+s
+)
+=
+q
+​
+(
+s
+)
+N
+​
+(
+s
+)
+\displaystyle\text{UCT}(s)=Q(s)+c\sqrt{\frac{\ln N_{parent}(s)}{N(s)}};\quad\text{where}\quad Q(s)=\frac{q(s)}{N(s)}
+(1)
+where
+N
+​
+(
+s
+)
+N(s)
+denotes the number of visits to node
+s
+s
+, and
+N
+parent
+​
+(
+s
+)
+N_{\text{parent}}(s)
+is the visit count of
+s
+s
+’s parent node. The predicted reward
+q
+​
+(
+s
+)
+q(s)
+is provided by the PPM and will be updated through back-propagation.
+c
+c
+is a constant that balances exploitation and exploration.
+Extensive Rollouts for Q-value Annotation
+. Accurate Q-value
+Q
+​
+(
+s
+)
+Q(s)
+annotation in Eq.
+1
+is crucial for guiding MCTS node selection towards correct problem-solving paths and identifying high-quality steps within trajectories.
+To improve Q-value reliability, we draw inspiration from Go players, who retrospectively evaluate the reward of each move based on game outcomes. Although initial estimates may be imprecise, repeated gameplay refines these evaluations over time. Similarly, in each rollout, we update the Q-value of each step based on its contribution to achieving the correct final answer. After extensive MCTS rollouts, steps consistently leading to correct answers achieve higher Q-values, occasional successes yield moderate Q-values, and consistently incorrect steps receive low Q-values. Specifically, we introduce two self-annotation methods to obtain these step-level Q-values. Fig.
+1
+(c) shows the detailed setting in the four rounds of self-evolution.
+Terminal-guided annotation
+. During the first two rounds, when the PPM is unavailable or insufficiently accurate, we use terminal-guided annotation. Formally, let
+q
+​
+(
+s
+i
+)
+k
+q(s_{i})^{k}
+denote the q value for step
+s
+i
+s_{i}
+after back-propagation in the
+k
+t
+​
+h
+k^{th}
+rollout. Following AlphaGo
+(Silver et al.,
+2017
+)
+and rStar
+(Qi et al.,
+2024
+)
+, we score each intermediate node based on its contribution to the final correct answer:
+q
+​
+(
+s
+i
+)
+k
+=
+q
+​
+(
+s
+i
+)
+k
+−
+1
++
+q
+​
+(
+s
+d
+)
+k
+;
+\displaystyle q(s_{i})^{k}=q(s_{i})^{k-1}+q(s_{d})^{k};
+(2)
+where the initial q value
+q
+​
+(
+s
+i
+)
+0
+=
+0
+q(s_{i})^{0}=0
+in the first rollout. If this step frequently leads to a correct answer, its
+q
+q
+value will increase; otherwise, it decreases. Terminal nodes are scored as
+q
+​
+(
+s
+d
+)
+=
+1
+q(s_{d})=1
+for correct answers and
+q
+​
+(
+s
+d
+)
+=
+−
+1
+q(s_{d})=-1
+otherwise, as shown in Fig.
+1
+.
+PRM-augmented annotation
+. Starting from the third round, we use PPM to score each step for more effective generation. Compared to terminal-guided annotation, which requires multiple rollouts for a meaningful
+q
+q
+value, PPM directly predicts a non-zero initial
+q
+q
+value.
+PPM-augmented MCTS also helps the policy model to generate higher-quality steps, guiding solutions towards correct paths. Formally, for step
+s
+i
+s_{i}
+, PPM predicts an initial
+q
+​
+(
+s
+i
+)
+0
+q(s_{i})^{0}
+value based on the partial trajectory:
+q
+​
+(
+s
+i
+)
+0
+=
+P
+​
+P
+​
+M
+​
+(
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+−
+1
+⊕
+s
+i
+)
+\displaystyle q(s_{i})^{0}=PPM(x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i-1}\oplus s_{i})
+(3)
+This
+q
+q
+value will be updated based on terminal node’s
+q
+​
+(
+s
+d
+)
+q(s_{d})
+value through MCTS
+back-propagation
+in Eq.
+2
+.
+For terminal node
+s
+d
+s_{d}
+, we do not use PRM for scoring during training data generation. Instead, we assign a more accurate score based on ground truth labels as terminal-guided rewarding.
+3.3
+Process Preference Model
+Process reward models, which provide granular step-level reward signals, is highly desirable for solving challenging math problems. However, obtaining high-quality step-level training data remains an open challenge. Existing methods rely on human annotations
+(Lightman et al.,
+2023
+)
+or MCTS-generated scores
+(Zhang et al.,
+2024a
+; Chen et al.,
+2024
+)
+to assign a score for each step. These scores then serve as training targets, with methods such as MSE loss
+(Chen et al.,
+2024
+)
+or pointwise loss
+(Wang et al.,
+2024c
+; Luo et al.,
+2024
+; Zhang et al.,
+2024a
+)
+used to minimize the difference between predicted and labeled scores.
+As a result, the precision of these annotated step-level reward scores directly determines the effectiveness of the resulting process reward model.
+Unfortunately, precise per-step scoring remains a unsolved challenge. Although our extensive MCTS rollouts improve the reliability of Q-values, precisely evaluating fine-grained step quality presents a major obstacle. For instance, among a set of correct steps, it is difficult to rank them as best, second-best, or average and then assign precise scores. Similarly, among incorrect steps, differentiating the worst from moderately poor steps poses analogous challenges. Even expert human annotation struggles with consistency, particularly at scale, leading to inherent noise in training labels.
+We introduce a novel training method that trains a process preference model (PPM) by constructing step-level positive-negative preference pairs. As shown in Fig.
+1
+(b), instead of using Q-values as direct reward labels, we use them to select steps from MCTS tree for preference pair construction. For each step, we select two candidates with the highest Q-values as positive steps and two with the lowest as negative steps. Critically, the selected positive steps must lead to a correct final answer, while negative steps must lead to incorrect answers. For intermediate steps (except the final answer step), the positive and negative pairs share the same preceding steps. For the final answer step, where identical reasoning trajectories rarely yield different final answers, we relax this restriction.
+We select two correct trajectories with the highest average Q-values as positive examples and two incorrect trajectories with the lowest average Q-values as negative examples. Following
+(Ouyang et al.,
+2022
+)
+, we define our loss function using the standard Bradley-Terry model with a pairwise ranking loss:
+ℒ
+p
+​
+p
+​
+m
+​
+(
+θ
+)
+=
+−
+1
+2
+×
+2
+​
+E
+(
+x
+,
+y
+i
+p
+​
+o
+​
+s
+,
+y
+i
+n
+​
+e
+​
+g
+∈
+𝔻
+)
+​
+[
+l
+​
+o
+​
+g
+​
+(
+σ
+​
+(
+r
+θ
+​
+(
+x
+,
+y
+i
+p
+​
+o
+​
+s
+)
+−
+r
+θ
+​
+(
+x
+,
+y
+i
+n
+​
+e
+​
+g
+)
+)
+)
+]
+\displaystyle\mathcal{L}_{ppm}(\theta)=-\frac{1}{2\times 2}E_{(x,y_{i}^{pos},y_{i}^{neg}\in\mathbb{D})}[log(\sigma(r_{\theta}(x,y_{i}^{pos})-r_{\theta}(x,y_{i}^{neg})))]
+(4)
+when
+i
+is not final answer step
+,
+y
+i
+p
+​
+o
+​
+s
+=
+s
+1
+⊕
+…
+⊕
+s
+i
+−
+1
+⊕
+s
+i
+p
+​
+o
+​
+s
+;
+y
+i
+n
+​
+e
+​
+g
+=
+s
+1
+⊕
+…
+⊕
+s
+i
+−
+1
+⊕
+s
+i
+n
+​
+e
+​
+g
+\displaystyle\text{when $i$ is not final answer step},y_{i}^{pos}=s_{1}\oplus...\oplus s_{i-1}\oplus s_{i}^{pos};y_{i}^{neg}=s_{1}\oplus...\oplus s_{i-1}\oplus s_{i}^{neg}\vskip-4.30554pt
+(5)
+Here,
+r
+θ
+​
+(
+x
+,
+y
+i
+)
+r_{\theta}(x,y_{i})
+denotes the output of the PPM, where
+x
+x
+is the problem and
+y
+y
+is the trajectory from the first step to the
+i
+t
+​
+h
+i^{th}
+step.
+3.4
+Self-Evolved Deep Thinking
+3.4.1
+Training with Step-by-Step Verified Reasoning Trajectory
+Math Problems Collection
+. We collect a large dataset of 747k math word problems with final answer ground-truth labels, primarily from NuminaMath
+(Jia LI and Polu,
+2024a
+)
+and MetaMath
+(Yu et al.,
+2023b
+)
+. Notably, only competition-level problems (e.g., Olympiads and AIME/AMC) from NuminaMath are included, as we observe that grade-school-level problems do not significantly improve LLM complex math reasoning. To augment the limited competition-level problems, we follow
+(Li et al.,
+2024
+)
+and use GPT-4 to synthesize new problems based on the seed problems in 7.5k MATH train set and 3.6k AMC-AIME training split. However, GPT-4 often generated unsolvable problems or incorrect solutions for challenging seed problems. To filter these, we prompt GPT-4 to generate 10 solutions per problem, retaining only those with at least 3 consistent solutions.
+Reasoning Trajectories Collection
+. Instead of using the original solutions in the 747k math dataset, we conduct extensive MCTS rollouts (Sec.
+3.2
+) to generate higher-quality step-by-step verified reasoning trajectories. In each self-evolution round, we perform 16 rollouts per math problem, which leads to 16 reasoning trajectories. Problems are then categories by difficulty based on the correct ratio of the generated trajectories:
+easy
+(all solutions are correct),
+medium
+(a mix of correct and incorrect solutions) and
+hard
+(all solutions are incorrect). For
+hard
+problems with no correct trajectories, an additional MCTS with 16 rollouts is performed. After that, all step-by-step trajectories and their annotated Q-values are collected and filtered to train the policy SLM and process preference model.
+Supervised Fine-tuning the Policy SLM
+. Through extensive experiments, we find that selecting high-quality reasoning trajectories is the key for fine-tuning a frontier math LLM. While methods such as GPT-distillation and Best-of-N can include low-quality or erroneous intermediate steps, a more effective approach ensures that every step in the trajectory is of high quality. To achieve this, we use per-step Q-values to select optimal trajectories from MCTS rollouts. Specifically, for each math problem, we select the top-2 trajectories with the highest average Q-values among those leading to correct answers as SFT training data.
+Training PPM
+. The PPM is initialized from the fine-tuned policy model, with its next-token prediction head replaced by a scalar-value head consisting of a linear layer and a tanh function to constrain outputs to the range [-1, 1]. We filter out math problems where all solution trajectories are fully correct or incorrect. For problems with mixed outcomes, we select two positive and two negative examples for each step based on Q-values, which are used as preference pairs for training data.
+3.4.2
+Recipe for Self-Evolution
+Table 2:
+Percentage of the 747k math problems correctly solved in each round. Only problems have correct solutions are included in the training set. The first round uses DeepSeek-Coder-Instruct as the policy LLM, while later rounds use our fine-tuned 7B policy SLM.
+#
+models in MCTS
+GSM-level
+MATH-level
+Olympiad-level
+All
+Round 1
+DeepSeek-Coder-V2-Instruct
+96.61%
+67.36%
+20.99%
+60.17%
+Round 2
+policy SLM-r1
+97.88%
+67.40%
+56.04%
+66.60%
+Round 3
+policy SLM-r2, PPM-r2
+98.15%
+88.69%
+62.16%
+77.86%
+Round 4
+policy SLM-r3, PPM-r3
+98.15%
+94.53%
+80.58%
+90.25%
+Table 3:
+Pass@1 accuracy of the resulting policy SLM in each round, showing continuous improvement until surpassing the bootstrap model.
+Round#
+MATH
+AIME 2024
+AMC 2023
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+DeepSeek-Coder-V2-Instruct
+(bootstrap model)
+75.3
+13.3
+57.5
+37.6
+46.2
+94.9
+64.7
+Base (Qwen2.5-Math-7B)
+58.8
+0.0
+22.5
+21.8
+41.6
+91.6
+51.7
+\hdashline
+policy SLM-r1
+69.6
+3.3
+30.0
+34.7
+44.5
+88.4
+57.4
+policy SLM-r2
+73.6
+10.0
+35.0
+39.0
+45.7
+89.1
+59.7
+policy SLM-r3
+75.8
+16.7
+45.0
+44.1
+49.6
+89.3
+62.8
+policy SLM-r4
+78.4
+26.7
+47.5
+47.1
+52.5
+89.7
+65.7
+Table 4:
+The quality of PPM consistently improves across rounds. The policy model has been fixed with policy SLM-r1 for a fair comparison.
+Round#
+MATH
+AIME 2024
+AMC 2023
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+PPM-r1
+75.2
+10.0
+57.5
+35.7
+45.4
+90.9
+60.3
+PPM-r2
+84.1
+26.7
+75.0
+52.7
+54.2
+93.3
+73.0
+PPM-r3
+85.2
+33.3
+77.5
+59.5
+55.6
+93.9
+76.6
+PPM-r4
+87.0
+43.3
+77.5
+61.5
+56.8
+94.2
+77.8
+Due to the weaker capabilities of SLMs, we perform four rounds of MCTS deep thinking to progressively generate higher-quality data and expand the training set with more challenging math problems. Each round uses MCTS to generate step-by-step verified reasoning trajectories, which are then used to train the new policy SLM and PPM. The new models are then applied in next round to generate higher-quality training data. Fig.
+1
+(c) and Table
+2
+detail the models used for data generation in each round, along with the identifiers of the trained policy model and PPM. Next, we outline the details and specific improvements targeted in each round.
+Round 1: Bootstrapping an initial strong policy SLM-r1
+. To enable SLMs to self-generate reasonably good training data, we perform a bootstrap round to fine-tune an initial strong policy model, denoted as SLM-r1.
+As shown in Table
+2
+, we run MCTS with DeepSeek-Coder-V2-Instruct (236B) to collect the SFT data. With no available reward model in this round, we use terminal-guided annotation for Q-values and limit MCTS to 8 rollouts for efficiency. For correct solutions, the top-2 trajectories with the highest average Q-values are selected as SFT data. We also train PPM-r1, but the limited rollouts yields unreliable Q-values, affecting the effectiveness of PPM-r1 ( Table
+4
+).
+Round 2: Training a reliable PPM-r2
+. In this round, with the policy model updated to the 7B SLM-r1, we conduct extensive MCTS rollouts for more reliable Q-value annotation and train the first reliable reward model, PPM-r2. Specifically, we perform 16 MCTS rollouts per problem. The resulting step-by-step verified reasoning trajectories show significant improvements in both quality and Q-value precision. As shown in Table
+4
+, PPM-r2 is notably more effective than in the bootstrap round. Moreover, the policy SLM-r2 also continues to improve as expected (Table
+3
+).
+Round 3: PPM-augmented MCTS to significantly improve data quality
+. With the reliable PPM-r2, we perform PPM-augmented MCTS in this round to generate data, leading to significantly higher-quality trajectories that cover more math and Olympiad-level problems in the training set (Table
+2
+). The generated reasoning trajectories and self-annotated Q-values are then used to train the new policy SLM-r3 and PPM-r3, both of which show significant improvements.
+Round 4: Solving challenging math problems
+. After the third round, while grade school and MATH problems achieve high success rates, only 62.16% of Olympiad-level problems are included in the training set. This is
+NOT
+solely due to weak reasoning abilities in our SLMs, as many Olympiad problems remain unsolved by GPT-4 or o1. To improve coverage, we adopt a straightforward strategy. For unsolved problems after 16 MCTS rollouts, we perform an additional 64 rollouts, and if needed, increase to 128. We also conduct multiple MCTS tree expansions with different random seeds. This boosts the success rate of Olympiad-level problems to 80.58%.
+After four rounds of self-evolution, 90.25% of the 747k math problems are successfully covered into the training set, as shown in Table
+2
+. Among the remaining unsolved problems, a significant portion consists of synthetic questions. We manually review a random sample of 20 problems and find that 19 are incorrectly labeled with wrong answers. Based on this, we conclude that the remaining unsolved problems are of low quality and thus terminate the self-evolution at round 4.
+4
+Evaluation
+4.1
+Setup
+Evaluation Datasets
+. We evaluate
+\sysname
+on diverse mathematical benchmarks. In addition to the widely-used GSM8K
+(Cobbe et al.,
+2021
+)
+, we include challenging benchmarks from multiple domains:
+(i)
+competition and Olympiad-level benchmarks, such as MATH-500
+(Lightman et al.,
+2023
+)
+, AIME 2024
+(AI-MO,
+2024a
+)
+, AMC 2023
+(AI-MO,
+2024b
+)
+and Olympiad Bench
+(He et al.,
+2024
+)
+. Specifically, AIME is the exams designed to challenge the brightest high school math students in American, with the 2024 dataset comprising 30 problems from AIME I and II exams;
+(ii)
+college-level math problems from College Math
+(Tang et al.,
+2024
+)
+and
+(iii)
+out-of-domain math benchmark: GaoKao (Chinese
+College Entrance Exam) En 2023
+(Liao et al.,
+2024
+)
+.
+Base Models and Setup
+.
+\sysname
+is a general approach applicable to various LLMs. To show its effectiveness and generalizability, we use SLMs of different sizes as the base policy models:
+Qwen2.5-Math-1.5B
+(Qwen,
+2024b
+)
+, Phi3-mini-Instruct (3B)
+(Microsoft,
+2024
+; Abdin et al.,
+2024
+)
+, Qwen2-Math-7B
+(Qwen,
+2024a
+)
+and Qwen2.5-Math-7B
+(Qwen,
+2024c
+)
+. Among these, Phi3-mini-Instruct is a general-purpose SLM without specialization in math reasoning.
+Due to limited GPU resources, we performed 4 rounds of self-evolution exclusively on Qwen2.5-Math-7B, yielding 4 evolved policy SLMs (Table
+3
+) and 4 PPMs (Table
+4
+). For the other 3 policy LLMs, we fine-tune them using step-by-step verified trajectories generated from Qwen2.5-Math-7B’s 4th round. The final PPM from this round is then used as the reward model for the 3 policy SLMs.
+Baselines
+.
+\sysname
+is a System 2 method. We compare it against three strong baselines representing both System 1 and System 2 approaches:
+(i)
+Frontier LLMs
+, including GPT-4o, the latest Claude, OpenAI o1-preview and o1-mini.
+We measure their accuracy on AMC 2023, Olympiad Bench, College Math, Gaokao and GSM8K, with accuracy numbers for other benchmarks are taken from public technical reports
+(Team,
+2024a
+)
+.
+(ii)
+Open-sourced superior reasoning models
+, including DeepSeek-Coder-v2-Instruct, Mathstral
+(Team,
+2024b
+)
+, NuminaMath-72B
+(Jia LI and Polu,
+2024a
+)
+, and LLaMA3.1
+(Dubey et al.,
+2024
+)
+, which represent the current mainstream System 1 approaches for improving LLM math reasoning.
+(iii)
+Both System 1 and System 2 performance of the base models trained from the original models teams
+, including Instruct versions (e.g., Qwen2.5-Math-7B-Instruct) and Best-of-N (e.g., Qwen2.5-Math-72B-Instruct+Qwen2.5-Math-RM-72B). Notably, the reward model used for the three Qwen base models is a 72B ORM, significantly larger than our 7B PPM.
+Evaluation Metric
+. We report Pass@1 accuracy for all baselines. For System 2 baselines, we use default evaluation settings, such as default thinking time for o1-mini and o1-preview. For Qwen models with Best-of-N, we re-evaluate MATH-500, AIME/AMC accuracy; other benchmarks results are from their technical reports. For a fair comparison,
+\sysname
+run MCTS to generate the same number of solutions as Qwen. Specifically, for AIME/AMC, we generate 16 trajectories for AIME/AMC and 8 for other benchmarks, using PPM to select the best solution. We also report performance with increased test-time computation using 64 trajectories, denoted as
+\sysname
+64
+.
+Table 5:
+The results of
+\sysname
+and other frontier LLMs on the most challenging math benchmarks.
+\sysname
+64
+shows the Pass@1 accuracy achieved when sampling 64 trajectories.
+Competition and College Level
+OOD
+Model
+Method
+MATH
+AIME
+2024
+AMC
+2023
+Olympiad
+Bench
+College
+Math
+GSM8K
+Gaokao
+En 2023
+Frontier LLMs
+GPT-4o
+System 1
+76.6
+9.3
+47.5
+43.3
+48.5
+92.9
+67.5
+Claude3.5-Sonnet
+System 1
+78.3
+16.0
+-
+-
+-
+96.4
+-
+GPT-o1-preview
+-
+85.5
+44.6
+90.0
+-
+-
+-
+-
+GPT-o1-mini
+-
+90.0
+56.7
+95.0
+65.3
+57.8
+94.8
+78.4
+Open-Sourced Reasoning LLMs
+DeepSeek-Coder-V2-Instruct
+System 1
+75.3
+13.3
+57.5
+37.6
+46.2
+94.9
+64.7
+Mathstral-7B-v0.1
+System 1
+57.8
+0.0
+37.5
+21.5
+33.7
+84.9
+46.0
+NuminaMath-72B-CoT
+System 1
+64.0
+3.3
+70.0
+32.6
+39.7
+90.8
+58.4
+LLaMA3.1-8B-Instruct
+System 1
+51.4
+6.7
+25.0
+15.4
+33.8
+76.6
+38.4
+LLaMA3.1-70B-Instruct
+System 1
+65.4
+23.3
+50.0
+27.7
+42.5
+94.1
+54.0
+Qwen2.5-Math-72B-Instruct
+System 1
+85.6
+30.0
+70.0
+49.0
+49.5
+95.9
+71.9
+Qwen2.5-Math-72B-Instruct+72B ORM
+System 2
+85.8
+36.7
+72.5
+54.5
+50.6
+96.4
+76.9
+General Base Model: Phi3-mini-Instruct (3.8B)
+Phi3-mini-Instruct (base model)
+System 1
+41.4
+3.33
+7.5
+12.3
+33.1
+85.7
+37.1
+\sysname
+(3.8B SLM+7B PPM)
+System 2
+85.4
+40.0
+77.5
+59.3
+58.0
+94.5
+77.1
+\sysname
+64
+(3.8B SLM+7B PPM)
+System 2
+86.4
+43.3
+80.0
+60.3
+59.1
+94.7
+77.7
+Math-Specialized Base Model: Qwen2.5-Math-1.5B
+Qwen2.5-Math-1.5B (base model)
+System 1
+51.2
+0.0
+22.5
+16.7
+38.4
+74.6
+46.5
+Qwen2.5-Math-1.5B-Instruct
+System 1
+60.0
+10.0
+60.0
+38.1
+47.7
+84.8
+65.5
+Qwen2.5-Math-1.5B-Instruct+72B ORM
+System 2
+83.4
+20.0
+72.5
+47.3
+50.2
+94.1
+73.0
+\sysname
+(1.5B SLM+7B PPM)
+System 2
+87.8
+46.7
+80.0
+63.5
+59.0
+94.3
+77.7
+\sysname
+64
+(1.5B SLM+7B PPM)
+System 2
+88.6
+46.7
+85.0
+64.6
+59.3
+94.8
+79.5
+Math-Specialized Base Model: Qwen2-Math-7B
+Qwen2-Math-7B (base model)
+System 1
+53.4
+3.3
+25.0
+17.3
+39.4
+80.4
+47.3
+Qwen2-Math-7B-Instruct
+System 1
+73.2
+13.3
+62.5
+38.2
+45.9
+89.9
+62.1
+Qwen2-Math-7B-Instruct+72B ORM
+System 2
+83.4
+23.3
+62.5
+47.6
+47.9
+95.1
+71.9
+\sysname
+(7B SLM+7B PPM)
+System 2
+88.2
+43.3
+80.0
+63.1
+58.4
+94.6
+78.2
+\sysname
+64
+(7B SLM+7B PPM)
+System 2
+88.6
+46.7
+85.0
+63.4
+59.3
+94.8
+79.2
+Math-Specialized Base Model: Qwen2.5-Math-7B
+Qwen2.5-Math-7B (base model)
+System 1
+58.8
+0.0
+22.5
+21.8
+41.6
+91.6
+51.7
+Qwen2.5-Math-7B-Instruct
+System 1
+82.6
+6.0
+62.5
+41.6
+46.8
+95.2
+66.8
+Qwen2.5-Math-7B-Instruct+72B ORM
+System 2
+88.4
+26.7
+75.0
+49.9
+49.6
+97.9
+75.1
+\sysname
+(7B SLM+7B PPM)
+System 2
+89.4
+50.0
+87.5
+65.3
+59.0
+95.0
+80.5
+\sysname
+64
+(7B SLM+7B PPM)
+System 2
+90.0
+53.3
+87.5
+65.6
+60.5
+95.2
+81.3
+4.2
+Main Results
+Results on diverse challenging math benchmarks
+. Table
+5
+shows the results of
+\sysname
+with comparing to state-of-the-art reasoning models. We highlight three key observations:
+(1)
+\sysname
+significantly improves SLMs math reasoning capabilities, achieving performance comparable to or surpassing OpenAI o1 with substantially smaller model size (1.5B-7B). For example, Qwen2.5-Math-7B, originally at 58.8% accuracy on MATH, improved dramatically to 90.0% with
+\sysname
+, outperforming o1-preview and Claude 3.5 Sonnet while matching o1-mini. On the College Math benchmark,
+\sysname
+exceeds o1-mini by 2.7%. On AIME 2024,
+\sysname
+scored 53.3%, ranking just below o1-mini, with the 7B model solving 8/15 problems in both AIME I and II, placing in the top 20% of the brightest high school math students.
+Notably, 8 of the unsolved problems were geometry-based, requiring visual understanding, a capability
+\sysname
+currently does not support.
+(2)
+Despite using smaller policy models (1.5B-7B) and reward models (7B),
+\sysname
+significantly outperforms state-of-the-art System 2 baselines. Compared to Qwen Best-of-N baselines, which use the same base models (Qwen2-Math-7B, Qwen2.5-Math-1.5B/7B) but a 10
+×
+\times
+larger reward model (Qwen2.5-Math-RM-72B),
+\sysname
+consistently improves the reasoning accuracy of all base models to state-of-the-art levels. Even against Best-of-N with a 10
+×
+\times
+larger Qwen2.5-Math-72B-Instruct policy model,
+\sysname
+surpasses it on all benchmarks except GSM8K, using the same number of sampled solutions.
+(3)
+Beyond well-known benchmarks like MATH, GSM8K, and AIME, which may risk over-optimization,
+\sysname
+shows strong generalizability on other challenging math benchmarks, including Olympiad Bench, College Math, and the Chinese College Entrance Math Exam (Gaokao), setting new state-of-the-art scores. As discussed in Sec.
+3.4
+, our training set is primarily sourced from public datasets, with no specific optimizations for these benchmarks.
+Figure 3:
+Reasoning performance under scaling up the test-time compute.
+Scaling up test-time computation
+.
+\sysname
+uses MCTS to augment the policy model, searching solutions guided by the PPM. By increasing test-time computation, it explores more trajectories, potentially improving performance.
+In Fig.
+3
+, we show the impact of test-time compute scaling by comparing the accuracy of the official Qwen Best-of-N across different numbers of sampled trajectories on four challenging math benchmarks. Sampling only one trajectory corresponds to the policy LLM’s Pass@1 accuracy, indicating a fallback to System 1 reasoning. We highlight two key observations:
+(1)
+With only 4 trajectories,
+\sysname
+significantly outperforms Best-of-N baselines, exceeding o1-preview and approaching o1-mini, demonstrating its effectiveness.
+(2)
+Scaling test-time compute improves reasoning accuracy across all benchmarks, though with varying trends. On Math, AIME, and Olympiad Bench,
+\sysname
+shows saturation or slow improvement at 64 trajectories, while on College Math, performance continues to improve steadily.
+4.3
+Ablation Study and Analysis
+We ablate the effectiveness of our three innovations. For System 2-style inference, Pass@1 accuracy is measured with 16 trajectories for AIME and AMC, and 8 for other benchmarks.
+Table 6:
+The continuously improved math reasoning capabilities through
+\sysname
+self-evolved deep thinking. Starting from round 2, the 7B base model powered by
+\sysname
+surpasses GPT-4o.
+Round#
+MATH
+AIME 2024
+AMC 2023
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+GPT-4o
+76.6
+9.3
+47.5
+43.3
+48.5
+92.9
+67.5
+Base 7B model
+58.8
+0.0
+22.5
+21.8
+41.6
+91.6
+51.7
+\sysname
+Round 1
+75.2
+10.0
+57.5
+35.7
+45.4
+90.9
+60.3
+\sysname
+Round 2
+86.6
+43.3
+75.0
+59.4
+55.6
+94.0
+76.4
+\sysname
+Round 3
+87.0
+46.7
+80.0
+61.6
+56.5
+94.2
+77.1
+\sysname
+Round 4
+89.4
+50.0
+87.5
+65.3
+59.0
+95.0
+80.5
+The effectiveness of self-evolution
+. The impressive results in Table
+5
+are achieved after 4 rounds of
+\sysname
+self-evolved deep thinking. Table
+6
+shows the math reasoning performance in each round, demonstrating a continuous improvement in accuracy.
+In round 1, the main improvement comes from applying SFT to the base model. Round 2 brings a significant boost with the application of a stronger PPM in MCTS, which unlocks the full potential of System 2 deep reasoning. Notably, starting from round 2,
+\sysname
+outperforms GPT-4o. Rounds 3 and 4 show further improvements, driven by stronger System 2 reasoning through better policy SLMs and PPMs.
+The effectiveness of step-by-step verified reasoning trajectory
+.
+\sysname
+generates step-by-step verified reasoning trajectories, which eliminate error intermediate steps and further expand training set with more challenging problems. To evaluate its effectiveness, we use the data generated from round 4 as SFT training data and compare it against
+three strong baselines:
+(i)
+GPT-distillation, which includes open-sourced CoT solutions synthesized using GPT-4, such as MetaMath
+(Yu et al.,
+2023b
+)
+, NuminaMath-CoT
+(Jia LI and Polu,
+2024b
+)
+;
+(ii)
+Random sampling from self-generation,
+which use the same policy model (i.e., policy SLM-r3) to randomly generate trajectories;
+(iii)
+Rejection sampling, where 32 trajectories are randomly sampled from the policy model, with high-quality solutions ranked by our trained ORM (appendix
+A.1
+). For fairness, we select two correct trajectories for each math problem in baseline (ii) and (iii). All SFT experiments use the same training recipe.
+Table 7:
+Ablation study on the effectiveness of our step-by-step verified reasoning trajectories as the SFT dataset. We report the SFT accuracy of Qwen2.5-Math-7B fine-tuned with different datasets.
+Dataset
+MATH
+AIME
+AMC
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+GPT-4o
+-
+76.6
+9.3
+47.5
+43.3
+48.5
+92.9
+67.5
+GPT4-distillation
+(Open-sourced)
+MetaMath
+55.2
+3.33
+32.5
+19.1
+39.2
+85.1
+43.6
+NuminaMath-CoT
+69.6
+10.0
+50.0
+37.2
+43.4
+89.8
+59.5
+Self-generation
+by policy SLM-r3
+Random sample
+72.4
+10.0
+45.0
+41.0
+48.0
+87.5
+57.1
+Rejection sampling
+73.4
+13.3
+47.5
+44.7
+50.8
+89.3
+61.7
+Step-by-step verified (ours)
+78.4
+26.7
+47.5
+47.1
+52.5
+89.7
+65.7
+Table
+7
+shows the math reasoning accuracy of Qwen2.5-Math-7B fine-tuned on different datasets. We highlight two observations:
+(i)
+Fine-tuning with our step-by-step verified trajectories significantly outperforms all other baselines. This is primarily due to our PPM-augmented MCTS for code-augmented CoT synthesis, which provides denser verification during math solution generation. It proves more effective than both random sampling, which lacks verification, and rejection sampling, where ORM provides only sparse verification.
+(ii)
+Even randomly sampled code-augmented CoT solutions from our SLM yields comparable or better performance than GPT-4 synthesized NuminaMath and MetaMath datasets.
+This indicates that our policy SLMs, after rounds of self-evolution, can generate high-quality math solutions. These results demonstrates the huge potential of our method to self-generate higher-quality reasoning data without relying on advanced LLM distillation.
+The effectiveness of PPM
+. We train both a strong ORM and Q-value score-based PRM (PQM) for comparison. To ensure a fair evaluation, we use the highest-quality training data: the step-by-step verified trajectories generated in round 4, with selected math problems matching those used for PPM training. Similar to PPM, we use step-level Q-values as to select positive and negative trajectories for each math problem.
+The ORM is trained using a pairwise ranking loss
+(Ouyang et al.,
+2022
+)
+, while the PQM follows
+(Chen et al.,
+2024
+; Zhang et al.,
+2024a
+)
+to use Q-values as reward labels and optimize with MSE loss. Detailed training settings are provided in Appendix
+A.1
+.
+Table 8:
+Ablation study on the reward model. Process reward models (PQM and PPM) outperform ORM, with PPM pushing the frontier of math reasoning capabilities.
+RM
+Inference
+MATH
+AIME
+AMC
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn
+o1-mini
+-
+90.0
+56.7
+95.0
+65.3
+55.6
+94.8
+78.6
+ORM
+Best-of-N
+82.6
+26.7
+65.0
+55.1
+55.5
+92.3
+72.5
+PQM
+MCTS
+88.2
+46.7
+85.0
+62.9
+57.6
+94.6
+79.5
+PPM
+MCTS
+89.4
+50.0
+87.5
+65.3
+59.0
+95.0
+80.5
+Table
+8
+compares the performance of ORM, PQM, and PPM for System 2 reasoning using our final round policy model. ORM provides reward signals only at the end of problem solving, so we use the Best-of-N method, while PRM and PPM leverage MCTS-driven search. As shown in Table
+8
+, both PQM and PPM outperform ORM by providing denser step-level reward signals, leading to higher accuracy on complex math reasoning tasks. However, PQM struggles on more challenging benchmarks, such as MATH and Olympiad Bench, due to the inherent imprecision of Q-values.
+In contrast, PPM constructs step-level preference data for training, enabling our 7B policy model to achieve comparable or superior performance to o1-mini across all benchmarks.
+5
+Findings and Discussions
+Figure 4:
+An example of intrinsic self-reflection during
+\sysname
+deep thinking.
+The emergence of intrinsic self-reflection capability
+. A key breakthrough in OpenAI o1 is its intrinsic self-reflection capability. When the model makes an error, it recognizes the mistake and can self-correct with a correct answer
+(Noam Brown and Lightman,
+2024
+)
+. Yet it has consistently
+been found to be largely ineffective in open-sourced LLMs. The community has actively explored various approaches, including self-correction
+(Huang et al.,
+2023
+; Kumar et al.,
+2024
+)
+, self-reflection
+(Renze and Guven,
+2024
+; Shinn et al.,
+2024
+)
+, to explicitly train or prompt LLMs to develop such capability.
+In our experiments, we unexpectedly observe that our MCTS-driven deep thinking exhibits self-reflection during problem-solving. As shown in Fig.
+4
+, the model initially formalizes an equation using
+SymPy
+in the first three steps, which would lead to an incorrect answer (left branch). Interestingly, in the fourth step (right branch), the policy model recognizes the low quality of its earlier steps and refrains from continuing along the initial problem-solving path. Instead, it backtracks and resolves the problem using a new, simpler approach, ultimately arriving at the correct answer. An additional example of self-correction is provided in Appendix
+A.2
+. Notably, no self-reflection training data or prompt was included, suggesting that advanced System 2 reasoning can foster intrinsic self-reflection.
+Figure 5:
+Pass@1 accuracy of policy models and their accuracy after applying System 2 reasoning with various reward models, shows that reward models primarily determine the final performance.
+PPM shapes the reasoning boundary in System 2 deep thinking
+. Both the policy and reward models are crucial for System 2 deep reasoning. Our experiments show that once the policy model attains a reasonably strong capability level,
+(see Appendix
+A.1
+), the PPM becomes the key determinant of the upper performance limit.
+Fig.
+5
+summarizes the accuracy of policy models of different sizes, as well as the improvements achieved with reward models. Despite variations in Pass@1 accuracy due to differences in training strategies, datasets, and model scales, the reward model proves to be the dominant factor in System 2 reasoning. For instance, although the SFT accuracy of
+\sysname
+-7B is lower than Qwen2.5-Math-72B-Instruct, pairing it with our 7B PPM allows
+\sysname
+to outperform the 72B policy model with Qwen 72B ORM. Moreover, despite varying Pass@1 accuracy across our three policy SLM sizes, the final reasoning accuracy converges after applying the PPM.
+PPM spots theorem-application steps
+. When solving challenging math problems, identifying and applying relevant theorems or key conclusions often form the cornerstone of successful problem-solving
+(Xin et al.,
+2024
+)
+. In our experiments, we find that during
+\sysname
+problem-solving, our PPM effectively identifies critical theorem-application intermediate steps within policy model’s deep thinking process. These steps are predicted with high reward scores, guiding the policy model to generate the correct solution. Appendix
+A.2
+provides examples where the PPM successfully identifies key theorems such as Fermat’s little theorem
+(Weisstein,
+a
+)
+, Vieta’s formulas
+(Weisstein,
+b
+)
+, the AM-GM inequality
+(
+amg,
+)
+, the Pythagorean theorem
+(
+pyt,
+)
+, and the Shoelace Theorem
+(
+sho,
+)
+, etc.
+Generalization discussions
+.
+\sysname
+offers a general methodology for improving LLM reasoning applicable to various domains. First,
+\sysname
+can generalize to more challenging math tasks, such as theorem proving, though its current focus is on word problems due to dataset limitations. Nonetheless,
+\sysname
+demonstrates the potential to prove mathematical statements. As shown in Appendix
+A.2
+, it successfully proves an Olympiad-level problem involving Fermat’s Little Theorem, providing a step-by-step correct proof through its deep reasoning process. Second,
+\sysname
+can generalize to other domains, such as code and commonsense reasoning. Notably, synthesizing step-by-step verified training trajectories for general reasoning requires a mechanism to provide feedback on whether a given trajectory reaches the desired output at the end of MCTS rollout. For instance, in code reasoning, this could involve designing extensive test cases; in general reasoning, feedback could be obtained through human labeling or mutual verification with another LLM
+(Qi et al.,
+2024
+)
+.
+6
+Conclusion
+In this work, we present
+\sysname
+, a self-evolved System 2 deep thinking approach that significantly boosts the math reasoning capabilities of small LLMs, achieving state-of-the-art OpenAI o1-level performance. Our approach demonstrates that SLMs can self-generate high-quality training data for frontier-level math reasoning. Extensive experiments across four different-sized SLMs and challenging math benchmarks demonstrate the superiority of
+\sysname
+, with achieving leading results while outperforming existing math reasoning LLMs and Best-of-N baselines. We also reveal key findings, including the emergence of self-reflection and the effectiveness of the PPM in identifying critical intermediate steps, such as theorem-application steps. Finally,
+\sysname
+can achieve further improvements by collecting more challenging math problems, we leave this as future work.
+Acknowledgement
+In the early stages of this work, we faced significant challenges due to limited GPU resources and restricted access to the GPT-4 API. We are deeply grateful to Qiufeng Yin and Chengmin Chi for their assistance in collecting math problems and providing GPT-4 resources for new math problem synthesis. Special thanks go to my colleagues, Lingxiao Ma, Ying Cao, Baotong Lu, Jing Liu, Jiahang Xu, Chengruidong Zhang, Siyuan Wang, Gaokai Zhang, Yujian Li, and Yang Wang, for generously sharing their GPU quotas with us.
+References
+[1]
+Inequality of arithmetic and geometric means.
+URL
+https://artofproblemsolving.com/wiki/index.php/AM-GM_Inequality
+.
+[2]
+Pythagorean theorem.
+URL
+https://en.wikipedia.org/wiki/Pythagorean_theorem
+.
+[3]
+Shoelace theorem.
+URL
+https://artofproblemsolving.com/wiki/index.php/Shoelace_Theorem
+.
+Abdin et al. [2024]
+Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah,
+Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl,
+et al.
+Phi-3 technical report: A highly capable language model locally on
+your phone.
+arXiv preprint arXiv:2404.14219
+, 2024.
+AI-MO [2024a]
+AI-MO.
+Aime 2024, 2024a.
+URL
+https://huggingface.co/datasets/AI-MO/aimo-validation-aime
+.
+AI-MO [2024b]
+AI-MO.
+Amc 2023, 2024b.
+URL
+https://huggingface.co/datasets/AI-MO/aimo-validation-amc
+.
+Brown et al. [2024]
+Bradley Brown, Jordan Juravsky, Ryan Ehrlich, Ronald Clark, Quoc V Le,
+Christopher Ré, and Azalia Mirhoseini.
+Large language monkeys: Scaling inference compute with repeated
+sampling.
+arXiv preprint arXiv:2407.21787
+, 2024.
+Chen et al. [2024]
+Guoxin Chen, Minpeng Liao, Chengxi Li, and Kai Fan.
+Alphamath almost zero: process supervision without process, 2024.
+Cobbe et al. [2021]
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz
+Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano,
+et al.
+Training verifiers to solve math word problems.
+arXiv preprint arXiv:2110.14168
+, 2021.
+Daniel [2011]
+Kahneman Daniel.
+Thinking, fast and slow.
+Macmillan
+, 2011.
+Dubey et al. [2024]
+Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad
+Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan,
+et al.
+The llama 3 herd of models.
+arXiv preprint arXiv:2407.21783
+, 2024.
+Gou et al. [2023]
+Zhibin Gou, Zhihong Shao, Yeyun Gong, Yujiu Yang, Minlie Huang, Nan Duan,
+Weizhu Chen, et al.
+Tora: A tool-integrated reasoning agent for mathematical problem
+solving.
+arXiv preprint arXiv:2309.17452
+, 2023.
+Hao et al. [2023]
+Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, and
+Zhiting Hu.
+Reasoning with language model is planning with world model.
+arXiv preprint arXiv:2305.14992
+, 2023.
+He et al. [2024]
+Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen,
+Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, et al.
+Olympiadbench: A challenging benchmark for promoting agi with
+olympiad-level bilingual multimodal scientific problems.
+arXiv preprint arXiv:2402.14008
+, 2024.
+Huang et al. [2023]
+Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu,
+Xinying Song, and Denny Zhou.
+Large language models cannot self-correct reasoning yet.
+arXiv preprint arXiv:2310.01798
+, 2023.
+Huang et al. [2024]
+Zhen Huang, Haoyang Zou, Xuefeng Li, Yixiu Liu, Yuxiang Zheng, Ethan Chern,
+Shijie Xia, Yiwei Qin, Weizhe Yuan, and Pengfei Liu.
+O1 replication journey – part 2: Surpassing o1-preview through
+simple distillation big progress or bitter lesson?
+Github
+, 2024.
+URL
+https://github.com/GAIR-NLP/O1-Journey
+.
+Jia LI and Polu [2024a]
+Lewis Tunstall Ben Lipkin Roman Soletskyi Shengyi Costa Huang Kashif Rasul
+Longhui Yu Albert Jiang Ziju Shen Zihan Qin Bin Dong Li Zhou Yann Fleureau
+Guillaume Lample Jia LI, Edward Beeching and Stanislas Polu.
+Numinamath.
+[https://github.com/project-numina/aimo-progress-prize](https://github.com/project-numina/aimo-progress-prize/blob/main/report/numina_dataset.pdf)
+,
+2024a.
+Jia LI and Polu [2024b]
+Lewis Tunstall Ben Lipkin Roman Soletskyi Shengyi Costa Huang Kashif Rasul
+Longhui Yu Albert Jiang Ziju Shen Zihan Qin Bin Dong Li Zhou Yann Fleureau
+Guillaume Lample Jia LI, Edward Beeching and Stanislas Polu.
+Numinamath cot, 2024b.
+URL
+https://huggingface.co/datasets/AI-MO/NuminaMath-CoT
+.
+Kang et al. [2024]
+Jikun Kang, Xin Zhe Li, Xi Chen, Amirreza Kazemi, and Boxing Chen.
+Mindstar: Enhancing math reasoning in pre-trained llms at inference
+time.
+arXiv preprint arXiv:2405.16265
+, 2024.
+Kocsis and Szepesvári [2006]
+Levente Kocsis and Csaba Szepesvári.
+Bandit based monte-carlo planning.
+volume 2006, pages 282–293, 09 2006.
+ISBN 978-3-540-45375-8.
+doi:
+10.1007/11871842_29
+.
+Kumar et al. [2024]
+Aviral Kumar, Vincent Zhuang, Rishabh Agarwal, Yi Su, John D Co-Reyes, Avi
+Singh, Kate Baumli, Shariq Iqbal, Colton Bishop, Rebecca Roelofs, et al.
+Training language models to self-correct via reinforcement learning.
+arXiv preprint arXiv:2409.12917
+, 2024.
+Lanham et al. [2023]
+Tamera Lanham, Anna Chen, Ansh Radhakrishnan, Benoit Steiner, Carson Denison,
+Danny Hernandez, Dustin Li, Esin Durmus, Evan Hubinger, Jackson Kernion,
+et al.
+Measuring faithfulness in chain-of-thought reasoning.
+arXiv preprint arXiv:2307.13702
+, 2023.
+Li et al. [2024]
+Chen Li, Weiqi Wang, Jingcheng Hu, Yixuan Wei, Nanning Zheng, Han Hu, Zheng
+Zhang, and Houwen Peng.
+Common 7b language models already possess strong math capabilities.
+arXiv preprint arXiv:2403.04706
+, 2024.
+Liao et al. [2024]
+Minpeng Liao, Wei Luo, Chengxi Li, Jing Wu, and Kai Fan.
+Mario: Math reasoning with code interpreter output–a reproducible
+pipeline.
+arXiv preprint arXiv:2401.08190
+, 2024.
+Lightman et al. [2023]
+Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy
+Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+arXiv preprint arXiv:2305.20050
+, 2023.
+Lightman et al. [2024]
+Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker,
+Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+In
+The Twelfth International Conference on Learning
+Representations
+, 2024.
+URL
+https://openreview.net/forum?id=v8L0pN6EOi
+.
+Liu et al. [2024]
+Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang
+Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al.
+Deepseek-v3 technical report.
+arXiv preprint arXiv:2412.19437
+, 2024.
+Luo et al. [2023]
+Haipeng Luo, Qingfeng Sun, Can Xu, Pu Zhao, Jianguang Lou, Chongyang Tao, Xiubo
+Geng, Qingwei Lin, Shifeng Chen, and Dongmei Zhang.
+Wizardmath: Empowering mathematical reasoning for large language
+models via reinforced evol-instruct.
+arXiv preprint arXiv:2308.09583
+, 2023.
+Luo et al. [2024]
+Liangchen Luo, Yinxiao Liu, Rosanne Liu, Samrat Phatale, Harsh Lara, Yunxuan
+Li, Lei Shu, Yun Zhu, Lei Meng, Jiao Sun, et al.
+Improve mathematical reasoning in language models by automated
+process supervision.
+arXiv preprint arXiv:2406.06592
+, 2024.
+Microsoft [2024]
+Microsoft.
+Phi-3-mini-4k-instruct, 2024.
+URL
+https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+.
+Noam Brown and Lightman [2024]
+Ilge Akkaya Noam Brown and Hunter Lightman.
+Openai’s noam brown, ilge akkaya and hunter lightman on o1 and
+teaching llms to reason better, 2024.
+URL
+https://www.youtube.com/watch?v=jPluSXJpdrA
+.
+OpenAI [2023]
+OpenAI.
+Gpt-4 technical report.
+2023.
+OpenAI [2024]
+OpenAI.
+Openai o1 system card.
+preprint
+, 2024.
+Ouyang et al. [2022]
+Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela
+Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al.
+Training language models to follow instructions with human feedback.
+Advances in Neural Information Processing Systems
+,
+35:27730–27744, 2022.
+Qi et al. [2024]
+Zhenting Qi, Mingyuan Ma, Jiahang Xu, Li Lyna Zhang, Fan Yang, and Mao Yang.
+Mutual reasoning makes smaller llms stronger problem-solvers.
+arXiv preprint arXiv:2408.06195
+, 2024.
+Qwen [2024a]
+Qwen.
+Qwen2-math-7b, 2024a.
+URL
+https://huggingface.co/Qwen/Qwen2-Math-7B
+.
+Qwen [2024b]
+Qwen.
+Qwen2.5-math-1.5b, 2024b.
+URL
+https://huggingface.co/Qwen/Qwen2.5-Math-1.5B
+.
+Qwen [2024c]
+Qwen.
+Qwen2.5-math-7b, 2024c.
+URL
+https://huggingface.co/Qwen/Qwen2.5-Math-7B
+.
+Renze and Guven [2024]
+Matthew Renze and Erhan Guven.
+Self-reflection in llm agents: Effects on problem-solving
+performance.
+arXiv preprint arXiv:2405.06682
+, 2024.
+Shinn et al. [2024]
+Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu
+Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Silver et al. [2017]
+David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew
+Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore
+Graepel, et al.
+Mastering chess and shogi by self-play with a general reinforcement
+learning algorithm.
+arXiv preprint arXiv:1712.01815
+, 2017.
+Snell et al. [2024]
+Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar.
+Scaling llm test-time compute optimally can be more effective than
+scaling model parameters.
+arXiv preprint arXiv:2408.03314
+, 2024.
+Tang et al. [2024]
+Zhengyang Tang, Xingxing Zhang, Benyou Wan, and Furu Wei.
+Mathscale: Scaling instruction tuning for mathematical reasoning.
+arXiv preprint arXiv:2403.02884
+, 2024.
+Team [2024a]
+Qwen Team.
+Qwq: Reflect deeply on the boundaries of the unknown, November
+2024a.
+URL
+https://qwenlm.github.io/blog/qwq-32b-preview/
+.
+Team [2024b]
+The Mistral AI Team.
+Mathstral-7b-v0.1, 2024b.
+URL
+https://huggingface.co/mistralai/Mathstral-7B-v0.1
+.
+Toshniwal et al. [2024]
+Shubham Toshniwal, Wei Du, Ivan Moshkov, Branislav Kisacanin, Alexan
+Ayrapetyan, and Igor Gitman.
+Openmathinstruct-2: Accelerating ai for math with massive open-source
+instruction data.
+arXiv preprint arXiv:2410.01560
+, 2024.
+Valmeekam et al. [2023]
+Karthik Valmeekam, Sarath Sreedharan, Matthew Marquez, Alberto Olmo, and
+Subbarao Kambhampati.
+On the planning abilities of large language models (a critical
+investigation with a proposed benchmark).
+arXiv preprint arXiv:2302.06706
+, 2023.
+Wang et al. [2024a]
+Chaojie Wang, Yanchen Deng, Zhiyi Lv, Shuicheng Yan, and An Bo.
+Q*: Improving multi-step reasoning for llms with deliberative
+planning, 2024a.
+Wang et al. [2024b]
+Ke Wang, Houxing Ren, Aojun Zhou, Zimu Lu, Sichun Luo, Weikang Shi, Renrui
+Zhang, Linqi Song, Mingjie Zhan, and Hongsheng Li.
+Mathcoder: Seamless code integration in LLMs for enhanced
+mathematical reasoning.
+In
+The Twelfth International Conference on Learning
+Representations
+, 2024b.
+URL
+https://openreview.net/forum?id=z8TW0ttBPp
+.
+Wang et al. [2024c]
+Peiyi Wang, Lei Li, Zhihong Shao, R. X. Xu, Damai Dai, Yifei Li, Deli Chen,
+Y. Wu, and Zhifang Sui.
+Math-shepherd: Verify and reinforce llms step-by-step without human
+annotations, 2024c.
+Wang et al. [2023]
+Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc V Le, Ed H. Chi, Sharan Narang,
+Aakanksha Chowdhery, and Denny Zhou.
+Self-consistency improves chain of thought reasoning in language
+models.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2023.
+URL
+https://openreview.net/forum?id=1PL1NIMMrw
+.
+Weisstein [a]
+Eric W. Weisstein.
+Fermat’s little theorem, a.
+URL
+https://mathworld.wolfram.com/FermatsLittleTheorem.html
+.
+Weisstein [b]
+Eric W. Weisstein.
+Vieta’s formulas, from mathworld—a wolfram web resource,
+b.
+URL
+http://mathworld.wolfram.com/Tree.html
+.
+Wu et al. [2024]
+Yangzhen Wu, Zhiqing Sun, Shanda Li, Sean Welleck, and Yiming Yang.
+An empirical analysis of compute-optimal inference for
+problem-solving with language models.
+arXiv preprint arXiv:2408.00724
+, 2024.
+Xin et al. [2024]
+Huajian Xin, Daya Guo, Zhihong Shao, Zhizhou Ren, Qihao Zhu, Bo Liu, Chong
+Ruan, Wenda Li, and Xiaodan Liang.
+Deepseek-prover: Advancing theorem proving in llms through
+large-scale synthetic data.
+arXiv preprint arXiv:2405.14333
+, 2024.
+Yang et al. [2024]
+An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao, Bowen Yu, Chengpeng Li,
+Dayiheng Liu, Jianhong Tu, Jingren Zhou, Junyang Lin, et al.
+Qwen2. 5-math technical report: Toward mathematical expert model via
+self-improvement.
+arXiv preprint arXiv:2409.12122
+, 2024.
+Yao et al. [2024]
+Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and
+Karthik Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language
+models.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Yu et al. [2023a]
+Fei Yu, Anningzhe Gao, and Benyou Wang.
+Outcome-supervised verifiers for planning in mathematical reasoning.
+arXiv preprint arXiv:2311.09724
+, 2023a.
+Yu et al. [2023b]
+Longhui Yu, Weisen Jiang, Han Shi, Jincheng Yu, Zhengying Liu, Yu Zhang,
+James T Kwok, Zhenguo Li, Adrian Weller, and Weiyang Liu.
+Metamath: Bootstrap your own mathematical questions for large
+language models.
+arXiv preprint arXiv:2309.12284
+, 2023b.
+Yuan et al. [2023]
+Zheng Yuan, Hongyi Yuan, Chengpeng Li, Guanting Dong, Keming Lu, Chuanqi Tan,
+Chang Zhou, and Jingren Zhou.
+Scaling relationship on learning mathematical reasoning with large
+language models.
+arXiv preprint arXiv:2308.01825
+, 2023.
+Zhang et al. [2024a]
+Dan Zhang, Sining Zhoubian, Ziniu Hu, Yisong Yue, Yuxiao Dong, and Jie Tang.
+Rest-mcts*: Llm self-training via process reward guided tree search.
+arXiv preprint arXiv:2406.03816
+, 2024a.
+Zhang et al. [2024b]
+Di Zhang, Jiatong Li, Xiaoshui Huang, Dongzhan Zhou, Yuqiang Li, and Wanli
+Ouyang.
+Accessing gpt-4 level mathematical olympiad solutions via monte carlo
+tree self-refine with llama-3 8b.
+arXiv preprint arXiv:2406.07394
+, 2024b.
+Zheng et al. [2023]
+Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao
+Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, Hao Zhang, Joseph E.
+Gonzalez, and Ion Stoica.
+Judging LLM-as-a-judge with MT-bench and chatbot arena.
+In
+Thirty-seventh Conference on Neural Information Processing
+Systems Datasets and Benchmarks Track
+, 2023.
+Appendix A
+Appendix
+A.1
+Additional Experiments and Details
+Data Generation Details
+. As detailed in Sec.
+3.4
+, each round starts by self-generating step-by-step verified trajectories for 747k math word problems. The maximum tree depth
+d
+d
+is set to 16, with 16 MCTS rollouts conducted per problem by default. At each step, we allow to explore 8 candidate nodes, and the constant
+c
+c
+in Eq.
+1
+is set to 2 to promote greater exploration. In the bootstrap round, due to the large size of the initial policy model (236B), we used smaller parameters: 8 rollouts and 5 candidate nodes per step. To improve the accuracy of solving challenging problems in round 4, we increase the number of candidate nodes to 16 and conduct 2 MCTS tree expansions per problem using different random seeds. Detailed prompts are available in Appendix
+A.3
+.
+Training Details
+. In each round, we collect step-by-step verified trajectories to fine-tune the policy LLM and train the PPM. To reduce noise
+in synthetic math problems (e.g., incorrect ground-truth answers labeled by GPT-4), we remove synthetic problems with trajectories achieving less than 50% accuracy. Based on our extensive experiments, the policy LLM is fine-tuned from the initial base model in each round, rather than training incrementally on the model from the previous round.
+All policy SLMs are trained for 2 epochs with a sequence length of 4096 tokens and a batch size of 128. We use AdamW optimizer with a linear learning rate scheduler, setting the initial learning rate to 7e-6 for Qwen models, and a cosine scheduler with an initial learning rate of 5e-6 for Phi3-mini-Instruct.
+The PPM is trained for 1 epoch with a batch size of 512 and an initial learning rate of 7e-6.
+Training the ORM and PQM
+. The Outcome Reward Model (ORM) and the Q-value-based Process Reward Model (PQM) share the same model architecture and training parameters with our PPM. To train the ORM, we collect trajectories from math problems containing both correct and incorrect solutions. Specifically, the two trajectories with the highest average Q-values are selected as positive examples, while the two with the lowest are chosen as negative examples. Following Qwen2.5-Math
+(Yang et al.,
+2024
+)
+, we adopt the pairwise ranking loss
+(Ouyang et al.,
+2022
+)
+to optimize the ORM. To train the PQM, we follow
+Chen et al. (
+2024
+)
+to use step-level Q-values as reward labels. Let
+𝐱
+=
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+d
+\mathbf{x}=x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{d}
+be the trajectory, with annotated Q-values
+𝐐
+=
+(
+Q
+​
+(
+s
+1
+)
+,
+Q
+​
+(
+s
+1
+)
+,
+…
+,
+Q
+​
+(
+s
+d
+)
+)
+\mathbf{Q}=(Q(s_{1}),Q(s_{1}),...,Q(s_{d}))
+and predicted Q-values
+𝐐
+′
+=
+(
+Q
+′
+​
+(
+s
+1
+)
+,
+Q
+′
+​
+(
+s
+1
+)
+,
+…
+,
+Q
+′
+​
+(
+s
+d
+)
+)
+\mathbf{Q^{\prime}}=(Q^{\prime}(s_{1}),Q^{\prime}(s_{1}),...,Q^{\prime}(s_{d}))
+for each step. To stabilize PQM training, we treat each trajectory as a single training sample and predict Q-values for all steps simultaneously, rather than splitting it into individual per-step samples. Specifically, to predict the Q-value
+Q
+′
+​
+(
+s
+i
+)
+Q^{\prime}(s_{i})
+for step
+s
+i
+s_{i}
+, PQM takes the trajectory from the question up to step
+s
+i
+s_{i}
+(i.e.,
+x
+⊕
+s
+1
+⊕
+s
+2
+⊕
+…
+⊕
+s
+i
+x\oplus s_{1}\oplus s_{2}\oplus...\oplus s_{i}
+) as input and outputs a value between -1 and 1. We use a mean squared error (MSE) loss for PQM training:
+ℒ
+p
+​
+r
+​
+m
+​
+(
+𝐱
+)
+=
+‖
+𝐐
+−
+𝐐
+′
+‖
+𝟐
+\mathcal{L}_{prm}(\bf{x})=\|\bf{Q}-\bf{Q^{\prime}}\|^{2}
+(6)
+Self-evolution Inference Costs.
+In the initial bootstrap round, we use DeepSeek-Coder-v2-Instruct (236B) as the policy model, using 10 nodes of 8×80GB H100 GPUs with 8 MCTS rollouts. This required approximately two weeks to finish the data generation. For rounds 2–4, using our fine-tuned 7B SLM as the policy model, data generation was performed on 15 nodes of 4×40GB A100 GPUs,
+with each round completed in three days. In the final round, to include more challenging problems, we increased the number of MCTS rollouts to 64, extending the data generation time to one week.
+Table 9:
+Inference costs of
+\sysname
+. We show the average number of generated tokens required to generate a trajectory for a given question.
+MATH
+AIME 2024
+AMC 2023
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+5453
+15693
+14544
+7889
+4503
+3299
+6375
+Inference Setting
+. In our evaluation, we run multiple MCTS to generate candidate solution trajectories. For each problem, we generate 32 candidate nodes at each step and use the PPM to score each node. Since the PPM effectively provides step-level quality evaluations, we limit MCTS to just 4 rollouts per step to update the Q-values. After completing MCTS, the trajectory with the highest PPM score is selected as the final answer. Table
+9
+presents the average number of tokens generated to produce a trajectory in MCTS.
+Table 10:
+Pass@1 (greedy) accuracy of our fine-tuned policy models for Phi3-mini, Qwen2.5-Math-1.5B, Qwen2-Math-7B and Qwen2.5-Math-7B.
+Model
+MATH
+AIME 2024
+AMC 2023
+Olympiad Bench
+College Math
+GSM8K
+GaokaoEn 2023
+General Base Model: Phi3-mini-Instruct (3.8B)
+Phi3-mini-Instruct
+41.4
+3.33
+7.5
+12.3
+33.1
+85.7
+37.1
+Our policy model
+68.0
+10.0
+37.5
+36.6
+48.7
+87.9
+53.2
+Math-Specialized Base Model: Qwen2.5-Math-1.5B
+Qwen2.5-Math-1.5B
+51.2
+0.0
+22.5
+16.7
+38.4
+74.6
+46.5
+Qwen2.5-Math-1.5B-Instruct
+60.0
+10.0
+60.0
+38.1
+47.7
+84.8
+65.5
+Our policy model
+74.8
+13.3
+47.5
+42.5
+50.1
+83.1
+58.7
+Math-Specialized Base Model: Qwen2-Math-7B
+Qwen2-Math-7B
+53.4
+3.3
+25.0
+17.3
+39.4
+80.4
+47.3
+Qwen2-Math-7B-Instruct
+73.2
+13.3
+62.5
+38.2
+45.9
+89.9
+62.1
+Our policy model
+73.8
+16.7
+45.0
+43.9
+52.0
+88.3
+65.2
+Math-Specialized Base Model: Qwen2.5-Math-7B
+Qwen2.5-Math-7B
+58.8
+0.0
+22.5
+21.8
+41.6
+91.6
+51.7
+Qwen2.5-Math-7B-Instruct
+82.6
+6.0
+62.5
+41.6
+46.8
+95.2
+66.8
+Our policy model
+78.4
+26.7
+47.5
+47.1
+52.5
+89.7
+65.7
+Figure 6:
+Pass@N accuracy with random sampling from different policy models. Compared to the official Qwen instruct version, our policy model exhibits a stronger ability to sample correct solutions.
+Figure 7:
+Pass@N accuracy with PPM-augmented MCTS. Under the same PPM guidance, the four policy models of varying sizes demonstrate convergent capabilities in sampling correct solutions.
+Pass@N.
+Table
+10
+compares the math reasoning performance of our policy models with the instruct versions developed by the original model team. Our policy models do not consistently outperform the instruct versions. For example, on the Qwen2.5-Math-7B base model, Qwen2.5-Math-7B-Instruct achieves 4.2% higher accuracy on the MATH benchmark. However, in System 2 deep thinking paradigm, the pass@1 accuracy alone does not fully reflect the reasoning capabilities for the policy model. To provide a more comprehensive evaluation, Fig.
+6
+and Fig.
+7
+present the pass@N accuracy. In this metric, the policy model generates
+N
+N
+solutions under two settings: random sampling (Fig.
+6
+) and PPM-augmented MCTS deep thinking (Fig.
+7
+). A problem is considered solved if one solution is correct.
+As shown in Figure
+6
+, our policy model achieves comparable Pass@64 accuracy with Qwen2.5-Math-7B-Instruct on MATH and significantly outperforms it on other benchmarks.
+Figure
+7
+further compares the pass@N accuracy of our four policy models (different sizes) after MCTS deep thinking under the same PPM guidance. We can observe that after generating 64 trajectories, the pass@N accuracy of different policy models becomes comparable.
+A.2
+Examples of Reasoning Trajectories
+Self-correction Example
+Question: The coordinates of a parallelogram are (5, 3), (6, 8), (7, 4) and
+(
+x
+,
+y
+)
+(x,y)
+and
+x
+>
+7
+x>7
+. What is the value of
+x
++
+y
+x+y
+Our trajectory
+:
+<code>
+# Import necessary libraries
+⬇
+import
+numpy
+as
+np
+<end_of_step>
+# Define the coordinates of the parallelogram
+⬇
+A
+=
+np
+.
+array
+([5,
+3])
+B
+=
+np
+.
+array
+([6,
+8])
+C
+=
+np
+.
+array
+([7,
+4])
+<end_of_step>
+# Calculate the vector differences to find the missing coordinate D
+# In a parallelogram, the vector difference between two adjacent sides is equal to the vector difference between the other two adjacent sides
+# We can use the formula: D = A + (C - B)
+⬇
+D
+=
+A
++
+(
+C
+-
+B
+)
+<end_of_step>
+# Since the coordinates provided are (5, 3), (6, 8), (7, 4) and (x, y), we can infer that the vector from B to C should be the same as the vector from A to the missing point (x, y)
+# This means: (x - 5, y - 3) = (1, -4) because the vector from B to C is (7 - 6, 4 - 8) = (1, -4)
+# Solve for x and y
+⬇
+x
+=
+5
++
+1
+y
+=
+3
++
+(-4)
+<end_of_step>
+# Verify that the point (x, y) exists and satisfies the condition x > 7
+# Since x = 6 is not greater than 7, we need to use the correct formula that maintains the parallelogram property and x > 7
+# Correcting the formula to find the other possible D [
+Self-correction!
+]
+⬇
+D
+=
+C
++
+(
+B
+-
+A
+)
+<end_of_step>
+# Solve for x and y again
+⬇
+x
+=
+7
++
+(6
+-
+5)
+y
+=
+4
++
+(8
+-
+3)
+<end_of_step>
+# Verify that the point (x, y) exists and satisfies the condition x > 7
+# Since x = 8 is greater than 7, which satisfies the condition given, we can now find the sum of x and y
+⬇
+x_plus_y
+=
+x
++
+y
+<end_of_step>
+# Now print the final answer
+⬇
+print
+(
+x_plus_y
+)
+# This should print the value of x + y
+<end_of_code>
+<output>17<end_of_output>
+<answer>The value of x + y is
+17
+<end_of_answer>
+Math statement proof using Fermat’s little theorem
+Question: If ( pt+a ) is a perfect square for some value of ( t ), then ( a
+p
+−
+1
+2
+{}^{\frac{p-1}{2}}
+- 1 ) is divisible by ( p ) (where ( p ) is a prime number).
+Our trajectory for proving
+:
+<code>
+# Step 1: Define the variables
+⬇
+from
+sympy
+import
+symbols
+,
+isprime
+,
+sqrt
+,
+mod_inverse
+# Define the variables
+⬇
+p
+,
+t
+,
+a
+=
+symbols
+(’
+p
+t
+a
+’)\\
+<end_of_step>
+# Step 2: Assume p*t + a is a perfect square
+# This means there exists an integer k such that p*t + a = k
+2
+# Rearrange to find a in terms of p, t, and k
+# a = k
+2
+- p*t
+<end_of_step>
+#
+Step 3: Use Fermat’s Little Theorem
+# Fermat’s Little Theorem states that if p is a prime and a is an integer not divisible by p,
+then a
+(p-1)
+≡
+\equiv
+1 (mod p)
+# We need to show that a
+((p-1)/2)
+- 1 is divisible by p
+<end_of_step>
+# Step 4: Substitute a = k
+2
+- p*t into Fermat’s Little Theorem
+# a
+((p-1)/2)
+≡
+\equiv
+(k
+2
+- p*t)
+((p-1)/2)
+(mod p)
+# Since p*t is divisible by p, (k
+2
+- p*t)
+((p-1)/2)
+≡
+\equiv
+k
+(p-1)
+(mod p)
+# By Fermat’s Little Theorem, k
+(p-1)
+≡
+\equiv
+1 (mod p)
+<end_of_step>
+# Step 5: Therefore, a
+((p-1)/2)
+≡
+\equiv
+1 (mod p)
+# This implies a
+((p-1)/2)
+- 1
+≡
+\equiv
+0 (mod p)
+# Hence, a
+((p-1)/2)
+- 1 is divisible by p
+<end_of_step>
+# Now print the final answer
+print("The expression a
+((p-1)/2)
+- 1 is divisible by p.")
+<end_of_code>
+<output>
+The expression
+a
+(
+(
+p
+−
+1
+)
+/
+2
+)
+a^{((p-1)/2)}
+- 1 is divisible by p.
+The statement is true.
+<end_of_output>
+Problem-solving with application of the Vieta’s formulas and the AM-GM inequality
+Question: Let
+a
+a
+be a positive real number such that all the roots of
+x
+3
++
+a
+​
+x
+2
++
+a
+​
+x
++
+1
+=
+0
+x^{3}+ax^{2}+ax+1=0
+are real. Find the smallest possible value of
+a
+.
+a.
+Our trajectory for proving
+:
+<code>
+# Step 1: Define the polynomial equation
+from sympy import symbols, solve, Eq
+# Define the variable and the parameter
+x, a = symbols(’x a’)
+<end_of_step>
+[PPM Score: -0.46430]
+# Step 2: Define the polynomial equation
+polynomial = x**3 + a*x**2 + a*x + 1
+<end_of_step>
+[PPM Score: 0.53937]
+#
+Step 3: Use Vieta’s formulas to relate the coefficients to the roots
+# Since the polynomial is monic (leading coefficient is 1), the sum of the roots (taken one at a time) is -a
+# The sum of the products of the roots taken two at a time is a
+# The product of the roots is -1
+# Since all roots are real, we can denote them as r1, r2, r3
+# From Vieta’s formulas:
+# r1 + r2 + r3 = -a
+# r1*r2 + r2*r3 + r3*r1 = a
+# r1*r2*r3 = -1
+<end_of_step>
+[PPM Score:
+0.9989]
+#
+Step 4: Use the AM-GM inequality to find the minimum value of a
+# By AM-GM, (r1*r2 + r2*r3 + r3*r1) / 3 >= (r1*r2*r3)
+(2/3)
+# Substituting the known values:
+# a / 3 >= (-1)
+(2/3)
+# a / 3 >= 1
+# a >= 3
+<end_of_step>
+[PPM Score:
+0.9999]
+# Step 5: Verify if a = 3 is achievable
+# If a = 3, then r1 + r2 + r3 = -3 and r1*r2 + r2*r3 + r3*r1 = 3
+# We need to check if there exist real roots that satisfy these conditions
+# Consider the case where r1 = r2 = r3 = -1
+# Then r1 + r2 + r3 = -3 and r1*r2 + r2*r3 + r3*r1 = 3, and r1*r2*r3 = -1
+# Thus, a = 3 is achievable with real roots
+<end_of_step>
+[PPM Score: 1.0]
+# Step 6: Print the final answer
+print(3)
+<end_of_code>
+<output>3<end_of_output>
+<answer>The smallest possible value of a is
+3
+.
+<end_of_answer>
+Problem-solving with application the Shoelace Theorem
+Question: In the diagram,
+D
+D
+and
+E
+E
+are the midpoints of
+A
+​
+B
+¯
+\overline{AB}
+and
+B
+​
+C
+¯
+\overline{BC}
+respectively. Determine the area of quadrilateral
+D
+​
+B
+​
+E
+​
+F
+DBEF
+. [asy]
+size(180); defaultpen(linewidth(.7pt)+fontsize(10pt));
+pair A, B, C, D, E, F;
+A=(0,6);
+B=(0,0);
+C=(8,0);
+D=(0,3);
+E=(4,0);
+F=(8/3,2);
+draw(E–A–C–D);
+draw((-1,0)–(10,0), EndArrow);
+draw((0,-1)–(0,8), EndArrow);
+label("
+A
+​
+(
+0
+,
+6
+)
+A(0,6)
+", A, W);
+label("
+B
+​
+(
+0
+,
+0
+)
+B(0,0)
+", B, SW);
+label("
+C
+​
+(
+8
+,
+0
+)
+C(8,0)
+", C, S);
+label("
+D
+D
+", D, W);
+label("
+E
+E
+", E, S);
+label("
+F
+F
+", F, SW);
+label("
+x
+x
+", (10,0), dir(0));
+label("
+y
+y
+", (0,8), dir(90));
+[/asy]
+Our trajectory for proving
+:
+<code>
+# Step 1: Define the coordinates of points A, B, C, D, and E
+⬇
+A
+=
+(0,
+6)
+B
+=
+(0,
+0)
+C
+=
+(8,
+0)
+D
+=
+(0,
+3)
+#
+Midpoint
+of
+AB
+E
+=
+(4,
+0)
+#
+Midpoint
+of
+BC
+<end_of_step>
+[PPM Score: -0.76639]
+# Step 2: Define the coordinates of point F using the section formula (known to be (8/3, 2))
+⬇
+F
+=
+(8/3,
+2)
+<end_of_step>
+[PPM Score: 0.8429]
+#
+Step 3: Use the Shoelace Theorem to find the area of quadrilateral DBEF
+# Shoelace Theorem formula: Area = 0.5 * |x1y2 + x2y3 + x3y4 + x4y1 - (y1x2 + y2x3 + y3x4 + y4x1)|
+# Coordinates of D, B, E, F are (0, 3), (0, 0), (4, 0), (8/3, 2) respectively
+⬇
+x1
+,
+y1
+=
+D
+x2
+,
+y2
+=
+B
+x3
+,
+y3
+=
+E
+x4
+,
+y4
+=
+F
+area
+=
+0.5
+*
+abs
+(
+x1
+*
+y2
++
+x2
+*
+y3
++
+x3
+*
+y4
++
+x4
+*
+y1
+-
+(
+y1
+*
+x2
++
+y2
+*
+x3
++
+y3
+*
+x4
++
+y4
+*
+x1
+))
+<end_of_step>
+[PPM Score:
+0.9972]
+# Step 4: Print the area of quadrilateral DBEF
+⬇
+print
+(
+area
+)
+<end_of_code>
+<output>8.0<end_of_output>
+<answer>
+8
+<end_of_answer>
+A.3
+Prompt Examples
+MCTS Prompt for DeepSeek-V2-Coder-Instruct (bootstrap round)
+You are a powerful agent with broad math knowledge and great python programming skills. You need to use python interpreter to do accurate calculation on math equations.
+!!! Remember:
+1. Use code solve the problem step by step. The solution should include three parts: <code>, <output>, and <answer>.
+2. All calculations should be done in python code. Provide concise reasoning and thinking in the comments of the code.
+3. The most related python packages include ‘math‘, ‘sympy‘, ‘scipy‘, and ‘numpy‘.
+4. Please use the following template:
+Question: the input question
+<code>Construct the code step by step. Use <end_of_step> to indicate the end of each step. Ensure your code can execute correctly(excluding <end_of_step>) and print the answer. Avoid undefined variables (NameError), unimported packages, or formatting errors (SyntaxError, TypeError). In the last step of the code, print the final answer and add a comment: Now print the final answer.<end_of_code>
+<output>Execute the code in using the Python interpreter and display the printed results.<end_of_output>
+<answer>The concise answer without verbose context, put your final answer’s numerical part (without unit, only focus on the numerical part if it’s a choice question) in
+boxed.<end_of_answer> Now! It’s your turn.
+Question:
+{input}
+The following are 2 demonstration examples:
+Question: Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
+<code>
+# Step 1: Calculate the total weight lifted with two 20-pound weights
+total_weight_20 = 2 * 20 * 12
+<end_of_step>
+# Step 2: Calculate the weight lifted per repetition with two 15-pound weights
+weight_per_rep_15 = 2 * 15
+<end_of_step>
+# Step 3: Calculate the number of repetitions needed to lift the same total weight with two 15-pound weights
+reps_needed = total_weight_20 / weight_per_rep_15
+<end_of_step>
+# Now print the final answer
+print(reps_needed)
+<end_of_code>
+<output>16.0 <end_of_output> <answer>From the result, we can see that Terrell must lift the 15-pound weights
+boxed16 times to lift the same total weight.
+<end_of_answer>,
+Question: Find the value of
+x
+x
+that satisfies
+3
+​
+x
++
+5
+6
+​
+x
++
+5
+=
+5
+3
+\frac{\sqrt{3x+5}}{\sqrt{6x+5}}=\frac{\sqrt{5}}{3}
+. Express your answer as a common fraction.
+<code>
+from sympy import symbols, Eq, solve, sqrt
+# Define the variable x
+x = symbols(’x’)
+<end_of_step>
+# Define the equation
+equation = Eq(sqrt(3*x + 5) / sqrt(6*x + 5), sqrt(5) / 3)
+<end_of_step>
+# Solve the equation for x
+solution = solve(equation, x)
+<end_of_step>
+# Now print the final answer
+print(solution)
+<end_of_code>
+<output>[20/3]
+<end_of_output> <answer>From the result, we can see that the value of x is
+20
+3
+<end_of_answer>
+MCTS Prompt for our policy SLM
+MCTS prompt for next step generation
+:
+<|user|>:
+{input question}
+<|assistant|>: Let’s think step by step and solve the problem with code.
+{trajectory of previous steps}
+SFT Prompt Example
+:
+<|user|>:
+How many numbers can you get by multiplying two or more distinct members of the set
+1
+,
+2
+,
+3
+,
+5
+,
+11
+\\
+{1,2,3,5,11\\
+}
+together?
+<|assistant|>: Let’s think step by step and solve the problem with code.
+<code>
+# Step 1: Create the set of numbers
+⬇
+numbers
+=
+{1,
+2,
+3,
+5,
+11}
+<end_of_step>
+# Step 2: Define a function to generate all possible products of two or more distinct members of the set
+⬇
+from
+itertools
+import
+combinations
+def
+generate_products
+(
+numbers
+):
+products
+=
+set
+()
+for
+r
+in
+range
+(2,
+len
+(
+numbers
+)
++
+1):
+#
+For
+combinations
+of
+2,
+3,
+4,
+and
+5
+elements
+for
+combo
+in
+combinations
+(
+numbers
+,
+r
+):
+product
+=
+1
+for
+num
+in
+combo
+:
+product
+*=
+num
+products
+.
+add
+(
+product
+)
+return
+products
+<end_of_step>
+# Step 3: Calculate the products
+⬇
+unique_products
+=
+generate_products
+(
+numbers
+)
+<end_of_step>
+# Step 4: Calculate the number of unique products
+⬇
+num_unique_products
+=
+len
+(
+unique_products
+)
+<end_of_step>
+# Now print the final answer
+⬇
+print
+(
+num_unique_products
+)
+<end_of_code>
+<output>15<end_of_output>
+<answer>From the result, we can see that there are
+15
+unique products.
+<end_of_answer>
+◄
+Feeling
+lucky?
+Conversion
+report
+Report
+an issue
+View original
+on arXiv
+►
\ No newline at end of file
diff --git a/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md b/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa9fcabb47acd0ab4c4b2421dfea5563cbe1bab6
--- /dev/null
+++ b/research/notes/250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag.md
@@ -0,0 +1,203 @@
+---
+title: '[2501.09136] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG'
+id: 250109136-agentic-retrieval-augmented-generation-a-survey-on-agentic-rag
+tags:
+- deepread
+created: '2026-06-10T00:24:47.557837Z'
+source: https://arxiv.org/abs/2501.09136
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:24:47.557707Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.09136] Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG
+Computer Science > Artificial Intelligence
+arXiv:2501.09136
+(cs)
+[Submitted on 15 Jan 2025 (
+v1
+), last revised 1 Apr 2026 (this version, v4)]
+Title:
+Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG
+Authors:
+Aditi Singh
+,
+Abul Ehtesham
+,
+Saket Kumar
+,
+Tala Talaei Khoei
+,
+Athanasios V. Vasilakos
+View a PDF of the paper titled Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG, by Aditi Singh and 4 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Large Language Models (LLMs) have advanced artificial intelligence by enabling human-like text generation and natural language understanding. However, their reliance on static training data limits their ability to respond to dynamic, real-time queries, resulting in outdated or inaccurate outputs. Retrieval-Augmented Generation (RAG) has emerged as a solution, enhancing LLMs by integrating real-time data retrieval to provide contextually relevant and up-to-date responses. Despite its promise, traditional RAG systems are constrained by static workflows and lack the adaptability required for multi-step reasoning and complex task management. Agentic Retrieval-Augmented Generation (Agentic RAG) transcends these limitations by embedding autonomous AI agents into the RAG pipeline. These agents leverage agentic design patterns reflection, planning, tool use, and multi-agent collaboration to dynamically manage retrieval strategies, iteratively refine contextual understanding, and adapt workflows through operational structures ranging from sequential steps to adaptive collaboration. This integration enables Agentic RAG systems to deliver flexibility, scalability, and context-awareness across diverse applications. This paper presents an analytical survey of Agentic RAG systems. It traces the evolution of RAG paradigms, introduces a principled taxonomy of Agentic RAG architectures based on agent cardinality, control structure, autonomy, and knowledge representation, and provides a comparative analysis of design trade-offs across existing frameworks. The survey examines applications in healthcare, finance, education, and enterprise document processing, and distills practical lessons for system designers and practitioners. Finally, it identifies key open research challenges related to evaluation, coordination, memory management, efficiency, and governance, outlining directions for future research.
+Subjects:
+Artificial Intelligence (cs.AI)
+; Computation and Language (cs.CL); Information Retrieval (cs.IR)
+Cite as:
+arXiv:2501.09136
+[cs.AI]
+(or
+arXiv:2501.09136v4
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2501.09136
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Abul Ehtesham [
+view email
+]
+[v1]
+Wed, 15 Jan 2025 20:40:25 UTC (20,962 KB)
+[v2]
+Mon, 3 Feb 2025 04:01:36 UTC (22,453 KB)
+[v3]
+Tue, 4 Feb 2025 04:48:00 UTC (22,430 KB)
+[v4]
+Wed, 1 Apr 2026 15:51:06 UTC (13,996 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Agentic Retrieval-Augmented Generation: A Survey on Agentic RAG, by Aditi Singh and 4 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+cs.CL
+cs.IR
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250109891-evolving-deeper-llm-thinking.md b/research/notes/250109891-evolving-deeper-llm-thinking.md
new file mode 100644
index 0000000000000000000000000000000000000000..7de7575973d75805c55cb6a083e71e8857aceda5
--- /dev/null
+++ b/research/notes/250109891-evolving-deeper-llm-thinking.md
@@ -0,0 +1,196 @@
+---
+title: '[2501.09891] Evolving Deeper LLM Thinking'
+id: 250109891-evolving-deeper-llm-thinking
+tags:
+- deepread
+created: '2026-06-10T00:24:58.674469Z'
+source: https://arxiv.org/abs/2501.09891
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:24:58.674344Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.09891] Evolving Deeper LLM Thinking
+Computer Science > Artificial Intelligence
+arXiv:2501.09891
+(cs)
+[Submitted on 17 Jan 2025]
+Title:
+Evolving Deeper LLM Thinking
+Authors:
+Kuang-Huei Lee
+,
+Ian Fischer
+,
+Yueh-Hua Wu
+,
+Dave Marwood
+,
+Shumeet Baluja
+,
+Dale Schuurmans
+,
+Xinyun Chen
+View a PDF of the paper titled Evolving Deeper LLM Thinking, by Kuang-Huei Lee and 6 other authors
+View PDF
+HTML (experimental)
+Abstract:
+We explore an evolutionary search strategy for scaling inference time compute in Large Language Models. The proposed approach, Mind Evolution, uses a language model to generate, recombine and refine candidate responses. The proposed approach avoids the need to formalize the underlying inference problem whenever a solution evaluator is available. Controlling for inference cost, we find that Mind Evolution significantly outperforms other inference strategies such as Best-of-N and Sequential Revision in natural language planning tasks. In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more than 98% of the problem instances using Gemini 1.5 Pro without the use of a formal solver.
+Subjects:
+Artificial Intelligence (cs.AI)
+Cite as:
+arXiv:2501.09891
+[cs.AI]
+(or
+arXiv:2501.09891v1
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2501.09891
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Dale Schuurmans [
+view email
+]
+[v1]
+Fri, 17 Jan 2025 00:41:44 UTC (3,183 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Evolving Deeper LLM Thinking, by Kuang-Huei Lee and 6 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md b/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed24746bf2c90b5eecf0bc53b02df7665e84ad5f
--- /dev/null
+++ b/research/notes/250112599-kimi-k15-scaling-reinforcement-learning-with-llms.md
@@ -0,0 +1,386 @@
+---
+title: '[2501.12599] Kimi k1.5: Scaling Reinforcement Learning with LLMs'
+id: 250112599-kimi-k15-scaling-reinforcement-learning-with-llms
+tags:
+- deepread
+created: '2026-06-10T00:24:53.655467Z'
+source: https://arxiv.org/abs/2501.12599
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:24:53.655188Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.12599] Kimi k1.5: Scaling Reinforcement Learning with LLMs
+Computer Science > Artificial Intelligence
+arXiv:2501.12599
+(cs)
+[Submitted on 22 Jan 2025 (
+v1
+), last revised 3 Jun 2025 (this version, v4)]
+Title:
+Kimi k1.5: Scaling Reinforcement Learning with LLMs
+Authors:
+Kimi Team
+,
+Angang Du
+,
+Bofei Gao
+,
+Bowei Xing
+,
+Changjiu Jiang
+,
+Cheng Chen
+,
+Cheng Li
+,
+Chenjun Xiao
+,
+Chenzhuang Du
+,
+Chonghua Liao
+,
+Chuning Tang
+,
+Congcong Wang
+,
+Dehao Zhang
+,
+Enming Yuan
+,
+Enzhe Lu
+,
+Fengxiang Tang
+,
+Flood Sung
+,
+Guangda Wei
+,
+Guokun Lai
+,
+Haiqing Guo
+,
+Han Zhu
+,
+Hao Ding
+,
+Hao Hu
+,
+Hao Yang
+,
+Hao Zhang
+,
+Haotian Yao
+,
+Haotian Zhao
+,
+Haoyu Lu
+,
+Haoze Li
+,
+Haozhen Yu
+,
+Hongcheng Gao
+,
+Huabin Zheng
+,
+Huan Yuan
+,
+Jia Chen
+,
+Jianhang Guo
+,
+Jianlin Su
+,
+Jianzhou Wang
+,
+Jie Zhao
+,
+Jin Zhang
+,
+Jingyuan Liu
+,
+Junjie Yan
+,
+Junyan Wu
+,
+Lidong Shi
+,
+Ling Ye
+,
+Longhui Yu
+,
+Mengnan Dong
+,
+Neo Zhang
+,
+Ningchen Ma
+,
+Qiwei Pan
+,
+Qucheng Gong
+,
+Shaowei Liu
+,
+Shengling Ma
+,
+Shupeng Wei
+,
+Sihan Cao
+,
+Siying Huang
+,
+Tao Jiang
+,
+Weihao Gao
+,
+Weimin Xiong
+,
+Weiran He
+,
+Weixiao Huang
+,
+Weixin Xu
+,
+Wenhao Wu
+,
+Wenyang He
+,
+Xianghui Wei
+,
+Xianqing Jia
+,
+Xingzhe Wu
+,
+Xinran Xu
+,
+Xinxing Zu
+,
+Xinyu Zhou
+,
+Xuehai Pan
+,
+Y. Charles
+,
+Yang Li
+,
+Yangyang Hu
+,
+Yangyang Liu
+,
+Yanru Chen
+,
+Yejie Wang
+,
+Yibo Liu
+,
+Yidao Qin
+,
+Yifeng Liu
+,
+Ying Yang
+,
+Yiping Bao
+,
+Yulun Du
+,
+Yuxin Wu
+,
+Yuzhi Wang
+,
+Zaida Zhou
+,
+Zhaoji Wang
+,
+Zhaowei Li
+,
+Zhen Zhu
+,
+Zheng Zhang
+,
+Zhexu Wang
+,
+Zhilin Yang
+,
+Zhiqi Huang
+,
+Zihao Huang
+,
+Ziyao Xu
+,
+Zonghan Yang
+,
+Zongyu Lin
+View a PDF of the paper titled Kimi k1.5: Scaling Reinforcement Learning with LLMs, by Kimi Team and 95 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Language model pretraining with next token prediction has proved effective for scaling compute but is limited to the amount of available training data. Scaling reinforcement learning (RL) unlocks a new axis for the continued improvement of artificial intelligence, with the promise that large language models (LLMs) can scale their training data by learning to explore with rewards. However, prior published work has not produced competitive results. In light of this, we report on the training practice of Kimi k1.5, our latest multi-modal LLM trained with RL, including its RL training techniques, multi-modal data recipes, and infrastructure optimization. Long context scaling and improved policy optimization methods are key ingredients of our approach, which establishes a simplistic, effective RL framework without relying on more complex techniques such as Monte Carlo tree search, value functions, and process reward models. Notably, our system achieves state-of-the-art reasoning performance across multiple benchmarks and modalities -- e.g., 77.5 on AIME, 96.2 on MATH 500, 94-th percentile on Codeforces, 74.9 on MathVista -- matching OpenAI's o1. Moreover, we present effective long2short methods that use long-CoT techniques to improve short-CoT models, yielding state-of-the-art short-CoT reasoning results -- e.g., 60.8 on AIME, 94.6 on MATH500, 47.3 on LiveCodeBench -- outperforming existing short-CoT models such as GPT-4o and Claude Sonnet 3.5 by a large margin (up to +550%).
+Comments:
+25 pages
+Subjects:
+Artificial Intelligence (cs.AI)
+; Machine Learning (cs.LG)
+Cite as:
+arXiv:2501.12599
+[cs.AI]
+(or
+arXiv:2501.12599v4
+[cs.AI]
+for this version)
+https://doi.org/10.48550/arXiv.2501.12599
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Flood Sung [
+view email
+]
+[v1]
+Wed, 22 Jan 2025 02:48:14 UTC (614 KB)
+[v2]
+Wed, 5 Mar 2025 02:16:32 UTC (614 KB)
+[v3]
+Wed, 28 May 2025 03:57:30 UTC (614 KB)
+[v4]
+Tue, 3 Jun 2025 02:14:54 UTC (603 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Kimi k1.5: Scaling Reinforcement Learning with LLMs, by Kimi Team and 95 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.AI
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md b/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md
new file mode 100644
index 0000000000000000000000000000000000000000..026758bd2ae72d030117a4ff8ea83ff8ca218f65
--- /dev/null
+++ b/research/notes/250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed.md
@@ -0,0 +1,206 @@
+---
+title: '[2501.18512] Streaming DiLoCo with overlapping communication: Towards a Distributed
+  Free Lunch'
+id: 250118512-streaming-diloco-with-overlapping-communication-towards-a-distributed
+tags:
+- deepread
+created: '2026-06-10T00:30:21.211856Z'
+source: https://arxiv.org/abs/2501.18512
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:21.211709Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.18512] Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch
+Computer Science > Computation and Language
+arXiv:2501.18512
+(cs)
+[Submitted on 30 Jan 2025]
+Title:
+Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch
+Authors:
+Arthur Douillard
+,
+Yanislav Donchev
+,
+Keith Rush
+,
+Satyen Kale
+,
+Zachary Charles
+,
+Zachary Garrett
+,
+Gabriel Teston
+,
+Dave Lacey
+,
+Ross McIlroy
+,
+Jiajun Shen
+,
+Alexandre Ramé
+,
+Arthur Szlam
+,
+Marc'Aurelio Ranzato
+,
+Paul Barham
+View a PDF of the paper titled Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch, by Arthur Douillard and Yanislav Donchev and Keith Rush and Satyen Kale and Zachary Charles and Zachary Garrett and Gabriel Teston and Dave Lacey and Ross McIlroy and Jiajun Shen and Alexandre Ram\'e and Arthur Szlam and Marc'Aurelio Ranzato and Paul Barham
+View PDF
+HTML (experimental)
+Abstract:
+Training of large language models (LLMs) is typically distributed across a large number of accelerators to reduce training time. Since internal states and parameter gradients need to be exchanged at each and every single gradient step, all devices need to be co-located using low-latency high-bandwidth communication links to support the required high volume of exchanged bits. Recently, distributed algorithms like DiLoCo have relaxed such co-location constraint: accelerators can be grouped into ``workers'', where synchronizations between workers only occur infrequently. This in turn means that workers can afford being connected by lower bandwidth communication links without affecting learning quality. However, in these methods, communication across workers still requires the same peak bandwidth as before, as the synchronizations require all parameters to be exchanged across all workers. In this paper, we improve DiLoCo in three ways. First, we synchronize only subsets of parameters in sequence, rather than all at once, which greatly reduces peak bandwidth. Second, we allow workers to continue training while synchronizing, which decreases wall clock time. Third, we quantize the data exchanged by workers, which further reduces bandwidth across workers. By properly combining these modifications, we show experimentally that we can distribute training of billion-scale parameters and reach similar quality as before, but reducing required bandwidth by two orders of magnitude.
+Subjects:
+Computation and Language (cs.CL)
+Cite as:
+arXiv:2501.18512
+[cs.CL]
+(or
+arXiv:2501.18512v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2501.18512
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Arthur Douillard [
+view email
+]
+[v1]
+Thu, 30 Jan 2025 17:23:50 UTC (3,278 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch, by Arthur Douillard and Yanislav Donchev and Keith Rush and Satyen Kale and Zachary Charles and Zachary Garrett and Gabriel Teston and Dave Lacey and Ross McIlroy and Jiajun Shen and Alexandre Ram\'e and Arthur Szlam and Marc'Aurelio Ranzato and Paul Barham
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md b/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md
new file mode 100644
index 0000000000000000000000000000000000000000..e14838a352d8126b4f6d4560de606bc94f63d325
--- /dev/null
+++ b/research/notes/250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli.md
@@ -0,0 +1,179 @@
+---
+title: '[2501.18639] A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture,
+  Applications, and Advances'
+id: 250118639-a-comprehensive-survey-of-the-lean-4-theorem-prover-architecture-appli
+tags:
+- deepread
+created: '2026-06-10T00:25:14.929249Z'
+source: https://arxiv.org/abs/2501.18639
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:25:14.929112Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2501.18639] A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances
+Computer Science > Logic in Computer Science
+arXiv:2501.18639
+(cs)
+[Submitted on 28 Jan 2025]
+Title:
+A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances
+Authors:
+Xichen Tang
+View a PDF of the paper titled A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances, by Xichen Tang
+View PDF
+Abstract:
+This comprehensive survey examines Lean 4, a state-of-the-art interactive theorem prover and functional programming language. We analyze its architectural design, type system, metaprogramming capabilities, and practical applications in formal verification and mathematics. Through detailed comparisons with other proof assistants and extensive case studies, we demonstrate Lean 4's unique advantages in proof automation, performance, and usability. The paper also explores recent developments in its ecosystem, including libraries, tools, and educational applications, providing insights into its growing impact on formal methods and mathematical formalization.
+Subjects:
+Logic in Computer Science (cs.LO)
+; Programming Languages (cs.PL)
+Cite as:
+arXiv:2501.18639
+[cs.LO]
+(or
+arXiv:2501.18639v1
+[cs.LO]
+for this version)
+https://doi.org/10.48550/arXiv.2501.18639
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Xichen Tang [
+view email
+]
+[v1]
+Tue, 28 Jan 2025 17:15:54 UTC (2,729 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled A Comprehensive Survey of the Lean 4 Theorem Prover: Architecture, Applications, and Advances, by Xichen Tang
+View PDF
+view license
+Current browse context:
+cs.LO
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-01
+Change to browse by:
+cs
+cs.PL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md b/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d8254e71f933483236ebbec2430872f47677dbc
--- /dev/null
+++ b/research/notes/250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering.md
@@ -0,0 +1,183 @@
+---
+title: '[2502.02047] AmaSQuAD: A Benchmark for Amharic Extractive Question Answering'
+id: 250202047-amasquad-a-benchmark-for-amharic-extractive-question-answering
+tags:
+- deepread
+created: '2026-06-10T00:24:12.341628Z'
+source: https://arxiv.org/abs/2502.02047
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:24:12.341471Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2502.02047] AmaSQuAD: A Benchmark for Amharic Extractive Question Answering
+Computer Science > Computation and Language
+arXiv:2502.02047
+(cs)
+[Submitted on 4 Feb 2025]
+Title:
+AmaSQuAD: A Benchmark for Amharic Extractive Question Answering
+Authors:
+Nebiyou Daniel Hailemariam
+,
+Blessed Guda
+,
+Tsegazeab Tefferi
+View a PDF of the paper titled AmaSQuAD: A Benchmark for Amharic Extractive Question Answering, by Nebiyou Daniel Hailemariam and 2 other authors
+View PDF
+HTML (experimental)
+Abstract:
+This research presents a novel framework for translating extractive question-answering datasets into low-resource languages, as demonstrated by the creation of the AmaSQuAD dataset, a translation of SQuAD 2.0 into Amharic. The methodology addresses challenges related to misalignment between translated questions and answers, as well as the presence of multiple answer instances in the translated context. For this purpose, we used cosine similarity utilizing embeddings from a fine-tuned BERT-based model for Amharic and Longest Common Subsequence (LCS). Additionally, we fine-tune the XLM-R model on the AmaSQuAD synthetic dataset for Amharic Question-Answering. The results show an improvement in baseline performance, with the fine-tuned model achieving an increase in the F1 score from 36.55% to 44.41% and 50.01% to 57.5% on the AmaSQuAD development dataset. Moreover, the model demonstrates improvement on the human-curated AmQA dataset, increasing the F1 score from 67.80% to 68.80% and the exact match score from 52.50% to 52.66%.The AmaSQuAD dataset is publicly available Datasets
+Subjects:
+Computation and Language (cs.CL)
+Cite as:
+arXiv:2502.02047
+[cs.CL]
+(or
+arXiv:2502.02047v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2502.02047
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Blessed Guda [
+view email
+]
+[v1]
+Tue, 4 Feb 2025 06:27:39 UTC (778 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled AmaSQuAD: A Benchmark for Amharic Extractive Question Answering, by Nebiyou Daniel Hailemariam and 2 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-02
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250210915-first-passage-times-with-fast-immigration.md b/research/notes/250210915-first-passage-times-with-fast-immigration.md
new file mode 100644
index 0000000000000000000000000000000000000000..84fa2f8790935e24d7af9191ce27d6f1809eb9ca
--- /dev/null
+++ b/research/notes/250210915-first-passage-times-with-fast-immigration.md
@@ -0,0 +1,186 @@
+---
+title: '[2502.10915] First passage times with fast immigration'
+id: 250210915-first-passage-times-with-fast-immigration
+tags:
+- deepread
+created: '2026-06-10T00:41:13.957308Z'
+source: https://arxiv.org/abs/2502.10915
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:41:13.957158Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2502.10915] First passage times with fast immigration
+Mathematics > Probability
+arXiv:2502.10915
+(math)
+[Submitted on 15 Feb 2025]
+Title:
+First passage times with fast immigration
+Authors:
+Hwai-Ray Tung
+,
+Sean D Lawley
+View a PDF of the paper titled First passage times with fast immigration, by Hwai-Ray Tung and Sean D Lawley
+View PDF
+HTML (experimental)
+Abstract:
+Many scientific questions can be framed as asking for a first passage time (FPT), which generically describes the time it takes a random "searcher" to find a "target." The important timescale in a variety of biophysical systems is the time it takes the fastest searcher(s) to find a target out of many searchers. Previous work on such fastest FPTs assumes that all searchers are initially present in the domain, which makes the problem amenable to extreme value theory. In this paper, we consider an alternative model in which searchers progressively enter the domain at a constant "immigration" rate. In the fast immigration rate limit, we determine the probability distribution and moments of the $k$-th fastest FPT. Our rigorous theory applies to many models of stochastic motion, including random walks on discrete networks and diffusion on continuous state spaces. Mathematically, our analysis involves studying the extrema of an infinite sequence of random variables which are both not independent and not identically distributed. Our results constitute a rare instance in which extreme value statistics can be determined exactly for strongly correlated random variables.
+Comments:
+25 pages, 1 figure
+Subjects:
+Probability (math.PR)
+MSC
+classes:
+60G70, 92B99, 60J60
+Cite as:
+arXiv:2502.10915
+[math.PR]
+(or
+arXiv:2502.10915v1
+[math.PR]
+for this version)
+https://doi.org/10.48550/arXiv.2502.10915
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Sean Lawley [
+view email
+]
+[v1]
+Sat, 15 Feb 2025 21:52:42 UTC (6,228 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled First passage times with fast immigration, by Hwai-Ray Tung and Sean D Lawley
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+math.PR
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-02
+Change to browse by:
+math
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md b/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a2bd5dac4cb648310ff364af65fe303b8cf985f
--- /dev/null
+++ b/research/notes/250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco.md
@@ -0,0 +1,187 @@
+---
+title: '[2502.12996] Eager Updates For Overlapped Communication and Computation in
+  DiLoCo'
+id: 250212996-eager-updates-for-overlapped-communication-and-computation-in-diloco
+tags:
+- deepread
+created: '2026-06-10T00:35:23.460079Z'
+source: https://arxiv.org/abs/2502.12996
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:35:23.459912Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2502.12996] Eager Updates For Overlapped Communication and Computation in DiLoCo
+Computer Science > Computation and Language
+arXiv:2502.12996
+(cs)
+[Submitted on 18 Feb 2025]
+Title:
+Eager Updates For Overlapped Communication and Computation in DiLoCo
+Authors:
+Satyen Kale
+,
+Arthur Douillard
+,
+Yanislav Donchev
+View a PDF of the paper titled Eager Updates For Overlapped Communication and Computation in DiLoCo, by Satyen Kale and Arthur Douillard and Yanislav Donchev
+View PDF
+HTML (experimental)
+Abstract:
+Distributed optimization methods such as DiLoCo have been shown to be effective in training very large models across multiple distributed workers, such as datacenters. These methods split updates into two parts: an inner optimization phase, where the workers independently execute multiple optimization steps on their own local data, and an outer optimization step, where the inner updates are synchronized. While such approaches require orders of magnitude less communication than standard data-parallel training, in settings where the workers are datacenters, even the limited communication requirements of these approaches can still cause significant slow downs due to the blocking necessary at each outer optimization step. In this paper, we investigate techniques to mitigate this issue by overlapping communication with computation in a manner that allows the outer optimization step to fully overlap with the inner optimization phase. We show that a particular variant, dubbed eager updates, provides competitive performance with standard DiLoCo in settings with low bandwidth between workers.
+Comments:
+arXiv admin note: text overlap with
+arXiv:2501.18512
+Subjects:
+Computation and Language (cs.CL)
+Cite as:
+arXiv:2502.12996
+[cs.CL]
+(or
+arXiv:2502.12996v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2502.12996
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Arthur Douillard [
+view email
+]
+[v1]
+Tue, 18 Feb 2025 16:16:14 UTC (1,257 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Eager Updates For Overlapped Communication and Computation in DiLoCo, by Satyen Kale and Arthur Douillard and Yanislav Donchev
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-02
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md b/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md
new file mode 100644
index 0000000000000000000000000000000000000000..71e5619fd9e5debae2cb95c5088d3a09958fc334
--- /dev/null
+++ b/research/notes/250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md
@@ -0,0 +1,261 @@
+---
+title: '[2503.14476] DAPO: An Open-Source LLM Reinforcement Learning System at Scale'
+id: 250314476-dapo-an-open-source-llm-reinforcement-learning-system-at-scale
+tags:
+- deepread
+created: '2026-06-09T23:20:59.609829Z'
+source: https://arxiv.org/abs/2503.14476
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:20:59.609711Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2503.14476] DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+Computer Science > Machine Learning
+arXiv:2503.14476
+(cs)
+[Submitted on 18 Mar 2025 (
+v1
+), last revised 20 May 2025 (this version, v2)]
+Title:
+DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+Authors:
+Qiying Yu
+,
+Zheng Zhang
+,
+Ruofei Zhu
+,
+Yufeng Yuan
+,
+Xiaochen Zuo
+,
+Yu Yue
+,
+Weinan Dai
+,
+Tiantian Fan
+,
+Gaohong Liu
+,
+Lingjun Liu
+,
+Xin Liu
+,
+Haibin Lin
+,
+Zhiqi Lin
+,
+Bole Ma
+,
+Guangming Sheng
+,
+Yuxuan Tong
+,
+Chi Zhang
+,
+Mofan Zhang
+,
+Wang Zhang
+,
+Hang Zhu
+,
+Jinhua Zhu
+,
+Jiaze Chen
+,
+Jiangjie Chen
+,
+Chengyi Wang
+,
+Hongli Yu
+,
+Yuxuan Song
+,
+Xiangpeng Wei
+,
+Hao Zhou
+,
+Jingjing Liu
+,
+Wei-Ying Ma
+,
+Ya-Qin Zhang
+,
+Lin Yan
+,
+Mu Qiao
+,
+Yonghui Wu
+,
+Mingxuan Wang
+View a PDF of the paper titled DAPO: An Open-Source LLM Reinforcement Learning System at Scale, by Qiying Yu and 34 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the $\textbf{D}$ecoupled Clip and $\textbf{D}$ynamic s$\textbf{A}$mpling $\textbf{P}$olicy $\textbf{O}$ptimization ($\textbf{DAPO}$) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model. Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the verl framework, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL.
+Comments:
+Project Page:
+this https URL
+Subjects:
+Machine Learning (cs.LG)
+; Computation and Language (cs.CL)
+Cite as:
+arXiv:2503.14476
+[cs.LG]
+(or
+arXiv:2503.14476v2
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2503.14476
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Qiying Yu [
+view email
+]
+[v1]
+Tue, 18 Mar 2025 17:49:06 UTC (4,369 KB)
+[v2]
+Tue, 20 May 2025 01:37:34 UTC (4,369 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled DAPO: An Open-Source LLM Reinforcement Learning System at Scale, by Qiying Yu and 34 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-03
+Change to browse by:
+cs
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md b/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad82ac91352c5cad8ba49239dc61530b1f423c78
--- /dev/null
+++ b/research/notes/250320783-understanding-r1-zero-like-training-a-critical-perspective.md
@@ -0,0 +1,212 @@
+---
+title: '[2503.20783] Understanding R1-Zero-Like Training: A Critical Perspective'
+id: 250320783-understanding-r1-zero-like-training-a-critical-perspective
+tags:
+- deepread
+created: '2026-06-09T23:20:56.927541Z'
+source: https://arxiv.org/abs/2503.20783
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:20:56.927395Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2503.20783] Understanding R1-Zero-Like Training: A Critical Perspective
+Computer Science > Machine Learning
+arXiv:2503.20783
+(cs)
+[Submitted on 26 Mar 2025 (
+v1
+), last revised 6 Oct 2025 (this version, v2)]
+Title:
+Understanding R1-Zero-Like Training: A Critical Perspective
+Authors:
+Zichen Liu
+,
+Changyu Chen
+,
+Wenjun Li
+,
+Penghui Qi
+,
+Tianyu Pang
+,
+Chao Du
+,
+Wee Sun Lee
+,
+Min Lin
+View a PDF of the paper titled Understanding R1-Zero-Like Training: A Critical Perspective, by Zichen Liu and 7 other authors
+View PDF
+HTML (experimental)
+Abstract:
+DeepSeek-R1-Zero has shown that reinforcement learning (RL) at scale can directly enhance the reasoning capabilities of LLMs without supervised fine-tuning. In this work, we critically examine R1-Zero-like training by analyzing its two core components: base models and RL. We investigate a wide range of base models, including DeepSeek-V3-Base, to understand how pretraining characteristics influence RL performance. Our analysis reveals that DeepSeek-V3-Base already exhibit ''Aha moment'', while Qwen2.5 base models demonstrate strong reasoning capabilities even without prompt templates, suggesting potential pretraining biases. Additionally, we identify an optimization bias in Group Relative Policy Optimization (GRPO), which artificially increases response length (especially for incorrect outputs) during training. To address this, we introduce Dr. GRPO, an unbiased optimization method that improves token efficiency while maintaining reasoning performance. Leveraging these insights, we present a minimalist R1-Zero recipe that achieves 43.3% accuracy on AIME 2024 with a 7B base model, establishing a new state-of-the-art. Our code is available at
+this https URL
+.
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL)
+Cite as:
+arXiv:2503.20783
+[cs.LG]
+(or
+arXiv:2503.20783v2
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2503.20783
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Zichen Liu [
+view email
+]
+[v1]
+Wed, 26 Mar 2025 17:59:14 UTC (2,551 KB)
+[v2]
+Mon, 6 Oct 2025 09:30:03 UTC (1,366 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Understanding R1-Zero-Like Training: A Critical Perspective, by Zichen Liu and 7 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-03
+Change to browse by:
+cs
+cs.AI
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md b/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md
new file mode 100644
index 0000000000000000000000000000000000000000..817c39210d5046c9fdffb0d0883e87a171f7acc8
--- /dev/null
+++ b/research/notes/250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open.md
@@ -0,0 +1,196 @@
+---
+title: '[2504.07164] R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling
+  Open-Weights SWE Agents'
+id: 250407164-r2e-gym-procedural-environments-and-hybrid-verifiers-for-scaling-open
+tags:
+- deepread
+created: '2026-06-10T00:23:07.985532Z'
+source: https://arxiv.org/abs/2504.07164
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:07.985299Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2504.07164] R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents
+Computer Science > Software Engineering
+arXiv:2504.07164
+(cs)
+[Submitted on 9 Apr 2025]
+Title:
+R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents
+Authors:
+Naman Jain
+,
+Jaskirat Singh
+,
+Manish Shetty
+,
+Liang Zheng
+,
+Koushik Sen
+,
+Ion Stoica
+View a PDF of the paper titled R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents, by Naman Jain and 5 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Improving open-source models on real-world SWE tasks (solving GITHUB issues) faces two key challenges: 1) scalable curation of execution environments to train these models, and, 2) optimal scaling of test-time compute. We introduce AgentGym, the largest procedurally-curated executable gym environment for training real-world SWE-agents, consisting of more than 8.7K tasks. AgentGym is powered by two main contributions: 1) SYNGEN: a synthetic data curation recipe that enables scalable curation of executable environments using test-generation and back-translation directly from commits, thereby reducing reliance on human-written issues or unit tests. We show that this enables more scalable training leading to pass@1 performance of 34.4% on SWE-Bench Verified benchmark with our 32B model. 2) Hybrid Test-time Scaling: we provide an in-depth analysis of two test-time scaling axes; execution-based and execution-free verifiers, demonstrating that they exhibit complementary strengths and limitations. Test-based verifiers suffer from low distinguishability, while execution-free verifiers are biased and often rely on stylistic features. Surprisingly, we find that while each approach individually saturates around 42-43%, significantly higher gains can be obtained by leveraging their complementary strengths. Overall, our approach achieves 51% on the SWE-Bench Verified benchmark, reflecting a new state-of-the-art for open-weight SWE-agents and for the first time showing competitive performance with proprietary models such as o1, o1-preview and sonnet-3.5-v2 (with tools). We will open-source our environments, models, and agent trajectories.
+Comments:
+Website:
+this https URL
+Subjects:
+Software Engineering (cs.SE)
+; Computation and Language (cs.CL); Machine Learning (cs.LG)
+Cite as:
+arXiv:2504.07164
+[cs.SE]
+(or
+arXiv:2504.07164v1
+[cs.SE]
+for this version)
+https://doi.org/10.48550/arXiv.2504.07164
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Naman Jain [
+view email
+]
+[v1]
+Wed, 9 Apr 2025 17:55:19 UTC (14,303 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents, by Naman Jain and 5 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.SE
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-04
+Change to browse by:
+cs
+cs.CL
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md b/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md
new file mode 100644
index 0000000000000000000000000000000000000000..2fe3df9c277804fcdbddf281eafad0802d1854d0
--- /dev/null
+++ b/research/notes/250421798-swe-smith-scaling-data-for-software-engineering-agents.md
@@ -0,0 +1,209 @@
+---
+title: '[2504.21798] SWE-smith: Scaling Data for Software Engineering Agents'
+id: 250421798-swe-smith-scaling-data-for-software-engineering-agents
+tags:
+- deepread
+created: '2026-06-10T00:22:56.761688Z'
+source: https://arxiv.org/abs/2504.21798
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:22:56.761560Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2504.21798] SWE-smith: Scaling Data for Software Engineering Agents
+Computer Science > Software Engineering
+arXiv:2504.21798
+(cs)
+[Submitted on 30 Apr 2025 (
+v1
+), last revised 21 May 2025 (this version, v2)]
+Title:
+SWE-smith: Scaling Data for Software Engineering Agents
+Authors:
+John Yang
+,
+Kilian Lieret
+,
+Carlos E. Jimenez
+,
+Alexander Wettig
+,
+Kabir Khandpur
+,
+Yanzhe Zhang
+,
+Binyuan Hui
+,
+Ofir Press
+,
+Ludwig Schmidt
+,
+Diyi Yang
+View a PDF of the paper titled SWE-smith: Scaling Data for Software Engineering Agents, by John Yang and 9 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Despite recent progress in Language Models (LMs) for software engineering, collecting training data remains a significant pain point. Existing datasets are small, with at most 1,000s of training instances from 11 or fewer GitHub repositories. The procedures to curate such datasets are often complex, necessitating hundreds of hours of human labor; companion execution environments also take up several terabytes of storage, severely limiting their scalability and usability. To address this pain point, we introduce SWE-smith, a novel pipeline for generating software engineering training data at scale. Given any Python codebase, SWE-smith constructs a corresponding execution environment, then automatically synthesizes 100s to 1,000s of task instances that break existing test(s) in the codebase. Using SWE-smith, we create a dataset of 50k instances sourced from 128 GitHub repositories, an order of magnitude larger than all previous works. We train SWE-agent-LM-32B, achieving 40.2% Pass@1 resolve rate on the SWE-bench Verified benchmark, state of the art among open source models. We open source SWE-smith (collection procedure, task instances, trajectories, models) to lower the barrier of entry for research in LM systems for automated software engineering. All assets available at
+this https URL
+.
+Comments:
+All assets available at
+this https URL
+Subjects:
+Software Engineering (cs.SE)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL)
+Cite as:
+arXiv:2504.21798
+[cs.SE]
+(or
+arXiv:2504.21798v2
+[cs.SE]
+for this version)
+https://doi.org/10.48550/arXiv.2504.21798
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: John Yang B [
+view email
+]
+[v1]
+Wed, 30 Apr 2025 16:56:06 UTC (2,670 KB)
+[v2]
+Wed, 21 May 2025 17:21:45 UTC (2,691 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled SWE-smith: Scaling Data for Software Engineering Agents, by John Yang and 9 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.SE
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-04
+Change to browse by:
+cs
+cs.AI
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md b/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8e713af37963f190b5ef4771167db4b4cc4758c
--- /dev/null
+++ b/research/notes/250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent.md
@@ -0,0 +1,446 @@
+---
+title: '[2506.13585] MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning
+  Attention'
+id: 250613585-minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attent
+tags:
+- deepread
+created: '2026-06-09T23:28:26.873770Z'
+source: https://arxiv.org/abs/2506.13585
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:28:26.873612Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2506.13585] MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention
+Computer Science > Computation and Language
+arXiv:2506.13585
+(cs)
+[Submitted on 16 Jun 2025]
+Title:
+MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention
+Authors:
+MiniMax
+:
+Aili Chen
+,
+Aonian Li
+,
+Bangwei Gong
+,
+Binyang Jiang
+,
+Bo Fei
+,
+Bo Yang
+,
+Boji Shan
+,
+Changqing Yu
+,
+Chao Wang
+,
+Cheng Zhu
+,
+Chengjun Xiao
+,
+Chengyu Du
+,
+Chi Zhang
+,
+Chu Qiao
+,
+Chunhao Zhang
+,
+Chunhui Du
+,
+Congchao Guo
+,
+Da Chen
+,
+Deming Ding
+,
+Dianjun Sun
+,
+Dong Li
+,
+Enwei Jiao
+,
+Haigang Zhou
+,
+Haimo Zhang
+,
+Han Ding
+,
+Haohai Sun
+,
+Haoyu Feng
+,
+Huaiguang Cai
+,
+Haichao Zhu
+,
+Jian Sun
+,
+Jiaqi Zhuang
+,
+Jiaren Cai
+,
+Jiayuan Song
+,
+Jin Zhu
+,
+Jingyang Li
+,
+Jinhao Tian
+,
+Jinli Liu
+,
+Junhao Xu
+,
+Junjie Yan
+,
+Junteng Liu
+,
+Junxian He
+,
+Kaiyi Feng
+,
+Ke Yang
+,
+Kecheng Xiao
+,
+Le Han
+,
+Leyang Wang
+,
+Lianfei Yu
+,
+Liheng Feng
+,
+Lin Li
+,
+Lin Zheng
+,
+Linge Du
+,
+Lingyu Yang
+,
+Lunbin Zeng
+,
+Minghui Yu
+,
+Mingliang Tao
+,
+Mingyuan Chi
+,
+Mozhi Zhang
+,
+Mujie Lin
+,
+Nan Hu
+,
+Nongyu Di
+,
+Peng Gao
+,
+Pengfei Li
+,
+Pengyu Zhao
+,
+Qibing Ren
+,
+Qidi Xu
+,
+Qile Li
+,
+Qin Wang
+,
+Rong Tian
+,
+Ruitao Leng
+,
+Shaoxiang Chen
+,
+Shaoyu Chen
+,
+Shengmin Shi
+,
+Shitong Weng
+,
+Shuchang Guan
+,
+Shuqi Yu
+,
+Sichen Li
+,
+Songquan Zhu
+,
+Tengfei Li
+,
+Tianchi Cai
+,
+Tianrun Liang
+,
+Weiyu Cheng
+,
+Weize Kong
+,
+Wenkai Li
+,
+Xiancai Chen
+,
+Xiangjun Song
+,
+Xiao Luo
+,
+Xiao Su
+,
+Xiaobo Li
+,
+Xiaodong Han
+,
+Xinzhu Hou
+,
+Xuan Lu
+,
+Xun Zou
+,
+Xuyang Shen
+,
+Yan Gong
+,
+Yan Ma
+,
+Yang Wang
+,
+Yiqi Shi
+,
+Yiran Zhong
+,
+Yonghong Duan
+,
+Yongxiang Fu
+,
+Yongyi Hu
+,
+Yu Gao
+,
+Yuanxiang Fan
+,
+Yufeng Yang
+,
+Yuhao Li
+,
+Yulin Hu
+,
+Yunan Huang
+,
+Yunji Li
+,
+Yunzhi Xu
+,
+Yuxin Mao
+,
+Yuxuan Shi
+,
+Yuze Wenren
+,
+Zehan Li
+,
+Zelin Li
+,
+Zhanxu Tian
+,
+Zhengmao Zhu
+,
+Zhenhua Fan
+,
+Zhenzhen Wu
+,
+Zhichao Xu
+,
+Zhihang Yu
+,
+Zhiheng Lyu
+,
+Zhuo Jiang
+,
+Zibo Gao
+,
+Zijia Wu
+,
+Zijian Song
+,
+Zijun Sun
+et al. (27 additional authors not shown)
+You must enable JavaScript to view entire author list.
+View a PDF of the paper titled MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention, by MiniMax: Aili Chen and 125 other authors
+View PDF
+HTML (experimental)
+Abstract:
+We introduce MiniMax-M1, the world's first open-weight, large-scale hybrid-attention reasoning model. MiniMax-M1 is powered by a hybrid Mixture-of-Experts (MoE) architecture combined with a lightning attention mechanism. The model is developed based on our previous MiniMax-Text-01 model, which contains a total of 456 billion parameters with 45.9 billion parameters activated per token. The M1 model natively supports a context length of 1 million tokens, 8x the context size of DeepSeek R1. Furthermore, the lightning attention mechanism in MiniMax-M1 enables efficient scaling of test-time compute. These properties make M1 particularly suitable for complex tasks that require processing long inputs and thinking extensively. MiniMax-M1 is trained using large-scale reinforcement learning (RL) on diverse problems including sandbox-based, real-world software engineering environments. In addition to M1's inherent efficiency advantage for RL training, we propose CISPO, a novel RL algorithm to further enhance RL efficiency. CISPO clips importance sampling weights rather than token updates, outperforming other competitive RL variants. Combining hybrid-attention and CISPO enables MiniMax-M1's full RL training on 512 H800 GPUs to complete in only three weeks, with a rental cost of just $534,700. We release two versions of MiniMax-M1 models with 40K and 80K thinking budgets respectively, where the 40K model represents an intermediate phase of the 80K training. Experiments on standard benchmarks show that our models are comparable or superior to strong open-weight models such as the original DeepSeek-R1 and Qwen3-235B, with particular strengths in complex software engineering, tool utilization, and long-context tasks. We publicly release MiniMax-M1 at
+this https URL
+.
+Comments:
+A technical report from MiniMax. The authors are listed in alphabetical order. We open-source our MiniMax-M1 at
+this https URL
+Subjects:
+Computation and Language (cs.CL)
+; Machine Learning (cs.LG)
+Cite as:
+arXiv:2506.13585
+[cs.CL]
+(or
+arXiv:2506.13585v1
+[cs.CL]
+for this version)
+https://doi.org/10.48550/arXiv.2506.13585
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Junteng Liu [
+view email
+]
+[v1]
+Mon, 16 Jun 2025 15:08:02 UTC (1,207 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention, by MiniMax: Aili Chen and 125 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.CL
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-06
+Change to browse by:
+cs
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md b/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7109dbe08d66cfe05e501b2a36a5fcf19e254ae
--- /dev/null
+++ b/research/notes/250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr.md
@@ -0,0 +1,204 @@
+---
+title: '[2506.21263] DiLoCoX: A Low-Communication Large-Scale Training Framework for
+  Decentralized Cluster'
+id: 250621263-dilocox-a-low-communication-large-scale-training-framework-for-decentr
+tags:
+- deepread
+created: '2026-06-10T00:35:45.127464Z'
+source: https://arxiv.org/abs/2506.21263
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:35:45.127344Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2506.21263] DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster
+Computer Science > Machine Learning
+arXiv:2506.21263
+(cs)
+[Submitted on 26 Jun 2025]
+Title:
+DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster
+Authors:
+Ji Qi
+,
+WenPeng Zhu
+,
+Li Li
+,
+Ming Wu
+,
+YingJun Wu
+,
+Wu He
+,
+Xun Gao
+,
+Jason Zeng
+,
+Michael Heinrich
+View a PDF of the paper titled DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster, by Ji Qi and 8 other authors
+View PDF
+HTML (experimental)
+Abstract:
+The distributed training of foundation models, particularly large language models (LLMs), demands a high level of communication. Consequently, it is highly dependent on a centralized cluster with fast and reliable interconnects. Can we conduct training on slow networks and thereby unleash the power of decentralized clusters when dealing with models exceeding 100 billion parameters? In this paper, we propose DiLoCoX, a low-communication large-scale decentralized cluster training framework. It combines Pipeline Parallelism with Dual Optimizer Policy, One-Step-Delay Overlap of Communication and Local Training, and an Adaptive Gradient Compression Scheme. This combination significantly improves the scale of parameters and the speed of model pre-training. We justify the benefits of one-step-delay overlap of communication and local training, as well as the adaptive gradient compression scheme, through a theoretical analysis of convergence. Empirically, we demonstrate that DiLoCoX is capable of pre-training a 107B foundation model over a 1Gbps network. Compared to vanilla AllReduce, DiLoCoX can achieve a 357x speedup in distributed training while maintaining negligible degradation in model convergence. To the best of our knowledge, this is the first decentralized training framework successfully applied to models with over 100 billion parameters.
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL)
+Cite as:
+arXiv:2506.21263
+[cs.LG]
+(or
+arXiv:2506.21263v1
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2506.21263
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Wu He [
+view email
+]
+[v1]
+Thu, 26 Jun 2025 13:45:04 UTC (320 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled DiLoCoX: A Low-Communication Large-Scale Training Framework for Decentralized Cluster, by Ji Qi and 8 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-06
+Change to browse by:
+cs
+cs.AI
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/250718071-group-sequence-policy-optimization.md b/research/notes/250718071-group-sequence-policy-optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfe10a119fc94aff88a182cb8a3c4a9145494635
--- /dev/null
+++ b/research/notes/250718071-group-sequence-policy-optimization.md
@@ -0,0 +1,213 @@
+---
+title: '[2507.18071] Group Sequence Policy Optimization'
+id: 250718071-group-sequence-policy-optimization
+tags:
+- deepread
+created: '2026-06-09T23:21:02.328678Z'
+source: https://arxiv.org/abs/2507.18071
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:21:02.328533Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2507.18071] Group Sequence Policy Optimization
+Computer Science > Machine Learning
+arXiv:2507.18071
+(cs)
+[Submitted on 24 Jul 2025 (
+v1
+), last revised 28 Jul 2025 (this version, v2)]
+Title:
+Group Sequence Policy Optimization
+Authors:
+Chujie Zheng
+,
+Shixuan Liu
+,
+Mingze Li
+,
+Xiong-Hui Chen
+,
+Bowen Yu
+,
+Chang Gao
+,
+Kai Dang
+,
+Yuqiong Liu
+,
+Rui Men
+,
+An Yang
+,
+Jingren Zhou
+,
+Junyang Lin
+View a PDF of the paper titled Group Sequence Policy Optimization, by Chujie Zheng and 11 other authors
+View PDF
+HTML (experimental)
+Abstract:
+This paper introduces Group Sequence Policy Optimization (GSPO), our stable, efficient, and performant reinforcement learning algorithm for training large language models. Unlike previous algorithms that adopt token-level importance ratios, GSPO defines the importance ratio based on sequence likelihood and performs sequence-level clipping, rewarding, and optimization. We demonstrate that GSPO achieves superior training efficiency and performance compared to the GRPO algorithm, notably stabilizes Mixture-of-Experts (MoE) RL training, and has the potential for simplifying the design of RL infrastructure. These merits of GSPO have contributed to the remarkable improvements in the latest Qwen3 models.
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI); Computation and Language (cs.CL)
+Cite as:
+arXiv:2507.18071
+[cs.LG]
+(or
+arXiv:2507.18071v2
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2507.18071
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Chujie Zheng [
+view email
+]
+[v1]
+Thu, 24 Jul 2025 03:50:32 UTC (259 KB)
+[v2]
+Mon, 28 Jul 2025 11:11:33 UTC (259 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Group Sequence Policy Optimization, by Chujie Zheng and 11 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-07
+Change to browse by:
+cs
+cs.AI
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md b/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md
new file mode 100644
index 0000000000000000000000000000000000000000..43b0acf1f2acb1c0baf7b9ae884163cd87fafd3f
--- /dev/null
+++ b/research/notes/251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms.md
@@ -0,0 +1,215 @@
+---
+title: '[2512.21852] A Comedy of Estimators: On KL Regularization in RL Training of
+  LLMs'
+id: 251221852-a-comedy-of-estimators-on-kl-regularization-in-rl-training-of-llms
+tags:
+- deepread
+created: '2026-06-09T23:21:27.833089Z'
+source: https://arxiv.org/abs/2512.21852
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:21:27.832936Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2512.21852] A Comedy of Estimators: On KL Regularization in RL Training of LLMs
+Computer Science > Machine Learning
+arXiv:2512.21852
+(cs)
+[Submitted on 26 Dec 2025 (
+v1
+), last revised 18 Mar 2026 (this version, v3)]
+Title:
+A Comedy of Estimators: On KL Regularization in RL Training of LLMs
+Authors:
+Vedant Shah
+,
+Johan Obando-Ceron
+,
+Vineet Jain
+,
+Brian Bartoldson
+,
+Bhavya Kailkhura
+,
+Sarthak Mittal
+,
+Glen Berseth
+,
+Pablo Samuel Castro
+,
+Yoshua Bengio
+,
+Nikolay Malkin
+,
+Moksh Jain
+,
+Siddarth Venkatraman
+,
+Aaron Courville
+View a PDF of the paper titled A Comedy of Estimators: On KL Regularization in RL Training of LLMs, by Vedant Shah and 12 other authors
+View PDF
+Abstract:
+The reasoning performance of large language models (LLMs) can be substantially improved by training them with reinforcement learning (RL). The RL objective for LLM training involves a regularization term, which is the reverse Kullback-Leibler (KL) divergence between the trained policy and the reference policy. Since computing the KL divergence exactly is intractable, various estimators are used in practice to estimate it from on-policy samples. Despite its wide adoption, including in several open-source libraries, there is no systematic study analyzing the numerous ways of incorporating KL estimators in the objective and their effect on the downstream performance of RL-trained models. Recent works show that prevailing practices for incorporating KL regularization do not provide correct gradients for stated objectives, creating a discrepancy between the objective and its implementation. In this paper, we further analyze these practices and study the gradients of several estimators configurations, revealing how design choices shape gradient bias. We substantiate these findings with empirical observations by RL fine-tuning \texttt{Qwen2.5-7B}, \texttt{Llama-3.1-8B-Instruct} and \texttt{Qwen3-4B-Instruct-2507} with different configurations and evaluating their performance on both in- and out-of-distribution tasks. Through our analysis, we observe that, in on-policy settings: (1) estimator configurations with biased gradients can result in training instabilities; and (2) using estimator configurations resulting in unbiased gradients leads to better performance on in-domain as well as out-of-domain tasks. We also investigate the performance resulting from different KL configurations in off-policy settings and observe that KL regularization can help stabilize off-policy RL training resulting from asynchronous setups.
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI)
+Cite as:
+arXiv:2512.21852
+[cs.LG]
+(or
+arXiv:2512.21852v3
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2512.21852
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Johan Obando-Ceron [
+view email
+]
+[v1]
+Fri, 26 Dec 2025 04:20:58 UTC (209 KB)
+[v2]
+Tue, 6 Jan 2026 15:07:53 UTC (209 KB)
+[v3]
+Wed, 18 Mar 2026 00:41:09 UTC (210 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled A Comedy of Estimators: On KL Regularization in RL Training of LLMs, by Vedant Shah and 12 other authors
+View PDF
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2025-12
+Change to browse by:
+cs
+cs.AI
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md b/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a17774bb0c9f7af60645329fcd662f8d7b0325c
--- /dev/null
+++ b/research/notes/260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language.md
@@ -0,0 +1,210 @@
+---
+title: '[2601.18734] Self-Distilled Reasoner: On-Policy Self-Distillation for Large
+  Language Models'
+id: 260118734-self-distilled-reasoner-on-policy-self-distillation-for-large-language
+tags:
+- deepread
+created: '2026-06-09T23:55:21.280271Z'
+source: https://arxiv.org/abs/2601.18734
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:55:21.280139Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2601.18734] Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+Computer Science > Machine Learning
+arXiv:2601.18734
+(cs)
+[Submitted on 26 Jan 2026 (
+v1
+), last revised 20 Mar 2026 (this version, v3)]
+Title:
+Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+Authors:
+Siyan Zhao
+,
+Zhihui Xie
+,
+Mengchen Liu
+,
+Jing Huang
+,
+Guan Pang
+,
+Feiyu Chen
+,
+Aditya Grover
+View a PDF of the paper titled Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models, by Siyan Zhao and 6 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Knowledge distillation improves large language model (LLM) reasoning by compressing the knowledge of a teacher LLM to train smaller LLMs. On-policy distillation advances this approach by having the student sample its own trajectories while a teacher LLM provides dense token-level supervision, addressing the distribution mismatch between training and inference in off-policy distillation methods. However, on-policy distillation typically requires a separate, often larger, teacher LLM and does not explicitly leverage ground-truth solutions available in reasoning datasets. Inspired by the intuition that a sufficiently capable LLM can rationalize external privileged reasoning traces and teach its weaker self, we introduce On-Policy Self-Distillation (OPSD), a learning algorithm where a single LLM acts as both teacher and student with different contexts. The teacher policy conditions on privileged information (e.g., verified reasoning traces) while the student policy sees only the question; training minimizes the per-token divergence between these distributions over the student's own rollouts. We demonstrate the efficacy of our method on multiple mathematical reasoning benchmarks, achieving superior token efficiency compared to reinforcement learning methods and better performance over off-policy distillation methods. Code repo:
+this https URL
+.
+Comments:
+code is released here:
+this https URL
+Subjects:
+Machine Learning (cs.LG)
+; Computation and Language (cs.CL)
+Cite as:
+arXiv:2601.18734
+[cs.LG]
+(or
+arXiv:2601.18734v3
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2601.18734
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Siyan Zhao [
+view email
+]
+[v1]
+Mon, 26 Jan 2026 17:56:50 UTC (265 KB)
+[v2]
+Thu, 5 Mar 2026 18:19:57 UTC (282 KB)
+[v3]
+Fri, 20 Mar 2026 15:40:19 UTC (268 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models, by Siyan Zhao and 6 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2026-01
+Change to browse by:
+cs
+cs.CL
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/260120802-reinforcement-learning-via-self-distillation.md b/research/notes/260120802-reinforcement-learning-via-self-distillation.md
new file mode 100644
index 0000000000000000000000000000000000000000..47ce2afec19a82bb4e8ab691d304b9160d0a21c9
--- /dev/null
+++ b/research/notes/260120802-reinforcement-learning-via-self-distillation.md
@@ -0,0 +1,215 @@
+---
+title: '[2601.20802] Reinforcement Learning via Self-Distillation'
+id: 260120802-reinforcement-learning-via-self-distillation
+tags:
+- deepread
+created: '2026-06-09T23:54:49.976930Z'
+source: https://arxiv.org/abs/2601.20802
+source_domain: arxiv.org
+fetched_at: '2026-06-09T23:54:49.976791Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2601.20802] Reinforcement Learning via Self-Distillation
+Computer Science > Machine Learning
+arXiv:2601.20802
+(cs)
+[Submitted on 28 Jan 2026 (
+v1
+), last revised 16 Feb 2026 (this version, v2)]
+Title:
+Reinforcement Learning via Self-Distillation
+Authors:
+Jonas Hübotter
+,
+Frederike Lübeck
+,
+Lejs Behric
+,
+Anton Baumann
+,
+Marco Bagatella
+,
+Daniel Marta
+,
+Ido Hakimi
+,
+Idan Shenfeld
+,
+Thomas Kleine Buening
+,
+Carlos Guestrin
+,
+Andreas Krause
+View a PDF of the paper titled Reinforcement Learning via Self-Distillation, by Jonas H\"ubotter and 10 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math. Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck. Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain why an attempt failed. We formalize this setting as reinforcement learning with rich feedback and introduce Self-Distillation Policy Optimization (SDPO), which converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model. SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy. In this way, SDPO leverages the model's ability to retrospectively identify its own mistakes in-context. Across scientific reasoning, tool use, and competitive programming on LiveCodeBench v6, SDPO improves sample efficiency and final accuracy over strong RLVR baselines. Notably, SDPO also outperforms baselines in standard RLVR environments that only return scalar feedback by using successful rollouts as implicit feedback for failed attempts. Finally, applying SDPO to individual questions at test time accelerates discovery on difficult binary-reward tasks, achieving the same discovery probability as best-of-k sampling or multi-turn conversations with 3x fewer attempts.
+Subjects:
+Machine Learning (cs.LG)
+; Artificial Intelligence (cs.AI)
+Cite as:
+arXiv:2601.20802
+[cs.LG]
+(or
+arXiv:2601.20802v2
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2601.20802
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Jonas Hübotter [
+view email
+]
+[v1]
+Wed, 28 Jan 2026 17:45:12 UTC (1,009 KB)
+[v2]
+Mon, 16 Feb 2026 14:49:34 UTC (2,122 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Reinforcement Learning via Self-Distillation, by Jonas H\"ubotter and 10 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2026-01
+Change to browse by:
+cs
+cs.AI
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+Links to Code Toggle
+Papers with Code
+(
+What is Papers with Code?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/260324477-composer-2-technical-report.md b/research/notes/260324477-composer-2-technical-report.md
new file mode 100644
index 0000000000000000000000000000000000000000..bcdcb9dc360b5126c19fa99c24e5c178e06b08ab
--- /dev/null
+++ b/research/notes/260324477-composer-2-technical-report.md
@@ -0,0 +1,293 @@
+---
+title: '[2603.24477] Composer 2 Technical Report'
+id: 260324477-composer-2-technical-report
+tags:
+- deepread
+created: '2026-06-10T00:23:01.085397Z'
+source: https://arxiv.org/abs/2603.24477
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:01.085279Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2603.24477] Composer 2 Technical Report
+Computer Science > Software Engineering
+arXiv:2603.24477
+(cs)
+[Submitted on 25 Mar 2026 (
+v1
+), last revised 26 Mar 2026 (this version, v2)]
+Title:
+Composer 2 Technical Report
+Authors:
+Cursor Research
+:
+Aaron Chan
+,
+Ahmed Shalaby
+,
+Alexander Wettig
+,
+Aman Sanger
+,
+Andrew Zhai
+,
+Anurag Ajay
+,
+Ashvin Nair
+,
+Charlie Snell
+,
+Chen Lu
+,
+Chen Shen
+,
+Emily Jia
+,
+Federico Cassano
+,
+Hanpeng Liu
+,
+Haoyu Chen
+,
+Henry Wildermuth
+,
+Jacob Jackson
+,
+Janet Li
+,
+Jediah Katz
+,
+Jiajun Yao
+,
+Joey Hejna
+,
+Josh Warner
+,
+Julius Vering
+,
+Kevin Frans
+,
+Lee Danilek
+,
+Less Wright
+,
+Lujing Cen
+,
+Luke Melas-Kyriazi
+,
+Michael Truell
+,
+Michiel de Jong
+,
+Naman Jain
+,
+Nate Schmidt
+,
+Nathan Wang
+,
+Niklas Muennighoff
+,
+Oleg Rybkin
+,
+Paul Loh
+,
+Phillip Kravtsov
+,
+Rishabh Yadav
+,
+Sahil Shah
+,
+Sam Kottler
+,
+Alexander M Rush
+,
+Shengtong Zhang
+,
+Shomil Jain
+,
+Sriram Sankar
+,
+Stefan Heule
+,
+Stuart H. Sul
+,
+Sualeh Asif
+,
+Victor Rong
+,
+Wanqi Zhu
+,
+William Lin
+,
+Yuchen Wu
+,
+Yuri Volkov
+,
+Yury Zemlyanskiy
+,
+Zack Holbrook
+,
+Zhiyuan Zhang
+View a PDF of the paper titled Composer 2 Technical Report, by Cursor Research: Aaron Chan and 53 other authors
+View PDF
+HTML (experimental)
+Abstract:
+Composer 2 is a specialized model designed for agentic software engineering. The model demonstrates strong long-term planning and coding intelligence while maintaining the ability to efficiently solve problems for interactive use. The model is trained in two phases: first, continued pretraining to improve the model's knowledge and latent coding ability, followed by large-scale reinforcement learning to improve end-to-end coding performance through stronger reasoning, accurate multi-step execution, and coherence on long-horizon realistic coding problems. We develop infrastructure to support training in the same Cursor harness that is used by the deployed model, with equivalent tools and structure, and use environments that match real problems closely. To measure the ability of the model on increasingly difficult tasks, we introduce a benchmark derived from real software engineering problems in large codebases including our own. Composer 2 is a frontier-level coding model and demonstrates a process for training strong domain-specialized models. On our CursorBench evaluations the model achieves a major improvement in accuracy compared to previous Composer models (61.3). On public benchmarks the model scores 61.7 on Terminal-Bench and 73.7 on SWE-bench Multilingual in our harness, comparable to state-of-the-art systems.
+Subjects:
+Software Engineering (cs.SE)
+; Machine Learning (cs.LG)
+Cite as:
+arXiv:2603.24477
+[cs.SE]
+(or
+arXiv:2603.24477v2
+[cs.SE]
+for this version)
+https://doi.org/10.48550/arXiv.2603.24477
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Alexander M. Rush [
+view email
+]
+[v1]
+Wed, 25 Mar 2026 16:18:37 UTC (1,616 KB)
+[v2]
+Thu, 26 Mar 2026 01:57:05 UTC (1,605 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Composer 2 Technical Report, by Cursor Research: Aaron Chan and 53 other authors
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.SE
+< prev
+|
+next >
+new
+|
+recent
+|
+2026-03
+Change to browse by:
+cs
+cs.LG
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md b/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md
new file mode 100644
index 0000000000000000000000000000000000000000..d8f12f6c4b436fde9a6712c939a8807b3bd135a2
--- /dev/null
+++ b/research/notes/260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu.md
@@ -0,0 +1,190 @@
+---
+title: '[2605.05029] The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale
+  Neural Evidence'
+id: 260505029-the-predictive-causal-gap-an-impossibility-theorem-and-large-scale-neu
+tags:
+- deepread
+created: '2026-06-10T00:31:38.772752Z'
+source: https://arxiv.org/abs/2605.05029
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:31:38.772451Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2605.05029] The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence
+Computer Science > Machine Learning
+arXiv:2605.05029
+(cs)
+[Submitted on 6 May 2026]
+Title:
+The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence
+Authors:
+Kejun Liu
+View a PDF of the paper titled The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence, by Kejun Liu
+View PDF
+HTML (experimental)
+Abstract:
+We report a systematic failure mode in predictive representation learning. Across 2695 neural network configurations trained to predict linear-Gaussian dynamics, the optimal encoder tracks the environment rather than the system it is meant to model. The mean causal fidelity -- the fraction of encoder sensitivity allocated to system degrees of freedom -- is 0.49, and only 2.5% of configurations exceed 0.70. The failure intensifies with dimension: at N=100, the optimal encoder becomes causally blind (fidelity ~10^{-8}) while achieving 92% lower prediction error than the causal representation. We prove this is not an optimization artifact but a structural property of the predictive objective: when environment modes are slower or less noisy than system modes, every minimizer of the population risk encodes the former. The set of dynamics exhibiting this predictive-causal gap is open and of positive measure in parameter space. In a nonlinear Duffing-GRU sweep, unconstrained predictors learn environment-dominant representations in 55% of tasks (95% CI 41--68%) versus 24% under operational grounding (p=2.3e-3); the median out-of-distribution MSE inflation under environment shift is 1.82x versus 1.00x. Operational grounding -- restricting the loss to system observables -- partially suppresses the gap, but causal fidelity is never recovered without an explicit system-environment boundary. The results identify the predictive-causal gap as a structural limit of learning, with implications for self-supervised representation learning, world models, and the scaling paradigm.
+Comments:
+15 pages, 5 figures, 3 tables. Supplemental Material included (Sections S1-S10)
+Subjects:
+Machine Learning (cs.LG)
+MSC
+classes:
+68T07, 62M45, 37M10, 81S2268T07, 62M45, 37M10, 81S22 68T07, 62M45, 37M10, 81S22
+Cite as:
+arXiv:2605.05029
+[cs.LG]
+(or
+arXiv:2605.05029v1
+[cs.LG]
+for this version)
+https://doi.org/10.48550/arXiv.2605.05029
+Focus to learn more
+arXiv-issued DOI via DataCite
+Submission history
+From: Kejun Liu [
+view email
+]
+[v1]
+Wed, 6 May 2026 15:25:37 UTC (138 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled The Predictive-Causal Gap: An Impossibility Theorem and Large-Scale Neural Evidence, by Kejun Liu
+View PDF
+HTML (experimental)
+TeX Source
+view license
+Current browse context:
+cs.LG
+< prev
+|
+next >
+new
+|
+recent
+|
+2026-05
+Change to browse by:
+cs
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+IArxiv recommender toggle
+IArxiv Recommender
+(
+What is IArxiv?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md b/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..1df86bea99a7f8d79f3b8d2cb5e0a7fa94333288
--- /dev/null
+++ b/research/notes/260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2.md
@@ -0,0 +1,196 @@
+---
+title: '[2606.07412] Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent
+  Skills'
+id: 260607412-socratic-swe-self-evolving-coding-agents-via-trace-derived-agent-skill-2
+tags:
+- deepread
+created: '2026-06-10T00:41:23.035686Z'
+source: https://ar5iv.labs.arxiv.org/html/2606.07412
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:41:23.035515Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+[2606.07412] Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills
+Computer Science > Software Engineering
+arXiv:2606.07412
+(cs)
+[Submitted on 5 Jun 2026]
+Title:
+Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills
+Authors:
+Chuan Xiao
+,
+Zhengbo Jiao
+,
+Shaobo Wang
+,
+Wei Wang
+,
+Bing Zhao
+,
+Hu Wei
+,
+Linfeng Zhang
+,
+Lin Qu
+View a PDF of the paper titled Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills, by Chuan Xiao and 7 other authors
+View PDF
+Abstract:
+LLM-driven software engineering agents have become a central testbed for real-world language-model capability, yet their training remains limited by the availability of high-quality SWE tasks. Existing synthetic data methods typically create tasks through fixed mutation or bug-injection procedures, making the resulting distributions largely independent of the agent's own weaknesses and training progress. We introduce Socratic-SWE, a closed-loop self-evolution framework that reuses the agent's historical solving traces as a source of training signal. Rather than treating traces only as evidence for reward computation, Socratic-SWE distills them into structured agent skills that summarize recurring failures and effective repair patterns. These skills then guide the generation of targeted repair tasks in real repositories. Candidate tasks are checked through execution-based validation and scored with a solver-gradient alignment reward, so that the retained tasks are both verifiable and useful for improving the Solver. The updated Solver produces new traces, enabling the task curriculum to adapt over successive rounds. Across SWE-bench Verified, SWE-bench Lite, SWE-bench Pro, and Terminal-Bench 2.0, Socratic-SWE consistently improves over self-evolving baselines under the same compute budget, reaching 50.40% on SWE-bench Verified after three iterations. These results suggest that solving traces can serve as a scalable substrate for self-evolving SWE agents.
+Comments:
+21 pages, 5 figures. Under review
+Subjects:
+Software Engineering (cs.SE)
+; Artificial Intelligence (cs.AI)
+Cite as:
+arXiv:2606.07412
+[cs.SE]
+(or
+arXiv:2606.07412v1
+[cs.SE]
+for this version)
+https://doi.org/10.48550/arXiv.2606.07412
+Focus to learn more
+arXiv-issued DOI via DataCite (pending registration)
+Submission history
+From: Zhengbo Jiao [
+view email
+]
+[v1]
+Fri, 5 Jun 2026 16:00:17 UTC (755 KB)
+Full-text links:
+Access Paper:
+View a PDF of the paper titled Socratic-SWE: Self-Evolving Coding Agents via Trace-Derived Agent Skills, by Chuan Xiao and 7 other authors
+View PDF
+TeX Source
+view license
+Current browse context:
+cs.SE
+< prev
+|
+next >
+new
+|
+recent
+|
+2026-06
+Change to browse by:
+cs
+cs.AI
+References & Citations
+NASA ADS
+Google Scholar
+Semantic Scholar
+export BibTeX citation
+Loading...
+BibTeX formatted citation
+×
+loading...
+Data provided by:
+Bookmark
+Bibliographic Tools
+Bibliographic and Citation Tools
+Bibliographic Explorer Toggle
+Bibliographic Explorer
+(
+What is the Explorer?
+)
+Connected Papers Toggle
+Connected Papers
+(
+What is Connected Papers?
+)
+Litmaps Toggle
+Litmaps
+(
+What is Litmaps?
+)
+scite.ai Toggle
+scite Smart Citations
+(
+What are Smart Citations?
+)
+Code, Data, Media
+Code, Data and Media Associated with this Article
+alphaXiv Toggle
+alphaXiv
+(
+What is alphaXiv?
+)
+Links to Code Toggle
+CatalyzeX Code Finder for Papers
+(
+What is CatalyzeX?
+)
+DagsHub Toggle
+DagsHub
+(
+What is DagsHub?
+)
+GotitPub Toggle
+Gotit.pub
+(
+What is GotitPub?
+)
+Huggingface Toggle
+Hugging Face
+(
+What is Huggingface?
+)
+ScienceCast Toggle
+ScienceCast
+(
+What is ScienceCast?
+)
+Demos
+Demos
+Replicate Toggle
+Replicate
+(
+What is Replicate?
+)
+Spaces Toggle
+Hugging Face Spaces
+(
+What is Spaces?
+)
+Spaces Toggle
+TXYZ.AI
+(
+What is TXYZ.AI?
+)
+Related Papers
+Recommenders and Search Tools
+Link to Influence Flower
+Influence Flower
+(
+What are Influence Flowers?
+)
+Core recommender toggle
+CORE Recommender
+(
+What is CORE?
+)
+Author
+Venue
+Institution
+Topic
+About arXivLabs
+arXivLabs: experimental projects with community collaborators
+arXivLabs is a framework that allows collaborators to develop and share new arXiv features directly on our website.
+Both individuals and organizations that work with arXivLabs have embraced and accepted our values of openness, community, excellence, and user data privacy. arXiv is committed to these values and only works with partners that adhere to them.
+Have an idea for a project that will add value for arXiv's community?
+Learn more about arXivLabs
+.
+Which authors of this paper are endorsers?
+|
+Disable MathJax
+(
+What is MathJax?
+)
\ No newline at end of file
diff --git a/research/notes/bugs-scaling-data-for-software-engineering-agents.md b/research/notes/bugs-scaling-data-for-software-engineering-agents.md
new file mode 100644
index 0000000000000000000000000000000000000000..d2a2e43ad1199bbf7b28c64633fc8ce209677183
--- /dev/null
+++ b/research/notes/bugs-scaling-data-for-software-engineering-agents.md
@@ -0,0 +1,8283 @@
+---
+title: '\bugs: Scaling Data for Software Engineering Agents'
+id: bugs-scaling-data-for-software-engineering-agents
+tags:
+- deepread
+created: '2026-06-10T00:23:06.342374Z'
+source: https://arxiv.org/html/2504.21798
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:06.342106Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+\bugs: Scaling Data for Software Engineering Agents
+\bugs
+: Scaling Data for Software Engineering Agents
+John Yang
+1
+,
+Kilian Lieret
+2
+,
+Carlos E. Jimenez
+2
+,
+Alexander Wettig
+2
+,
+Kabir Khandpur
+3
+,
+\And
+Yanzhe Zhang
+1
+,
+Binyuan Hui
+4
+,
+Ofir Press
+2
+,
+Ludwig Schmidt
+1
+,
+Diyi Yang
+1
+\And
+1
+Stanford University
+2
+Princeton University
+3
+Indepedent
+4
+Alibaba Qwen
+Abstract
+Despite recent progress in Language Models (LMs) for software engineering, collecting training data remains a significant pain point.
+Existing datasets are small, with at most
+1
+,
+000
+1
+000
+1{,}000
+1 , 000
+s of training instances from
+11
+11
+11
+11
+or fewer GitHub repositories.
+The procedures to curate such datasets are often complex, necessitating hundreds of hours of human labor; companion execution environments also take up several terabytes of storage, severely limiting their scalability and usability.
+To address this pain point, we introduce SWE-smith, a novel pipeline for generating software engineering training data at scale.
+Given any Python codebase,
+\bugs
+constructs a corresponding execution environment, then automatically synthesizes
+100
+100
+100
+100
+s to
+1
+,
+000
+1
+000
+1{,}000
+1 , 000
+s of task instances that break existing test(s) in the codebase.
+Using
+\bugs
+, we create a dataset of
+50
+50
+50
+50
+k instances sourced from
+128
+128
+128
+128
+GitHub repositories, an order of magnitude larger than all previous works.
+We train
+SWE-agent-LM-32B
+, achieving
+40.2
+40.2
+40.2
+40.2
+% Pass@1 resolve rate on the SWE-bench Verified benchmark, state of the art among open source models.
+We open source
+\bugs
+(collection procedure, task instances, trajectories, models) to lower the barrier of entry for research in LM systems for automated software engineering.
+All assets available at
+https://swesmith.com
+.
+1
+Introduction
+Figure 1:
+Scaling task instances
+(left) and
+performance
+(right) for SWE-agent’s with
+\bugs
+.
+Using
+\bugs
+, we can create
+100
+100
+100
+100
+s to
+1000
+1000
+1000
+1000
+s of instances for any Python codebase, enabling us to train
+SWE-agent-LM-32B
+which achieves
+40.2
+40.2
+40.2
+40.2
+% on SWE-bench Verified.
+Language Model (LM) agents, such as SWE-agent
+(Yang et al.,
+2024a
+)
+or OpenHands
+(Wang et al.,
+2024
+)
+,
+have made remarkable progress towards automating software engineering (SE) tasks, as tracked by benchmarks such as SWE-bench
+(Jimenez et al.,
+2024b
+)
+.
+However, the most effective agents still rely on proprietary LMs, as building open source LMs for SE remains bottlenecked by the lack of large-scale, high-quality training data.
+To ensure that open research remains relevant in this field, it is critical to develop infrastructure for collecting software engineering training data at scale.
+The current open-source ecosystem offers two kinds of data sources to train LMs on SE tasks.
+One simple approach is to crawl pull requests (PRs) and issues from GitHub repositories.
+However, without execution environments or tests, these instances offer no reliable way of validating generated solutions,
+and LMs are limited to learning from the surface form of code
+(Xie et al.,
+2025a
+)
+or via rewards based on superficial string similarity
+(Wei et al.,
+2025
+)
+.
+In contrast, SWE-bench provides reliable validation by running unit tests against proposed solutions.
+Another line of work has simply extended the SWE-bench collection strategy to a new set of repositories for training purposes
+(Pan et al.,
+2024
+)
+.
+This produces flexible environments for training and distilling LM agents, since we can generate agent trajectories and filter them based on the unit test results.
+However, the scalability of this approach is severely limited by the challenges associated with SWE-bench’s collection strategy.
+SWE-bench’s filtering process leaves only a small number of PRs that not only resolve a Github issue, but also make meaningful changes to unit tests.
+Also, setting up execution environments for each instance requires a substantial amount of human intervention.
+In this paper, we introduce the
+\bugs
+toolkit, which marries the flexible execution environments of SWE-bench with scalable instance collection (Figure
+1
+).
+\bugs
+features several techniques to automatically synthesize bugs in existing GitHub repositories,
+such as (1) generating errant rewrites of functions with an LM, (2) procedurally modifying the abstract syntax tree (AST) of functions, (3) undoing PRs, and (4) combining bugs.
+Our key insight is that execution-based validation can not only validate proposed solutions,
+but also identify bug candidates which cause substantial software regression (i.e., break tests).
+Figure 2:
+\bugs
+creates training data for software engineering agents by crafting bugs into real codebases.
+Given a codebase, we employ several strategies to create task instances that break existing tests.
+Using
+\bugs
+, we create
+50
+50
+50
+50
+k+ task instances with execution environments from
+128
+128
+128
+128
+real world repositories.
+In a nutshell,
+\bugs
+puts forth the following task creation workflow, as shown in Figure
+2
+.
+Given a codebase, we automatically set up a corresponding environment using SWE-agent
+(Yang et al.,
+2024a
+)
+.
+Within this environment, we then use the aforementioned techniques to synthesize
+100
+100
+100
+100
+s to
+1
+,
+000
+1
+000
+1,000
+1 , 000
+s of task instances.
+Finally, we craft realistic issue descriptions automatically with LMs.
+\bugs
+’s design significantly reduces the amount of human labor and storage required for constructing execution environments.
+Using
+\bugs
+, we create a dataset of
+50
+50
+50
+50
+k task instances across
+128
+128
+128
+128
+real-world GitHub repositories.
+Using the
+\bugs
+dataset, we achieve a new open-weight state of the art result on SWE-bench verified.
+Using the SWE-smith task instances, we generate
+5
+,
+016
+5
+016
+5{,}016
+5 , 016
+expert trajectories with Claude 3.7 Sonnet and fine-tune Qwen 2.5 Coder Instruct
+32
+32
+32
+32
+B.
+The resulting LM,
+SWE-agent-LM-32B
+, achieves
+40.2
+%
+percent
+40.2
+40.2\%
+40.2 %
+(+
+33.4
+33.4
+33.4
+33.4
+%) on SWE-bench Verified in a single attempt, without inference-time scaling.
+This sets a new state of the art for open-weight models.
+The scale and diversity of the
+\bugs
+dataset enables us to begin establishing truths and investigate interesting phenomena about developing SWE-agents.
+Training on more instances, bug types, and repositories helps.
+LM generated issue text approximates real ones effectively.
+Using
+\bugs
+, we find that it’s possible to optimize LMs to perform well for specific repositories while only suffering minor generalization loss.
+We release
+\bugs
+as an open-source toolkit — including instances, environments, and trajectories — to catalyze the development of stronger open-source LM agents.
+2
+\bugs
+: Software Task Generation at Scale
+The core principle of
+\bugs
+’s collection strategy is to define an execution environment first, and then synthesize task instances within the environment.
+Conceptually, this is a simple inversion of SWE-bench’s approach, which instead prioritizes identifying task instances, and then attempts to build an environment for each.
+In this section, we describe the procedure in detail and show how, in practice,
+\bugs
+scales significantly better in terms of repositories, task instances, and storage.
+2.1
+Collection
+Building execution environments for repositories with passing tests.
+Given a repository, we run SWE-agent
+(Yang et al.,
+2024a
+)
+on the latest commit for at most
+100
+100
+100
+100
+steps, instructing it to install the codebase and run the test suite.
+We then manually verify the installation and testing instructions, check if more than
+80
+80
+80
+80
+% of existing tests pass, and finally create a Docker image for the repository.
+We target repositories for the
+5
+,
+000
+5
+000
+5,000
+5 , 000
+most downloaded packages listed in the Python Package Index (PyPI) as of November 18, 2024, sort the PyPI packages by GitHub stars, and then remove any PyPI package with less than
+1
+,
+000
+1
+000
+1,000
+1 , 000
+stars, as well as all
+12
+12
+12
+12
+SWE-bench test repositories from consideration.
+More in §
+A.2
+.
+Creating task instance candidates.
+Per repository, we employ four different strategies to create candidates.
+As shown in Figure
+2
+, each strategy takes in a repository as input, then produces task instance candidates represented as
+.diff
+files.
+Extensive details in §
+B
+.
+•
+LM Generation
+: Per repository, we identify all programmatic entities (functions, classes), then take two approaches: (1) provide an LM with the function and prompt it to introduce errant
+modifications
+(henceforth referred to as “LM Modify”), and (2) given only the function header and docstring, ask the LM to
+rewrite
+it (“LM Rewrite”).
+More in §
+B.1
+.
+•
+Procedural Modification
+: Per function, we acquire an abstract syntax tree (AST) representation of the code, then randomly perform one or more transformations (e.g., remove a conditional/loop, change an operator, +
+11
+11
+11
+11
+more. See Table
+8
+).
+More in §
+B.2
+.
+•
+Combine Bugs
+: LM generation and Procedural Modification task instances exclusively edit one function or class.
+To create more complex tasks that require editing multiple portions of the codebase, we devise a “Patch Combination” strategy that creates a task instance by aggregating candidates from the same file(s) or module(s).
+More in §
+B.3
+.
+•
+Invert PRs
+(or “PR Mirror”): Per repository, we collect all PRs that modify Python files.
+Per PR, we attempt to
+undo
+its revisions in the current version of the repository.
+To achieve this, we provide an LM with the PR’s code changes (a
+.diff
+plaintext) and prompt it to rewrite each affected file such that the PR edits are reverted.
+Unlike SWE-bench, we do
+not
+check out the PR’s base commit, as the install specifications determined in the previous step may not be compatible with older versions of the repo.
+More in §
+B.4
+.
+Execution-based validation of candidates.
+We apply each candidate patch to the corresponding repository, run the test suite, and only keep patches that break one or more existing, passing tests (referred to as
+Fail-to-Pass
+or
+F2P
+test(s)).
+For efficiency purposes, we also limit testing runtime to two minutes; bug candidates that cause test runtimes in excess of this time limit are discarded.
+Minor additional details in §
+A.3
+.
+Generating problem statements.
+The issue text associated with a bug can significantly alter the difficulty and feasibility of the task instance.
+Detailed descriptions of “expected” vs. “observed” behavior or bug-reproduction code in issue text heavily affect an agent’s capacity to localize bugs or iterate on proposed solutions.
+We explore several techniques covered fully in §
+D
+, and ultimately settle on a simple strategy.
+Per task instance, we provide an LM with the
+.diff
+patch, source code of a random F2P test, and execution output from running the repository’s test suite with the bug patch applied.
+We prompt the LM for GitHub issue-style text that includes reproduction code based on the F2P test.
+What human labor remains?
+The steps requiring manual effort are (1) parsing the correct installation setup procedures from the agent trajectory (
+∼
+7
+similar-to
+absent
+7
+\sim 7
+∼ 7
+min per repository),
+and (2) implementing the parser for test outputs (
+∼
+1
+similar-to
+absent
+1
+\sim 1
+∼ 1
+min per repository).
+Step two requires very little time because parsers can be reused for repositories with the same testing infrastructure (e.g.,
+pytest
+).
+\bugs
+removes the need for manual efforts to determine installation specifications for multiple versions of a codebase across time, the most costly step of SWE-bench collection.
+Creating
+\bugs
+took one author
+∼
+20
+similar-to
+absent
+20
+\sim{}20
+∼ 20
+h of human labor.
+2.2
+Features
+We apply
+\bugs
+to
+128
+128
+128
+128
+Python repositories, generating a total of
+50
+50
+50
+50
+k instances.
+Table
+1
+captures the key statistics.
+On average, we generate
+381
+381
+381
+381
+task instances per repository, with as many as
+2277
+2277
+2277
+2277
+for
+pandas-dev/pandas
+.
+We summarize the distribution of task instances per repository in Figure
+1
+, where repositories are grouped into one of six general categories.
+\bugs
+took $
+1360
+1360
+1360
+1360
+to create ($
+1000
+1000
+1000
+1000
+to generate bugs, $
+160
+160
+160
+160
+for automatic repository installation with SWE-agent, $
+200
+200
+200
+200
+to generate issues for
+10
+10
+10
+10
+K bugs).
+Generating an issue costs
+2.54
+2.54
+2.54
+2.54
+¢ on average.
+More dataset analyses in §
+C
+.
+\captionof
+figure
+Distribution of instances per repo for
+128
+128
+128
+128
+repo’s grouped into
+6
+6
+6
+6
+categories.
+Bug Type
+Yield %
+# Insts
+Cost
+F2P
+Lines
+Combine
+96.9
+96.9
+96.9
+96.9
+%
+10
+,
+092
+10
+092
+10,092
+10 , 092
+0.00
+0.00
+0.00
+0.00
+¢
+15
+15
+15
+15
+11
+11
+11
+11
+LM Modify
+56.0
+56.0
+56.0
+56.0
+%
+17
+,
+887
+17
+887
+17,887
+17 , 887
+0.38
+0.38
+0.38
+0.38
+¢
+4
+4
+4
+4
+3
+3
+3
+3
+LM Rewrite
+35.0
+35.0
+35.0
+35.0
+%
+4
+,
+173
+4
+173
+4,173
+4 , 173
+3.93
+3.93
+3.93
+3.93
+¢
+4
+4
+4
+4
+24
+24
+24
+24
+PR Mirror
+33.8
+33.8
+33.8
+33.8
+%
+2
+,
+344
+2
+344
+2,344
+2 , 344
+5.53
+5.53
+5.53
+5.53
+¢
+3
+3
+3
+3
+14
+14
+14
+14
+Procedural
+40.2
+40.2
+40.2
+40.2
+%
+15
+,
+641
+15
+641
+15,641
+15 , 641
+0.00
+0.00
+0.00
+0.00
+¢
+7
+7
+7
+7
+5
+5
+5
+5
+Total
+50.1
+50,137
+2.32¢
+6
+5
+Table 1:
+Summary of
+\bugs
+statistics.
+“Yield %” is the % of candidates generated by a strategy that break
+1
++
+limit-from
+1
+1+
+1 +
+tests.
+“Cost” is the average cost to generate one candidate.
+“F2P” (Fail to Pass tests), “Lines [Edited]” are median values.
+Bug generation strategies vary in cost and yield rate.
+Of methods relying on LMs, PR Mirrors are more expensive because the task entails rewriting entire files, as opposed to individual functions for LM Modify and LM Rewrite.
+Yield rates are limited by either lack of test coverage for the change or because the bug candidate did not actually introduce relevant issues.
+For example, for LM Rewrite, the LM is asked to re-implement the function; it is
+not
+explicitly asked for bugs.
+When requested outright (LM Modify), the yield is higher.
+Dataset
+# Tasks
+# Repos
+Exec?
+Source
+Env. Size
+R2E
+(Jain et al.,
+2024
+)
+0.25
+0.25
+0.25
+0.25
+k
+137
+137
+137
+137
+Synth
+270
+270
+270
+270
+GBs
+R2E-gym (Subset)
+(Jain et al.,
+2025
+)
+4.6
+4.6
+4.6
+4.6
+k
+10
+10
+10
+10
+Synth
+4
+4
+4
+4
+TBs
+SWE-bench-extra
+(Badertdinov et al.,
+2024
+)
+6.38
+6.38
+6.38
+6.38
+k
+2
+2
+2
+2
+k
+Real
+-
+SWE-bench-train
+(Jimenez et al.,
+2024b
+)
+19
+19
+19
+19
+k
+37
+37
+37
+37
+Real
+-
+SWE-fixer
+(Xie et al.,
+2025a
+)
+115
+115
+115
+115
+k
+856
+856
+856
+856
+Real
+-
+SWE-gym
+(Pan et al.,
+2024
+)
+2.4
+2.4
+2.4
+2.4
+k
+11
+11
+11
+11
+Real
+6
+6
+6
+6
+TBs
+\bugs
+50
+50
+50
+50
+k
+128
+128
+128
+128
+Both
+295
+295
+295
+295
+GBs
+Table 2:
+Comparison of open source training datasets for software engineering tasks.
+Relative to existing datasets,
+\bugs
+has multiple times the number of task instances, repositories, and environments at a fraction of prior storage costs.
+SWE-fixer and SWE-bench-train task instances do not have execution environments, so “Env. Size” is blank.
+How difficult are
+\bugs
+task instances?
+To determine whether task instances produced by
+\bugs
+are realistic and challenging, we train a Qwen
+2.5
+2.5
+2.5
+2.5
+32
+32
+32
+32
+B model on
+1
+,
+699
+1
+699
+1{,}699
+1 , 699
+human-annotated (task, label) pairs from
+Chowdhury et al. (
+2024
+)
+to rate tasks as (
+easy
+,
+medium
+,
+hard
+) by training.
+To quantify difficulty, each difficulty label corresponds to values of
+1
+1
+1
+1
+/
+5
+5
+5
+5
+/
+9
+9
+9
+9
+.
+The model achieves
+75.3
+75.3
+75.3
+75.3
+% test accuracy.
+We then rate difficulty of task instances from both
+\bugs
+and prior SWE-bench style datasets
+(Chowdhury et al.,
+2024
+; Jimenez et al.,
+2024b
+; Pan et al.,
+2024
+; Yang et al.,
+2024b
+)
+.
+\bugs
+task instances span a broad range of difficulties, similar to SWE-bench and SWE-gym.
+The average difficulty score for
+\bugs
+(
+5.27
+5.27
+5.27
+5.27
+–
+5.72
+5.72
+5.72
+5.72
+across bug generation strategies) is comparable to SWE-bench (
+5.01
+5.01
+5.01
+5.01
+) and SWE-gym (
+5.62
+5.62
+5.62
+5.62
+). This suggests SWE-smith enables realistic and appropriately challenging evaluation.
+We discuss why bug strategies yield different levels of difficulty and visualize difficulty per dataset in §
+E
+.
+Scaling execution environments.
+Unlike SWE-bench which creates a Docker image per task instance,
+\bugs
+leverages a simpler design where tasks from the same repository share the same environment, reducing storage overhead significantly, as shown in Table
+2
+.
+This approach not only makes scaling task instances more affordable, but also renders
+\bugs
+more accessible and maintainable than existing datasets.
+We estimate that creating a similar quantity of task instances (
+50
+50
+50
+50
+k) using SWE-bench would require
+50
+50
+50
+50
+to
+150
+150
+150
+150
+TBs of storage for environments, a
+500
+500
+500
+500
+x difference.
+Extended discussion in §
+C.1
+.
+3
+Experiments
+To explore the utility of
+\bugs
+for training software engineering agents, we use rejection sampling fine-tuning
+(Yuan et al.,
+2023
+)
+as the primary procedure for improving a base LM with
+\bugs
+.
+Our experiment workflow is as follows.
+First, we curate a subset of
+\bugs
+task instances.
+Next, we run an agent system with an expert model on this subset.
+At this step, the trajectory corresponding to each run is recorded.
+Then, we fine-tune the base (or “student”) model on the trajectories corresponding to resolved instances.
+Finally, we evaluate the agent system run with the student model on a separate, test split.
+Models.
+For expert models, we use
+claude-3-7-sonnet-20250219
+(Anthropic,
+2025
+)
+.
+For fair comparisons with prior works
+(Pan et al.,
+2024
+)
+, we also use
+claude-3-5-sonnet-20240620
+and
+gpt-4o-2024-08-06
+.
+We use the
+Qwen-2.5-Coder-Instruct
+(Hui et al.,
+2024
+)
+7
+7
+7
+7
+B and
+32
+32
+32
+32
+B series as the base models.
+Training and hyperparameter details are in §
+F.1
+.
+Agent system.
+We use SWE-agent
+(Yang et al.,
+2024a
+)
+, an agent system for solving GitHub issues.
+SWE-agent provides a base LM with an Agent Computer Interface (ACI) that enables more effective interactions with a codebase.
+At each turn, SWE-agent prompts an LM to generate a ReAct
+(Yao et al.,
+2023b
+)
+style (thought, action) pair, where the action either edits a file or executes a shell command.
+We choose SWE-agent because, at the time of writing, SWE-agent with Claude 3.7 Sonnet is the top open source solution on SWE-bench.
+When generating trajectories with expert models, we run SWE-agent for at most
+75
+75
+75
+75
+steps and $
+2.00
+2.00
+2.00
+2.00
+cost limit.
+For inference of student models, we impose the same
+75
+75
+75
+75
+step maximum and fix temperature at
+0.0
+0.0
+0.0
+0.0
+.
+Full configuration details are in §
+F.1
+.
+Evaluation metrics.
+We evaluate on the SWE-bench Lite and Verified
+(Chowdhury et al.,
+2024
+)
+subsets.
+SWE-bench evaluates AI systems on their ability to solve software issues from
+12
+12
+12
+12
+real world GitHub repositories.
+The Lite split is a subset of
+300
+300
+300
+300
+instances, curated to be an easier evaluation set that’s less costly to run.
+The Verified split is a human-curated subset of
+500
+500
+500
+500
+instances, selected for clearer problem statements and more reliable evaluation.
+To assess generalization beyond Python, we also evaluate on SWE-bench Multilingual, a new dataset introduced in this paper.
+SWE-Bench Multilingual consists of
+300
+300
+300
+300
+task instances that cover
+9
+9
+9
+9
+additional programming languages.
+See §
+F.2
+for more details.
+We report the
+% resolved
+metric, the proportion of successfully resolved instances.
+4
+Results
+Table
+3
+compares the performance of Qwen 2.5 Coder Instruct models (7B and 32B), fine-tuned on
+5
+,
+016
+5
+016
+5{,}016
+5 , 016
+\bugs
+trajectories.
+We refer to them as
+SWE-agent-LM-7B
+and
+SWE-agent-LM-32B
+; the latter achieves state-of-the-art performance.
+Model
+System
+Train Size
+Lite
+Verified
+Closed Weight Models
+GPT-4o
+(OpenAI,
+2024a
+)
+Agentless
+-
+32.0
+32.0
+32.0
+32.0
+38.8
+38.8
+38.8
+38.8
+OpenHands
+-
+22.0
+22.0
+22.0
+22.0
+-
+SWE-agent
+-
+18.3
+18.3
+18.3
+18.3
+23.0
+23.0
+23.0
+23.0
+Claude 3.5 Sonnet
+(Anthropic,
+2024
+)
+Agentless
+-
+40.7
+40.7
+40.7
+40.7
+50.8
+50.8
+50.8
+50.8
+AutoCodeRover
+-
+-
+46.2
+46.2
+46.2
+46.2
+OpenHands
+-
+41.7
+41.7
+41.7
+41.7
+53.0
+53.0
+53.0
+53.0
+SWE-agent
+-
+23.0
+23.0
+23.0
+23.0
+33.6
+33.6
+33.6
+33.6
+Claude 3.7 Sonnet
+(Anthropic,
+2025
+)
+SWE-agent
+-
+48.0
+58.2
+Llama3-SWE-RL-70B
+(Wei et al.,
+2025
+)
+Agentless
+11M
+-
+41.0
+41.0
+41.0
+41.0
+Open Weight Models
+Lingma-SWE-GPT-72B
+(Ma et al.,
+2024
+)
+SWE-SynInfer
+-
+-
+28.8
+28.8
+28.8
+28.8
+Qwen3-235B-A22B
+(Qwen et al.,
+2025
+)
+OpenHands
+-
+-
+34.4
+34.4
+34.4
+34.4
+R2E-Gym-32B
+(Jain et al.,
+2025
+)
+OpenHands
+3.3
+3.3
+3.3
+3.3
+k
+-
+34.4
+34.4
+34.4
+34.4
+SWE-fixer-72B
+(Xie et al.,
+2025a
+)
+SWE-Fixer
+110
+110
+110
+110
+k
+24.7
+24.7
+24.7
+24.7
+32.8
+32.8
+32.8
+32.8
+SWE-gym-32B
+(Pan et al.,
+2024
+)
+OpenHands
+491
+491
+491
+491
+15.3
+15.3
+15.3
+15.3
+20.6
+20.6
+20.6
+20.6
+SWE-agent-LM-7B
+SWE-agent
+2
+2
+2
+2
+k
+11.7
+11.7
+11.7
+11.7
+15.2
+15.2
+15.2
+15.2
+SWE-agent-LM-32B
+SWE-agent
+5
+5
+5
+5
+k
+30.7
+40.2
+Table 3:
+Resolve rates for existing solutions on SWE-bench Lite and Verified, collected from
+Jimenez et al. (
+2024a
+)
+, compared to models fine-tuned on
+\bugs
+.
+All performance numbers are pass@
+1
+1
+1
+1
+.
+We do
+not
+compare against systems that use verifiers or multiple attempts at test time.
+The final dataset of
+5
+,
+016
+5
+016
+5{,}016
+5 , 016
+training points was curated as follows.
+We start by collecting a large pool of expert trajectories.
+First, we carried out each of the ablations in Section
+4.1
+, giving us an initial set of
+5
+,
+105
+5
+105
+5{,}105
+5 , 105
+trajectories.
+Next, based on our observation that PR Mirror and LM Rewrite task instances yield the most effective expert trajectories (discussed below), we run the expert model on all task instances of these types, bumping up the total number to
+6
+,
+457
+6
+457
+6{,}457
+6 , 457
+task instances.
+Ultimately, we attempt to generate expert trajectories for
+8
+,
+686
+8
+686
+8{,}686
+8 , 686
+unique task instances, or
+17.3
+17.3
+17.3
+17.3
+% of the
+\bugs
+dataset.
+Reinforcing the difficulty rating findings from Section
+2.2
+, we observe that
+\bugs
+task instances are non-trivial for the top agent systems today.
+The final pool of
+6
+,
+457
+6
+457
+6{,}457
+6 , 457
+represents a
+36
+36
+36
+36
+% resolve rate of all
+17
+,
+906
+17
+906
+17{,}906
+17 , 906
+attempts to solve one of the
+8
+,
+686
+8
+686
+8{,}686
+8 , 686
+task instances.
+Next, we perform minor filtering of this collection.
+As reported in
+Pan et al. (
+2024
+)
+, we also observe that “easier” trajectories – task instances that are repeatedly solved across multiple runs — degrade model performance.
+Therefore, we limit the number of times any
+\bugs
+task instance is represented in the training set to
+3
+3
+3
+3
+trajectories.
+This leads to the final
+5
+,
+016
+5
+016
+5{,}016
+5 , 016
+training set.
+More details in §
+F.3
+.
+Performance improves with more data points.
+Extending similar graphs from
+Jain et al. (
+2025
+); Pan et al. (
+2024
+)
+, Figure
+1
+shows increasing performance with more trajectories.
+Comparison at the same training set size.
+To compare with prior works
+(Jain et al.,
+2025
+; Pan et al.,
+2024
+)
+, we run expert trajectory generation on
+1000
+1000
+1000
+1000
+random
+\bugs
+task instances with SWE-agent + Claude 3.5 Sonnet (
+800
+800
+800
+800
+) or GPT-4o (
+200
+200
+200
+200
+).
+We then fine-tune the
+32
+32
+32
+32
+B model on
+500
+500
+500
+500
+successful trajectories, a training set size both works report on.
+Our model achieves a
+28.2
+28.2
+28.2
+28.2
+% resolve rate on SWE-bench Verified, a relative difference of
++
+8.2
+8.2
++8.2
++ 8.2
+% with
+Pan et al. (
+2024
+)
+and
++
+0.7
+0.7
++0.7
++ 0.7
+% with
+Jain et al. (
+2025
+)
+.
+4.1
+Ablations of
+\bugs
+We perform several ablations of how
+\bugs
+’s bug and problem statement generation strategies impact the quality of training data.
+We use Claude 3.7 Sonnet as the expert for fine-tuning Qwen
+2.5
+2.5
+2.5
+2.5
+7
+7
+7
+7
+B Coder Instruct, and report the performance on SWE-bench Verified.
+LM Rewrite and Procedural bugs are comparable to PR mirrors.
+We randomly sample
+1000
+1000
+1000
+1000
+instances per bug generation strategy (LM Modify, LM Rewrite, Procedural Modifications, PR Mirrors).
+Per instance, we generate issue text with an LM and run expert trajectory generation.
+We then fine-tune a student model per strategy, capping training points to the minimum number of successful trajectories from any strategy (
+507
+507
+507
+507
+) for fair comparison.
+Table
+5
+summarizes the results.
+Trajectories generated from PR mirrors are empirically the most effective training data — this is expected, since they are most reflective of SWE-bench.
+What’s noteworthy is that trajectories from Procedural Modification and LM Rewrite instances lead to competitive models.
+There is a steep drop-off with LM Modify bugs.
+LM generated issues are comparable to real issues.
+We randomly sample
+600
+600
+600
+600
+PR Mirror task instances.
+We compare LM generated issues with three alternatives — fixed issue templates, the source code + test logs of a random Fail-to-Pass test, and the original issue text associated with the PR.
+We again cap training points to the minimum number of successful trajectories (
+259
+259
+259
+259
+) for fairness.
+As shown in Table
+5
+, training on task instances with LM generated issues is empirically comparable to using the original issue text.
+Using fixed issue templates not only leads to the fewest successful trajectories, but also results in relatively homogeneous problem solving sequences.
+The expert trajectories from fixed issue templates have
+31
+31
+31
+31
+% fewer unique actions compared to LM generated text (
+379
+379
+379
+379
+vs.
+550
+550
+550
+550
+).
+While providing a Fail-to-Pass test case leads to more successful expert trajectories, leaking the evaluation criteria causes the model to skip over writing a reproduction script, which accounts for the performance drop.
+Of
+500
+500
+500
+500
+SWE-bench Verified instances, the student model trained on LM-generated issues attempts to reproduce the bug for
+379
+379
+379
+379
+of the runs.
+The model trained on test-based issues only does so for
+127
+127
+127
+127
+cases, a
+66
+66
+66
+66
+% decrease.
+Strategy
+# Trajs.
+% Resolved
+LM Modify
+802
+802
+802
+802
+5.7
+5.7
+5.7
+5.7
+(
+±
+1.5
+plus-or-minus
+1.5
+\pm 1.5
+± 1.5
+)
+LM Rewrite
+507
+507
+507
+507
+8.8
+8.8
+8.8
+8.8
+(
+±
+1.7
+plus-or-minus
+1.7
+\pm 1.7
+± 1.7
+)
+Procedural
+745
+745
+745
+745
+8.6
+8.6
+8.6
+8.6
+(
+±
+1.8
+plus-or-minus
+1.8
+\pm 1.8
+± 1.8
+)
+PR Mirror
+557
+557
+557
+557
+9.2
+9.2
+9.2
+9.2
+(
+±
+1.7
+)
+\pm 1.7)
+± 1.7 )
+Table 4:
+Comparison of training on
+1000
+1000
+1000
+1000
+\bugs
+instances created with different strategies.
+Issue
+# Trajs.
+% Resolved
+Fixed
+259
+6.4
+6.4
+6.4
+6.4
+(
+±
+1.5
+plus-or-minus
+1.5
+\pm 1.5
+± 1.5
+)
+F2P Test
+390
+7.3
+7.3
+7.3
+7.3
+(
+±
+1.9
+plus-or-minus
+1.9
+\pm 1.9
+± 1.9
+)
+LM
+328
+7.7
+7.7
+7.7
+7.7
+(
+±
+1.5
+plus-or-minus
+1.5
+\pm 1.5
+± 1.5
+)
+Original
+319
+7.8
+7.8
+7.8
+7.8
+(
+±
+1.8
+plus-or-minus
+1.8
+\pm 1.8
+± 1.8
+)
+Table 5:
+Comparing training on
+600
+600
+600
+600
+PR Mirror instances with varied issue text.
+Task difficulty correlates with solvability but not with effectiveness as training data.
+First, we run our difficulty rating model on
+10
+10
+10
+10
+k randomly selected
+\bugs
+task instances.
+From this pool, we curate subsets of
+1000
+1000
+1000
+1000
+instances corresponding to the three difficulty levels, then run expert trajectory generation per subset
+3
+3
+3
+3
+times.
+For the
+easy
+/
+medium
+/
+hard
+subsets, the resolve rate by the expert model are
+58.6
+58.6
+58.6
+58.6
+%,
+41.0
+41.0
+41.0
+41.0
+%, and
+17.0
+17.0
+17.0
+17.0
+% respectively.
+Next, from all successful trajectories, we create four fine-tuning datasets of
+500
+500
+500
+500
+trajectories each corresponding to difficulty scores of
+2
+2
+2
+2
+,
+4
+4
+4
+4
+,
+6
+6
+6
+6
+, and
+8
+8
+8
+8
+.
+As mentioned in Section
+2.2
+, the corresponding scores for
+easy
+/
+medium
+/
+hard
+are
+1
+1
+1
+1
+/
+5
+5
+5
+5
+/
+9
+9
+9
+9
+.
+Therefore, the SFT dataset for score
+2
+2
+2
+2
+is made up of trajectories corresponding to
+375
+375
+375
+375
+easy
+and
+125
+125
+125
+125
+medium
+instances, and so on.
+Somewhat surprisingly, we do not observe strong correlation between increased difficulty and downstream performance.
+For the student models trained on the
+2
+2
+2
+2
+/
+4
+4
+4
+4
+/
+6
+6
+6
+6
+/
+8
+8
+8
+8
+difficulty SFT datasets, we get pass@1 scores of
+12.4
+12.4
+12.4
+12.4
+%,
+10.8
+10.8
+10.8
+10.8
+%,
+13.6
+13.6
+13.6
+13.6
+%, and
+12.2
+12.2
+12.2
+12.2
+% on SWE-bench Verified.
+Figure 3:
+We fine-tune a
+7
+7
+7
+7
+B base and our
+32
+32
+32
+32
+B models on
+700
+700
+700
+700
+trajectories for SymPy.
+Specialization boosts performance with minor generalization loss.
+Figure 4:
+At
+700
+700
+700
+700
+training samples, we observe performance increases logarithmically with repositories.
+Training on more repositories improves general performance.
+We train models in four settings by sampling
+700
+700
+700
+700
+expert trajectories on Procedural Modification tasks from pools of
+4
+4
+4
+4
+,
+25
+25
+25
+25
+,
+50
+50
+50
+50
+, and
+100
+100
+100
+100
+repositories.
+Echoing similar findings for code generation tasks
+(Xie et al.,
+2025b
+)
+, we find that increasing repositories represented in the training set improves performance, as shown in Figure
+4
+, with an approximately logarithmic relation between model performance and number of repositories.
+Repository-specialized models excel on the target repository with minor generalization loss.
+We experiment with training models to be specialists on one particular repository.
+To assess performance, we evaluate models on a subset of SWE-bench Verified tasks that are (1) from SymPy, and (2) created after January 1st, 2022, a total of
+22
+22
+22
+22
+instances.
+To create SymPy specific training data, we first select a base commit of SymPy just before the cutoff date.
+Next, we create
+1276
+1276
+1276
+1276
+Procedural Modification task instances, then generate
+700
+700
+700
+700
+expert trajectories.
+We evaluate specialization in two settings: (1) single-repository fine-tuning, and (2) specialist stage fine-tuning, both shown in Figure
+4
+.
+For single-repository tuning, we compare a model initialized with
+Qwen-2.5-Coder-Instruct 7B
+and trained on
+700
+700
+700
+700
+instances sampled from
+100
+100
+100
+100
+repositories, to the same Qwen base model but fine-tuned on the
+700
+700
+700
+700
+SymPy instances only.
+For specialist stage fine-tuning, we simply compare
+SWE-agent-LM-32B
+to the same model further fine-tuned on the
+700
+700
+700
+700
+SymPy instances.
+Specialization significantly boosts performance for the target repository with only slight drops in general performance in both the single-repository fine-tuning (
+21.2
+21.2
+21.2
+21.2
+% vs.
+13.6
+13.6
+13.6
+13.6
+%) and specialist stage fine-tuning (
+42.4
+42.4
+42.4
+42.4
+% vs.
+33.3
+33.3
+33.3
+33.3
+%) settings.
+4.2
+Analysis of Agent Behavior
+Figure 5:
+SWE-agent-LM-32B
+takes fewer steps to submit compared to Claude 3.7 Sonnet for instances resolved by both models.
+Figure 6:
+For unsuccessfully resolved tasks, a frequent failure mode is that
+SWE-agent-LM-32B
+will repeat actions.
+This section analyzes the behavior, failure modes, and efficiency of SWE-agent when run with
+SWE-agent-LM-32B
+or Claude 3.7 Sonnet on SWE-bench verified.
+SWE-agent-LM-32B can solve tasks efficiently.
+SWE-agent-LM-32B
+resolves tasks in fewer steps on average (24.9) than Claude 3.7 Sonnet (29.1), though the difference becomes marginal when accounting for different average difficulties of the resolved tasks: On the overlap of tasks that are resolved by both LMs,
+SWE-agent-LM-32B
+uses 24.8 steps compared to 25.6 used by Claude 3.7 Sonnet (see Fig.
+6
+).
+While shorter trajectories are not always preferred (additional actions can be used for additional validation purposes, for example), this shows that
+SWE-agent-LM-32B
+solves tasks very efficiently.
+At the same time
+SWE-agent-LM-32B
+also demonstrates that it can remain focused throughout long trajectories, with 31 instances being resolved after 40 steps or more.
+We further highlight that the accuracy of naturally terminating
+1
+1
+1
+i.e., excluding agent runs that are terminated due to errors or cost/step count limits. Note that SWE-agent still extracts and submits any changes performed by the agent in these cases and some of them can be successful (for example if the agent is terminated due to cost while testing already performed edits).
+agent submissions with
+SWE-agent-LM-32B
+achieve an accuracy nearly matching that of Claude 3.7 Sonnet (60% vs 63%), showing that
+SWE-agent-LM-32B
+is adept at determining whether an instance has been resolved.
+As the overall cost and turn count averages scale strongly with the cost and turn limits, we reserve a more thorough analysis for §
+F.5.1
+.
+Repetitive actions are a key problem.
+We observe a tendency for
+SWE-agent-LM-32B
+to get stuck in long sequences of repetitive actions, in particular long sequences of calls that display different portions of a file instead of using search commands.
+2
+2
+2
+In fact, these
+str_replace_editor view
+commands make up
+73
+73
+73
+73
+% of the longest repetitive sequences. For this analysis, we look at repetitions of the base command, i.e., without any arguments. See §
+F.5
+for more.
+More than 25% of
+SWE-agent-LM-32B
+trajectories have a repetitive sequence of at least length 10, compared to less than 4% for Claude 3.7 Sonnet (see Figure
+6
+).
+The occurrence of long repetitive sequences correlates strongly with the agent’s ability to solve the corresponding task instance, largely because the LM continues issuing similar commands until either the agent cost or turn limit is reached, at which point the run is terminated.
+For example, repetitive sequences of length 10 correspond to an 89% failure probability.
+Simple interventions from the agent scaffold can mitigate repetitive actions, but do not seem to improve resolve rates (see §
+F.5
+).
+Figure 7:
+More than half of the unresolved instances of
+SWE-agent-LM-32B
+correspond to runs terminated by cost/step limits, and these limits are frequently reached before source code has been modified. See §
+F.5
+for more.
+Localization is the dominant failure mode.
+Guided by a short plan in the system prompt, SWE-agent typically starts by
+localizing
+(search and read actions),
+reproducing
+(test file creation and execution), before modifying source files and validating the fixes.
+If the agent gets stuck at any of these stages or keeps on iterating, the agent loop is eventually interrupted by runtime limits (cost, number of LM calls, runtime).
+While this rarely happens with Claude 3.7 Sonnet, 53% of
+SWE-agent-LM-32b
+’s failures are associated with such limits (Figure
+7
+).
+The agent often already gets stuck during localization or initial efforts to reproduce a bug, with endlessly repeated actions being a persistent issue.
+More on failure modes in §
+F.5
+.
+5
+Related Work
+LMs for Software Engineering.
+As contemporary LMs have saturated traditional code generation tasks
+(Austin et al.,
+2021
+; Chen et al.,
+2021
+)
+, software engineering benchmarks
+(Jain et al.,
+2024
+; Jimenez et al.,
+2024b
+; Yang et al.,
+2024b
+; Zhao et al.,
+2024
+; Zan et al.,
+2025
+)
+, notably SWE-bench, have become a new de facto evaluation setting due to their diverse, complex, real-world programming challenges.
+The most significant source of open source progress on SWE-bench has been the development of LM-based workflows
+(Orwall,
+2024
+; Xia et al.,
+2024
+; Zhang et al.,
+2024b
+)
+and agents
+(Antoniades et al.,
+2024
+; Wang et al.,
+2024
+; Yang et al.,
+2024a
+; Zhang et al.,
+2024a
+)
+.
+Workflow-based systems are typically human-engineered decompositions of a task into a sequence of sub-goals.
+Yang et al. (
+2024b
+)
+suggests such pipelines may not generalize effectively to non-Python repositories, requiring additional human intervention to re-adapt.
+We therefore elect to focus on generating trajectories with and for LM agent systems
+(Sumers et al.,
+2024
+; Yang et al.,
+2023
+; Yao et al.,
+2023b
+)
+.
+Because no workflow is imposed, agent systems inherently rely more on the LM to plan and refine its actions, putting more focus on an LM’s capabilities, not inference scaffolds.
+Training Datasets for Coding.
+Prior work around training data has focused on instruction following
+(Luo et al.,
+2023
+; Muennighoff et al.,
+2024
+; Shypula et al.,
+2024
+; Wei et al.,
+2024a
+;
+b
+; Yu et al.,
+2024
+)
+and preference learning
+(Liu et al.,
+2024a
+;
+b
+)
+for code completion tasks.
+Several recent works introduce training sets for retrieval augmented generation
+(Jimenez et al.,
+2024b
+; Xie et al.,
+2025a
+)
+, workflows
+(Wei et al.,
+2025
+)
+, and agent
+(Badertdinov et al.,
+2024
+; Ma et al.,
+2024
+; Pan et al.,
+2024
+; Jain et al.,
+2025
+)
+approaches to SWE-bench.
+Our work applies
+Haluptzok et al. (
+2023
+)
+at a repository level: by having an LM break a codebase, we drastically reduce the human effort needed to define a task and build its environment.
+Concurrent to our work,
+Xie et al. (
+2025b
+)
+(RePOST) also constructs execution environments for repository functions, but differs significantly in methodology and evaluation.
+RePOST sandboxes a function and its dependencies to a separate script, then generates tests with an LM, removing the original codebase as context.
+The tasks’ source is repository-level; the environments and tasks are not.
+RePOST evaluates solely on code generation (e.g., HumanEval
+(Chen et al.,
+2021
+)
+).
+Jain et al. (
+2025
+)
+(R2E-Gym) improves open source LMs’ performance on SWE-bench with inference time scaling and verifiers.
+R2E-gym’s
+51
+51
+51
+51
+% resolve rate is not comparable to Table
+3
+results, as each instance is attempted
+26
+26
+26
+26
+times.
+R2E-gym’s
+4.6
+4.6
+4.6
+4.6
+k training instances are collected using SWE-bench’s pipeline, with some augmentations around using LMs to synthesize issue text and tests.
+To our knowledge, we are the first to address the limited scalability
+of previous approaches.
+6
+Discussion
+Limitations and future directions.
+First,
+\bugs
+’s collection pipeline is Python-centric.
+The mechanisms to identify programmatic objects (e.g. functions, classes) and perform transformations rely heavily on the Python specific
+ast
+library.
+That said,
+\bugs
+’s collection strategy is transferable to other languages.
+Second, due to both compute/budget constraints and our work’s primary stance as a dataset contribution, we only include fine-tuning as a demonstration of
+\bugs
+’s effectiveness.
+We do not explore other training techniques such as reasoning capabilities elicited via reinforcement learning.
+Conclusion.
+We introduce
+\bugs
+, a dataset of
+50
+50
+50
+50
+k software engineering task instances from across
+128
+128
+128
+128
+real world GitHub repositories.
+\bugs
+collection pipeline allows us to scale up task instances, environments, and trajectories at a fraction of prior costs without sacrificing faithfulness to open source software development practices.
+Using
+\bugs
+, we train
+SWE-agent-LM-32B
+, achieving a state-of-the-art
+40.2
+40.2
+40.2
+40.2
+% on SWE-bench Verified.
+Our experiments show how
+\bugs
+can be used to identify fundamental trends about developing SWE-agents.
+We believe
+\bugs
+provides the foundational data and infrastructure needed to train software engineering agents in a truly scalable manner.
+Acknowledgments
+We thank Princeton Language & Intelligence (PLI) for providing credits for running closed-source API models.
+Thanks to Samuel Ainsworth for his constant support of
+bitbop.io
+(
+https://bitbop.io/
+), the compute service for which the majority of the project was carried out with.
+We’d also like to thank Akshat Bubna, Howard Halim, Andrew Liu, Peyton Walters, and the great team at Modal (
+https://modal.com/
+) for providing credits that made fine-tuning and model serving efforts extremely easy for this project.
+This work is partially supported by ONR grant N000142412532 and NSF grant IIS-2247357.
+We also thank Open Philanthropy and Andreessen Horowitz for providing funding for this work.
+Finally, thanks to Tianyu Gao, William Held, Niklas Muennighoff, Rafael Rafailov, Yijia Shao, Chenglei Si, Anikait Singh, Tianyi Zhang, Kexin Pei, and Karthik Narasimhan for constructive discussions and support throughout this project.
+References
+Anthropic (2024)
+Anthropic.
+Introducing claude 3.5 sonnet, 2024.
+URL
+https://www.anthropic.com/news/claude-3-5-sonnet
+.
+Anthropic (2025)
+Anthropic.
+Introducing claude 3.7 sonnet, 2025.
+URL
+https://www.anthropic.com/news/claude-3-7-sonnet
+.
+Antoniades et al. (2024)
+Antonis Antoniades, Albert Örwall, Kexun Zhang, Yuxi Xie, Anirudh Goyal, and William Wang.
+SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement, December 2024.
+URL
+http://arxiv.org/abs/2410.20285
+.
+arXiv:2410.20285 [cs].
+Austin et al. (2021)
+Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, and Charles Sutton.
+Program Synthesis with Large Language Models, August 2021.
+URL
+http://arxiv.org/abs/2108.07732
+.
+arXiv:2108.07732 [cs].
+Badertdinov et al. (2024)
+Ibragim Badertdinov, Maria Trofimova, Yuri Anapolskiy, Sergey Abramov, Karina Zainullina, Alexander Golubev, Sergey Polezhaev, Daria Litvintseva, Simon Karasik, Filipp Fisin, Sergey Skvortsov, Maxim Nekrashevich, Anton Shevtsov, and Boris Yangel.
+Scaling data collection for training software engineering agents.
+Nebius blog
+, 2024.
+Bogin et al. (2024)
+Ben Bogin, Kejuan Yang, Shashank Gupta, Kyle Richardson, Erin Bransom, Peter Clark, Ashish Sabharwal, and Tushar Khot.
+Super: Evaluating agents on setting up and executing tasks from research repositories, 2024.
+URL
+https://arxiv.org/abs/2409.07440
+.
+Chen et al. (2023)
+Baian Chen, Chang Shu, Ehsan Shareghi, Nigel Collier, Karthik Narasimhan, and Shunyu Yao.
+FireAct: Toward Language Agent Fine-tuning, October 2023.
+URL
+http://arxiv.org/abs/2310.05915
+.
+arXiv:2310.05915 [cs].
+Chen et al. (2021)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba.
+Evaluating Large Language Models Trained on Code, July 2021.
+URL
+http://arxiv.org/abs/2107.03374
+.
+arXiv:2107.03374 [cs].
+Chowdhury et al. (2024)
+Neil Chowdhury, James Aung, Chan Jun Shern, Oliver Jaffe, Dane Sherburn, Giulio Starace, Evan Mays, Rachel Dias, Marwan Aljubeh, Mia Glaese, et al.
+Introducing swe-bench verified, 2024.
+URL https://openai.com/index/introducing-swe-bench-verified
+, 2024.
+Daniel Han & team (2023)
+Michael Han Daniel Han and Unsloth team.
+Unsloth, 2023.
+URL
+http://github.com/unslothai/unsloth
+.
+Eliseeva et al. (2025)
+Aleksandra Eliseeva, Alexander Kovrigin, Ilia Kholkin, Egor Bogomolov, and Yaroslav Zharov.
+Envbench: A benchmark for automated environment setup, 2025.
+URL
+https://arxiv.org/abs/2503.14443
+.
+Haluptzok et al. (2023)
+Patrick Haluptzok, Matthew Bowers, and Adam Tauman Kalai.
+Language models can teach themselves to program better, 2023.
+URL
+https://arxiv.org/abs/2207.14502
+.
+Hu et al. (2021)
+Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen.
+Lora: Low-rank adaptation of large language models, 2021.
+URL
+https://arxiv.org/abs/2106.09685
+.
+Hui et al. (2024)
+Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Lei Zhang, Tianyu Liu, Jiajun Zhang, Bowen Yu, Keming Lu, Kai Dang, Yang Fan, Yichang Zhang, An Yang, Rui Men, Fei Huang, Bo Zheng, Yibo Miao, Shanghaoran Quan, Yunlong Feng, Xingzhang Ren, Xuancheng Ren, Jingren Zhou, and Junyang Lin.
+Qwen2.5-coder technical report, 2024.
+URL
+https://arxiv.org/abs/2409.12186
+.
+Jain et al. (2024)
+Naman Jain, Manish Shetty, Tianjun Zhang, King Han, Koushik Sen, and Ion Stoica.
+R2e: Turning any github repository into a programming agent environment.
+In
+ICML 2024
+, 2024.
+Jain et al. (2025)
+Naman Jain, Jaskirat Singh, Manish Shetty, Liang Zheng, Koushik Sen, and Ion Stoica.
+R2e-gym: Procedural environments and hybrid verifiers for scaling open-weights swe agents, 2025.
+URL
+https://arxiv.org/abs/2504.07164
+.
+Jimenez et al. (2024a)
+Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan.
+Swe-bench leaderboard, 2024a.
+URL
+https://swe-bench.github.io/
+.
+Jimenez et al. (2024b)
+Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan.
+SWE-bench: Can Language Models Resolve Real-World GitHub Issues?, November 2024b.
+URL
+http://arxiv.org/abs/2310.06770
+.
+arXiv:2310.06770 [cs].
+Liu et al. (2024a)
+Jiawei Liu, Thanh Nguyen, Mingyue Shang, Hantian Ding, Xiaopeng Li, Yu Yu, Varun Kumar, and Zijian Wang.
+Learning code preference via synthetic evolution, 2024a.
+URL
+https://arxiv.org/abs/2410.03837
+.
+Liu et al. (2024b)
+Zhihan Liu, Shenao Zhang, Yongfei Liu, Boyi Liu, Yingxiang Yang, and Zhaoran Wang.
+DSTC: Direct Preference Learning with Only Self-Generated Tests and Code to Improve Code LMs, December 2024b.
+URL
+http://arxiv.org/abs/2411.13611
+.
+arXiv:2411.13611 [cs].
+Luo et al. (2023)
+Ziyang Luo, Can Xu, Pu Zhao, Qingfeng Sun, Xiubo Geng, Wenxiang Hu, Chongyang Tao, Jing Ma, Qingwei Lin, and Daxin Jiang.
+Wizardcoder: Empowering code large language models with evol-instruct, 2023.
+URL
+https://arxiv.org/abs/2306.08568
+.
+Ma et al. (2024)
+Yingwei Ma, Rongyu Cao, Yongchang Cao, Yue Zhang, Jue Chen, Yibo Liu, Yuchen Liu, Binhua Li, Fei Huang, and Yongbin Li.
+Lingma swe-gpt: An open development-process-centric language model for automated software improvement, 2024.
+URL
+https://arxiv.org/abs/2411.00622
+.
+Modal (2025)
+Modal.
+Modal: High-performance ai infrastructure, 2025.
+URL
+https://modal.com/
+.
+Muennighoff et al. (2024)
+Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro von Werra, and Shayne Longpre.
+OctoPack: Instruction Tuning Code Large Language Models, February 2024.
+URL
+http://arxiv.org/abs/2308.07124
+.
+arXiv:2308.07124 [cs].
+Murty et al. (2024)
+Shikhar Murty, Dzmitry Bahdanau, and Christopher D. Manning.
+NNetscape Navigator: Complex Demonstrations for Web Agents Without a Demonstrator, October 2024.
+URL
+http://arxiv.org/abs/2410.02907
+.
+arXiv:2410.02907 [cs].
+Mündler et al. (2025)
+Niels Mündler, Mark Niklas Müller, Jingxuan He, and Martin Vechev.
+Swt-bench: Testing and validating real-world bug-fixes with code agents, 2025.
+URL
+https://arxiv.org/abs/2406.12952
+.
+OpenAI (2024a)
+OpenAI.
+Gpt-4o system card, 2024a.
+URL
+https://arxiv.org/abs/2410.21276
+.
+OpenAI (2024b)
+OpenAI.
+Openai o3-mini system card, 2024b.
+URL
+https://cdn.openai.com/o3-mini-system-card-feb10.pdf
+.
+Orwall (2024)
+Albert Orwall.
+Moatless tools, 2024.
+URL
+https://github.com/aorwall/moatless-tools
+.
+Ou et al. (2024)
+Tianyue Ou, Frank F. Xu, Aman Madaan, Jiarui Liu, Robert Lo, Abishek Sridhar, Sudipta Sengupta, Dan Roth, Graham Neubig, and Shuyan Zhou.
+Synatra: Turning Indirect Knowledge into Direct Demonstrations for Digital Agents at Scale, September 2024.
+URL
+http://arxiv.org/abs/2409.15637
+.
+arXiv:2409.15637 [cs].
+Pan et al. (2024)
+Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, and Yizhe Zhang.
+Training Software Engineering Agents and Verifiers with SWE-Gym, December 2024.
+URL
+http://arxiv.org/abs/2412.21139
+.
+arXiv:2412.21139 [cs].
+PyTorch (2024)
+PyTorch.
+torchtune: Pytorch’s finetuning library, April 2024.
+URL
+https//github.com/pytorch/torchtune
+.
+Qwen et al. (2025)
+Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tianyi Tang, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu.
+Qwen2.5 Technical Report, January 2025.
+URL
+http://arxiv.org/abs/2412.15115
+.
+arXiv:2412.15115 [cs].
+Shen et al. (2024)
+Junhong Shen, Atishay Jain, Zedian Xiao, Ishan Amlekar, Mouad Hadji, Aaron Podolny, and Ameet Talwalkar.
+Scribeagent: Towards specialized web agents using production-scale workflow data, 2024.
+URL
+https://arxiv.org/abs/2411.15004
+.
+Shypula et al. (2024)
+Alexander Shypula, Aman Madaan, Yimeng Zeng, Uri Alon, Jacob Gardner, Milad Hashemi, Graham Neubig, Parthasarathy Ranganathan, Osbert Bastani, and Amir Yazdanbakhsh.
+Learning performance-improving code edits, 2024.
+URL
+https://arxiv.org/abs/2302.07867
+.
+Sumers et al. (2024)
+Theodore R. Sumers, Shunyu Yao, Karthik Narasimhan, and Thomas L. Griffiths.
+Cognitive architectures for language agents, 2024.
+URL
+https://arxiv.org/abs/2309.02427
+.
+Vergopoulos et al. (2025)
+Konstantinos Vergopoulos, Mark Niklas Müller, and Martin Vechev.
+Automated benchmark generation for repository-level coding tasks, 2025.
+URL
+https://arxiv.org/abs/2503.07701
+.
+Wang et al. (2024)
+Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and Graham Neubig.
+OpenHands: An Open Platform for AI Software Developers as Generalist Agents, October 2024.
+URL
+http://arxiv.org/abs/2407.16741
+.
+arXiv:2407.16741 [cs].
+Wei et al. (2024a)
+Yuxiang Wei, Federico Cassano, Jiawei Liu, Yifeng Ding, Naman Jain, Zachary Mueller, Harm de Vries, Leandro von Werra, Arjun Guha, and Lingming Zhang.
+SelfCodeAlign: Self-Alignment for Code Generation, November 2024a.
+URL
+http://arxiv.org/abs/2410.24198
+.
+arXiv:2410.24198 [cs].
+Wei et al. (2024b)
+Yuxiang Wei, Zhe Wang, Jiawei Liu, Yifeng Ding, and Lingming Zhang.
+Magicoder: Empowering code generation with oss-instruct, 2024b.
+URL
+https://arxiv.org/abs/2312.02120
+.
+Wei et al. (2025)
+Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang.
+SWE-RL: Advancing llm reasoning via reinforcement learning on open software evolution, 2025.
+URL
+https://arxiv.org/abs/2502.18449
+.
+Xia et al. (2024)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying LLM-based Software Engineering Agents, October 2024.
+URL
+http://arxiv.org/abs/2407.01489
+.
+arXiv:2407.01489 [cs].
+Xiang et al. (2023)
+Jiannan Xiang, Tianhua Tao, Yi Gu, Tianmin Shu, Zirui Wang, Zichao Yang, and Zhiting Hu.
+Language Models Meet World Models: Embodied Experiences Enhance Language Models, October 2023.
+URL
+http://arxiv.org/abs/2305.10626
+.
+arXiv:2305.10626 [cs].
+Xie et al. (2025a)
+Chengxing Xie, Bowen Li, Chang Gao, He Du, Wai Lam, Difan Zou, and Kai Chen.
+Swe-fixer: Training open-source llms for effective and efficient github issue resolution, 2025a.
+URL
+https://arxiv.org/abs/2501.05040
+.
+Xie et al. (2024)
+Tianbao Xie, Danyang Zhang, Jixuan Chen, Xiaochuan Li, Siheng Zhao, Ruisheng Cao, Toh Jing Hua, Zhoujun Cheng, Dongchan Shin, Fangyu Lei, Yitao Liu, Yiheng Xu, Shuyan Zhou, Silvio Savarese, Caiming Xiong, Victor Zhong, and Tao Yu.
+OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments, May 2024.
+URL
+http://arxiv.org/abs/2404.07972
+.
+arXiv:2404.07972 [cs].
+Xie et al. (2025b)
+Yiqing Xie, Alex Xie, Divyanshu Sheth, Pengfei Liu, Daniel Fried, and Carolyn Rose.
+Repost: Scalable repository-level coding environment construction with sandbox testing, 2025b.
+URL
+https://arxiv.org/abs/2503.07358
+.
+Xu et al. (2024)
+Yiheng Xu, Dunjie Lu, Zhennan Shen, Junli Wang, Zekun Wang, Yuchen Mao, Caiming Xiong, and Tao Yu.
+AgentTrek: Agent Trajectory Synthesis via Guiding Replay with Web Tutorials, December 2024.
+URL
+http://arxiv.org/abs/2412.09605
+.
+arXiv:2412.09605 [cs].
+Yang et al. (2023)
+John Yang, Akshara Prabhakar, Karthik Narasimhan, and Shunyu Yao.
+InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback, October 2023.
+URL
+http://arxiv.org/abs/2306.14898
+.
+arXiv:2306.14898 [cs].
+Yang et al. (2024a)
+John Yang, Carlos E. Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press.
+SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering, November 2024a.
+URL
+http://arxiv.org/abs/2405.15793
+.
+arXiv:2405.15793 [cs].
+Yang et al. (2024b)
+John Yang, Carlos E. Jimenez, Alex L. Zhang, Kilian Lieret, Joyce Yang, Xindi Wu, Ori Press, Niklas Muennighoff, Gabriel Synnaeve, Karthik R. Narasimhan, Diyi Yang, Sida I. Wang, and Ofir Press.
+SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?, October 2024b.
+URL
+http://arxiv.org/abs/2410.03859
+.
+arXiv:2410.03859 [cs].
+Yao et al. (2023a)
+Shunyu Yao, Howard Chen, John Yang, and Karthik Narasimhan.
+WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents, February 2023a.
+URL
+http://arxiv.org/abs/2207.01206
+.
+arXiv:2207.01206 [cs].
+Yao et al. (2023b)
+Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao.
+ReAct: Synergizing Reasoning and Acting in Language Models, March 2023b.
+URL
+http://arxiv.org/abs/2210.03629
+.
+arXiv:2210.03629 [cs].
+Yu et al. (2024)
+Zhaojian Yu, Xin Zhang, Ning Shang, Yangyu Huang, Can Xu, Yishujie Zhao, Wenxiang Hu, and Qiufeng Yin.
+Wavecoder: Widespread and versatile enhancement for code large language models by instruction tuning, 2024.
+URL
+https://arxiv.org/abs/2312.14187
+.
+Yuan et al. (2023)
+Zheng Yuan, Hongyi Yuan, Chengpeng Li, Guanting Dong, Keming Lu, Chuanqi Tan, Chang Zhou, and Jingren Zhou.
+Scaling relationship on learning mathematical reasoning with large language models, 2023.
+URL
+https://arxiv.org/abs/2308.01825
+.
+Zan et al. (2025)
+Daoguang Zan, Zhirong Huang, Wei Liu, Hanwu Chen, Linhao Zhang, Shulin Xin, Lu Chen, Qi Liu, Xiaojian Zhong, Aoyan Li, Siyao Liu, Yongsheng Xiao, Liangqiang Chen, Yuyu Zhang, Jing Su, Tianyu Liu, Rui Long, Kai Shen, and Liang Xiang.
+Multi-swe-bench: A multilingual benchmark for issue resolving, 2025.
+URL
+https://arxiv.org/abs/2504.02605
+.
+Zhang et al. (2024a)
+Kexun Zhang, Weiran Yao, Zuxin Liu, Yihao Feng, Zhiwei Liu, Rithesh Murthy, Tian Lan, Lei Li, Renze Lou, Jiacheng Xu, Bo Pang, Yingbo Zhou, Shelby Heinecke, Silvio Savarese, Huan Wang, and Caiming Xiong.
+Diversity Empowers Intelligence: Integrating Expertise of Software Engineering Agents, August 2024a.
+URL
+http://arxiv.org/abs/2408.07060
+.
+arXiv:2408.07060 [cs].
+Zhang et al. (2024b)
+Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, and Abhik Roychoudhury.
+AutoCodeRover: Autonomous Program Improvement, July 2024b.
+URL
+http://arxiv.org/abs/2404.05427
+.
+arXiv:2404.05427 [cs].
+Zhao et al. (2024)
+Wenting Zhao, Nan Jiang, Celine Lee, Justin T. Chiu, Claire Cardie, Matthias Gallé, and Alexander M. Rush.
+Commit0: Library Generation from Scratch, December 2024.
+URL
+http://arxiv.org/abs/2412.01769
+.
+arXiv:2412.01769 [cs].
+Zhou et al. (2024)
+Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Tianyue Ou, Yonatan Bisk, Daniel Fried, Uri Alon, and Graham Neubig.
+WebArena: A Realistic Web Environment for Building Autonomous Agents, April 2024.
+URL
+http://arxiv.org/abs/2307.13854
+.
+arXiv:2307.13854 [cs].
+Appendix
+The appendix is generally structured as follows.
+In Sections
+A
+to
+D
+, we review details about
+\bugs
+’s infrastructure and collection strategies for curating the
+\bugs
+task instances and execution environments, providing comparisons to existing datasets such as SWE-bench and SWE-gym along the way.
+In Sections
+E
+and onward, we discuss more about how we created the trajectories dataset, then provide additional ablations and results showcasing the effectiveness of
+\bugs
+as a dataset.
+Figure 8:
+An overview of pipelines in
+\bugs
+.
+Scripts/functions and manual steps are highlighted in
+blue
+.
+Artifacts that are also the inputs and outputs of these scripts are in
+orange
+.
+\bugs
+fits in seamlessly with the SWE-bench and SWE-agent ecosystem.
+Use
+\bugs
+to construct execution environments and generate task instances.
+Use SWE-agent to generate expert trajectories on
+\bugs
+task instances and run inference with models trained on these trajectories.
+Use SWE-bench to evaluate how good your models are at resolving GitHub issues and performing software engineering tasks.
+Appendix A
+Infrastructure
+We cover additional details about how
+\bugs
+works, specifically
+•
+The form factor of a
+\bugs
+task instance.
+•
+How we identify repositories and the SWE-agent configuration we use to automatically install them.
+•
+How the task validation and evaluation harnesses work.
+A.1
+\bugs
+Task Instance
+We briefly review the format of a
+\bugs
+task instance, highlight how it is different from a SWE-bench task instance, and discuss why
+\bugs
+’s relatively simple infrastructure compared to SWE-bench allows us scale task collection much more efficiently.
+A
+\bugs
+task instance is very similar to the form factor of a SWE-bench task instance, with several minor differences.
+A
+\bugs
+task instance includes the following fields:
+•
+repo
+: The repository the task instance is from.
+•
+instance_id
+: A unique identifier (usually
+(repo).(bug_type).(hash)
+)
+•
+base_commit
+: Hash of the GitHub branch that points to the repository with the bug
+patch
+applied.
+•
+patch
+: The
+diff
+that causes the bug. It is applied to the original codebase to create the bug. Reverting this patch is effectively the solution.
+•
+problem_statement
+: The generated issue text that conveys the bug.
+It is provided to a model or system before it begins attempting a fix.
+•
+created_at
+: A timestamp matching when the bug was successfully validated and pushed to the mirror repository as a branch.
+•
+FAIL_TO_PASS
+: The unit tests that break when the test suite is run with the bug
+patch
+applied.
+•
+PASS_TO_PASS
+: The unit tests that do not break.
+These correspond to the set of all tests minus the
+FAIL_TO_PASS
+tests.
+We summarize the key distinctions between a
+\bugs
+and SWE-bench task instance:
+•
+\bugs
+task instances do not include the
+version
+or
+environment_setup_commit
+fields, which SWE-bench requires as additional identifiers for specifying repository-specific installation instructions across time.
+In
+\bugs
+, unique installation instructions are specified for each (repository, commit).
+•
+The
+hints_text
+field is not included. In SWE-bench, this refers to the issue and PR thread comments written after the first commit of the corresponding PR.
+•
+The
+created_at
+field is assigned the timestamp reflecting when the bug was successfully validated.
+Originally,
+created_at
+refers to when a PR was created.
+•
+There is no
+test_patch
+field, as the
+\bugs
+collection pipeline does not create or synthesize any hidden tests.
+All
+FAIL_TO_PASS
+bugs are visible and runnable in the repository at inference time.
+A.2
+Repository Selection
+In addition to the criteria discussed in Section
+2.1
+, we also ensure that a repository has a license that allows non-proprietary use.
+The majority of software licenses are permissive (BSD, MIT, Apache), while the remainder are largely protective licenses (GPL) that still allow for non-commercial use.
+We inspected the repositories with custom licenses and confirmed they allowed for the use cases exercised in our work.
+The licenses for each repository are fully listed in Table
+6
+.
+Apache License 2.0
+Project-MONAI/MONAI; alanjds/drf-nested-routers; arrow-py/arrow; buriy/python-readability; facebookresearch/fvcore; getmoto/moto; google/textfsm; iterative/dvc; jax-ml/jax; jd/tenacity; kayak/pypika; modin-project/modin; pyca/pyopenssl; spulec/freezegun; tkrajina/gpxpy; tornadoweb/tornado; weaveworks/grafanalib
+BSD 2-Clause ”Simplified” License
+madzak/python-json-logger; pyasn1/pyasn1; pygments/pygments; sunpy/sunpy
+BSD 3-Clause ”New” or ”Revised” License
+Suor/funcy; alecthomas/voluptuous; andialbrecht/sqlparse; cookiecutter/cookiecutter; dask/dask; django/channels; django/daphne; encode/starlette; gawel/pyquery; gweis/isodate; john-kurkowski/tldextract; lepture/mistune; oauthlib/oauthlib; pallets/click; pallets/flask; pallets/jinja; pallets/markupsafe; pandas-dev/pandas; scrapy/scrapy; theskumar/python-dotenv
+GNU General Public License v3.0
+Cog-Creators/Red-DiscordBot; adrienverge/yamllint
+GNU Lesser General Public License v2.1
+chardet/chardet; paramiko/paramiko; pylint-dev/astroid
+GNU Lesser General Public License v3.0
+Knio/dominate
+ISC License
+kennethreitz/records
+MIT License
+amueller/word_cloud; borntyping/python-colorlog; bottlepy/bottle; cantools/cantools; cdgriffith/Box; cknd/stackprinter; conan-io/conan; cool-RR/PySnooper; datamade/usaddress; dbader/schedule; erikrose/parsimonious; facebookresearch/hydra; facelessuser/soupsieve; getnikola/nikola; graphql-python/graphene; hukkin/tomli; jaraco/inflect; jawah/charset_normalizer; joke2k/faker; keleshev/schema; life4/textdistance; luozhouyang/python-string-similarity; marshmallow-code/apispec; marshmallow-code/marshmallow; marshmallow-code/webargs; martinblech/xmltodict; matthewwithanm/python-markdownify; mewwts/addict; mido/mido; mozillazg/python-pinyin; msiemens/tinydb; pdfminer/pdfminer; pndurette/gTTS; pudo/dataset; pydantic/pydantic; pyparsing/pyparsing; pytest-dev/iniconfig; python-hyper/h11; python-jsonschema/jsonschema; python-openxml/python-docx; pyupio/safety; pyvista/pyvista; r1chardj0n3s/parse; rsalmei/alive-progress; rubik/radon; rustedpy/result; scanny/python-pptx; seatgeek/thefuzz; sloria/environs; sqlfluff/sqlfluff; termcolor/termcolor; tobymao/sqlglot; tox-dev/pipdeptree; tweepy/tweepy; un33k/python-slugify; vi3k6i5/flashtext
+Other
+Mimino666/langdetect; PyCQA/flake8; agronholm/exceptiongroup; agronholm/typeguard; aio-libs/async-timeout; benoitc/gunicorn; cloudpipe/cloudpickle; davidhalter/parso; django-money/django-money; gruns/furl; kurtmckee/feedparser; lincolnloop/python-qrcode; mahmoud/boltons; mahmoud/glom; mozilla/bleach; pexpect/ptyprocess; prettytable/prettytable; pwaller/pyfiglet; pydata/patsy; pydicom/pydicom; python-trio/trio; python/mypy; pyutils/line_profiler; seperman/deepdiff
+Table 6:
+License associated with each repository as of April 8, 2025. All licenses are permissive and allow for public, nonprofit use.
+We deliberately limit the search scope for repositories to those predominantly written in Python.
+Following precedents, focusing on Python repositories allowed us to form assumptions about installation and testing procedures (e.g. repository is organized as a PyPI package,
+pytest
+is the testing framework) that made scaling up automatic repository setup with SWE-agent more tractable.
+A worthwhile direction to consider for future work is expanding the coverage of repositories to be more comprehensive of codebases written in different programming languages, as
+Yang et al. (
+2024b
+)
+does, extending SWE-bench style evaluation to JavaScript repositories with multimodal inputs.
+Automated repository installation.
+The goal of this step is to first, get the installation and testing instructions for a repository, and second, create a Docker image containing the repository with the development environment set up.
+We provide the system prompt given to SWE-agent that asks it to install a repository in Figure
+6
+.
+Each repository installation task is initialized with a clone of the original repository.
+No additional steps (e.g.
+pypi
+package downloads,
+conda
+environment setup) are performed.
+We run SWE-agent with
+claude-3-5-sonnet-20241022
+with a maximum cost limit of $
+2
+2
+2
+2
+and a maximum call limit of
+150
+150
+150
+150
+.
+The installation run terminates whenever one of these conditions is met.
+For every run, we record the interactions.
+We then manually review the trajectory, identifying the appropriate installation and testing specifications.
+Each run incurs an average cost of $
+0.72
+0.72
+0.72
+0.72
+and an average of
+17
+17
+17
+17
+steps before SWE-agent issues the
+submit
+command.
+The runs typically finish within two minutes.
+The majority of Python repositories require fewer steps — typically, SWE-agent will view the
+CONTRIBUTING.md
+, run the installation command provided verbatim in the text, and then runs
+pytest
+, showing all tests passing.
+A minority of repositories will require several steps because additional dependencies must be installed with
+apt-get
+.
+The manual review process following this requires
+3
+3
+3
+3
+to
+20
+20
+20
+20
+minutes.
+One author carried out this effort for
+128
+128
+128
+128
+repositories, taking an estimated
+18
+18
+18
+18
+human hours to accomplish.
+In the process of reaching
+128
+128
+128
+128
+repositories, the author gave up on
+17
+17
+17
+17
+repositories at the manual review stage.
+System prompt for generating bugs with an LM
+<
+<
+<
+uploaded_files
+>
+>
+>
+{{working_dir}}
+<
+<
+<
+/uploaded_files
+>
+>
+>
+I’ve uploaded a python code repository in the directory
+{{working_dir}}
+.
+Can you please install this repository?
+Your goal should be to configure the repository’s development environment such that existing tests pass.
+You are currently in the root directory of the repository, and nothing has been installed yet.
+You in an Ubuntu 22.04 environment.
+The repository is predominantly written in Python. Here are several tips for installing it:
+1. A good place to start is to look for a
+CONTRIBUTING.[md
+|
+|
+|
+|
+rst]
+file, which will often contain instructions on how to install the repository and any dependencies it may have. Occasionally, the
+README.md
+file may also contain installation instructions.
+2. Usually, a repository may have
+setup.py
+or
+pyproject.toml
+files which can be used to install the package.
+pip install -e .
+is commonly used, although many packages will also require an additional specifier that installs development packages as well (e.g.
+pip install -e .[dev]
+).
+3. To check whether the repository was installed successfully, run tests and see if they pass. You can usually find tests in a
+tests/
+or
+test/
+directory. You can run tests using
+pytest
+or
+unittest
+, depending on the framework used by the repository.
+4. Sometimes, you will need to install additional packages, often listed in a
+requirements.txt
+or
+environment.yml
+file. Also, be mindful of Ubuntu system dependencies that may need to be installed via
+apt-get
+(e.g.
+sudo apt-get install
+<
+<
+<
+package
+>
+>
+>
+).
+Once you are finished with installing the repository, run the
+submit
+command to submit your changes for review.
+\captionof
+figure
+Prompt provided to SWE-agent + an LM asking it to install a repository.
+A.3
+Validation, Evaluation Harnesses
+We adapt SWE-bench’s validation script to convert each bug patch into a SWE-bench style task instance.
+This step ensures
+\bugs
+can be run by existing SWE-bench solutions.
+The conversion involves two steps.
+First, the bug patch is applied and pushed as a branch to a mirror clone of the repository.
+Second, we create a SWE-bench style task instance from the bug patch, populating important fields such as Fail-to-Pass and Pass-to-Pass tests with information from the validation logs.
+Appendix B
+Bug Generation Strategies
+In this section, we review each of the bug generation strategies we employ in depth.
+While we experimented with several bug generation strategies, the ones we elect to include are those we found to satisfy several desirable properties.
+1.
+The approach works in a codebase-agnostic manner.
+2.
+The approach reliably yields usable task instances (meaning
+1
++
+limit-from
+1
+1+
+1 +
+passing tests break).
+3.
+The approach is controllable; via each strategy’s parameters, we can affect the quantity and quality of the generated bugs.
+System prompt for generating bugs with an LM
+You are a software developer doing chaos monkey testing.
+Your job is to rewrite a function such that it introduces a logical bug that will break existing unit test(s) in a codebase.
+To this end, some kinds of bugs you might introduce include:
+(Per inference call, only 3 of the following tips are randomly selected and shown)
+- Alter calculation order for incorrect results: Rearrange the sequence of operations in a calculation to subtly change the output (e.g., change (a + b) * c to a + (b * c)).
+- Introduce subtle data transformation errors: Modify data processing logic, such as flipping a sign, truncating a value, or applying the wrong transformation function.
+- Change variable assignments to alter computation state: Assign a wrong or outdated value to a variable that affects subsequent logic.
+- Mishandle edge cases for specific inputs: Change handling logic to ignore or improperly handle boundary cases, like an empty array or a null input.
+- Modify logic in conditionals or loops: Adjust conditions or loop boundaries (e.g., replace
+<=
+<=
+< =
+with
+<
+<
+<
+) to change the control flow.
+- Introduce off-by-one errors in indices or loop boundaries: Shift an index or iteration boundary by one, such as starting a loop at 1 instead of 0.
+- Adjust default values or constants to affect behavior: Change a hardcoded value or default parameter that alters how the function behaves under normal use.
+- Reorder operations while maintaining syntax: Rearrange steps in a process so the function produces incorrect intermediate results without breaking the code.
+- Swallow exceptions or return defaults silently: Introduce logic that catches an error but doesn’t log or handle it properly, leading to silent failures.
+Tips about the bug-introducing task:
+(At inference time, tips are randomly shuffled)
+- It should not cause compilation errors.
+- It should not be a syntax error.
+- It should be subtle and challenging to detect.
+- It should not modify the function signature.
+- It should not modify the documentation significantly.
+- For longer functions, if there is an opportunity to introduce multiple bugs, please do!”
+- Please DO NOT INCLUDE COMMENTS IN THE CODE indicating the bug location or the bug itself.
+Your answer should be formatted as follows:
+Explanation:
+<
+<
+<
+explanation
+>
+>
+>
+Bugged Code:
+‘‘‘
+<
+<
+<
+bugged_code
+>
+>
+>
+‘‘‘
+\captionof
+figure
+System prompt provided to an LM to generate bugs by modifying an existing, working function.
+Text in
+red
+are not included at the actual prompt.
+B.1
+Generating with an LM
+We describe our workflows for generating bugs with an LM.
+For each function or class in a codebase, we prompt an LM to generate either a rewrite that introduces bugs or a complete re-implementation from scratch.
+This strategy is illustrated in Figure
+9
+.
+Figure 9:
+Workflow to generate bugs for a function or class with an LM.
+We first extract all functions or classes from a codebase, then enumerate across all candidates and prompt the LM to generate either a bug-laced rewrite or a re-implementation.
+Modify existing functions.
+Given a Python codebase, we use the
+ast
+library to identify all unique functions, excluding any functions found under a testing related directory (e.g.
+tests
+,
+testing
+).
+Next, given a function, the LM is asked to write a new version that introduces logical, runtime bugs.
+Within the prompt, shown in Figure
+B
+, several suggestions of types of bugs along with a demonstration of a rewrite are provided.
+Prompts for reimplementing bugs with an LM
+System Prompt
+You are a software developer and you have been asked to implement a function.
+You will be given the contents of an entire file, with one or more functions defined in it.
+Please implement the function(s) that are missing.
+Do NOT modify the function signature, including the function name, parameters, return types, or docstring if provided.
+Do NOT change any other code in the file.
+You should not use any external libraries.
+Task Instance Prompt
+Please implement the function func_signature in the following code:
+{file_src_code}
+Remember, you should not modify the function signature, including the function name, parameters, return types, or docstring if provided.
+Do NOT change any other code in the file.
+Format your output as:
+[explanation]
+{func_to_write}
+\captionof
+figure
+System prompt provided to an LM to generate bugs by re-implementing an existing target function.
+file_src_code
+refers to the original source file minus the target function’s original implementation.
+func_to_write
+refers to the signature and docstring of the target function.
+In our experiments, we use OpenAI’s o3 mini model
+(OpenAI,
+2024b
+)
+(
+o3-mini-2025-01-31
+) as the main base model for bug generation.
+Based on our empirical observations of an LM’s tendencies, we include several explicit guidelines in the prompt about what the rewrite should not do.
+Notably, it is important to ask the LM to not generate any inline comments denoting the location of a bug; we observe that without explicitly specifying this, model generation outputs tend to have inline comments pointing out the bug.
+We also want to avoid the complexities of identifying and removing such comments from a file diff representation.
+Second, we state that rewrites causing compilation or syntax errors (e.g. undeclared variables, function definition modifications) should be avoided because such bugs are relatively trivial to solve.
+We do not experiment extensively with different prompts or generating multiple buggy rewrites per function.
+Modify existing classes.
+This method involves a simple amendment to the function rewriting approach.
+Instead of identifying unique functions (
+ast.FunctionDef
+), the codebase traversal logic instead looks for classes (
+ast.ClassDef
+).
+Otherwise, all other aspects of the implementation are near identical to function rewriting, with minor changes to the prompt to make bug suggestions and the demonstration more class oriented.
+Rewrite existing functions.
+Instead of providing an LM with the original function, we explore an alternative strategy of asking an LM to re-implement a function from scratch.
+Similar to above, we again use the
+ast
+library to identify all unique functions.
+However, instead of directly asking for a bug, we remove the function’s implementation, then prompt the LM with the entire file containing the function (minus the original implementation).
+In the task description, we then explicitly ask for the LM to implement the function without changing the function signature.
+B.2
+Procedural Modification
+We explore a zero-cost approach to create bugs by performing random modifications to the
+ast
+representation of a function or class.
+A “procedural modification” refers to a function that takes in an
+ast
+and applies a fixed transformation to it, such as removing a loop or swapping the blocks of an if/else clause.
+This strategy is illustrated in Figure
+10
+.
+Figure 10:
+Workflow to generate bugs via procedural modifications.
+Per function/class, the source code is first convert into an
+ast
+.
+The modification then mutates the
+ast
+(e.g. removes an assignment statement).
+The
+ast
+is then converted back into source code with the specific modification introduced.
+Similar to the workflow for generating bugs with an LM, we first identify all functions or classes in a repository.
+Per procedural modification, we first impose a set of criteria that filters out any candidates for which the modification would be impossible.
+For instance, if the procedural modification removes a random conditional from a function, the modification’s criteria will filter out any candidates that are not functions or do not have a conditional.
+For the remaining candidates, the procedural modification is applied with controlled
+likelihood
+, where
+likelihood
+is a fraction indicating how often the procedural modification is applied within a candidate.
+For example, if the procedural modification removes a random function with a
+likelihood
+of
+0.5
+0.5
+0.5
+0.5
+, then for every conditional declared within the function, there is a
+50
+50
+50
+50
+% chance it gets removed.
+We introduce
+likelihood
+so procedural modifications do not lead to changes that are too difficult.
+Finally, the modified
+ast
+is converted back into source code.
+Table
+7
+is a complete list of filtering criteria that is used for any procedural modification.
+For the
+filter_min_complexity
+and
+filter_max_complexity
+criteria, we define a simple definition of “complexity” as a sum of the number of conditional blocks, loops, boolean operators, exception handling blocks, and comparison operators in a function.
+The purpose of
+filter_min_complexity
+is to remove both simple, uninteresting functions (e.g. getter, setter methods) from consideration.
+filter_max_complexity
+is occasionally used to avoid changing long, monolithic functions.
+Index
+Criteria
+Description
+1
+filter_functions
+Is the
+ast
+a function definition
+2
+filter_classes
+Is the
+ast
+a class definition
+3
+filter_classes_has_base
+Is the
+ast
+a class definition with parents
+4
+filter_loops
+Does the
+ast
+contain a
+For
+or
+While
+loop?
+5
+filter_conditionals
+Does the
+ast
+contain a conditional block?
+6
+filter_assignments
+Is the
+ast
+a function def. with assignments?
+7
+filter_wrappers
+Does the
+ast
+contain
+try
+or
+with
+blocks?
+8
+filter_if_else
+Does the
+ast
+contain an
+if-else
+block?
+9
+filter_operators
+Does the
+ast
+contain binary, boolean operators?
+10
+filter_min_complexity
+Is the
+ast
+≥
+\geq
+≥
+a complexity score?
+11
+filter_max_complexity
+Is the
+ast
+≤
+\leq
+≤
+a complexity score?
+Table 7:
+Pool of criteria used to filter for functions or classes with specific properties.
+Per procedural modification, a subset of these criteria is first used to filter functions and/or classes from a codebase.
+The modification is then run on the remainder.
+Table
+8
+is an exhaustive list of all procedural modifications used to create bugs in a codebase.
+Procedural Modification
+Criteria
+Description
+Class
+Remove Functions
+2, 10
+Removes method(s) + reference(s).
+Remove Parent
+3, 10
+Removes base class from class header.
+Shuffle Methods
+2, 10
+Shuffles method definitions in a class.
+Control
+Invert If/Else
+8
+Inverts the if-else bodies of a condition.
+Flow
+Shuffle Lines
+11, 12
+Shuffles the lines of a function.
+Expressions
+Change Constants
+1, 9, 10
+±
+1
+plus-or-minus
+1
+\pm 1
+± 1
+to a constant numeric value.
+Break Chains
+1, 9, 10
+Removes operator(s), operator(s).
+Swap Operands
+1, 9, 10
+Mixes order of operands.
+Change Operator
+1, 9, 10
+Changes operator(s) (e.g.
++
++
++
+to
+−
+-
+-
+).
+Removal
+Loops
+1, 4, 10
+Remove loops (e.g.
+for
+,
+while
+).
+Conditionals
+1, 5, 10
+Remove conditionals (
+if
+).
+Assignments
+1, 6, 10
+Remove assignment statements.
+Wrappers
+1, 7, 10
+Remove exception (
+try
+), context (
+with
+).
+Table 8:
+The
+13
+13
+13
+13
+procedural modification techniques we use to create bugs in a codebase.
+The “Criteria” column contains indices referencing the corresponding filter defined in Table
+7
+.
+There are four informal categories — Class, Control Flow, Expressions, Removal — which indicates the general type of modification being made.
+B.3
+Combine Bug Patches
+We discuss the two strategies we use to combine bug patches from the same file or the same module.
+In practice, we combine LM and procedurally generated bugs that have been validated successfully as usable task instances.
+Figure 11:
+Workflow to generate bugs by combining bug patches.
+We take
+n
+𝑛
+n
+italic_n
+patches (generated using an LM or procedural modification), then sequentially apply each bug patch to the codebase.
+If all individual patches apply successfully, we save the resulting single patch which now represents all
+n
+𝑛
+n
+italic_n
+bugs combined.
+From the same file.
+If two or more functions are defined within a single file, this strategy merges the function-level bug patches together.
+Given
+n
+𝑛
+n
+italic_n
+function-level bugs and
+k
+𝑘
+k
+italic_k
+as the number of bugs to combine, there are
+(
+n
+k
+)
+binomial
+𝑛
+𝑘
+{n\choose k}
+( binomial start_ARG italic_n end_ARG start_ARG italic_k end_ARG )
+unique file-level candidate bug patches, which can be a large search space to cover.
+To make the search space tractable, ensure no single function-level bug is repeatedly used, and generate instances that reliably have
+1
+1
+1
+1
++ Fail to Pass tests, we implement the following approach described in Algorithm
+1
+.
+c
+⁢
+o
+⁢
+d
+⁢
+e
+⁢
+b
+⁢
+a
+⁢
+s
+⁢
+e
+𝑐
+𝑜
+𝑑
+𝑒
+𝑏
+𝑎
+𝑠
+𝑒
+codebase
+italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e
+,
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑏
+𝑢
+𝑔
+𝑠
+bugs
+italic_b italic_u italic_g italic_s
+;
+n
+⁢
+u
+⁢
+m
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑛
+𝑢
+𝑚
+_
+𝑏
+𝑢
+𝑔
+𝑠
+num\_bugs
+italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s
+,
+l
+⁢
+i
+⁢
+m
+⁢
+i
+⁢
+t
+⁢
+_
+⁢
+p
+⁢
+e
+⁢
+r
+⁢
+_
+⁢
+f
+⁢
+i
+⁢
+l
+⁢
+e
+𝑙
+𝑖
+𝑚
+𝑖
+𝑡
+_
+𝑝
+𝑒
+𝑟
+_
+𝑓
+𝑖
+𝑙
+𝑒
+limit\_per\_file
+italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_f italic_i italic_l italic_e
+;
+m
+⁢
+a
+⁢
+x
+⁢
+_
+⁢
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+⁢
+s
+𝑚
+𝑎
+𝑥
+_
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+𝑠
+max\_combos
+italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s
+m
+⁢
+i
+⁢
+n
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑚
+𝑖
+𝑛
+_
+𝑏
+𝑢
+𝑔
+𝑠
+min\_bugs
+italic_m italic_i italic_n _ italic_b italic_u italic_g italic_s
+≥
+\geq
+≥
+2;
+m
+⁢
+a
+⁢
+x
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑚
+𝑎
+𝑥
+_
+𝑏
+𝑢
+𝑔
+𝑠
+max\_bugs
+italic_m italic_a italic_x _ italic_b italic_u italic_g italic_s
+≥
+\geq
+≥
+m
+⁢
+i
+⁢
+n
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑚
+𝑖
+𝑛
+_
+𝑏
+𝑢
+𝑔
+𝑠
+min\_bugs
+italic_m italic_i italic_n _ italic_b italic_u italic_g italic_s
+;
+procedure
+CombineFileBugs
+for
+each
+f
+⁢
+i
+⁢
+l
+⁢
+e
+𝑓
+𝑖
+𝑙
+𝑒
+file
+italic_f italic_i italic_l italic_e
+in
+c
+⁢
+o
+⁢
+d
+⁢
+e
+⁢
+b
+⁢
+a
+⁢
+s
+⁢
+e
+𝑐
+𝑜
+𝑑
+𝑒
+𝑏
+𝑎
+𝑠
+𝑒
+codebase
+italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e
+do
+f
+⁢
+i
+⁢
+l
+⁢
+e
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑓
+𝑖
+𝑙
+𝑒
+_
+𝑏
+𝑢
+𝑔
+𝑠
+file\_bugs
+italic_f italic_i italic_l italic_e _ italic_b italic_u italic_g italic_s
+←
+←
+\leftarrow
+←
+bugs that apply to
+f
+⁢
+i
+⁢
+l
+⁢
+e
+𝑓
+𝑖
+𝑙
+𝑒
+file
+italic_f italic_i italic_l italic_e
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+←
+←
+\leftarrow
+←
+get_combos(
+f
+⁢
+i
+⁢
+l
+⁢
+e
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑓
+𝑖
+𝑙
+𝑒
+_
+𝑏
+𝑢
+𝑔
+𝑠
+file\_bugs
+italic_f italic_i italic_l italic_e _ italic_b italic_u italic_g italic_s
+,
+n
+⁢
+u
+⁢
+m
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑛
+𝑢
+𝑚
+_
+𝑏
+𝑢
+𝑔
+𝑠
+num\_bugs
+italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s
+,
+m
+⁢
+a
+⁢
+x
+⁢
+_
+⁢
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+⁢
+s
+𝑚
+𝑎
+𝑥
+_
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+𝑠
+max\_combos
+italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s
+)
+for
+each
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+in combinations
+do
+Apply
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+to
+c
+⁢
+o
+⁢
+d
+⁢
+e
+⁢
+b
+⁢
+a
+⁢
+s
+⁢
+e
+𝑐
+𝑜
+𝑑
+𝑒
+𝑏
+𝑎
+𝑠
+𝑒
+codebase
+italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e
+if
+success
+then
+Save
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+to disk
+if
+l
+⁢
+i
+⁢
+m
+⁢
+i
+⁢
+t
+⁢
+_
+⁢
+p
+⁢
+e
+⁢
+r
+⁢
+_
+⁢
+f
+⁢
+i
+⁢
+l
+⁢
+e
+𝑙
+𝑖
+𝑚
+𝑖
+𝑡
+_
+𝑝
+𝑒
+𝑟
+_
+𝑓
+𝑖
+𝑙
+𝑒
+limit\_per\_file
+italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_f italic_i italic_l italic_e
+reached
+then
+break
+end
+if
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+←
+←
+\leftarrow
+←
+[c for c in
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+if c
+∩
+\cap
+∩
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+=
+∅
+absent
+=\emptyset
+= ∅
+]
+end
+if
+end
+for
+end
+for
+end
+procedure
+Algorithm 1
+Combine multiple patches from the same file.
+For each file in a codebase, we first identify the function-level bugs (or bug patches) that edit that file.
+The pool of bugs we draw from have been
+validated
+, meaning we have already ensured there is
+1
+1
+1
+1
++ Fail to Pass test(s) associated with the bug.
+From these pool of
+file_bugs
+, the
+get_combos
+function then generates up to
+max_combos
+sets of bugs, where the size of each set is
+num_bugs
+.
+For each
+combo
+, or set of bugs, the bugs are applied to the codebase one by one.
+If all patches are successfully combined, this means they were successfully merged, and the merged patch, which consists of multiple function-level bugs, is saved and re-validated as a single bug.
+Merging patches occasionally fails if there is an overlapping conflict between two files, akin to a merge conflict with
+git
+; this usually happens when a function is declared within another.
+To ensure a function-level bug is only used once, any remaining bug sets in
+combinations
+using any patch in
+combo
+are removed.
+The
+limit_per_file
+and
+max_combos
+parameters prevent any one file from being over-represented and constrains an otherwise combinatorial large search space.
+We run this algorithm across all codebase files, typically setting
+num_bugs
+=
+[
+2
+,
+4
+]
+absent
+2
+4
+=[2,4]
+= [ 2 , 4 ]
+,
+limit_per_file
+=
+3
+absent
+3
+=3
+= 3
+,
+max_combos
+=
+40
+absent
+40
+=40
+= 40
+.
+Decreasing
+num_bugs
+or increasing the other three parameters improves the yield.
+From the same module.
+There are several ways one could imagine composing function-level bugs from multiple bugs, such as combining those that break the same test or have a programmatic relationship (e.g. function
+a
+calls function
+b
+).
+We found a relatively straightforward and effective approach to be combining files that edit the same “module”.
+By “module” we are referring to a subdirectory within the source code (e.g.
+sklearn/feature_extraction
+,
+astropy/convolution
+).
+Out of all SWE-bench instances that edit
+2
+2
+2
+2
++ files,
+75
+75
+75
+75
+% modify files within the same submodule, suggesting a high degree of intra-module code changes.
+The implementation for our approach is described in Algorithm
+2
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑏
+𝑢
+𝑔
+𝑠
+bugs
+italic_b italic_u italic_g italic_s
+;
+n
+⁢
+u
+⁢
+m
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑛
+𝑢
+𝑚
+_
+𝑏
+𝑢
+𝑔
+𝑠
+num\_bugs
+italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s
+;
+l
+⁢
+i
+⁢
+m
+⁢
+i
+⁢
+t
+⁢
+_
+⁢
+p
+⁢
+e
+⁢
+r
+⁢
+_
+⁢
+m
+⁢
+o
+⁢
+d
+⁢
+u
+⁢
+l
+⁢
+e
+𝑙
+𝑖
+𝑚
+𝑖
+𝑡
+_
+𝑝
+𝑒
+𝑟
+_
+𝑚
+𝑜
+𝑑
+𝑢
+𝑙
+𝑒
+limit\_per\_module
+italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_m italic_o italic_d italic_u italic_l italic_e
+;
+m
+⁢
+a
+⁢
+x
+⁢
+_
+⁢
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+⁢
+s
+𝑚
+𝑎
+𝑥
+_
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+𝑠
+max\_combos
+italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s
+;
+d
+⁢
+e
+⁢
+p
+⁢
+t
+⁢
+h
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+depth
+italic_d italic_e italic_p italic_t italic_h
+n
+⁢
+u
+⁢
+m
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑛
+𝑢
+𝑚
+_
+𝑏
+𝑢
+𝑔
+𝑠
+num\_bugs
+italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s
+≥
+\geq
+≥
+2;
+procedure
+CombineModuleBugs
+m
+⁢
+a
+⁢
+p
+⁢
+_
+⁢
+p
+⁢
+a
+⁢
+t
+⁢
+h
+⁢
+_
+⁢
+t
+⁢
+o
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+←
+{
+}
+←
+𝑚
+𝑎
+𝑝
+_
+𝑝
+𝑎
+𝑡
+ℎ
+_
+𝑡
+𝑜
+_
+𝑏
+𝑢
+𝑔
+𝑠
+map\_path\_to\_bugs\leftarrow\{\}
+italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_b italic_u italic_g italic_s ← { }
+for
+each
+b
+⁢
+u
+⁢
+g
+𝑏
+𝑢
+𝑔
+bug
+italic_b italic_u italic_g
+in
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑏
+𝑢
+𝑔
+𝑠
+bugs
+italic_b italic_u italic_g italic_s
+do
+p
+⁢
+a
+⁢
+t
+⁢
+h
+←
+←
+𝑝
+𝑎
+𝑡
+ℎ
+absent
+path\leftarrow
+italic_p italic_a italic_t italic_h ←
+get_path_from(bug)
+m
+⁢
+a
+⁢
+p
+⁢
+_
+⁢
+p
+⁢
+a
+⁢
+t
+⁢
+h
+⁢
+_
+⁢
+t
+⁢
+o
+⁢
+_
+⁢
+p
+⁢
+a
+⁢
+t
+⁢
+c
+⁢
+h
+⁢
+e
+⁢
+s
+⁢
+[
+p
+⁢
+a
+⁢
+t
+⁢
+h
+]
+←
+[
+b
+⁢
+u
+⁢
+g
+]
+←
+𝑚
+𝑎
+𝑝
+_
+𝑝
+𝑎
+𝑡
+ℎ
+_
+𝑡
+𝑜
+_
+𝑝
+𝑎
+𝑡
+𝑐
+ℎ
+𝑒
+𝑠
+delimited-[]
+𝑝
+𝑎
+𝑡
+ℎ
+delimited-[]
+𝑏
+𝑢
+𝑔
+map\_path\_to\_patches[path]\leftarrow[bug]
+italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_p italic_a italic_t italic_c italic_h italic_e italic_s [ italic_p italic_a italic_t italic_h ] ← [ italic_b italic_u italic_g ]
+end
+for
+Collapse nested paths based on
+d
+⁢
+e
+⁢
+p
+⁢
+t
+⁢
+h
+𝑑
+𝑒
+𝑝
+𝑡
+ℎ
+depth
+italic_d italic_e italic_p italic_t italic_h
+for all
+(
+p
+⁢
+a
+⁢
+t
+⁢
+h
+,
+p
+⁢
+a
+⁢
+t
+⁢
+c
+⁢
+h
+⁢
+e
+⁢
+s
+)
+𝑝
+𝑎
+𝑡
+ℎ
+𝑝
+𝑎
+𝑡
+𝑐
+ℎ
+𝑒
+𝑠
+(path,patches)
+( italic_p italic_a italic_t italic_h , italic_p italic_a italic_t italic_c italic_h italic_e italic_s )
+in
+m
+⁢
+a
+⁢
+p
+⁢
+_
+⁢
+p
+⁢
+a
+⁢
+t
+⁢
+h
+⁢
+_
+⁢
+t
+⁢
+o
+⁢
+_
+⁢
+p
+⁢
+a
+⁢
+t
+⁢
+c
+⁢
+h
+⁢
+e
+⁢
+s
+𝑚
+𝑎
+𝑝
+_
+𝑝
+𝑎
+𝑡
+ℎ
+_
+𝑡
+𝑜
+_
+𝑝
+𝑎
+𝑡
+𝑐
+ℎ
+𝑒
+𝑠
+map\_path\_to\_patches
+italic_m italic_a italic_p _ italic_p italic_a italic_t italic_h _ italic_t italic_o _ italic_p italic_a italic_t italic_c italic_h italic_e italic_s
+do
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+←
+←
+\leftarrow
+←
+get_combos(patches,
+n
+⁢
+u
+⁢
+m
+⁢
+_
+⁢
+b
+⁢
+u
+⁢
+g
+⁢
+s
+𝑛
+𝑢
+𝑚
+_
+𝑏
+𝑢
+𝑔
+𝑠
+num\_bugs
+italic_n italic_u italic_m _ italic_b italic_u italic_g italic_s
+,
+m
+⁢
+a
+⁢
+x
+⁢
+_
+⁢
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+⁢
+s
+𝑚
+𝑎
+𝑥
+_
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+𝑠
+max\_combos
+italic_m italic_a italic_x _ italic_c italic_o italic_m italic_b italic_o italic_s
+)
+for
+each
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+in
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+do
+Apply
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+to
+c
+⁢
+o
+⁢
+d
+⁢
+e
+⁢
+b
+⁢
+a
+⁢
+s
+⁢
+e
+𝑐
+𝑜
+𝑑
+𝑒
+𝑏
+𝑎
+𝑠
+𝑒
+codebase
+italic_c italic_o italic_d italic_e italic_b italic_a italic_s italic_e
+if
+success and num_files_changed(combo)
+≥
+2
+absent
+2
+\geq 2
+≥ 2
+then
+Save
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+to disk
+if
+l
+⁢
+i
+⁢
+m
+⁢
+i
+⁢
+t
+⁢
+_
+⁢
+p
+⁢
+e
+⁢
+r
+⁢
+_
+⁢
+m
+⁢
+o
+⁢
+d
+⁢
+u
+⁢
+l
+⁢
+e
+𝑙
+𝑖
+𝑚
+𝑖
+𝑡
+_
+𝑝
+𝑒
+𝑟
+_
+𝑚
+𝑜
+𝑑
+𝑢
+𝑙
+𝑒
+limit\_per\_module
+italic_l italic_i italic_m italic_i italic_t _ italic_p italic_e italic_r _ italic_m italic_o italic_d italic_u italic_l italic_e
+reached
+then
+break
+end
+if
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+←
+←
+\leftarrow
+←
+[c for c in
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+i
+⁢
+n
+⁢
+a
+⁢
+t
+⁢
+i
+⁢
+o
+⁢
+n
+⁢
+s
+𝑐
+𝑜
+𝑚
+𝑏
+𝑖
+𝑛
+𝑎
+𝑡
+𝑖
+𝑜
+𝑛
+𝑠
+combinations
+italic_c italic_o italic_m italic_b italic_i italic_n italic_a italic_t italic_i italic_o italic_n italic_s
+if c
+∩
+\cap
+∩
+c
+⁢
+o
+⁢
+m
+⁢
+b
+⁢
+o
+𝑐
+𝑜
+𝑚
+𝑏
+𝑜
+combo
+italic_c italic_o italic_m italic_b italic_o
+=
+∅
+absent
+=\emptyset
+= ∅
+]
+end
+if
+end
+for
+end
+for
+end
+procedure
+Algorithm 2
+Combine multiple patches from the same module.
+The implementation for this approach is similar to Algorithm
+1
+with two key changes.
+First, we do not do file-by-file or folder-by-folder traversal.
+Instead, using the diff patches, we create a dictionary
+map_path_to_bugs
+that mimics the file structure of a codebase.
+For example, if
+bug
+modifies path
+a/b/c/d.py
+, it is represented as
+map_path_to_bugs[a][b][c][d.py]
+=
+=
+=
+[bug]
+.
+Additional bugs that modify the same path are appended to the list.
+Since every bug is a function-level bug, there will never be a bug registered in multiple lists.
+We then “collapse” up to
+depth
+indices.
+So for instance, at
+depth
+=
+3
+absent
+3
+=3
+= 3
+, the above data structure is collapsed into
+map_path_to_bugs[a/b/c][d.py]
+=
+=
+=
+[bug]
+.
+Finally, any nested dictionaries are collapsed into a single list of patches (e.g.
+map_path_to_bugs[a/b/c]
+=
+=
+=
+[bug]
+).
+Mirroring the procedure in Algorithm
+1
+, we then iterate across this dictionary’s values (lists of bugs).
+Second, we only save patches that modify
+2
++
+limit-from
+2
+2+
+2 +
+files; aggregate bugs (represented by
+combo
+) modifying a single file are not considered.
+Again, we run this strategy across all
+100
+100
+100
+100
+repositories, with parameters
+num_bugs
+=
+[
+2
+,
+5
+]
+absent
+2
+5
+=[2,5]
+= [ 2 , 5 ]
+,
+limit_per_module
+=
+10
+absent
+10
+=10
+= 10
+,
+max_combos
+=
+100
+absent
+100
+=100
+= 100
+, and
+depth
+=
+2
+absent
+2
+=2
+= 2
+.
+Reducing
+num_bugs
+,
+depth
+and increasing the other parameters yields more bugs.
+We choose a
+depth
+of
+2
+2
+2
+2
+because empirically, we find that meaningful modules are usually declared as immediate sub-folders of the main source code folder (e.g. in
+sklearn/feature_extraction
+,
+sklearn
+is the source code folder while
+feature_extraction
+is the module).
+A shallower depth leads to less meaningful groupings, while yield decreases significantly for every increased level of depth, particularly for smaller repositories.
+B.4
+Pull Request Mirroring
+We finally discuss the fourth and last strategy for generating bugs - mirroring real world pull requests (PR).
+We visualize this process in Figure
+12
+.
+Figure 12:
+Workflow to generate bugs by reverting changes made in the diff patch corresponding to a real GitHub pull request (PR).
+Given the patch and the files modified by the patch, we prompt the LM to generate a complete rewrite of each file that
+reverses
+the changes made in the PR.
+The changes are applied to the codebase, and we extract the patch, which now captures the reversal of the PR changes.
+Why use an LM?
+When we initially implemented this approach, we attempted to directly perform a
+git apply --reverse [patch]
+on the codebase.
+However, for the large majority of patches, this fails.
+We performed troubleshooting by inspecting
+100
+100
+100
+100
+PR patches on the
+sqlfluff/sqlfluff
+repository, leading us to two observations.
+1.
+The majority of these PRs reflect changes that remain present in the codebase today (making the bug creation promising).
+2.
+However, many patches can not be reversed because the exact location (e.g. lines, file) of the relevant code changed because of other changes.
+Therefore, we employ LMs to perform patch reversal, and find that reasoning models (e.g.
+o3-mini
+(OpenAI,
+2024b
+)
+) are particularly effective.
+Description of method.
+We follow SWE-bench’s methodology for crawling PRs created January 1st, 2023 and onwards, with minor and arbitrary exceptions for some repositories where we crawl older PRs as well.
+Per PR, we iterate across the file(s) changed by the patch.
+Per file, we prompt an LM with the file-specific changes from the patch along with the file’s source code in the current state of the repository (
+not
+the repository’s state corresponding to when the PR was applied, referred to as the
+base_commit
+in SWE-bench).
+The LM is asked to generate a rewrite of the file that reverts the changes reflected in the PR.
+We aggregate the changes across all file(s) into a single patch.
+Because we are interested in problems that our expert trajectory generation method (SWE-agent + Claude 3.7 Sonnet) has a chance of solving, we do not attempt to reproduce PRs that change more than
+8
+8
+8
+8
+files.
+This constraint is imposed because no SWE-bench instance that edits more than
+6
+6
+6
+6
+files has ever been solved
+(Jimenez et al.,
+2024a
+)
+.
+How well does PR mirroring work?
+We scrape the PRs corresponding to
+100
+100
+100
+100
+randomly selected SWE-bench task instances from the
+django/django
+GitHub repository and attempt to recreate these task instances with
+\bugs
+’s collection process.
+We successfully recovered
+92
+92
+92
+92
+of
+100
+100
+100
+100
+task instances.
+Of these,
+84
+84
+84
+84
+break identical F2P test(s), with the remaining
+8
+8
+8
+8
+breaking a subset because some tests were removed over time.
+This sanity check gives us confidence that the PR mirroring strategy lives up to its name.
+Comparison to SWE-bench.
+This approach has several benefits and drawbacks compared to SWE-bench’s collection pipeline.
+First, it removes the need to create instance-specific Docker images — all PRs are mirrored against the same version of a repository.
+This also implies that there is no need to write installation specifications for past versions of a repository, which is typically the most laborious step in task construction with SWE-bench.
+Finally, this strategy also allows us to loosen the requirements on what PRs we attempt to convert into a task instance.
+In SWE-bench, the core requirements for what PRs to attempt to convert into a task instance include:
+1.
+It must edit
+1
++
+limit-from
+1
+1+
+1 +
+code files (e.g. not just
+.md
+,
+.rst
+files).
+2.
+It must reference
+1
++
+limit-from
+1
+1+
+1 +
+GitHub issues, which serves as the problem statement.
+3.
+It must edit
+1
++
+limit-from
+1
+1+
+1 +
+testing related files (
+1
++
+limit-from
+1
+1+
+1 +
+files with a
+test
+-adjacent keyword in it).
+With this collection strategy and
+\bugs
+’s focus on training data, the second and third requirements are no longer necessary.
+If there is no associated issue, issue text can simply be generated.
+If the patch does not contain any testing related changes, this is tolerable, as the validation stage will determine whether the PR breaks any tests.
+With these considerations, we purport that
+\bugs
+’s PR mirroring strategy can re-purpose a higher percentage of real world code changes for training purposes.
+The main downside is that the rest of the repository is out of sync with the state of the codebase when the PR was applied.
+As a result, it’s possible that changes in the behavior of the rest of the codebase may affect the issue’s reproducibility or the accuracy of the issue description (e.g. line numbers referenced in the issue text are likely somewhat off with respect to the codebase).
+However, a simple mitigation for this is to create a Docker image for a repository at an earlier commit that’s closer to the original creation date of the issue.
+While we do not carry out a targeted experiment, we hypothesize that using
+\bugs
+, we would be able to reproduce SWE-bench entirely with
+10
+10
+10
+10
+x less human hours with an estimated
+2294
+2294
+2294
+2294
+x $
+0.055
+0.055
+0.055
+0.055
+= $
+126.17
+126.17
+126.17
+126.17
+in costs.
+Appendix C
+Dataset Statistics
+We present additional breakdowns and analyses of the
+\bugs
+dataset, focusing on the kinds of repositories and bugs that are represented.
+Repository categorization.
+We present an exhaustive list of repositories used in
+\bugs
+in Table
+9
+.
+We categorize the repositories into seven general buckets: Data Parsing and Transformation (
+39
+39
+39
+39
+), Web & API Development (
+11
+11
+11
+11
+), Code Quality & Testing (
+12
+12
+12
+12
+), Visualization & Presentation (
+8
+8
+8
+8
+), System Tools & Protocols (
+17
+17
+17
+17
+), Natural Language Processing (
+7
+7
+7
+7
+), and Miscellaneous (
+6
+6
+6
+6
+).
+The categorizations were performed by first, determining an appropriate set of categories based on manual inspection supported by the descriptions and GitHub topics associated with each repository.
+After settling upon the buckets, we asked GPT-4o to provide a label based on the repository’s metadata and
+README
+dump.
+\bugs
+represents a wider and more variegated coverage of software tools and applications compared to any prior works.
+{CJK*}
+UTF8gbsn
+Repository
+Description
+Code Quality and Testing
+PyCQA/flake8
+flake8 is a python tool that glues together pycodestyle, pyflakes, mccabe, and third-party plugins to check the style and quality of some python code.
+Suor/funcy
+A fancy and practical functional tools
+adrienverge/yamllint
+A linter for YAML files.
+agronholm/typeguard
+Run-time type checker for Python
+cknd/stackprinter
+Debugging-friendly exceptions for Python
+cool-RR/PySnooper
+Never use print for debugging again
+getmoto/moto
+A library that allows you to easily mock out tests based on AWS infrastructure.
+pylint-dev/astroid
+A common base representation of python source code for pylint and other projects
+pytest-dev/iniconfig None
+pytest-dev/iniconfig
+None
+python/mypy
+Optional static typing for Python
+pyupio/safety
+Safety checks Python dependencies for known security vulnerabilities and suggests the proper remediations for vulnerabilities detected.
+pyutils/line_profiler
+Line-by-line profiling for Python
+rubik/radon
+Various code metrics for Python code
+spulec/freezegun
+Let your Python tests travel through time
+sqlfluff/sqlfluff
+A modular SQL linter and auto-formatter with support for multiple dialects and templated code.
+Data Parsing and Transformation
+alecthomas/voluptuous
+CONTRIBUTIONS ONLY: Voluptuous, despite the name, is a Python data validation library.
+andialbrecht/sqlparse
+A non-validating SQL parser module for Python
+buriy/python-readability
+fast python port of arc90’s readability tool, updated to match latest readability.js!
+burnash/gspread
+Google Sheets Python API
+chardet/chardet
+Python character encoding detector
+cloudpipe/cloudpickle
+Extended pickling support for Python objects
+dask/dask
+Parallel computing with task scheduling
+datamade/usaddress
+:us: a python library for parsing unstructured United States address strings into address components
+davidhalter/parso
+A Python Parser
+erikrose/parsimonious
+The fastest pure-Python PEG parser I can muster
+facelessuser/soupsieve
+A modern CSS selector implementation for BeautifulSoup
+gawel/pyquery
+A jquery-like library for python
+google/textfsm
+Python module for parsing semi-structured text into python tables.
+gruns/furl
+URL parsing and manipulation made easy.
+gweis/isodate
+ISO 8601 date/time parser
+hukkin/tomli
+A lil’ TOML parser
+jawah/charset_normalizer
+Truly universal encoding detector in pure Python
+john-kurkowski/tldextract
+Accurately separates a URL’s subdomain, domain, and public suffix, using the Public Suffix List (PSL).
+joke2k/faker
+Faker is a Python package that generates fake data for you.
+jsvine/pdfplumber
+Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
+kayak/pypika
+PyPika is a python SQL query builder that exposes the full richness of the SQL language using a syntax that reflects the resulting query. PyPika excels at all sorts of SQL queries but is especially useful for data analysis.
+keleshev/schema
+Schema validation just got Pythonic
+kennethreitz/records
+SQL for Humans™
+kurtmckee/feedparser
+Parse feeds in Python
+lepture/mistune
+A fast yet powerful Python Markdown parser with renderers and plugins.
+madzak/python-json-logger
+Json Formatter for the standard python logger
+mahmoud/glom
+☄️ Python’s nested data operator (and CLI), for all your declarative restructuring needs. Got data? Glom it! ☄️
+marshmallow-code/marshmallow
+A lightweight library for converting complex objects to and from simple Python datatypes.
+martinblech/xmltodict
+Python module that makes working with XML feel like you are working with JSON
+matthewwithanm/python-markdownify
+Convert HTML to Markdown
+mewwts/addict
+The Python Dict that’s better than heroin.
+mido/mido
+MIDI Objects for Python
+modin-project/modin
+Modin: Scale your Pandas workflows by changing a single line of code
+mozilla/bleach
+Bleach is an allowed-list-based HTML sanitizing library that escapes or strips markup and attributes
+msiemens/tinydb
+TinyDB is a lightweight document oriented database optimized for your happiness :)
+pandas-dev/pandas
+Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
+pdfminer/pdfminer.six
+Community maintained fork of pdfminer - we fathom PDF
+pudo/dataset
+Easy-to-use data handling for SQL data stores with support for implicit table creation, bulk loading, and transactions.
+pydantic/pydantic
+Data validation using Python type hints
+pydata/patsy
+Describing statistical models in Python using symbolic formulas
+pydicom/pydicom
+Read, modify and write DICOM files with python code
+pygments/pygments
+Pygments is a generic syntax highlighter written in Python
+pyparsing/pyparsing
+Python library for creating PEG parsers
+python-jsonschema/jsonschema
+An implementation of the JSON Schema specification for Python
+python-openxml/python-docx
+Create and modify Word documents with Python
+r1chardj0n3s/parse
+Parse strings using a specification based on the Python format() syntax.
+scanny/python-pptx
+Create Open XML PowerPoint documents in Python
+scrapy/scrapy
+Scrapy, a fast high-level web crawling & scraping framework for Python.
+seperman/deepdiff
+DeepDiff: Deep Difference and search of any Python object/data. DeepHash: Hash of any object based on its contents. Delta: Use deltas to reconstruct objects by adding deltas together.
+sloria/environs
+simplified environment variable parsing
+sunpy/sunpy
+SunPy - Python for Solar Physics
+tkrajina/gpxpy
+gpx-py is a python GPX parser. GPX (GPS eXchange Format) is an XML based file format for GPS tracks.
+tobymao/sqlglot
+Python SQL Parser and Transpiler
+un33k/python-slugify
+Returns unicode slugs
+Machine Learning and AI
+facebookresearch/fvcore
+Collection of common code that’s shared among different research projects in FAIR computer vision team.
+facebookresearch/hydra
+Hydra is a framework for elegantly configuring complex applications
+HIPS/autograd
+Efficiently computes derivatives of NumPy code.
+iterative/dvc
+Data Versioning and ML Experiments
+jaraco/inflect
+Correctly generate plurals, ordinals, indefinite articles; convert numbers to words
+life4/textdistance
+Compute distance between sequences. 30+ algorithms, pure python implementation, common interface, optional external libs usage.
+luozhouyang/python-string-similarity
+A library implementing different string similarity and distance measures using Python.
+Mimino666/langdetect
+Port of Google’s language-detection library to Python.
+mozillazg/python-pinyin
+汉字转拼音(pypinyin)
+pndurette/gTTS
+Python library and CLI tool to interface with Google Translate’s text-to-speech API
+Project-MONAI/MONAI
+AI Toolkit for Healthcare Imaging
+seatgeek/thefuzz
+Fuzzy String Matching in Python
+vi3k6i5/flashtext
+Extract Keywords from sentence or Replace keywords in sentences.
+System Tools and Protocols
+agronholm/exceptiongroup
+Backport of PEP 654 (exception groups)
+aio-libs/async-timeout
+asyncio-compatible timeout class
+arrow-py/arrow
+Better dates & times for Python
+borntyping/python-colorlog
+A colored formatter for the python logging module
+cantools/cantools
+CAN bus tools.
+conan-io/conan
+Conan - The open-source C and C++ package manager
+cookiecutter/cookiecutter
+A cross-platform command-line utility that creates projects from cookiecutters (project templates), e.g. Python package projects, C projects.
+dbader/schedule
+Python job scheduling for humans.
+gruns/icecream
+Never use print() to debug again.
+jd/tenacity
+Retrying library for Python
+mahmoud/boltons
+Like builtins, but boltons. 250+ constructs, recipes, and snippets which extend (and rely on nothing but) the Python standard library. Nothing like Michael Bolton.
+oauthlib/oauthlib
+A generic, spec-compliant, thorough implementation of the OAuth request-signing logic
+pallets/click
+Python composable command line interface toolkit
+paramiko/paramiko
+The leading native Python SSHv2 protocol library.
+pexpect/ptyprocess
+Run a subprocess in a pseudo terminal
+pyasn1/pyasn1
+Generic ASN.1 library for Python
+pyca/pyopenssl
+A Python wrapper around the OpenSSL library
+python-hyper/h11
+A pure-Python, bring-your-own-I/O implementation of HTTP/1.1
+python-trio/trio
+Trio – a friendly Python library for async concurrency and I/O
+rustedpy/result
+NOT MAINTAINED - A simple Rust like Result type for Python 3. Fully type annotated.
+termcolor/termcolor
+ANSI color formatting for output in terminal
+theskumar/python-dotenv
+Reads key-value pairs from a .env file and can set them as environment variables. It helps in developing applications following the 12-factor principles.
+tox-dev/pipdeptree
+A command line utility to display dependency tree of the installed Python packages
+Visualization and Presentation
+amueller/word_cloud
+A little word cloud generator in Python
+lincolnloop/python-qrcode
+Python QR Code image generator
+prettytable/prettytable
+Display tabular data in a visually appealing ASCII table format
+pwaller/pyfiglet
+An implementation of figlet written in Python
+rsalmei/alive-progress
+A new kind of Progress Bar, with real-time throughput, ETA, and very cool animations!
+weaveworks/grafanalib
+Python library for building Grafana dashboards
+Web and API Development
+Cog-Creators/Red-DiscordBot
+A multi-function Discord bot
+Knio/dominate
+Dominate is a Python library for creating and manipulating HTML documents using an elegant DOM API. It allows you to write HTML pages in pure Python very concisely, which eliminate the need to learn another template language, and to take advantage of the more powerful features of Python.
+alanjds/drf-nested-routers
+Nested Routers for Django Rest Framework
+benoitc/gunicorn
+gunicorn ’Green Unicorn’ is a WSGI HTTP Server for UNIX, fast clients and sleepy applications.
+bottlepy/bottle
+bottle.py is a fast and simple micro-framework for python web-applications.
+django-money/django-money
+Money fields for Django forms and models.
+django/channels
+Developer-friendly asynchrony for Django
+django/daphne
+Django Channels HTTP/WebSocket server
+encode/starlette
+The little ASGI framework that shines.
+getnikola/nikola
+A static website and blog generator
+graphql-python/graphene
+GraphQL framework for Python
+marshmallow-code/apispec
+A pluggable API specification generator. Currently supports the OpenAPI Specification (f.k.a. the Swagger specification)..
+marshmallow-code/webargs
+A friendly library for parsing HTTP request arguments, with built-in support for popular web frameworks, including Flask, Django, Bottle, Tornado, Pyramid, webapp2, Falcon, and aiohttp.
+pallets/jinja
+A very fast and expressive template engine.
+pallets/markupsafe
+Safely add untrusted strings to HTML/XML markup.
+tornadoweb/tornado
+Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed.
+tweepy/tweepy
+Twitter for Python!
+C.1
+Bug Generation Statistics
+We provide extensive details about different aspects of each of the bug generation strategies, including the yield rates, labor/monetary costs, and dataset characterizations.
+Yield rates.
+In Table
+10
+, we provide the yield rates for each bug generation method across all repositories in
+\bugs
+.
+In general, we find that the PR Mirroring has the lowest yield rate at
+13.18
+13.18
+13.18
+13.18
+% (although this rate is somewhat higher than SWE-bench’s yield rate of
+2294
+/
+93139
+=
+2.46
+2294
+93139
+2.46
+2294/93139=2.46
+2294 / 93139 = 2.46
+%).
+For using LMs to generate bugs, modifying functions to introduce bugs intentionally has a higher yield than asking LMs to perform a best-effort rewrite.
+The efficacy of Procedural Modifications varies by strategy.
+For instance, shuffling the functions declared in a class only breaks existing test(s)
+1.93
+1.93
+1.93
+1.93
+% of the time, but inverting a conditional will lead to a task instance for
+47.04
+47.04
+47.04
+47.04
+% of modifications.
+Finally, combining bug patches has an extremely high yield rate - this is to be expected because we only attempt to combine bug patches that have been validated as usable task instances breaking
+1
++
+limit-from
+1
+1+
+1 +
+tests.
+Strategy
+# Repos
+# Candidates
+# Instances
+Yield Rate
+Combine (file)
+124
+6020
+5865
+97.43%
+Combine (module)
+65
+4396
+4227
+96.16%
+LM (Modify)
+108
+31950
+17887
+55.98%
+LM (Rewrite)
+128
+11908
+4173
+35.04%
+PR Mirroring
+108
+6934
+2344
+33.8%
+Procedural (Class Rm Base)
+103
+1401
+463
+33.05%
+Procedural (Class Rm Funcs)
+103
+2506
+1180
+47.09%
+Procedural (Class Shuffle Funcs)
+103
+2504
+47
+1.88%
+Procedural (Ctrl Invert If)
+105
+4695
+2321
+49.44%
+Procedural (Ctrl Shuffle)
+104
+9055
+4015
+44.34%
+Procedural (Op Break Chains)
+71
+747
+225
+30.12%
+Procedural (Op Change Const)
+77
+723
+257
+35.55%
+Procedural (Op Change)
+81
+1507
+450
+29.86%
+Procedural (Op Swap)
+87
+2141
+483
+22.56%
+Procedural (Remove Assign)
+121
+5470
+2661
+48.65%
+Procedural (Remove Cond)
+120
+5288
+2311
+43.7%
+Procedural (Remove Loop)
+110
+1945
+860
+44.22%
+Procedural (Remove Wrapper)
+80
+884
+368
+41.63%
+All
+129
+100074
+50137
+50.1%
+Table 10:
+Yield rates for different bug generation strategies covered in Section
+B
+.
+We show the number of repositories that each strategy was run on, the number of bug candidates generated by each strategy, and the number of instances, or the number of candidates that were validated to have
+1
++
+limit-from
+1
+1+
+1 +
+Fail to Pass test.
+The yield rate for
+The number of repositories captured by each bug generation technique varies due to each strategy’s specific preconditions, which at times may not be effective for some repositories.
+For instance, the
+Procedural (Class *)
+set of methods only mutates Python classes.
+This strategy is fruitless for the minority of
+\bugs
+repositories that do not define any classes.
+The
+Procedural (Op Break Chains)
+method randomly removes operations and operands from expressions with two or more operations (e.g.
+a
++
+b
++
+c
+→
+a
++
+b
+→
+𝑎
+𝑏
+𝑐
+𝑎
+𝑏
+a+b+c\rightarrow a+b
+italic_a + italic_b + italic_c → italic_a + italic_b
+) — such expressions are not always present in
+\bugs
+repositories.
+The collective yield rate across
+\bugs
+’s bug generation strategies is significantly higher than SWE-bench’s collection strategy.
+Yield Rate
+# of Repositories
+0
+0
+-
+25
+25
+25
+25
+%
+10
+10
+10
+10
+25
+25
+25
+25
+-
+50
+50
+50
+50
+%
+31
+31
+31
+31
+50
+50
+50
+50
+-
+75
+75
+75
+75
+%
+60
+60
+60
+60
+75
+75
+75
+75
+-
+100
+100
+100
+100
+%
+27
+27
+27
+27
+Table 11:
+Yield rates for different repositories represented in
+\bugs
+.
+The yield rate also varies with respect to the repository it is being applied to.
+We provide a summary of yield rates by repository in Table
+11
+.
+We generally observe that lower test coverage correlates with a lower yield rate.
+Dataset characterizations.
+In Table
+12
+, we provide statistics about the validated task instances produced by different bug generation strategies.
+Our work’s LM-based strategies rewrite one function in one file.
+Procedural modifications will also only change one file, but depending on the strategy,
+1
++
+limit-from
+1
+1+
+1 +
+functions or classes may be changed.
+Combining multiple patches from the same file always produces a patch with
+2
++
+limit-from
+2
+2+
+2 +
+functions edited.
+Combining across modules produces a patch with
+2
++
+limit-from
+2
+2+
+2 +
+files edited.
+The targeted nature of each of the bug creation strategies is reflected in the typical number of functions and files that the bugs produced by each strategy edits.
+Strategy
+# Instances
+# F2P
+Δ
+Δ
+\Delta
+roman_Δ
+Lines
+Δ
+Δ
+\Delta
+roman_Δ
+Functions
+Δ
+Δ
+\Delta
+roman_Δ
+Files
+Combine
+10092
+15 (5-48)
+19 (12-36)
+2 (2-3)
+1 (1-2)
+LM
+22060
+4 (1-17)
+6 (3-15)
+1 (1-1)
+1 (1-1)
+PR Mirroring
+2344
+3 (1-14)
+20 (8-55)
+2 (2-4)
+1 (1-2)
+Procedural
+15641
+7 (2-32)
+7 (5-15)
+1 (1-1)
+1 (1-1)
+Table 12:
+Statistics for attributes of a
+\bugs
+task instance across different bug generation strategies, reported as
+median (IQR)
+, where IQR is the inter-quartile range (25th–75th percentile).
+In Figure
+13
+, we show the distributions for different attributes of
+\bugs
+compared to other SWE-bench style datasets.
+Compared to prior works, there is a much higher proportion of task instances with more than one Fail-to-Pass test.
+For any one repository, we find that
+\bugs
+task instances collectively cause failures for a much higher percentage of the testing suit than other datasets; a potential benefit of this is that training on
+\bugs
+based trajectories may expose models to a much broader set of functionalities in a codebase.
+The number of lines and files edited by
+\bugs
+task instances is highly similar to the trend lines for SWE-bench Verified.
+Figure 13:
+Comparison of cumulative distributions for Fail-to-Pass tests along with the lines and files edited by the gold patch across
+\bugs
+and four SWE-bench style datasets.
+We note that unlike other datasets, the trend line of
+\bugs
+task instances is “adjustable”.
+In other words, the Figure
+13
+distributions are a capture of the task instances provided in this release of
+\bugs
+.
+However, because of
+\bugs
+’s flexible bug creation techniques, the distribution can be “shaped” if needed.
+For instance, generating more task instances using the bug patch combination method would shift all three curves in Figure
+13
+.
+We make this point to highlight the fact that the attributes of SWE-bench task instances are, in a sense, constrained by real world software development behavior.
+On the other hand,
+\bugs
+can be used to break tests and code that may not be reflected at all in any existing pull request.
+In this sense, we argue that LMs trained on
+\bugs
+have better “exposure” to a codebase compared to exclusively training on pull requests.
+Continuation of scaling execution environments.
+The validation and evaluation procedures for
+\bugs
+deviate slightly from SWE-bench’s harnesses.
+The main reasons for these differences can largely be attributed to the granularity of installation specifications.
+In SWE-bench, each task instance corresponds to a unique base commit, with additional
+version
+and
+environment_setup_commit
+keys needed as indirection for mapping an instance to the correct set of installation and testing instructions.
+Across time, the continuous evolution of a repository and its dependencies make for an incredibly high degree of variability in how a repository should be installed correctly.
+To solve this variability, the community has resorted to creating an image per task instance, as done in
+Chowdhury et al. (
+2024
+)
+.
+Therefore, for
+2294
+2294
+2294
+2294
+SWE-bench task instances, there are
+2294
+2294
+2294
+2294
+unique Docker images, each at a size of at least several gigabytes (
+∼
+5
+similar-to
+absent
+5
+\sim 5
+∼ 5
+-
+6
+6
+6
+6
+GBs).
+On the other hand, the simplicity and scalability of
+\bugs
+’s design allows one to support many task instances with comparatively much fewer Docker images.
+As mentioned above, installation and testing procedures are (repository, commit) specific.
+Therefore, when bugs are generated from each (repository, commit), all bugs can be reproduced and tested successfully from the same Docker image.
+In other words, if I generate
+100
+100
+100
+100
+bugs for a repository at some commit, instead of
+100
+100
+100
+100
+Docker images, only a single Docker image is required to run inference on any of the
+100
+100
+100
+100
+task instances.
+This design is what enables
+\bugs
+to be significantly more space-efficient than SWE-bench.
+Based on the publicly released images, for SWE-bench’s
+2294
+2294
+2294
+2294
+task instances,
+1.2
+1.2
+1.2
+1.2
+TBs of storage are required to download all Docker images locally.
+for SWE-bench Multimodal’s
+517
+517
+517
+517
+task instances,
+1.2
+1.2
+1.2
+1.2
+TBs are required.
+The higher per-instance Docker image size for SWE-bench Multimodal is due to how JavaScript dependency management tools (e.g.
+npm
+) require more storage compared to equivalent Python infrastructure (e.g.
+pypi
+).
+Pan et al. (
+2024
+)
+states that each image for the
+2438
+2438
+2438
+2438
+instances an average of
+2.6
+2.6
+2.6
+2.6
+GB, totaling 6 TB of storage total.
+Such a storage requirement can be a significant barrier for academic practitioners.
+On the other hand, with more than
+20
+20
+20
+20
+x the number of bugs,
+\bugs
+requires only
+125
+125
+125
+125
+Docker images total, corresponding to the number of unique (repository, commit) pairs (in this work, for each repository, we only determine installation and test specifications for one commit).
+The
+125
+125
+125
+125
+images require a total of
+290.54
+290.54
+290.54
+290.54
+GBs.
+In summary, compared to SWE-bench’s task collection strategy,
+\bugs
+’s design makes it easier to not only create task instances, but also train on them as well.
+C.2
+Case Study: SWE-bench &
+\bugs
+To better understand the differences between the SWE-bench and
+\bugs
+collection strategies, we perform
+\bugs
+collection on the
+pallets/flask
+GitHub repository, one of the
+12
+12
+12
+12
+test split repositories from the original SWE-bench benchmark.
+We review the steps covered in Section
+2.1
+applied to
+pallets/flask
+in detail.
+First, we defined the installation and testing specifications for the
+pallets/flask
+repository at commit
+bc09840
+.
+Next, we apply the LM modification bug generation strategy to this version of the repository, generating
+267
+267
+267
+267
+unique bugs.
+We observe several differences.
+First,
+the
+\bugs
+collection strategy yields a much higher number of bugs outright.
+From SWE-bench,
+11
+11
+11
+11
+task instances are from the
+pallets/flask
+repository.
+The task instances were originally filtered from
+2434
+2434
+2434
+2434
+pull requests (PRs), with
+107
+107
+107
+107
+satisfying SWE-bench’s filtering criteria of (1) being linked to one or more issues and (2) featuring 1+ new tests.
+Out of these
+107
+107
+107
+107
+, the
+11
+11
+11
+11
+(
+0.45
+0.45
+0.45
+0.45
+% of
+2434
+2434
+2434
+2434
+) task instances represent the proportion of PRs that execution environments could be successfully constructed for.
+On the other hand, running the function-level rewriting strategy for bug generation originally yielded
+402
+402
+402
+402
+candidates, of which
+267
+267
+267
+267
+were determined to be valid task instances.
+Second,
+\bugs
+requires significantly less human effort while only incurring minor costs
+.
+Collecting the
+11
+11
+11
+11
+pallets/flask
+task instances (steps include scraping PRs, determining repository versions across time, defining version-specific installation/test specifications, running execution-based validation multiple times) took an estimated
+38
+38
+38
+38
+hours worth of human labor.
+On the contrary, defining installation and testing specifications for the latest commit of
+pallets/flasks
+took
+10
+10
+10
+10
+minutes.
+The subsequent function-level rewriting strategy for bugs took
+23
+23
+23
+23
+minutes to run, incurring a total cost of just $
+2.47
+2.47
+2.47
+2.47
+(
+∼
+similar-to
+\sim
+∼
+$
+0.00613
+0.00613
+0.00613
+0.00613
+per instance).
+The final execution-based validation step that filters out
+402
+−
+267
+=
+135
+402
+267
+135
+402-267=135
+402 - 267 = 135
+unqualified bug candidates ran in
+14
+14
+14
+14
+minutes.
+Since both the bug and problem statement generation strategies are repository agnostic, no additional human intervention is necessary for these steps.
+Head to head, per instance for the
+pallets/flask
+repository, SWE-bench style collection requires
+38
+×
+60
+/
+11
+=
+207.27
+38
+60
+11
+207.27
+38\times 60/11=207.27
+38 × 60 / 11 = 207.27
+minutes compared to
+0.176
+0.176
+0.176
+0.176
+minutes (
+∼
+10.6
+similar-to
+absent
+10.6
+\sim 10.6
+∼ 10.6
+seconds) and $
+0.00613
+0.00613
+0.00613
+0.00613
+in API costs using
+\bugs
+.
+Third,
+collectively,
+\bugs
+task instances break a significantly larger proportion of existing tests in a codebase
+.
+We define “bug coverage” as the proportion of tests broken by
+1
+1
+1
+1
++ instance across all task instances.
+For the SWE-bench split of
+pallets/flask
+, there are
+207
+207
+207
+207
+unique tests across all
+11
+11
+11
+11
+instances.
+Of these
+207
+207
+207
+207
+tests,
+15
+15
+15
+15
+are broken by
+1
+1
+1
+1
++ instance, corresponding to a bug coverage rate of
+7.25
+7.25
+7.25
+7.25
+%.
+For the
+\bugs
+split of
+pallets/flask
+, there are
+474
+474
+474
+474
+unique tests across
+267
+267
+267
+267
+instances.
+The larger amount of tests is due to increased test coverage in the
+pallets/flask
+repository as of Nov. 28, 2024 (when
+\bugs
+was collected) compared to June 2023 (when SWE-bench was collected).
+Of these
+474
+474
+474
+474
+tests,
+422
+422
+422
+422
+are broken by
+1
+1
+1
+1
++ instance, a bug coverage rate of
+89.03
+89.03
+89.03
+89.03
+%.
+We attribute the significant difference to a consistent tendency in real world open source software development workflows, that is, the
+minority
+of tests are introduced to capture existing, errant behavior in the repository.
+The significant majority of tests are committed alongside working code, ensuring that already correct behavior is upheld.
+Well-maintained repositories will typically not merge commits that cause such tests to fail.
+This results in a large number of tests where few to no commits correspond to those tests’ failures.
+Finally,
+\bugs
+does not yield instances appropriate for evaluation
+.
+The
+\bugs
+pipeline as presented does not produce hidden tests, a crucial difference that makes SWE-bench more suitable for evaluation.
+Consequently, when expert trajectories are generated, the Fail-to-Pass tests are present in the repository at inference time.
+Furthermore, our issue generation strategy does not include checks for known problems such as underspecified text descriptions or solution leakage
+(Chowdhury et al.,
+2024
+)
+.
+Simple amendments could make
+\bugs
+task instances suitable for evaluation, such as deleting Fail-to-Pass test functions or files along with a validation procedure around the ambiguity and leakage of the issue text.
+Finally, thorough analyses of how faithful
+\bugs
+task instances are to real world issues and PRs would be necessary to justify synthetic bugs for evaluation.
+Appendix D
+Issue Generation
+We cover the four issue generation strategies we experiment with to determine issue text’s effect on how solvable a
+\bugs
+instance is along with the trajectory’s value as a training data point.
+Generated with LM.
+We prompt an LM with a randomly selected SWE-bench Verified problem statement, the bug patch, list of Fail-to-Pass tests, source code for one Fail-to-Pass test, and the execution logs of running all the Fail-to-Pass tests.
+We ask the LM to generate an issue that describes the bug conveyed in the patch in the style of the SWE-bench Verified demonstration.
+Figure
+13
+shows the system prompt for this strategy.
+Fixed issue templates.
+We create a set of
+7
+7
+7
+7
+pre-defined issue templates, listed in Table
+13
+.
+Each template uses information from the bug patch or Fail-to-Pass tests associated with every task instance.
+Given a dataset of task instances, we randomly select one of the templates to use as the problem statement according to the probabilities listed in Table
+13
+.
+The reason we assign the highest likelihood for the prompt that provides all four categories of information (bug type, files changed, functions changed, Fail-to-Pass tests) is to ensure that a higher proportion of task instances are well-specified.
+Template
+Prob.
+Information Provided
+Basic
+0.05
+0.05
+0.05
+0.05
+None
+Files
+0.1
+0.1
+0.1
+0.1
+States which file(s) have bug(s).
+Funcs
+0.15
+0.15
+0.15
+0.15
+States which file(s) and func(s) have bug(s).
+Tests
+0.1
+0.1
+0.1
+0.1
+States that some tests are failing.
+F2P Tests
+0.1
+0.1
+0.1
+0.1
+States which tests are failing.
+Bug Type
+0.05
+0.05
+0.05
+0.05
+States failure type.
+Bug Type + Files
+0.15
+0.15
+0.15
+0.15
+States failure type and which file(s) have bug(s)
+Bug Type + Files
+0.15
+0.15
+0.15
+0.15
+States failure type, which file(s) have bug(s),
++ Test
+and a random F2P test.
+Bug Type + Files
+0.15
+0.15
+0.15
+0.15
+States failure type, which file(s) and func(s)
++ Funcs + Test
+have bug(s), and a random F2P test.
+Table 13:
+List of issue text templates we use to generate problem statements.
+Across all templates, four types of information are included — the files with bugs, functions with bugs, Fail-to-Pass test(s), and the type of bug.
+Templates that offer less information are generally assigned a lower probability.
+Fail-to-Pass test code and execution logs.
+Another approach is showing the source code and test execution logs for a randomly selected Fail-to-Pass test.
+This approach is motivated by the lack of reproduction code or expected/actual behavior of code communicated with fixed issue templates.
+We show code and execution logs only for a single Fail-to-Pass test; if a task instance has more than one Fail-to-Pass test, we do not disclose remaining tests.
+Original issue text.
+This strategy works exclusively for some task instances generated using PR Mirroring.
+If a PR is successfully mirrored, we use the text from the associated issues as the problem statement, exactly as done in SWE-bench.
+Of the
+2345
+2345
+2345
+2345
+task instances represented in
+\bugs
+mirrored from real-world PRs,
+708
+708
+708
+708
+or
+30.19
+30.19
+30.19
+30.19
+% of these have one or more associated GitHub issue(s) to create a SWE-bench style problem statement.
+System prompt for generating issues with an LM
+You are a software engineer helping to create a realistic dataset of synthetic GitHub issues.
+You will be given the following input:
+1. Demonstration: A realistic GitHub issue to mimic (included in the
+<
+<
+<
+demonstration
+>
+>
+>
+tag).
+2. Patch: A git diff output/PR changes that introduces a bug (included in the
+<
+<
+<
+patch
+>
+>
+>
+tag).
+3. Test output: The output of running the tests after the patch is applied (included in the
+<
+<
+<
+test_output
+>
+>
+>
+tag).
+4. Test source code: Source code for one or more tests that failed (included in the
+<
+<
+<
+test_source_code
+>
+>
+>
+tag).
+Output: A realistic GitHub issue for the patch.
+Guidelines:
+- Mimic the style and structure of the demonstration issues.
+If the demonstration issues are not well structured, your output should also be not well structured.
+If the demonstrations use improper or no markdown, your output should also use improper or no markdown.
+If the demonstrations are short/long, your output should also be short/long (if possible).
+If the demonstrations include human ”flavor text” or ”fluff”, your output should also include human ”flavor text” or ”fluff”.
+Do this even if it conflicts with your default behavior of trying to be extremely concise and helpful.
+- DO NOT explain the fix/what caused the bug itself, focus on how to reproduce the issue it introduces
+- Do not mention pytest or what exact test failed. Instead, generate a realistic issue.
+- If possible, include information about how to reproduce the issue. An ideal reproduction script should raise an error
+or print an unexpected output together with the expected output.
+However, still include this information in a style very similar to the demonstration issues.
+\captionof
+figure
+System prompt provided to an LM to generate an issue based off the bug patch and testing information of a task instance along with a demonstration problem statement randomly selected from SWE-bench Verified.
+Appendix E
+Difficulty Rating
+We train a model that labels a task with one of three difficulty labels:
+<
+15
+absent
+15
+<15
+< 15
+minutes (easy),
+15
+15
+15
+15
+minutes -
+1
+1
+1
+1
+hour (medium), and
+1
+1
+1
+1
++ hour (hard).
+This model allows us to quantify the difficulty of individual task instances and, in aggregate, the difficulty of entire datasets.
+To train this model, we use
+1699
+1699
+1699
+1699
+annotations from
+Chowdhury et al. (
+2024
+)
+.
+In their work towards curating SWE-bench Verified, a subset of
+1699
+1699
+1699
+1699
+SWE-bench task instances were labeled with four difficulty levels:
+<
+15
+absent
+15
+<15
+< 15
+min,
+15
+15
+15
+15
+min -
+1
+1
+1
+1
+hr,
+1
+1
+1
+1
+-
+4
+4
+4
+4
+hrs, and
+4
+4
+4
+4
++ hrs.
+Generally, three annotators were assigned to each instance, and the difficulty annotations were ensembled by taking the majority choice for a sample, or the median if there is no majority.
+The distribution of annotated difficulties, from easiest to hardest, is
+24.5
+24.5
+24.5
+24.5
+%,
+53.5
+53.5
+53.5
+53.5
+%,
+19.4
+19.4
+19.4
+19.4
+%, and
+2.8
+2.8
+2.8
+2.8
+%.
+Because there are very few samples in the
+4
+4
+4
+4
++ hr category, we reclassify the
+1
+1
+1
+1
+-
+4
+4
+4
+4
+hr and
+4
+4
+4
+4
++ hr instances into a single
+1
+1
+1
+1
++ hr category.
+Next, we create corresponding train and test datasets at a
+80
+80
+80
+80
+/
+20
+20
+20
+20
+% split, randomly shuffling the instances while ensuring the train and test distributions do not deviate significantly from the original.
+An instance’s problem statement and solution patch are provided as input, and one of the three difficulty labels serves as the target output.
+We perform LoRA fine-tuning
+(Hu et al.,
+2021
+)
+on a Qwen 2.5 32B Instruct model using the Unsloth
+(Daniel Han & team,
+2023
+)
+library.
+The model achieves an accuracy of
+75.3
+75.3
+75.3
+75.3
+% on the test set.
+All errant predictions are off by one; in other words, the model never predicted
+<
+15
+absent
+15
+<15
+< 15
+min when the label was
+1
+1
+1
+1
++ hr, and vise versa.
+Using this model, we can grade the difficulty of a
+\bugs
+instance once the bug patch and corresponding issue text have been created.
+To provide a succinct summary of difficulty for a dataset of SWE-bench style task instances, we propose a “difficulty score” metric.
+Each label corresponds to a numeric difficulty score of
+1
+1
+1
+1
+,
+5
+5
+5
+5
+, and
+9
+9
+9
+9
+, from easiest to hardest.
+The difficulty score is therefore the average difficulty score across all task instances.
+Figure 14:
+Distribution of task instance difficulty (
+easy
+/
+medium
+/
+hard
+) for existing SWE-bench style datasets (left
+5
+5
+5
+5
+bars) and
+\bugs
+(right
+5
+5
+5
+5
+bars), assessed by our difficulty rating model.
+The average difficulty score for each dataset is listed above each bar.
+For
+\bugs
+, per bug strategy, we sample
+1000
+1000
+1000
+1000
+task instances with LM generated issue text.
+Dataset
+# Instances
+Score
+easy
+med
+hard
+SWE-bench
+2294
+2294
+2294
+2294
+5.014
+5.014
+5.014
+5.014
+438
+438
+438
+438
+1408
+1408
+1408
+1408
+446
+446
+446
+446
+Lite
+300
+300
+300
+300
+3.893
+3.893
+3.893
+3.893
+93
+93
+93
+93
+197
+197
+197
+197
+10
+10
+10
+10
+Verified
+500
+500
+500
+500
+3.960
+3.960
+3.960
+3.960
+173
+173
+173
+173
+284
+284
+284
+284
+43
+43
+43
+43
+SWE-bench Multimodal
+510
+510
+510
+510
+6.036
+6.036
+6.036
+6.036
+55
+55
+55
+55
+265
+265
+265
+265
+186
+186
+186
+186
+SWE-gym
+2438
+2438
+2438
+2438
+5.625
+5.625
+5.625
+5.625
+288
+288
+288
+288
+1456
+1456
+1456
+1456
+664
+664
+664
+664
+Lite
+230
+230
+230
+230
+3.890
+3.890
+3.890
+3.890
+67
+67
+67
+67
+156
+156
+156
+156
+4
+4
+4
+4
+\bugs
+(LM Modify)
+1000
+1000
+1000
+1000
+3.304
+3.304
+3.304
+3.304
+441
+441
+441
+441
+542
+542
+542
+542
+17
+17
+17
+17
+\bugs
+(LM Rewrite)
+1000
+1000
+1000
+1000
+5.272
+5.272
+5.272
+5.272
+68
+68
+68
+68
+796
+796
+796
+796
+136
+136
+136
+136
+\bugs
+(Procedural)
+1000
+1000
+1000
+1000
+3.596
+3.596
+3.596
+3.596
+374
+374
+374
+374
+603
+603
+603
+603
+23
+23
+23
+23
+\bugs
+(PR Mirror)
+1000
+1000
+1000
+1000
+4.876
+4.876
+4.876
+4.876
+206
+206
+206
+206
+619
+619
+619
+619
+175
+175
+175
+175
+\bugs
+(Combine)
+1000
+1000
+1000
+1000
+5.720
+5.720
+5.720
+5.720
+52
+52
+52
+52
+716
+716
+716
+716
+232
+232
+232
+232
+Table 14:
+The score is averaged over all task instances, where
+easy
+/
+med
+/
+hard
+corresponds to
+1
+1
+1
+1
+/
+5
+5
+5
+5
+/
+9
+9
+9
+9
+.
+For
+\bugs
+, we sample
+1000
+1000
+1000
+1000
+task instances per bug strategy.
+Figure
+14
+summarizes our findings for difficulties across different SWE-bench style datasets.
+We provide a more thorough rundown of task instances per difficulty level in Table
+14
+.
+We find that different
+\bugs
+bug generation methods yield different levels of difficulty.
+LM Modify are consistently rated to be easy - from several manual spot checks, we notice that while the prompt for LM Modify provides several examples of types of bugs and does not name specific issues to create, the large majority of bugs created by this strategy are simple variable assignment mistakes (e.g.
+a=a; b=b
+is changed to
+a=b; b=a
+).
+An open-ended prompt like ours does not actually yield high diversity in terms of mistakes created.
+Procedural modifications are, as expected, the next easiest, as the types of bugs created by this strategy are finite.
+PR Mirrors and LM Rewrites yield much harder tasks, confirmed not only by our bug rating model, but also the lower average resolve rate on these tasks by our expert model (SWE-agent + Claude 3.7 Sonnet).
+Finally, aggregating smaller functions together is a simple but effective strategy for creating bugs that are rated as more complex.
+This effect aligns with our original expectations; generally, bugs that require editing more functions and files tend to be rated as more difficult.
+\bugs
+can be used to create task instances with a range of difficulties.
+Appendix F
+Experiments
+In this section, we provide additional details about the configurations and parameters used to generate trajectories with an expert model and run inference on a fine-tuned model.
+We then provide additional ablations and analyses about the
+\bugs
+dataset and the agents trained on
+\bugs
+.
+F.1
+Training Details
+Rejection sampling fine-tuning.
+Our fine-tuning setup heavily inherits from
+Pan et al. (
+2024
+)
+’s work.
+We perform full parameter fine tuning using the
+torchtune
+(PyTorch,
+2024
+)
+library, with learning rate
+5e-5
+, maximum
+3
+3
+3
+3
+epochs, and max context length of
+32768
+32768
+32768
+32768
+.
+Training was carried on Modal
+(Modal,
+2025
+)
+on
+2
+2
+2
+2
+-
+8
+8
+8
+8
+NVIDIA H100 80G GPUs.
+As discussed in Section
+3
+, the procedure for rejection sampling fine-tuning (RFT) is as follows.
+We first generate expert demonstrations/trajectories using SWE-agent and a “strong” model (e.g. Claude 3.7 Sonnet, GPT 4o) on
+\bugs
+task instances.
+Of these, we then only train a student model on the trajectories corresponding to resolved instances.
+SWE-agent configuration.
+We use two different configurations, one for generating trajectories with an expert model, and a separate one for running inference on the fine-tuned Qwen, student models.
+The configurations are generally quite similar, with minor differences around how LMs’ responses are elicited, the parsing mechanism for an LM response, constraints around message sizes, and the system prompt.
+We will first review the information common to both configurations.
+The prompt template informing an agent of the task’s nature and problem statement is included in Figure
+F.1
+.
+This prompt is very similar to the original SWE-agent prompt used in
+Yang et al. (
+2024a
+)
+.
+The prompt templates for showing environment feedback are identical as well.
+If there is execution output, the text is simply preceded by
+OBSERVATION: [output]
+.
+If there is no output (e.g
+rm -r
+succeeds silently), then the agent is informed “Your command ran successfully and did not produce any output”.
+The agent computer interface (ACI) provided is also identical; SWE-agent provides LM with access to three general tools:
+•
+bash
+: Execute a bash command in terminal.
+•
+str_replace_editor
+: A tool for viewing, creating, and editing files.
+•
+submit
+: A special keyword for the LM to indicate the task is completed or if it is unable to proceed further with the task.
+Task Instance Prompt provided to SWE-agent
+<
+<
+<
+uploaded_files
+>
+>
+>
+{{working_dir}}
+<
+<
+<
+/uploaded_files
+>
+>
+>
+I’ve uploaded a python code repository in the directory {{working_dir}}. Consider the following PR description:
+<
+<
+<
+pr_description
+>
+>
+>
+{{problem_statement}}
+<
+<
+<
+/pr_description
+>
+>
+>
+Can you help me implement the necessary changes to the repository so that the requirements specified in the
+<
+<
+<
+pr_description
+>
+>
+>
+are met?
+I’ve already taken care of all changes to any of the test files described in the
+<
+<
+<
+pr_description
+>
+>
+>
+. This means you DON’T have to modify the testing logic or any of the tests in any way!
+Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the
+<
+<
+<
+pr_description
+>
+>
+>
+is satisfied.
+Follow these steps to resolve the issue:
+1. As a first step, it might be a good idea to find and read code relevant to the
+<
+<
+<
+pr_description
+>
+>
+>
+2. Create a script to reproduce the error and execute it with ‘python
+<
+<
+<
+filename.py
+>
+>
+>
+‘ using the bash tool, to confirm the error
+3. Edit the source code of the repo to resolve the issue
+4. Rerun your reproduce script and confirm that the error is fixed!
+5. Think about edgecases and make sure your fix handles them as well
+Your thinking should be thorough and so it’s fine if it’s very long.
+\captionof
+figure
+A copy of the prompt provided to an LM via SWE-agent informing the LM of the nature of the task, the task description itself, and several tips on how to proceed.
+We briefly review the distinctions.
+First, tool invocation works differently for expert versus student models.
+For the Claude and GPT series models that are used as experts, we use function calling for models to invoke the aforementioned tools.
+On the other hand, the student model is asked to generate a response with XML tags to delineate the thought and action.
+Therefore, when fine-tuning on expert trajectories, a key processing step is to convert the expert trajectories’ function calling format into the XML style response — fine-tuning
+directly
+on the expert trajectories does not work.
+We note that we use these particular settings because as of the publication of this paper, this tool setting reflects the absolute state-of-the-art performance achieved with an open source agent system (SWE-agent) and any existing LM (Claude 3.7 Sonnet).
+It is certainly possible to explore more tool designs and experiment with different formatting calls, as many existing prior works, notably
+Yang et al. (
+2024a
+)
+, have performed.
+However, given the focus of our work, we do not bother with repeating such a ”hyperparameter sweep” across configurations for the agent system, as this effort is expensive and has already been performed to suggest that the configuration we are using is ideal for expert level performance.
+For generating trajectories with expert models, we run with a maximum of
+75
+75
+75
+75
+steps and a cost limit of $
+2.00
+2.00
+2.00
+2.00
+.
+A run terminates automatically when either of these limits are reached or the context window of the expert model is exceeded.
+The overwhelming majority of automatic terminations are due to the
+75
+75
+75
+75
+maximum steps limit.
+For running inference with student models, we run with a maximum of
+75
+75
+75
+75
+steps or a cost limit
+3
+3
+3
+We include the cost limit in addition the step limit to provide realistic behavior with respect to handling long context. To calculate a cost value for our model, we use the gpt-4o cost function as of April, 2025.
+of $
+2.00
+2.00
+2.00
+2.00
+, where the run similarly terminates when either the steps, cost or context window limit is reached.
+For the student model, per LM inference call, we truncate the message history to only keep the
+5
+5
+5
+5
+most recent tool outputs.
+While we occasionally sample trajectories with the expert model set at various temperatures, for the student model, the temperature is fixed at
+0.0
+0.0
+0.0
+0.0
+.
+F.2
+Evaluation Datasets
+SWE-bench.
+SWE-bench is a widely used benchmark that evaluates AI systems on their ability to resolve GitHub issues
+(Jimenez et al.,
+2024b
+)
+.
+Given a codebase along with a description of a bug or feature, the AI system is asked to modify the codebase in such a way that the issue presented in the description is resolved.
+SWE-bench consists of
+2294
+2294
+2294
+2294
+such task instances, collected from real world pull requests (PRs) and issues in
+12
+12
+12
+12
+GitHub repositories that are predominantly Python.
+As discussed in Section
+3
+, the Lite and Verified subsets are curated from the main SWE-bench repository with the goal of making evaluation either more efficent or more reliable.
+Since evaluation on the entirety of SWE-bench is fairly costly and does not have as many comparable references, we do not evaluate
+SWE-agent-LM-32B
+on the entire SWE-bench test set.
+SWE-bench Multimodal.
+SWE-bench Multimodal applies SWE-bench collection strategy to
+12
+12
+12
+12
+additional predominantly JavaScript and TypeScript GitHub repositories, where task instances are associated with issues that have visual asset(s) in them
+(Yang et al.,
+2024b
+)
+.
+The evaluation dataset consists of
+510
+510
+510
+510
+task instances.
+While the original work evaluates vision language models (VLMs) specifically, we do not evaluate
+SWE-agent-LM-32B
+which, as it is based on Qwen
+2.5
+2.5
+2.5
+2.5
+Coder Instruct, does not have the ability to process images as inputs.
+jqlang/jq
+9
+redis/redis
+12
+micropython/micropython
+5
+valkey-io/valkey
+4
+nlohmann/json
+1
+fmtlib/fmt
+11
+C/C++
+42
+prometheus/prometheus
+8
+caddyserver/caddy
+14
+gin-gonic/gin
+8
+hashicorp/terraform
+5
+gohugoio/hugo
+7
+Go
+42
+briannesbitt/carbon
+10
+laravel/framework
+13
+phpoffice/phpspreadsheet
+10
+php-cs-fixer/php-cs-fixer
+10
+PHP
+43
+apache/druid
+5
+reactivex/rxjava
+1
+apache/lucene
+9
+projectlombok/lombok
+17
+google/gson
+9
+javaparser/javaparser
+2
+Java
+43
+babel/babel
+5
+mrdoob/three.js
+3
+vuejs/core
+5
+preactjs/preact
+17
+axios/axios
+6
+immutable-js/immutable-js
+2
+facebook/docusaurus
+5
+JS/TS
+43
+rubocop/rubocop
+16
+jekyll/jekyll
+5
+faker-ruby/faker
+2
+fastlane/fastlane
+7
+fluent/fluentd
+12
+jordansissel/fpm
+2
+Ruby
+44
+tokio-rs/axum
+7
+nushell/nushell
+5
+sharkdp/bat
+8
+burntsushi/ripgrep
+2
+uutils/coreutils
+5
+tokio-rs/tokio
+9
+astral-sh/ruff
+7
+Rust
+43
+Table 15:
+Number of task instances per repository and language in the SWE-bench Multilingual evaluation set.
+The entire dataset includes
+300
+300
+300
+300
+task instances covering
+9
+9
+9
+9
+languages.
+SWE-bench Multilingual.
+SWE-bench Multilingual is an evaluation dataset consisting of 300 task instances that we introduce with this work.
+A single author carried out SWE-bench’s collection strategy for
+42
+42
+42
+42
+additional GitHub repositories, covering the following
+9
+9
+9
+9
+programming languages: JavaScript, TypeScript, C, C++, Go, Java, PHP, Ruby, and Rust. These repositories span a wide range of application domains, including web frameworks, data storage and processing tools, core utilities, and widely used libraries. A brief summary of the dataset is presented in Table
+15
+.
+Like SWE-bench Verified, we curate the dataset by excluding task instances deemed by a team of three authors to have ambiguous or underspecified issue text.
+Each task instance edits (meaning additions and removals) on average
+48
+48
+48
+48
+lines of code.
+Similar to SWE-bench and
+\bugs
+, the median number of Fail-to-Pass tests is one.
+We introduce SWE-bench Multilingual to:
+1.
+Provide a benchmark to evaluate model and agent performance across a variety of programming languages and application domains. Existing agent systems often rely on Python-specific tooling, effectively overfitting to the original SWE-bench
+(Yang et al.,
+2024b
+)
+. Although SWE-bench Multimodal addresses this to some degree, its focus on visual inputs is a confounding factor for text-only evaluation of software engineering capabilities.
+2.
+Remain fully compatible with SWE-bench, so current users can adopt it without changing infrastructure.
+3.
+Keep the dataset small enough to run quickly. While concurrent work like
+Zan et al. (
+2025
+)
+provides more task instances in multiple languages, we purposely constrain the number of task instances so that the dataset is easy to run quickly.
+In §
+F.4
+, we briefly discuss how performance by existing state of the art methods for SWE-bench is markedly worse on SWE-bench Multilingual, then offer some clear directions for potential next steps to build better agentic coding models that would involve extending
+\bugs
+.
+F.3
+Trajectory Dataset Breakdown
+Purpose
+Bug Gen.
+Issue Gen.
+# Instances
+Temp.
+# Traj.
+claude-3-7-sonnet-20250219
+Ablation
+LM (Modify)
+LM
+1000
+0
+605
+(Bug Type)
+LM (Rewrite)
+LM
+1000
+0
+507
+Procedural
+LM
+1000
+0
+745
+PR Mirrors
+LM
+1000
+0
+557
+Ablation
+PR Mirrors
+Fixed
+600
+0
+259
+(Issue Type)
+PR Mirrors
+F2P Test
+600
+0
+390
+PR Mirrors
+Original
+600
+0
+328
+PR Mirrors
+LM
+600
+0
+319
+Ablation
+Procedural
+LM
+1000
+0
+721
+(Repositories)
+Procedural
+LM
+1000
+0
+709
+Procedural
+LM
+1000
+0
+723
+Procedural
+LM
+1000
+0
+707
+Final Dataset
+LM (Rewrite)
+LM
+3574
+0
+1003
+Curation
+PR Mirrors
+LM
+1049
+0
+349
+claude-3-5-sonnet-20250219
+Compare with prior work
+All
+LM
+800
+0
+535
+gpt-4o-2024-08-06
+Compare with prior work
+All
+LM
+200
+0
+89
+Table 16:
+Breakdown of trajectories sampled from
+\bugs
+.
+Trajectories were generated from subsets of
+\bugs
+that were either for the purpose of ablations or performance.
+All trajectories were generated with a maximum of
+75
+75
+75
+75
+steps and a $
+2
+2
+2
+2
+cost limit.
+Bug Type
+Count
+Combine (File)
+123
+Combine (Module)
+7
+LM (Modify)
+11
+LM (Rewrite)
+1532
+Procedural
+1495
+PR Mirror
+1848
+Table 17:
+Bug types represented in final training dataset.
+Repository
+Count
+Repository
+Count
+getmoto/moto
+378
+378
+378
+378
+sqlfluff/sqlfluff
+122
+122
+122
+122
+pandas-dev/pandas
+320
+320
+320
+320
+pylint-dev/astroid
+110
+110
+110
+110
+conan-io/conan
+243
+243
+243
+243
+pydicom/pydicom
+103
+103
+103
+103
+pydantic/pydantic
+209
+209
+209
+209
+tobymao/sqlglot
+101
+101
+101
+101
+iterative/dvc
+181
+181
+181
+181
+pygments/pygments
+99
+99
+99
+99
+dask/dask
+139
+139
+139
+139
+scanny/python-pptx
+98
+98
+98
+98
+Table 18:
+Top ten repositories by number of trajectories represented in final dataset for main result.
+We provide a thorough review of the dataset of SWE-agent trajectories released with this work in Table
+16
+.
+The majority are generated with
+claude-3-7-sonnet-20250219
+.
+To compare with prior work, a minority were generated with
+claude-3-5-sonnet-20240620
+and
+gpt-4o-2024-08-06
+.
+As mentioned in Section
+4
+, to guard against the easy data bias phenomenon, we impose a per-instance cap of
+3
+3
+3
+3
+, meaning for any task instance, we include at most
+3
+3
+3
+3
+trajectories successfully resolving that task instance in our fine-tuning dataset.
+From the pool of trajectories reflected in Table
+16
+, we curate a set of
+5000
+5000
+5000
+5000
+trajectories that we then use to train
+SWE-agent-LM-32B
+.
+Tables
+18
+and
+18
+show what repositories and bug types are represented in the final training dataset.
+In total,
+123
+123
+123
+123
+repositories are represented, with at least
+10
+10
+10
+10
+trajectories from
+91
+91
+91
+91
+repositories.
+Trajectories are on average
+58
+58
+58
+58
+turns long, meaning an LM typically takes
+29
+29
+29
+29
+actions for a given demonstration trajectory.
+We visualize this distribution in Figure
+15
+.
+Figure 15:
+Distribution of number of turns for trajectories represented in the final dataset.
+F.4
+Training Analyses
+We provide additional experiments and discussions around training
+SWE-agent-LM-32B
+.
+Pass@k trend line.
+To calculate the Pass@1 score discussed in our main result, we ran SWE-agent with
+SWE-agent-LM-32B
+six times.
+In Figure
+17
+, we observe increasing performance at higher values of
+k
+, a phenomenon that reflects observations in prior works across LMs for software engineering, code generation, web navigation, and theorem proving.
+While we do not explore work around inference time scaling and training a separate verifier model to select the best solution candidate generated by multiple roll-outs, as done in
+Pan et al. (
+2024
+)
+and
+Jain et al. (
+2025
+)
+,
+SWE-agent-LM-32B
+is fully compatible with the generate-then-select pipelines explored by such works.
+Given its strong Pass@1 performance,
+SWE-agent-LM-32B
+would likely be quite competitive for Best@k results as well.
+As mentioned before, all trajectories generated in the course of
+\bugs
+have been released publicly, which the community might find useful for training better verifiers.
+Rejection sampling fine-tuning ablation.
+To confirm that rejection sampling fine-tuning leads to better performance on the downstream task, we compare against a setting where we randomly sample
+n
+training points with no filtering criteria, at
+n = [100, 200, 400, 800, 1600]
+and fine-tune the same student model (Qwen
+2.5
+2.5
+2.5
+2.5
+Coder Instruct
+32
+32
+32
+32
+B.
+We then run SWE-agent with each student model on the SWE-bench Verified dataset three times, with the “% Resolved” corresponding to the Pass@1 score.
+We show results in Figure
+17
+, which confirms that fine-tuning only on trajectories corresponding to successfully resolved tasks is better than randomly sampling trajectories.
+SWE-bench Multilingual performance.
+To assess how well
+SWE-agent-LM-32B
+and existing models generalize to non-Python coding domains, we evaluate the performance of our model, Qwen
+2.5
+2.5
+2.5
+2.5
+Coder Instruct 32B, and Claude
+3.7
+3.7
+3.7
+3.7
+Sonnet with SWE-agent on our new dataset, which we introduced in Section
+F.2
+.
+Out of
+300
+300
+300
+300
+task instances, we found that Claude
+3.7
+3.7
+3.7
+3.7
+Sonnet achieved a
+43
+43
+43
+43
+% Pass@1 resolve rate, which is significantly better than
+SWE-agent-LM-32B
+(
+8.4
+8.4
+8.4
+8.4
+%) and Qwen
+2.5
+2.5
+2.5
+2.5
+Coder Instruct (
+6.5
+6.5
+6.5
+6.5
+%).
+SWE-agent-LM-32B
+does not demonstrate a significant improvement over the baseline model.
+Through several spot checks of different trajectories, we came to a working hypothesis that while the rejection sampling fine-tuning process had improved its ability to carry out multi-turn interactions in this task setting, there were instances where code edits reflected syntax closer to Python despite code and files viewed in previous steps clearly not being written in Python.
+While the result for
+SWE-agent-LM-32B
+SWE-bench Multilingual is clearly subpar, we are excited by such a finding, as it motivates future work on top of
+\bugs
+.
+To elaborate, we expect that the path to open agent coding models capable of generalizing to many repositories and languages will be paved by more data and better training techniques, both of which
+\bugs
+is very capable of facilitating.
+First, regarding data, although we wrote
+\bugs
+to be Python centric, the collection methodology and bug generation techniques (especially LM based methods) should be readily transferable to other repositories.
+Second, the negative result on SWE-bench Multilingual provides a clear impetus for exploring whether better training techniques could lead to models that are trained on one code domain (e.g., Python), but can generalize to many languages and repositories.
+Figure 16:
+SWE-agent-LM-32B
+Pass@k curve on SWE-bench Verified.
+We observe higher % resolved when considering
+more runs.
+Figure 17:
+Rejection sampling fine-tuning
+leads to better performance than random sampling
+of trajectories for training.
+F.5
+Agent Behavioral Studies
+F.5.1
+Turn counts and cost
+While agents are frequently quoted with a singular cost-per-instance number, this can be very misleading in the case of SWE-agent-LM-32B.
+Because most of the failed instances fail due to termination by the cost or turn count limit, the average cost and turn counts depend strongly on these limits (see Fig.
+19
+).
+We can also chart the number of resolved instances vs step limits.
+To avoid reevaluating the agent with multiple step limits, we use one run with step limit 75 and then assume that a successful agent run that terminates after step
+n
+𝑛
+n
+italic_n
+would have failed when restricted by a limit smaller than
+n
+𝑛
+n
+italic_n
+.
+This chart corroborates the point made in section
+3
+: SWE-agent-LM-32B has a higher resolution rate for very low step limits.
+Figure 18:
+The average step count depends strongly on the prescribed step limit.
+Figure 19:
+Number of successful instances submitted before a given step limit.
+F.5.2
+Analysis of agent action space
+Reduction to
+base commands.
+In addition to the dedicated tools provided to the agent as part of the agent computer interface (Section
+F.1
+), the agent can execute arbitrary bash commands.
+This makes quantitative analyses of the agent action space challenging.
+For example, the agent might issue commands like
+PYTHONPATH=/testbed/repo cd /testbed/repo && python3 reproduce.py
+.
+We have found the following procedure to determine a
+base command
+effective to meaningfully describe the action:
+1.
+Strip any environment variable manipulation from the beginning of the command.
+2.
+When multiple commands are chained with
+&&
+or semicolons, only consider the last command.
+3.
+Remove all arguments. Because some commands have subcommands (e.g.,
+git checkout
+), we apply several basic heuristics to determine whether to keep the first or the first two words.
+Repetitive actions.
+We determine the longest repetitive sequence of actions by determining the longest sequence of identical base commands within the agent actions.
+Note that this means that e.g.,
+str_replace_editor view
+actions that target different files are considered to be repetitive actions as far as this analysis is concerned.
+F.5.3
+Failure mode analysis
+Categorizing the failure mode proceeds as shown in Figure
+20
+:
+1.
+Error conditions:
+If the agent terminates due to an error (environment errors, inability of the LM to correctly format its messages, etc.) or because it exceeded its maximum context window, we return the
+error
+or
+context
+category.
+2.
+Early termination:
+If the agent was terminated because of a step or cost limit, we return one of the
+stuck …
+subcategories. Note that the SWE-agent still attempts to extract a submission (list of changes/patch).
+We determine the subcategory based on which part of the workflow agentic loop was terminated:
+(a)
+If no source (i.e., non-test) file was modified
+4
+4
+4
+We exclude added files because solving SWE-bench instances always requires
+changes
+to existing files.
+and no attempt at testing was made, we return
+stuck at localization
+. If test commands were run (i.e.,
+python
+,
+pytest
+, …, or similar commands), we return
+stuck at reproduction
+.
+(b)
+If source files
+were
+modified, we check whether the changes include changes to all source files that are modified in the gold patch. If not, we return
+incorrect localization (stuck)
+, else
+incorrect edit (stuck)
+.
+3.
+Successful submission:
+If the agent terminated and submitted a solution naturally, we return
+incorrect localization
+or
+incorrect edit
+, depending on whether the changes from the submitted patch included changes to all files from the SWE-bench gold patch.
+Figure 20:
+Categorizing failure modes
+F.5.4
+Mitigating repetitive actions
+As described in section
+4
+,
+SWE-agent-LM-32B
+frequently shows highly repetitive actions for unresolved instances.
+In light of this, it seems promising to investigate whether agent scaffolding interventions can be used to mitigate the problem and increase the success rates.
+We make the following modification to the agent scaffold:
+•
+We add warning messages to the observation (command output) if a base command is repeated four (
+str_replace_editor view
+) or six (any other base command) times. The warning message advises to try different commands, and in particular suggest to locate relevant context using
+find
+or
+grep
+.
+•
+If the warning messages do not break the string of repetitive base commands and the repetition length reaches 6 (
+str_replace_editor view
+) or 8 (any other base command), every following action is resampled up to 10 times, stopping at the first base command that is distinct from the previous ones.
+To further increase the likelihood of breaking the cycle, we inject assistant messages or raise the temperature if the repetition length reaches 7 or 9.
+This effectively reduces the number of repetitive actions (see Fig.
+21
+).
+However, the overall number of resolved instances drops slightly to 192 (
+38.4
+%
+percent
+38.4
+38.4\%
+38.4 %
+).
+Variations of the above strategies yield similar outcomes: while repetition is suppressed, success rates do not improve substantially.
+This may suggest that repetitive actions are better understood as
+symptoms
+of the model’s difficulty in solving an instance (such as when the instance is out-of-distribution or particularly challenging) rather than constituting intrinsic failure modes.
+Figure 21:
+Scaffold interventions can drastically reduce the number of repetitive actions.
+Appendix G
+Miscellaneous
+Teaser figure description.
+We briefly describe how the left hand graph of Figure
+1
+, which depicts scaling of task instance collection for the
+\bugs
+vs. SWE-bench, was created.
+For
+\bugs
+, we simply collected the number of task instances for each repository.
+For SWE-bench, we ran the SWE-bench task instance candidate collection script on all
+128
+128
+128
+128
+repositories, which first crawls all PRs from a given repository.
+Then, each PR that edits at least one or more Python files and changes at least one or more testing related files is converted into a candidate task instance.
+Finally, based on the average task instance yield rate reported in
+Jimenez et al. (
+2024b
+)
+, we estimate the number of viable task instances to be
+20
+20
+20
+20
+% of the candidates.
+We then determine the number of task instances for
+n
+repositories at intervals of
+5
+5
+5
+5
+repositories ranging from
+5
+5
+5
+5
+to
+250
+250
+250
+250
+, where the repositories are sorted by number of stars.
+In other words, the first five repositories we account for in the figure are the five with the fewest number of stars out of the
+128
+128
+128
+128
+repositories used.
+Extended related works.
+We discuss additional related works briefly, primarily about similar work towards synthesizing trajectories for training LM agents, but for the domain of web tasks.
+To improve the interactive capabilities of open source LMs
+(Chen et al.,
+2023
+)
+, prior works have also explored trajectory generation techniques for web benchmarks and settings
+(Xie et al.,
+2024
+; Yao et al.,
+2023a
+; Zhou et al.,
+2024
+)
+.
+For web navigation, existing strategies rely on (1) performing random walks which are then labeled retroactively with instructions
+(Xiang et al.,
+2023
+; Murty et al.,
+2024
+)
+, (2) using online web tutorials as a source of indirect supervision for generating synthetic trajectories
+(Ou et al.,
+2024
+)
+, or (3) collecting human demonstrations
+(Shen et al.,
+2024
+; Xu et al.,
+2024
+)
+.
+These procedures do not translate well to the software engineering setting; random sequences of command line interactions usually do not achieve meaningful effects on a codebase.
+Our cursory efforts around replaying trajectories synthesized from online code edit sequences (e.g. GitHub commit histories) were unsuccessful due to the limited information available, which primarily capture file-level changes without reflecting the underlying skills, decision-making, or the broader context of a software development process.
+Our exploration of using SWE-agent to automatically determine installation and testing specifications for a repository is heavily influenced by two research directions - automatic execution environment construction using LMs
+(Bogin et al.,
+2024
+; Eliseeva et al.,
+2025
+; Vergopoulos et al.,
+2025
+)
+, and generating unit tests using LMs
+(Mündler et al.,
+2025
+)
+.
+Although relatively much less than SWE-bench style collection,
+\bugs
+still requires minimal amounts of human labor (around
+8
+8
+8
+8
+minutes total per repository).
+As we expand
+\bugs
+to more repositories and languages, we are continuing to consider how to completely automate the environment construction process end to end.
\ No newline at end of file
diff --git a/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md b/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md
new file mode 100644
index 0000000000000000000000000000000000000000..2434e10ce9d7b9f354ea6be6addeb3df1eebec9b
--- /dev/null
+++ b/research/notes/chain-of-world-world-model-thinking-in-latent-motion.md
@@ -0,0 +1,3280 @@
+---
+title: 'Chain of World: World Model Thinking in Latent Motion'
+id: chain-of-world-world-model-thinking-in-latent-motion
+tags:
+- deepread
+created: '2026-06-10T00:31:05.101951Z'
+source: https://arxiv.org/html/2603.03195
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:31:05.101809Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Chain of World: World Model Thinking in Latent Motion
+Title:
+Content selection saved. Describe the issue below:
+Description:
+License: arXiv.org perpetual non-exclusive license
+arXiv:2603.03195v1 [cs.CV] 03 Mar 2026
+Chain of World: World Model Thinking in Latent Motion
+Fuxiang Yang
+1,2
+Donglin Di
+2
+Lulu Tang
+3,6
+Xuancheng Zhang
+2
+Lei Fan
+4
+Hao Li
+2
+Wei Chen
+2
+Tonghua Su
+1,5
+Baorui Ma
+2
+1
+Harbin Institute of Technology
+2
+Li Auto
+3
+Beijing Academy of Artificial Intelligence (BAAI)
+4
+University of New South Wales
+5
+Chongqing Research Institute of HIT
+6
+Peking University
+hityangfx@foxmail.com, donglin.ddl@gmail.com, lulutang_@outlook.com
+xczhang.thu@gmail.com, lei.fan1@unsw.edu.au, {lihao43, chenwei10}@lixiang.com
+thsu@hit.edu.cn, mabaorui2014@gmail.com
+Work done during an internship at Li Auto.Corresponding author.Project leader and corresponding author.
+Abstract
+Vision-Language-Action (VLA) models are a promising path toward embodied intelligence, yet they often overlook the predictive and temporal-causal structure underlying visual dynamics.
+World-model VLAs address this by predicting future frames, but waste capacity reconstructing redundant backgrounds.
+Latent-action VLAs encode frame-to-frame transitions compactly, but lack temporally continuous dynamic modeling and world knowledge.
+To overcome these limitations, we introduce CoWVLA (Chain-of-World VLA), a new “Chain of World” paradigm that unifies world-model temporal reasoning with a disentangled latent motion representation.
+First, a pretrained video VAE serves as a latent motion extractor, explicitly factorizing video segments into structure and motion latents.
+Then, during pre-training, the VLA learns from an instruction and an initial frame to infer a continuous latent motion chain and predict the segment’s terminal frame.
+Finally, during co-fine-tuning, this latent dynamic is aligned with discrete action prediction by jointly modeling sparse keyframes and action sequences in a unified autoregressive decoder.
+This design preserves the world-model benefits of temporal reasoning and world knowledge while retaining the compactness and interpretability of latent actions, enabling efficient visuomotor learning.
+Extensive experiments on robotic simulation benchmarks show that CoWVLA outperforms existing world-model and latent-action approaches and achieves moderate computational efficiency, highlighting its potential as a more effective VLA pretraining paradigm.
+The project website can be found at https://fx-hit.github.io/cowvla-io.
+1
+Introduction
+Figure 1
+:
+Comparison of VLA pretraining strategies.
+(a)
+World Model
+: It predicts future visual frames, leading to redundant background reconstruction.
+(b)
+Latent Action
+: It learns the frame-to-frame transition using a visual encoder
+E
+E
+, but lacks temporally continuous reasoning.
+(c)
+CoWVLA
+: Our method first uses a video encoder
+E
+E
+to decompose each video segment into motion and structure latents, and then trains the VLM to infer latent motion and predict the terminal frame of the segment given the instruction and the initial frame.
+Embodied intelligence aims to build agents that can perceive, understand, and act in the physical world.
+Vision-Language-Action (VLA) models represent a significant step toward this goal, unifying multimodal perception and motor control into end-to-end transformers
+[
+61
+,
+24
+,
+3
+,
+34
+]
+.
+While effective at mapping visual observations and language instructions directly to actions for many tasks, standard VLAs lack the future prediction capabilities that humans possess, which has spurred interest in enriching them with predictive world models
+[
+1
+,
+5
+]
+.
+A prominent approach integrates world models into VLAs by predicting future visual frames to explicitly model environmental dynamics, as illustrated in Figure
+1
+(a).
+Methods such as WorldVLA
+[
+7
+]
+, UniVLA
+[
+50
+]
+, and FlowVLA
+[
+58
+]
+typically built on large-scale autoregressive transformers, learn to anticipate future states and thus benefit action policy learning.
+While effective, this paradigm has fundamental limitations.
+It requires modeling entire visual frames containing substantial static and redundant background pixels, leading to near-trivial pixel replication rather than focusing on meaningful motion and dynamic change.
+Furthermore, quantizing images
+[
+15
+]
+into discrete tokens results in excessively long sequences and severe training inefficiency when multiple frames are used.
+From a cognitive standpoint, such frame prediction is misaligned with how humans model the world: we reason about motion and interactions rather than rebuilding every pixel in memory.
+This observation raises an important question:
+can we build a more compact, abstract, and dynamic form of world modeling?
+The latent action paradigm
+[
+54
+,
+12
+,
+6
+,
+11
+]
+offers compelling inspiration.
+As shown in Figure
+1
+(b), it encodes frame-to-frame transitions as latent actions, which serve as abstract motion carriers for world modeling, enabling large-scale pretraining using the pseudo-action labels built from videos.
+However, we identify two critical limitations in the current latent action paradigm compared to world models.
+First, world models perform temporally continuous dynamic modeling, whereas existing latent actions often focus only on the change between two frames
+[
+54
+,
+12
+,
+6
+]
+.
+Second, world models, through future frame prediction, learn generalizable knowledge for task execution and common sense about the world.
+In contrast, latent actions only encode “how to move”, but lack an understanding of what is moving, where the motion happens, or how the scene should evolve after the motion.
+To address these limitations, we propose Chain-of-World VLA (CoWVLA), which establishes a new paradigm that unifies the advantages of both approaches, as shown in Figure
+1
+(c).
+Our key insight is that effective world modeling requires both the compactness of motion representations and the temporal continuity and world knowledge of frame prediction.
+We argue that it is possible to extract continuous and compact motion representations from video clips, suggesting the need for a model capable of decoupling the content structure and motion in videos.
+Such motion representations serve as carriers for perceiving essential dynamic changes and further enable the model to reason about keyframes after temporal evolution, thereby preserving crucial visual landmarks.
+Specifically, our approach employs a pretrained video VAE as a latent motion extractor, which explicitly disentangles each video segment into structure and motion representations, providing compact and interpretable supervision for downstream visuomotor learning.
+We then train a unified VLA decoder through two stages.
+During the pre-training stage, the model learns to infer latent dynamics and predict the terminal frame of a video segment given the instruction and initial frame, thereby establishing a dynamics-aware world prior in the latent motion space.
+During the subsequent co-fine-tuning stage, this prior is further aligned with discrete action prediction by jointly modeling sparse keyframes and action sequences in a unified autoregressive manner.
+This design combines the interpretability and compactness of latent motion with the temporal reasoning and world knowledge of world models, achieving efficient and robust visuomotor learning without reconstructing redundant intermediate frames.
+In summary, our contributions are as follows:
+•
+We present CoWVLA, establishing the “Chain-of-World” paradigm that unifies world modeling and latent action learning through continuous latent-motion sequences and terminal keyframe prediction.
+•
+We introduce a structure-motion disentangled latent prior that yields interpretable, continuous, and effective dynamic representations.
+•
+We conduct extensive experiments demonstrating that CoWVLA achieves state-of-the-art performance across multiple benchmarks, surpassing existing world-model and latent-action approaches.
+2
+Related Work
+Vision-Language-Action Models.
+Deep learning has been widely applied in various industrial scenarios, such as visual anomaly detection
+[
+17
+,
+16
+]
+.
+Recent vision-language-action (VLA) models have rapidly advanced toward directly generating actions from visual and language inputs within a unified framework
+[
+61
+,
+24
+,
+34
+,
+25
+,
+36
+,
+35
+,
+22
+,
+3
+,
+18
+,
+41
+,
+46
+]
+.
+RT-2
+[
+61
+]
+pioneered this direction by treating robotic control as a sequence modeling problem, fine-tuning a pretrained vision-language model on robotic data to output discretized action tokens.
+This approach was scaled up by RT-X
+[
+34
+]
+, which demonstrated the benefits of joint training across diverse robot platforms and tasks. OpenVLA
+[
+24
+,
+25
+]
+further democratized this effort with an open-source implementation.
+FAST
+[
+35
+]
+introduced a unified frequency-domain formulation for discretizing actions, enhancing temporal correlation in discrete control.
+Meanwhile, another line of research explores continuous trajectory generation
+[
+13
+,
+3
+,
+28
+,
+21
+]
+.
+They leverage diffusion or flow-matching models to generate continuous, high-frequency action sequences.
+However, most existing methods primarily focus on action space modeling, with limited capability to capture how the environment evolves.
+Figure 2
+:
+Overview of the CoWVLA framework.
+CoWVLA consists of two core components: a latent motion extractor and a VLA decoder.
+The latent motion extractor, implemented as a video VAE, disentangles each video segment into a structure latent
+z
+s
+z_{s}
+and two directional motion latents
+z
+m
+h
+z_{m}^{h}
+and
+z
+m
+w
+z_{m}^{w}
+, which are concatenated into a unified latent motion vector
+z
+m
+z_{m}
+.
+The VLA decoder performs unified autoregressive modeling over multimodal sequences.
+During pre-training, the model takes the instruction and initial frame as input, and uses a learnable motion query
+Q
+Q
+to predict the latent motion
+z
+^
+m
+\hat{z}_{m}
+while reconstructing the terminal frame of the video segment.
+During co-fine-tuning, the input expands into alternating keyframe–action pairs;
+Q
+Q
+continues to aggregate temporally continuous latent dynamics, guiding multi-step action generation under sparse visual observations.
+World Models for Robotics.
+World models are commonly employed to capture environment states and their future evolution, and have been widely applied in areas such as autonomous driving
+[
+48
+,
+51
+]
+, image and video generation
+[
+5
+,
+31
+,
+53
+,
+45
+,
+42
+,
+14
+]
+, and robotics
+[
+50
+,
+7
+,
+1
+,
+38
+,
+19
+,
+56
+]
+.
+When combined with VLA models, most approaches
+[
+52
+,
+8
+,
+57
+,
+7
+,
+50
+,
+58
+]
+rely on predicting future visual states to provide implicit world knowledge and demonstrate improved performance in robotic manipulation.
+UVA
+[
+29
+]
+further jointly optimizes video prediction and action prediction using diffusion models, enhancing both visual reasoning and control inference efficiency.
+However, these methods require reconstructing full visual frame sequences, leading to high computational cost and heavy resource consumption.
+Latent Actions for Robotics.
+Latent-action methods learn a compact latent transition between two frames to model environment dynamics.
+LAPA
+[
+54
+]
+introduces a three-stage framework (including latent action quantization, latent pretraining, and action fine-tuning), leveraging large-scale pseudo-action supervision to improve learning of real-world robotic control. MoTo
+[
+12
+]
+follows this paradigm with enhancements in motion quantization and real action quality.
+TLA
+[
+6
+]
+further disentangles task-relevant and task-irrelevant motion factors.
+However, these approaches generally restrict latent action modeling to frame pairs, limiting their ability to capture long-range temporal dynamics. Although Villa-X
+[
+11
+]
+extends latent actions to multi-frame settings, it still generates one latent action per local frame pair, resulting in limited temporal consistency.
+Moreover, the latent action representations inevitably encode static appearance and contextual details.
+While TLA
+[
+6
+]
+mitigates this issue by decoupling task relevance, an ideal latent space should explicitly separate structure from motion, producing cleaner and more interpretable action representations.
+Video Compression and Decoupling.
+Recent methods in video representation learning have increasingly focused on compressing visual information into disentangled latent spaces that separately encode spatial structure and temporal motion
+[
+53
+,
+40
+,
+26
+,
+55
+,
+49
+]
+.
+The design of our latent motion space is inspired by these advances.
+Models like CMD
+[
+55
+]
+and VidTwin
+[
+49
+]
+have successfully disentangled overall content and dynamic information in a highly compressed latent space.
+This factorization provides a compact, continuous, and meaningful representation of how scenes evolve.
+While these models were developed for video generation, we are the first to hypothesize and demonstrate that their pretrained latent motion space can serve as a powerful dynamic prior for a robotic world model.
+3
+Method
+3.1
+Overall Framework
+We consider a robotic manipulation task that involves executing a sequence of actions conditioned on a language instruction and visual observations.
+The instruction is denoted as
+T
+T
+.
+The raw action sequence is
+𝐀
+1
+:
+t
+=
+{
+a
+1
+,
+…
+,
+a
+t
+}
+\mathbf{A}_{1:t}=\{a_{1},\ldots,a_{t}\}
+.
+To enable discrete sequence modeling, the action sequence
+𝐀
+1
+:
+t
+\mathbf{A}_{1:t}
+is partitioned into consecutive chunks of fixed length
+l
+a
+l_{a}
+, i.e.,
+𝐀
+1
+:
+t
+=
+⋃
+j
+=
+1
+N
+𝐀
+j
+,
+𝐀
+j
+=
+{
+a
+(
+j
+−
+1
+)
+​
+l
+a
++
+1
+,
+…
+,
+a
+j
+​
+l
+a
+}
+\mathbf{A}_{1:t}=\bigcup_{j=1}^{N}\mathbf{A}^{j},\quad\mathbf{A}^{j}=\{a_{(j-1)l_{a}+1},\ldots,a_{jl_{a}}\}
+,
+and each chunk
+𝐀
+j
+\mathbf{A}^{j}
+is then quantized into a discrete token sequence
+𝐀
+q
+j
+\mathbf{A}_{q}^{j}
+,
+using the FAST
+[
+35
+]
+algorithm.
+The raw corresponding visual observation sequence is represented as
+𝐕
+1
+:
+t
+=
+{
+v
+1
+,
+…
+,
+v
+t
+}
+\mathbf{V}_{1:t}=\{v_{1},\ldots,v_{t}\}
+, where each frame
+v
+i
+∈
+ℝ
+H
+×
+W
+×
+3
+v_{i}\in\mathbb{R}^{H\times W\times 3}
+.
+We extract the first frame of each action chunk as a keyframe:
+𝐕
+~
+=
+{
+v
+~
+j
+}
+j
+=
+1
+N
+=
+{
+v
+(
+j
+−
+1
+)
+​
+l
+a
++
+1
+}
+j
+=
+1
+N
+,
+\tilde{\mathbf{V}}=\{\tilde{v}_{j}\}_{j=1}^{N}=\{v_{(j-1)l_{a}+1}\}_{j=1}^{N},
+where each
+v
+~
+j
+\tilde{v}_{j}
+is subsequently quantized into a visual token
+v
+~
+q
+j
+\tilde{v}_{q}^{j}
+using VQGAN
+[
+15
+]
+.
+Additionally, a learnable motion query token
+Q
+∈
+ℝ
+D
+Q
+Q\in\mathbb{R}^{D_{Q}}
+is introduced as a world dynamics query, whose hidden representation summarizes past context and provides a future dynamics-aware conditioning signal for generating subsequent vision or action tokens.
+The overall framework consists of two models and three training stages.
+The first model is the latent motion extractor (video VAE paradigm), which encodes a video sub-sequence
+𝐕
+1
+:
+f
+\mathbf{V}_{1:f}
+into an intermediate latent
+z
+∈
+ℝ
+d
+z
+×
+f
+×
+h
+×
+w
+z\in\mathbb{R}^{d_{z}\times f\times h\times w}
+, and decomposes it into a structural feature
+z
+s
+z_{s}
+and two directional motion features
+z
+m
+h
+z_{m}^{h}
+and
+z
+m
+w
+z_{m}^{w}
+.
+The two motion components are concatenated to form a unified latent motion vector
+z
+m
+∈
+ℝ
+D
+m
+z_{m}\in\mathbb{R}^{D_{m}}
+, providing the ground-truth supervision.
+The second model is the VLA decoder (Transformer-decoder paradigm), which performs unified autoregressive next-token prediction across modalities.
+During pre-training, the input sequence is organized as
+[
+T
+,
+v
+q
+1
+,
+Q
+,
+v
+q
+f
+]
+[T,v_{q}^{1},Q,v_{q}^{f}]
+.
+The final hidden representation corresponding to the query token
+Q
+Q
+, obtained from the VLA decoder, is fed into an MLP to predict the latent motion
+z
+^
+m
+\hat{z}_{m}
+.
+This stage enables the model to infer latent dynamics and future observations from language and the initial visual input.
+During co-fine-tuning, we use alternating keyframes and action tokens, e.g.,
+[
+T
+,
+v
+~
+q
+1
+,
+Q
+,
+𝐀
+q
+1
+,
+v
+~
+q
+2
+,
+𝐀
+q
+2
+,
+…
+]
+[T,\tilde{v}_{q}^{1},Q,\mathbf{A}_{q}^{1},\tilde{v}_{q}^{2},\mathbf{A}_{q}^{2},\ldots]
+.
+The model continues to predict a latent motion vector
+z
+^
+m
+\hat{z}_{m}
+at
+Q
+Q
+position. As a result, the model maintains explicit dynamics reasoning under sparse keyframe observations and generates stable multi-step actions from compact latent representations.
+3.2
+Latent Motion Extractor
+To encode temporal dynamics in a compact latent space, we adopt a pretrained video variational autoencoder
+[
+49
+]
+as the latent motion extractor.
+As illustrated in Figure
+2
+, the extractor achieves structure–motion disentanglement through two dedicated branches.
+Given a video segment
+𝐕
+1
+:
+f
+\mathbf{V}_{1:f}
+, the encoder produces a latent tensor
+z
+∈
+ℝ
+d
+z
+×
+f
+×
+h
+×
+w
+.
+z\in\mathbb{R}^{d_{z}\times f\times h\times w}.
+The structure branch employs a Q-Former
+[
+27
+]
+module with a set of learnable queries
+{
+q
+i
+}
+i
+=
+1
+n
+q
+\{q_{i}\}_{i=1}^{n_{q}}
+to aggregate global semantics and low-frequency dynamics along the temporal dimension, yielding
+z
+s
+∈
+ℝ
+d
+s
+×
+n
+q
+×
+h
+s
+×
+w
+s
+,
+n
+q
+≤
+f
+.
+z_{s}\in\mathbb{R}^{d_{s}\times n_{q}\times h_{s}\times w_{s}},n_{q}\leq f.
+The motion branch operates along spatial dimensions: several convolutional layers reduce the dimension of
+z
+z
+and produce
+z
+′
+∈
+ℝ
+d
+m
+×
+f
+×
+h
+m
+×
+w
+m
+z^{\prime}\in\mathbb{R}^{d_{m}\times f\times h_{m}\times w_{m}}
+.
+Then, spatial averaging
+μ
+​
+(
+⋅
+)
+\mu(\cdot)
+is applied independently along the height and width axes to extract directional motion embeddings:
+z
+m
+h
+=
+μ
+h
+​
+(
+z
+′
+)
+∈
+ℝ
+d
+m
+×
+f
+×
+w
+m
+,
+z
+m
+w
+=
+μ
+w
+​
+(
+z
+′
+)
+∈
+ℝ
+d
+m
+×
+f
+×
+h
+m
+.
+z_{m}^{h}=\mu_{h}(z^{\prime})\in\mathbb{R}^{d_{m}\times f\times w_{m}},z_{m}^{w}=\mu_{w}(z^{\prime})\in\mathbb{R}^{d_{m}\times f\times h_{m}}.
+These two motion components are concatenated and flattened to form a unified latent motion representation:
+z
+m
+∈
+ℝ
+D
+m
+,
+D
+m
+=
+f
+×
+d
+m
+×
+(
+h
+m
++
+w
+m
+)
+.
+z_{m}\in\mathbb{R}^{D_{m}},D_{m}=f\times d_{m}\times(h_{m}+w_{m}).
+In the decoder stage, the three latent components
+(
+z
+s
+,
+z
+m
+h
+,
+z
+m
+w
+)
+(z_{s},z_{m}^{h},z_{m}^{w})
+are upsampled through convolutional and MLP layers to the same spatial and temporal size, summed together, and then fed into the decoder to reconstruct
+𝐕
+^
+1
+:
+f
+\hat{\mathbf{V}}_{1:f}
+.
+The training objective follows the original VAE design
+[
+49
+]
+, combining reconstruction loss
+ℒ
+rec
+\mathcal{L}_{\text{rec}}
+, perceptual loss
+ℒ
+p
+\mathcal{L}_{p}
+, adversarial loss
+ℒ
+GAN
+\mathcal{L}_{\text{GAN}}
+, and KL-divergence regularization loss
+ℒ
+KL
+\mathcal{L}_{\text{KL}}
+to preserve temporal consistency and visual realism:
+ℒ
+v
+​
+a
+​
+e
+=
+ℒ
+rec
++
+λ
+p
+​
+ℒ
+p
++
+λ
+GAN
+​
+ℒ
+GAN
++
+λ
+KL
+​
+ℒ
+KL
+.
+\mathcal{L}_{vae}=\mathcal{L}_{\text{rec}}+\lambda_{p}\mathcal{L}_{p}+\lambda_{\text{GAN}}\mathcal{L}_{\text{GAN}}+\lambda_{\text{KL}}\mathcal{L}_{\text{KL}}.
+(1)
+Through explicit structure–motion disentanglement and mild adaptation, the extractor yields a compact, interpretable, and transferable latent representation well-suited for robotic scenarios, providing effective supervision for downstream VLA pre-training and co-fine-tuning.
+3.3
+Pre-training to Think in Latent Motion
+The pre-training stage aims to align language and initial visual observations with latent motion representations, enabling the model to reason about continuous temporal dynamics in the latent space and predict the terminal frame of the video segment.
+Given a continuous video segment
+𝐕
+1
+:
+f
+=
+{
+v
+1
+,
+…
+,
+v
+f
+}
+\mathbf{V}_{1:f}=\{v_{1},\ldots,v_{f}\}
+, the latent motion extractor produces a latent motion supervision signal
+z
+m
+z_{m}
+.
+Its first and last frames are quantized into discrete visual tokens, denoted as
+v
+q
+1
+v_{q}^{1}
+and
+v
+q
+f
+v_{q}^{f}
+, respectively.
+Based on this, we organize the input sequence to the VLA decoder as:
+[
+T
+,
+v
+q
+1
+,
+Q
+,
+v
+q
+f
+]
+,
+[T,v_{q}^{1},Q,v_{q}^{f}],
+where
+T
+T
+denotes the instruction,
+v
+q
+1
+v_{q}^{1}
+represents the initial observation,
+Q
+Q
+is a learnable motion query token, and
+v
+q
+f
+v_{q}^{f}
+corresponds to the visual state that would be reached after applying the underlying motion from
+v
+1
+v_{1}
+through
+z
+m
+z_{m}
+.
+During the forward pass, the hidden state at the query position is fed to an MLP to predict the latent motion
+z
+^
+m
+\hat{z}_{m}
+.
+To prevent information leakage, causal masking is applied so that
+Q
+Q
+only attends to
+{
+T
+,
+v
+q
+1
+}
+\{T,v_{q}^{1}\}
+while being masked from
+v
+q
+f
+v_{q}^{f}
+.
+The training objective contains latent motion supervision and terminal-frame visual consistency:
+ℒ
+pretrain
+=
+‖
+z
+^
+m
+−
+z
+m
+‖
+2
+2
++
+∑
+x
+∈
+{
+1
+,
+f
+}
+CE
+​
+(
+v
+^
+q
+x
+,
+v
+q
+x
+)
+,
+\mathcal{L}_{\text{pretrain}}=\|\hat{z}_{m}-z_{m}\|_{2}^{2}+\sum_{x\in\{1,f\}}\mathrm{CE}(\hat{v}_{q}^{x},v_{q}^{x}),
+(2)
+where the first term enforces that the latent representation extracted at
+Q
+Q
+accurately summarizes the continuous motion from
+v
+1
+v_{1}
+to
+v
+f
+v_{f}
+, while the second ensures that the model forms a coherent prediction of the resulting future state.
+Through this stage, the model learns to infer latent temporal dynamics directly from language and the initial frame, thus establishing a dynamics-aware prior for subsequent action modeling.
+3.4
+Co-Fine-Tuning for Aligning Latent Dynamics with Action Policies
+After the pre-training stage establishes a dynamics-aware prior in the latent motion space, the co-fine-tuning stage further aligns latent motion reasoning with discrete action modeling in a unified autoregressive framework, enabling stable multi-step control under sparse keyframe observations.
+Given a continuous video sequence
+𝐕
+1
+:
+f
+\mathbf{V}_{1:f}
+and its corresponding action sequence
+𝐀
+1
+:
+f
+\mathbf{A}_{1:f}
+, we extract
+N
+=
+f
+/
+l
+a
+N=f/l_{a}
+keyframes and quantize them into visual tokens:
+𝐕
+~
+q
+=
+{
+v
+~
+q
+1
+,
+…
+,
+v
+~
+q
+N
+}
+,
+\tilde{\mathbf{V}}_{q}=\{\tilde{v}_{q}^{1},\ldots,\tilde{v}_{q}^{N}\},
+where
+v
+~
+q
+j
+=
+v
+q
+(
+j
+−
+1
+)
+​
+l
+a
++
+1
+\tilde{v}_{q}^{j}=v_{q}^{(j-1)l_{a}+1}
+.
+We further quantize the action sequence using FAST
+[
+35
+]
+:
+𝐀
+1
+:
+f
+→
+FAST
+{
+𝐀
+q
+1
+,
+…
+,
+𝐀
+q
+N
+}
+.
+\mathbf{A}_{1:f}\ \xrightarrow{\text{FAST}}\ \{\mathbf{A}_{q}^{1},\ldots,\mathbf{A}_{q}^{N}\}.
+The input sequence adopts a “single-
+Q
+Q
+for the full window” design:
+[
+T
+,
+v
+~
+q
+1
+,
+Q
+,
+𝐀
+q
+1
+,
+v
+~
+q
+2
+,
+𝐀
+q
+2
+,
+…
+,
+𝐀
+q
+N
+]
+,
+[T,\ \tilde{v}_{q}^{1},\ Q,\ \mathbf{A}_{q}^{1},\ \tilde{v}_{q}^{2},\ \mathbf{A}_{q}^{2},\ \ldots,\ \mathbf{A}_{q}^{N}],
+where the query token
+Q
+Q
+appears only once after the first keyframe and serves as a latent dynamics aggregator for the entire temporal horizon. The decoder autoregressively predicts both action and visual tokens; the hidden state at
+Q
+Q
+is passed through an MLP to produce a single latent motion vector
+z
+^
+m
+\hat{z}_{m}
+, enforcing consistency between latent dynamics and subsequent predictions.
+As in pre-training, causal masking prevents
+Q
+Q
+from attending to future keyframes and actions, compelling the model to reason over latent dynamics rather than directly peeking at future states.
+The co-fine-tuning objective consists of three terms:
+ℒ
+finetune
+=
+\displaystyle\mathcal{L}_{\text{finetune}}=
+∑
+j
+=
+1
+N
+CE
+​
+(
+𝐀
+^
+q
+j
+,
+𝐀
+q
+j
+)
++
+λ
+1
+​
+‖
+z
+^
+m
+−
+z
+m
+​
+(
+𝐕
+1
+:
+f
+)
+‖
+2
+2
+\displaystyle\sum_{j=1}^{N}\mathrm{CE}\!\left(\hat{\mathbf{A}}_{q}^{j},\ \mathbf{A}_{q}^{j}\right)+\lambda_{1}\left\|\hat{z}_{m}-z_{m}(\mathbf{V}_{1:f})\right\|_{2}^{2}
+(3)
++
+λ
+2
+​
+∑
+j
+=
+1
+N
+CE
+​
+(
+v
+~
+^
+q
+j
+,
+v
+~
+q
+j
+)
+.
+\displaystyle+\lambda_{2}\sum_{j=1}^{N}\mathrm{CE}\!\left(\hat{\tilde{v}}_{q}^{j},\ \tilde{v}_{q}^{j}\right).
+Here,
+z
+m
+​
+(
+𝐕
+1
+:
+f
+)
+z_{m}(\mathbf{V}_{1:f})
+is a continuous latent motion supervision signal produced by the pretrained extractor.
+The first term ensures accurate execution of discrete actions.
+The second term encourages the latent representation at the query token to faithfully capture the continuous dynamics from
+v
+1
+v_{1}
+to
+v
+f
+v_{f}
+.
+The third term anchors motion predictions to sparse visual checkpoints, maintaining consistent state transitions driven by the predicted dynamics.
+Table 1
+:
+Comparison of different methods on the LIBERO
+[
+32
+]
+and SimplerEnv-WidowX
+[
+30
+]
+benchmarks.
+The best and the second-best values for each metric are bold and
+underlined
+respectively.
+Model
+LIBERO
+SimplerEnv-WidowX
+SPATIAL
+OBJECT
+GOAL
+LONG
+Avg.
+Stack Block
+Put Carrot
+Put Spoon
+Put Eggplant
+Avg.
+OpenVLA
+[
+24
+]
+0.849
+0.884
+0.792
+0.537
+0.765
+0.000
+0.000
+0.000
+0.041
+0.010
+SpatialVLA
+[
+36
+]
+0.882
+0.899
+0.786
+0.555
+0.781
+0.292
+0.250
+0.167
+1.000
+0.427
+CogACT
+[
+28
+]
+0.960
+0.874
+0.868
+0.846
+0.887
+0.150
+0.508
+0.717
+0.675
+0.513
+Dita
+[
+21
+]
+0.842
+0.963
+0.854
+0.638
+0.824
+–
+–
+–
+–
+–
+π
+0
+\pi_{0}
+[
+3
+]
+0.968
+0.988
+0.958
+0.852
+0.942
+0.167
+0.000
+0.291
+0.625
+0.401
+π
+0
+\pi_{0}
+-FAST
+[
+35
+]
+0.964
+0.968
+0.886
+0.602
+0.855
+0.108
+0.219
+0.291
+0.666
+0.483
+GR00T N1
+[
+2
+]
+0.944
+0.976
+0.930
+0.906
+0.939
+0.167
+0.458
+0.625
+0.208
+0.495
+w/ Latent Actions
+LAPA
+[
+54
+]
+–
+–
+–
+–
+–
+0.542
+0.458
+0.708
+0.583
+0.573
+villa-X
+[
+11
+]
+0.975
+0.970
+0.915
+0.745
+0.901
+0.613
+0.463
+0.779
+0.646
+0.625
+TLA
+[
+6
+]
+0.965
+0.968
+0.956
+0.920
+0.952
+0.028
+0.556
+0.528
+0.806
+0.480
+w/ World Model
+WorldVLA
+[
+7
+]
+0.856
+0.890
+0.826
+0.590
+0.791
+–
+–
+–
+–
+–
+CoT-VLA
+[
+57
+]
+0.875
+0.916
+0.876
+0.690
+0.811
+–
+–
+–
+–
+–
+UniVLA
+[
+50
+]
+0.960
+0.992
+0.932
+0.914
+0.950
+0.292
+0.625
+0.833
+1.000
+0.687
+FlowVLA
+[
+58
+]
+0.932
+0.950
+0.916
+0.726
+0.881
+0.625
+0.625
+0.708
+1.000
+0.740
+\rowcolor
+gray!20
+Ours
+0.972
+0.978
+0.946
+0.928
+0.956
+0.625
+0.667
+0.792
+0.958
+0.760
+4
+Experiments
+4.1
+Benchmarks
+LIBERO.
+The LIBERO
+[
+32
+]
+benchmark is designed for studying knowledge transfer in multitask and lifelong robot learning, requiring both
+declarative knowledge
+about objects and spatial relations and
+procedural knowledge
+about motion and behaviors.
+It contains four task suites: LIBERO-Spatial emphasizes spatial reasoning by placing a bowl based on its location, LIBERO-Object focuses on object recognition via picking and placing distinct objects, LIBERO-Goal tests procedural learning with varying task goals under fixed objects, and LIBERO-Long contains ten long-horizon tasks with diverse objects, layouts, and goals.
+SimplerEnv.
+SimplerEnv
+[
+30
+]
+is a collection of manipulation evaluation environments for common real-world robot setups, showing strong correlation with real-robot performance. It enables assessing the transferability and generalization of models trained on real-world video data. We evaluate on four tasks using a 7-DoF WidowX robotic arm.
+4.2
+Implementation Details
+Our latent motion extractor is built upon a pretrained video VAE (VidTwin
+[
+49
+]
+) and is further fine-tuned on a robot-centric dataset consisting of 237k videos (details provided in the appendix).
+Each video segment is uniformly sampled to 16 frames and resized to
+224
+×
+224
+224\times 224
+.
+The structure latent
+z
+s
+z_{s}
+has a shape of
+4
+×
+16
+×
+7
+×
+7
+4\times 16\times 7\times 7
+,
+while the directional motion embeddings
+z
+m
+h
+z_{m}^{h}
+and
+z
+m
+w
+z_{m}^{w}
+have shapes of
+8
+×
+16
+×
+7
+8\times 16\times 7
+.
+The motion latent dimension is
+D
+m
+=
+1792
+D_{m}=1792
+.
+The backbone of our VLA model follows the design of UniVLA
+[
+50
+]
+and is based on the 8.5B-parameter VLM Emu3
+[
+47
+]
+.
+Visual observations are quantized into discrete tokens using VQGAN
+[
+15
+]
+, while actions are partitioned into chunks and discretized into tokens using the FAST algorithm
+[
+35
+]
+.
+During the pre-training stage, we trained the model using the aforementioned 237k videos with pretrained Emu3 initialization.
+From each video, we extracted a frame sequence of length
+f
+=
+16
+f=16
+, where the first and last frame tokens supervise visual modeling, and the latent motion extracted from VidTwin provides supervision.
+We trained using a batch size of 256 for 10k steps.
+During the co-fine-tuning stage, we initialized from the pretrained checkpoint and trained on the benchmark-specific datasets.
+For the LIBERO benchmark, we used the mixed data from the four task suites curated by OpenVLA
+[
+24
+]
+, including both third-person and wrist-mounted views.
+We trained the model with a batch size of 128 for 8k iterations, resized all images to
+200
+×
+200
+200\times 200
+, set the action chunk length to
+l
+a
+=
+10
+l_{a}=10
+, and used
+λ
+1
+=
+0.1
+\lambda_{1}=0.1
+and
+λ
+2
+=
+0.01
+\lambda_{2}=0.01
+.
+For SimplerEnv, we trained the model on the Bridge V2 dataset
+[
+43
+]
+with a batch size of 128 for 12k iterations.
+Single-view images were resized to
+256
+×
+256
+256\times 256
+, the action chunk length is set to
+l
+a
+=
+5
+l_{a}=5
+, and we used
+λ
+1
+=
+0.1
+\lambda_{1}=0.1
+and
+λ
+2
+=
+0
+\lambda_{2}=0
+.
+In the co-fine-tuning stage, we set
+N
+=
+2
+N=2
+, where two visual observations and two corresponding ground-truth action chunks were used.
+Further training details and supplementary results are provided in the appendix.
+Table 2
+:
+Evaluation of VAE-Reconstructed Videos and downstream fine-tuning performance on SimplerEnv-WidowX
+[
+30
+]
+.
+Model
+Reconstruction Metrics
+Simulation Evaluation
+PSNR
+↑
+\uparrow
+SSIM
+↑
+\uparrow
+LPIPS
+↓
+\downarrow
+Stack Block
+Put Carrot
+Put Spoon
+Put Eggplant
+Average
+Pretrain
+32.7
+0.923
+0.122
+0.458
+0.750
+0.792
+0.917
+0.729
+Finetune
+33.4
+0.934
+0.123
+0.625
+0.667
+0.792
+0.958
+0.760
+Figure 3
+:
+Visualization of the disentangled motion and structure latents.
+We select two frames (
+t
+1
+t_{1}
+and
+t
+2
+t_{2}
+) and show the original (Orig.) and reconstructed (Recon.) frames.
+“M. Recon.” and “S. Recon.” denote the reconstructions obtained by decoding only the motion latent or only the structure latent, respectively.
+The structure latent preserves the global scene layout, whereas the motion latent captures motion and fine-grained temporal details.
+4.3
+Comparison with SOTA Methods
+We compared CoWVLA against three representative categories of methods: VLA baselines (OpenVLA
+[
+24
+]
+, SpatialVLA
+[
+36
+]
+, CogACT
+[
+28
+]
+, DiTA
+[
+21
+]
+,
+π
+0
+\pi_{0}
+[
+3
+]
+,
+π
+0
+\pi_{0}
+-FAST
+[
+35
+]
+, GR00T-N1
+[
+2
+]
+), latent-action approaches (LAPA
+[
+54
+]
+, villa-X
+[
+11
+]
+, TLA
+[
+6
+]
+), and world-model approaches (WorldVLA
+[
+7
+]
+, CoT-VLA
+[
+57
+]
+, UniVLA
+[
+50
+]
+, FlowVLA
+[
+58
+]
+).
+These methods respectively model: (i) actions directly, (ii) frame-to-frame latent transitions, and (iii) pixel/token-level future frames.
+They collectively represent the main paradigms in current VLA pretraining and provide strong and fair comparison points.
+The results are shown in Table
+1
+.
+Overall, our CoWVLA achieves SOTA performance with superior cross-domain robustness.
+We observe that TLA achieves a strong 0.952 on LIBERO but significantly drops to 0.480 on SimplerEnv, while FlowVLA is strong on SimplerEnv (0.740) but noticeably weaker on LIBERO (0.881). UniVLA shows a more balanced performance (0.950/0.698).
+In contrast, CoWVLA achieves 0.956/0.760 on the two benchmarks, outperforming UniVLA on both and demonstrating higher absolute performance and greater cross-domain stability.
+4.4
+Latent Motion Analysis
+In this subsection, we analyze the effectiveness of the proposed disentangled latent space from three perspectives: the separation of structure and motion factors, the improved adaptiveness of the motion latent after fine-tuning on robot data, and the enhanced capability of modeling future dynamics. These results collectively verify that our latent motion representation provides a clearer physical prior and stronger action reasoning ability.
+Effective decoupling of structure and motion latent.
+As shown in Figure
+3
+, we reconstruct frames using only the motion latent (M. Recon.) or only the structure latent (S. Recon.).
+The structure latent preserves global scene layout and object appearance, whereas the motion latent captures robot arm trajectories and fine-grained temporal dynamics.
+Figure
+4
+provides additional evidence through cross-reconstruction.
+Since motion cues are subtle in individual frames, we visualize the pixel-wise differences, which highlight the motion-affected regions and show that injecting the motion latent alters only the dynamic parts while keeping the static structure intact.
+These visualizations demonstrate that our latent space effectively separates content structure and dynamic information, providing a more interpretable representation for downstream visuomotor reasoning.
+Fine-tuning on robot data improves motion latent quality.
+As presented in Table
+2
+, fine-tuning the latent motion extractor on robot data not only improves reconstruction quality (higher PSNR and SSIM) but also boosts downstream performance. In the SimplerEnv-WidowX evaluation, the average task success rate increases from 0.729 to 0.760. This confirms that motion latents adapted to the robot domain contain higher-quality dynamic cues that benefit policy learning.
+Motion latent enhances dynamic modeling for future frame prediction.
+As illustrated in Figure
+5
+, we visualize the future frame predictions under different pretraining strategies.
+From top to bottom, the tasks in each subfigure are:
+i) pick up the black bowl from the table center and place it on the plate,
+ii) sweep into a pile.
+World-model-based approaches reconstruct redundant background pixels and therefore struggle to focus on interactive motion, while single-goal-frame prediction lacks supervision of temporal evolution and often produces unstable goal frames.
+This leads both strategies to easily generate results with no changes, such as Figure
+5
+(b) Task i.
+In contrast, our model leverages the motion latent as a “chain of world” during reasoning, achieving physically plausible future states that align more closely with the instructions.
+Figure 4
+:
+Cross-reconstruction visualization.
+We extract the structure latent from the static video in the first row and the motion latent from the robot-arm motion video in the second row.
+By combining the two latents, we reconstruct the video shown in the third row.
+We compute the difference between the cross-reconstructed frames and the static frames to highlight the changed regions, which correspond to the robot arm’s motion.
+Figure 5
+:
+Comparative visualization of future-frame prediction strategies.
+There are two tasks demonstrated: i) pick up the black bowl from the table center and place it on the plate, and ii) sweep into a pile.
+(a) The world-model approach predicts five future frames.
+(b) The single-goal-frame approach predicts one goal frame.
+(c) Our method reasons through a learned motion latent
+z
+m
+z_{m}
+, producing more reasonable and instruction-aligned frames.
+4.5
+Ablation and Efficiency Analysis
+In this section, we conduct an in-depth analysis of key modules, hyperparameter settings, and training efficiency.
+Experiments in Table
+3
+and Table
+4
+adhere to a unified dataset and training configuration, with a batch size of 256 for 10k steps during the pre-training phase and a batch size of 128 for 8k steps during the co-fine-tuning phase.
+In Table
+3
+, we provide a unified comparison of the effectiveness of latent action, world model, and our proposed method.
+In Table
+4
+, we analyze the effect of the loss weighting ratio between the latent motion loss (
+λ
+1
+\lambda_{1}
+) and the visual token loss (
+λ
+2
+\lambda_{2}
+) on task success rates during the co-fine-tuning strategy.
+In addition, we analyze the pre-training cost and task success rate of different methods in Figure
+6
+.
+The main conclusions are as follows.
+Table 3
+:
+Ablation study on the LIBERO
+[
+32
+]
+benchmark.
+Config
+Variant
+Spatial
+Object
+Goal
+Long
+Average
+Latent Action
+w/o LA
+0.622
+0.146
+0.694
+0.328
+0.448
+LAPA style
+0.718
+0.852
+0.804
+0.488
+0.716
+villa-X style
+0.840
+0.904
+0.834
+0.668
+0.812
+structure latent
+0.856
+0.898
+0.822
+0.692
+0.817
+motion latent
+0.916
+0.932
+0.886
+0.774
+0.877
+World Model
+UniVLA Style
+0.958
+0.978
+0.932
+0.898
+0.942
+CoT-VLA style
+0.942
+0.964
+0.950
+0.838
+0.924
+Ours
+motion
+0.960
+0.980
+0.922
+0.882
+0.936
+motion & cot
+0.948
+0.974
+0.958
+0.906
+0.947
+i) Our latent motion modeling significantly outperforms existing latent action methods.
+The “Latent Action” part of Table
+3
+compares several baselines.
+The “w/o LA” variant, which skips pre-training and fine-tunes directly on LIBERO data, achieves the lowest average success rate (0.448). “LAPA style” (0.716) and “villa-X style” (0.812) both outperform the “w/o LA” variant, with “villa-X style” achieving stronger performance by modeling richer multi-frame information.
+Our method separates the latent into a “structure latent” (0.817) capturing content and texture, and a “motion latent” (0.877) encoding dynamic information.
+Modeling with the cleaner motion notably improves task success rate.
+ii) World model methods show stronger overall performance than latent action methods.
+In the “World Model” part of Table
+3
+, both “UniVLA style” (pretrained with six frames) and “CoT-VLA style” (pretrained with initial and target frames) achieve higher success rates (0.942 and 0.924, respectively) than those methods in the “Latent Action” category.
+Notably, “UniVLA style”, which uses more frames, performs better, indicating that world model methods have a distinct advantage in temporal modeling and learning knowledge of environmental evolution.
+iii) Our method achieves superior performance to latent action and world models.
+The “Ours” part in Table
+3
+presents two configurations of our method. Both use latent motion supervision during pre-training and set
+λ
+1
+=
+0.1
+,
+λ
+2
+=
+0
+\lambda_{1}=0.1,\lambda_{2}=0
+during fine-tuning (i.e., using only real action and latent motion losses). The “motion” configuration does not use the final frame
+v
+f
+v_{f}
+during pre-training and achieves a success rate of 0.936.
+In contrast, the “motion & cot” configuration adds supervision from
+v
+f
+v_{f}
+during pre-training and improves the success rate to 0.947.
+This yields two conclusions: first, introducing latent motion during the fine-tuning phase effectively guides the inference of real actions; second, introducing
+v
+f
+v_{f}
+as an evolutionary target during pre-training significantly enhances the model’s perception and understanding of environmental evolution.
+Table 4
+:
+Ablation study of loss weights on the LIBERO
+[
+32
+]
+benchmark.
+λ
+1
+\lambda_{1}
+λ
+2
+\lambda_{2}
+Spatial
+Object
+Goal
+Long
+Average
+0.0
+0.0
+0.922
+0.962
+0.862
+0.742
+0.872
+0.1
+0.0
+0.960
+0.980
+0.922
+0.882
+0.936
+1.0
+0.0
+0.958
+0.970
+0.950
+0.902
+0.945
+0.1
+0.05
+0.954
+0.972
+0.944
+0.914
+0.946
+0.1
+0.01
+0.970
+0.964
+0.958
+0.926
+0.955
+1.0
+0.01
+0.970
+0.956
+0.934
+0.922
+0.946
+Figure 6
+:
+Comparison of pre-training efficiency and task performance on LIBERO
+[
+32
+]
+across different methods.
+Blue and orange circles denote world-model and latent-action baselines, respectively, while green circles denote our configurations.
+Circle size indicates training-time GPU memory usage.
+Our method balances pre-training efficiency and performance, achieving a higher success rate with moderate computational efficiency.
+iv) Balancing latent motion and visual token losses during co-fine-tuning further improves performance.
+Table
+4
+presents an ablation study on the loss weights
+λ
+1
+\lambda_{1}
+(latent motion) and
+λ
+2
+\lambda_{2}
+(visual token) during the co-fine-tuning stage, based on the same pretrained model.
+First, we fix
+λ
+2
+=
+0
+\lambda_{2}=0
+to analyze the impact of
+λ
+1
+\lambda_{1}
+. When
+λ
+1
+=
+0
+\lambda_{1}=0
+(no latent motion loss), the success rate is only 0.872. As
+λ
+1
+\lambda_{1}
+increases from 0.1 to 1.0, the success rate improves from 0.936 to 0.945, indicating that the guiding effect of latent motion is strengthening.
+Next, we introduce the visual token loss
+λ
+2
+\lambda_{2}
+.
+By comparing (
+λ
+1
+=
+0.1
+,
+λ
+2
+=
+0.05
+\lambda_{1}=0.1,\lambda_{2}=0.05
+) at 0.946 and (
+λ
+1
+=
+0.1
+,
+λ
+2
+=
+0.01
+\lambda_{1}=0.1,\lambda_{2}=0.01
+) at 0.955, we find that the weight for visual token prediction should not be too high.
+Then we tune
+λ
+1
+=
+1.0
+\lambda_{1}=1.0
+and
+λ
+2
+=
+0.01
+\lambda_{2}=0.01
+, achieving an average success rate of 0.946.
+This proves that simultaneously introducing latent motion (
+λ
+1
+=
+0.1
+\lambda_{1}=0.1
+) and a low-weighted visual token prediction (
+λ
+2
+=
+0.01
+\lambda_{2}=0.01
+) during the fine-tuning phase most effectively guides the inference of real actions.
+v) Our method balances pre-training efficiency and performance.
+As shown in Figure
+6
+, we compare several methods from Table
+3
+in terms of training speed, GPU memory usage, and task success rate (batch size = 4 per GPU).
+UniVLA is the slowest and most memory-intensive, while LAPA is the fastest but less successful.
+Our method has two configurations: “motion” without
+v
+f
+v_{f}
+achieves the second-fastest speed and slightly lower performance than UniVLA, and “motion & cot” with
+v
+f
+v_{f}
+achieves a better balance of efficiency and performance, surpassing UniVLA in both.
+5
+Conclusion
+In this work, we presented CoWVLA, which for the first time integrates the temporal reasoning capability of world models with a disentangled latent motion representation, enabling world modeling directly in a structure–motion separated latent space.
+By introducing the Chain-of-World paradigm, our method predicts a continuous latent motion chain and a terminal keyframe from the instruction and initial observation, compactly capturing temporal evolution and physical dynamics without reconstructing intermediate pixels.
+Extensive experiments on LIBERO and SimplerEnv benchmarks demonstrate that CoWVLA outperforms both world-model and latent-action approaches, while offering improved dynamic consistency and visuomotor grounding, thereby providing a more efficient pretraining route toward general-purpose robotic manipulation.
+Limitations
+.
+Despite its promising results, our approach still has limitations.
+The latent motion space remains dependent on the quality and domain coverage of the pretrained video VAE, which may introduce distribution mismatch in new environments.
+Moreover, the model relies on a large VLA backbone and substantial computational resources.
+We believe exploring more lightweight and scalable architectures, as well as further enhancing the coupling between latent dynamics and action learning, will broaden the applicability of our method to real-world robotics.
+6
+Acknowledgments
+This work was supported by the National Natural Science Foundation of China (Grant No. 62277011), Project of Chongqing MEITC (Grant No. YJX-2025001001009), and CAAI-CANN Open Fund, developed on OpenI Community.
+References
+Assran et al. [2025]
+Mido Assran, Adrien Bardes, David Fan, et al.
+V-jepa 2: Self-supervised video models enable understanding, prediction and planning.
+arXiv preprint arXiv:2506.09985
+, 2025.
+Bjorck et al. [2025]
+Johan Bjorck, Fernando Castañeda, Nikita Cherniadev, Xingye Da, Runyu Ding, et al.
+Gr00t n1: An open foundation model for generalist humanoid robots.
+arXiv preprint arXiv:2503.14734
+, 2025.
+Black et al. [2024]
+Kevin Black, Noah Brown, Danny Driess, et al.
+π
+0
+\pi_{0}
+: A vision-language-action flow model for general robot control.
+arXiv preprint arXiv:2410.24164
+, 2024.
+Brohan et al. [2022]
+Anthony Brohan, Noah Brown, et al.
+Rt-1: Robotics transformer for real-world control at scale.
+arXiv preprint arXiv:2212.06817
+, 2022.
+Bruce et al. [2024]
+Jake Bruce, Michael D Dennis, Ashley Edwards, Jack Parker-Holder, et al.
+Genie: Generative interactive environments.
+In
+ICML
+, 2024.
+Bu et al. [2025]
+Qingwen Bu, Yanting Yang, Jisong Cai, et al.
+Learning to act anywhere with task-centric latent actions.
+In
+RSS
+, 2025.
+Cen et al. [2025]
+Jun Cen, Chaohui Yu, Hangjie Yuan, Yuming Jiang, Siteng Huang, et al.
+Worldvla: Towards autoregressive action world model.
+arXiv preprint arXiv:2506.21539
+, 2025.
+Cheang et al. [2024]
+Chi-Lam Cheang, Guangzeng Chen, Ya Jing, Tao Kong, et al.
+Gr-2: A generative video-language-action model with web-scale knowledge for robot manipulation.
+arXiv preprint arXiv:2410.06158
+, 2024.
+Chen et al. [2023]
+Lili Chen, Shikhar Bahl, and Deepak Pathak.
+Playfusion: Skill acquisition via diffusion from language-annotated play.
+In
+CoRL
+, pages 2012–2029, 2023.
+Chen et al. [2024]
+Lawrence Yunliang Chen, Simeon Adebola, and Ken Goldberg.
+Berkeley UR5 demonstration dataset, 2024.
+Chen et al. [2025a]
+Xiaoyu Chen, Hangxing Wei, Pushi Zhang, Chuheng Zhang, Kaixin Wang, et al.
+villa-X: enhancing latent action modeling in vision-language-action models.
+arXiv preprint arXiv:2507.23682
+, 2025a.
+Chen et al. [2025b]
+Yi Chen, Yuying Ge, Yizhuo Li, Yixiao Ge, Mingyu Ding, Ying Shan, and Xihui Liu.
+Moto: Latent motion token as the bridging language for robot manipulation.
+In
+ICCV
+, 2025b.
+Chi et al. [2023]
+Cheng Chi, Siyuan Feng, Yilun Du, Zhenjia Xu, et al.
+Diffusion policy: Visuomotor policy learning via action diffusion.
+In
+RSS
+, 2023.
+Di et al. [2025]
+Donglin Di, He Feng, Wenzhang Sun, Yongjia Ma, Hao Li, Wei Chen, Lei Fan, Tonghua Su, and Xun Yang.
+Dh-facevid-1k: A large-scale high-quality dataset for face video generation.
+In
+ICCV
+, pages 12124–12134, 2025.
+Esser et al. [2021]
+Patrick Esser, Robin Rombach, and Bjorn Ommer.
+Taming transformers for high-resolution image synthesis.
+In
+CVPR
+, pages 12873–12883, 2021.
+Fan et al. [2025a]
+Lei Fan, Dongdong Fan, Zhiguang Hu, Yiwen Ding, Donglin Di, Kai Yi, Maurice Pagnucco, and Yang Song.
+Manta: A large-scale multi-view and visual-text anomaly detection dataset for tiny objects.
+In
+CVPR
+, pages 25518–25527, 2025a.
+Fan et al. [2025b]
+Lei Fan, Junjie Huang, Donglin Di, Anyang Su, Tianyou Song, Maurice Pagnucco, and Yang Song.
+Salvaging the overlooked: Leveraging class-aware contrastive learning for multi-class anomaly detection.
+In
+ICCV
+, pages 21419–21428, 2025b.
+Gao et al. [2025a]
+Chongkai Gao, Zixuan Liu, Zhenghao Chi, Junshan Huang, Xin Fei, Yiwen Hou, Yuxuan Zhang, Yudi Lin, Zhirui Fang, and Lin Shao.
+VLA-OS: Structuring and dissecting planning representations and paradigms in vision-language-action models.
+In
+NeurIPS
+, 2025a.
+Gao et al. [2025b]
+Shenyuan Gao, Siyuan Zhou, Yilun Du, Jun Zhang, and Chuang Gan.
+Adaworld: Learning adaptable world models with latent actions.
+In
+ICML
+, 2025b.
+Gu et al. [2023]
+Jiayuan Gu, Fanbo Xiang, Xuanlin Li, Zhan Ling, Xiqiang Liu, Tongzhou Mu, Yihe Tang, Stone Tao, Xinyue Wei, Yunchao Yao, et al.
+Maniskill2: A unified benchmark for generalizable manipulation skills.
+arXiv preprint arXiv:2302.04659
+, 2023.
+Hou et al. [2025]
+Zhi Hou, Tianyi Zhang, Yuwen Xiong, Haonan Duan, et al.
+Dita: Scaling diffusion transformer for generalist vision-language-action policy.
+In
+ICCV
+, 2025.
+Intelligence et al. [2025]
+Physical Intelligence, Kevin Black, Noah Brown, et al.
+π
+0.5
+\pi_{0.5}
+: a vision-language-action model with open-world generalization.
+arXiv preprint arXiv:2504.16054
+, 2025.
+Kalashnikov et al. [2018]
+Dmitry Kalashnikov, Alex Irpan, Peter Pastor, Julian Ibarz, et al.
+Scalable deep reinforcement learning for vision-based robotic manipulation.
+In
+CoRL
+, pages 651–673, 2018.
+Kim et al. [2024]
+Moo Jin Kim, Karl Pertsch, Siddharth Karamcheti, Ted Xiao, Ashwin Balakrishna, et al.
+OpenVLA: An open-source vision-language-action model.
+In
+CoRL
+, 2024.
+Kim et al. [2025]
+Moo Jin Kim, Chelsea Finn, and Percy Liang.
+Fine-tuning vision-language-action models: Optimizing speed and success.
+In
+RSS
+, 2025.
+Lew et al. [2025]
+Jaihyun Lew, Jooyoung Choi, Chaehun Shin, Dahuin Jung, and Sungroh Yoon.
+Disentangled motion modeling for video frame interpolation.
+In
+AAAI
+, pages 4607–4615, 2025.
+Li et al. [2023]
+Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi.
+Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models.
+In
+ICML
+, pages 19730–19742. PMLR, 2023.
+Li et al. [2024a]
+Qixiu Li, Yaobo Liang, Zeyu Wang, Lin Luo, et al.
+CogACT: A foundational vision-language-action model for synergizing cognition and action in robotic manipulation.
+arXiv preprint arXiv:2411.19650
+, 2024a.
+Li et al. [2025]
+Shuang Li, Yihuai Gao, Dorsa Sadigh, and Shuran Song.
+Unified video action model.
+In
+RSS
+, 2025.
+Li et al. [2024b]
+Xuanlin Li, Kyle Hsu, Jiayuan Gu, Oier Mees, Karl Pertsch, et al.
+Evaluating real-world robot manipulation policies in simulation.
+In
+CoRL
+, 2024b.
+Lin et al. [2024]
+Bin Lin, Yunyang Ge, Xinhua Cheng, et al.
+Open-sora plan: Open-source large video generation model.
+arXiv preprint arXiv:2412.00131
+, 2024.
+Liu et al. [2023]
+Bo Liu, Yifeng Zhu, Chongkai Gao, Yihao Feng, Qiang Liu, Yuke Zhu, and Peter Stone.
+LIBERO: Benchmarking knowledge transfer for lifelong robot learning.
+In
+NeurIPS
+, 2023.
+Mees et al. [2022]
+Oier Mees, Lukas Hermann, Erick Rosete-Beas, and Wolfram Burgard.
+Calvin: A benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks.
+RA-L
+, 7(3):7327–7334, 2022.
+O’Neill et al. [2024]
+Abby O’Neill, Abdul Rehman, Abhiram Maddukuri, Abhishek Gupta, Abhishek Padalkar, et al.
+Open x-embodiment: Robotic learning datasets and rt-x models: Open x-embodiment collaboration.
+In
+ICRA
+, pages 6892–6903, 2024.
+Pertsch et al. [2025]
+Karl Pertsch, Kyle Stachowicz, Brian Ichter, Danny Driess, et al.
+Fast: Efficient action tokenization for vision-language-action models.
+arXiv preprint arXiv:2501.09747
+, 2025.
+Qu et al. [2025]
+Delin Qu, Haoming Song, Qizhi Chen, Yuanqi Yao, Xinyi Ye, et al.
+Spatialvla: Exploring spatial representations for visual-language-action model.
+In
+RSS
+, 2025.
+Rosete-Beas et al. [2023]
+Erick Rosete-Beas, Oier Mees, Gabriel Kalweit, Joschka Boedecker, and Wolfram Burgard.
+Latent plans for task-agnostic offline reinforcement learning.
+In
+CoRL
+, pages 1838–1849, 2023.
+Routray et al. [2026]
+Sandeep Routray, Hengkai Pan, Unnat Jain, Shikhar Bahl, and Deepak Pathak.
+Vipra: Video prediction for robot actions.
+In
+ICLR
+, 2026.
+Shah et al. [2023]
+Rutav Shah, Roberto Martín-Martín, and Yuke Zhu.
+Mutex: Learning unified policies from multimodal task specifications.
+arXiv preprint arXiv:2309.14320
+, 2023.
+Shi et al. [2024]
+Xiaoyu Shi, Zhaoyang Huang, Fu-Yun Wang, et al.
+Motion-i2v: Consistent and controllable image-to-video generation with explicit motion modeling.
+In
+ACM SIGGRAPH
+, pages 1–11, 2024.
+Sun et al. [2025]
+Shibo Sun, Xue Li, Donglin Di, Mingjie Wei, Lanshun Nie, Wei-Nan Zhang, Dechen Zhan, Yang Song, and Lei Fan.
+Llapa: A vision-language model framework for counterfactual-aware procedural planning.
+In
+ACM MM
+, pages 5020–5029, 2025.
+Sun et al. [2024]
+Zhenhong Sun, Junyan Wang, Zhiyu Tan, Daoyi Dong, Hailan Ma, Hao Li, and Dong Gong.
+Eggen: Image generation with multi-entity prior learning through entity guidance.
+In
+ACM MM
+, pages 6637–6645, 2024.
+Walke et al. [2023]
+Homer Rich Walke, Kevin Black, Tony Z Zhao, et al.
+Bridgedata v2: A dataset for robot learning at scale.
+In
+CoRL
+, pages 1723–1736, 2023.
+Wan et al. [2025]
+Team Wan, Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, et al.
+Wan: Open and advanced large-scale video generative models.
+arXiv preprint arXiv:2503.20314
+, 2025.
+Wang et al. [2024a]
+Junyan Wang, Zhenhong Sun, Zhiyu Tan, Xuanbai Chen, Weihua Chen, Hao Li, Cheng Zhang, and Yang Song.
+Towards effective usage of human-centric priors in diffusion models for text-based human image generation.
+In
+CVPR
+, pages 8446–8455, 2024a.
+Wang et al. [2026a]
+Kun Wang, Xiao Feng, Mingcheng Qu, and Tonghua Su.
+Hmvla: Hyperbolic multimodal fusion for vision-language-action models.
+arXiv preprint arXiv:2602.02533
+, 2026a.
+Wang et al. [2024b]
+Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, et al.
+Emu3: Next-token prediction is all you need.
+arXiv preprint arXiv:2409.18869
+, 2024b.
+Wang et al. [2024c]
+Yuqi Wang, Jiawei He, Lue Fan, Hongxin Li, Yuntao Chen, and Zhaoxiang Zhang.
+Driving into the future: Multiview visual forecasting and planning with world model for autonomous driving.
+In
+CVPR
+, pages 14749–14759, 2024c.
+Wang et al. [2025]
+Yuchi Wang, Junliang Guo, Xinyi Xie, Tianyu He, Xu Sun, and Jiang Bian.
+Vidtwin: Video vae with decoupled structure and dynamics.
+In
+CVPR
+, pages 22922–22932, 2025.
+Wang et al. [2026b]
+Yuqi Wang, Xinghang Li, Wenxuan Wang, Junbo Zhang, Yingyan Li, Yuntao Chen, Xinlong Wang, and Zhaoxiang Zhang.
+Unified vision-language-action model.
+In
+ICLR
+, 2026b.
+Wei et al. [2024]
+Julong Wei, Shanshuai Yuan, Pengfei Li, Qingda Hu, Zhongxue Gan, and Wenchao Ding.
+Occllama: An occupancy-language-action generative world model for autonomous driving.
+arXiv preprint arXiv:2409.03272
+, 2024.
+Wu et al. [2024a]
+Hongtao Wu et al.
+Unleashing large-scale video generative pre-training for visual robot manipulation.
+In
+ICLR
+, 2024a.
+Wu et al. [2024b]
+Jialong Wu, Shaofeng Yin, Ningya Feng, Xu He, Dong Li, Jianye Hao, and Mingsheng Long.
+iVideoGPT: Interactive videogpts are scalable world models.
+In
+NeurIPS
+, pages 68082–68119, 2024b.
+Ye et al. [2025]
+Seonghyeon Ye, Joel Jang, Byeongguk Jeon, Sejune Joo, Jianwei Yang, et al.
+Latent action pretraining from videos.
+In
+ICLR
+, 2025.
+Yu et al. [2024]
+Sihyun Yu, Weili Nie, De-An Huang, Boyi Li, Jinwoo Shin, and Anima Anandkumar.
+Efficient video diffusion models via content-frame motion-latent decomposition.
+In
+ICLR
+, 2024.
+Zhang et al. [2025]
+Wenyao Zhang, Hongsi Liu, Zekun Qi, Yunnan Wang, et al.
+Dreamvla: A vision-language-action model dreamed with comprehensive world knowledge.
+In
+NeurIPS
+, 2025.
+Zhao et al. [2025]
+Qingqing Zhao, Yao Lu, Moo Jin Kim, Zipeng Fu, Zhuoyang Zhang, Yecheng Wu, et al.
+Cot-VLA: Visual chain-of-thought reasoning for vision-language-action models.
+In
+CVPR
+, pages 1702–1713, 2025.
+Zhong et al. [2025]
+Zhide Zhong, Haodong Yan, Junfeng Li, Xiangchen Liu, Xin Gong, et al.
+Flowvla: Visual chain of thought-based motion reasoning for vision-language-action models.
+arXiv preprint arXiv:2508.18269
+, 2025.
+Zhou et al. [2023]
+Gaoyue Zhou, Victoria Dean, Mohan Kumar Srirama, Aravind Rajeswaran, Jyothish Pari, Kyle Hatch, Aryan Jain, Tianhe Yu, Pieter Abbeel, Lerrel Pinto, et al.
+Train offline, test online: A real robot learning benchmark.
+arXiv preprint arXiv:2306.00942
+, 2023.
+Zhu et al. [2023]
+Yifeng Zhu, Abhishek Joshi, Peter Stone, and Yuke Zhu.
+Viola: Imitation learning for vision-based manipulation with object proposal priors.
+In
+CoRL
+, pages 1199–1210, 2023.
+Zitkovich et al. [2023]
+Brianna Zitkovich, Tianhe Yu, Sichun Xu, Peng Xu, et al.
+RT-2: Vision-language-action models transfer web knowledge to robotic control.
+In
+CoRL
+, pages 2165–2183, 2023.
+\thetitle
+Supplementary Material
+1
+Implementation Details
+1.1
+Datasets
+We collected high-quality robot manipulation data for fine-tuning the Latent Motion Extractor (LME) and training the VLA, with the datasets summarized in Table
+1
+. Most of the data comes from the OXE
+[
+34
+]
+dataset, and we additionally include the Calvin
+[
+33
+]
+and Libero
+[
+32
+]
+simulation datasets. For LME fine-tuning, we use only episode frames. In the VLA pre-training stage, we use both episode frames and text instructions. Following UniVLA
+[
+50
+]
+, we adopt different sampling intervals for each dataset to ensure that the temporal gap between keyframes is approximately one second. We then uniformly sample 16 frames from the continuous frames covered by six keyframes for pre-training. Throughout this stage, only third-person view data is used, excluding wrist-camera views.
+During the VLA co-fine-tuning stage, we train on the benchmark-specific training sets using text instructions, frames, and actions. For example, the BridgeV2 dataset
+[
+43
+]
+is used for the SimplerEnv-Bridge evaluation
+[
+30
+]
+, while the Libero
+[
+32
+]
+evaluation uses the mixed data of four Libero task suites processed by OpenVLA
+[
+24
+]
+. In addition, the appendix includes extended experiments using the Fractal dataset
+[
+4
+]
+for the Simpler-Google Robot
+[
+30
+]
+evaluation and the Calvin dataset
+[
+33
+]
+for the Calvin evaluation, covering both ABCD
+→
+\rightarrow
+D and ABC
+→
+\rightarrow
+D task settings. Across the co-fine-tuning experiments, Bridge and Google Robot training use only third-person views, while Libero and Calvin use both third-person and wrist views.
+1.2
+Training Details
+For LME fine-tuning, we start from the VidTwin
+[
+49
+]
+pretrained model and fine-tune it on the video data from the datasets listed in Table
+1
+.
+We use 4 A800 GPUs with a per-GPU batch size of 4, randomly sampling 16 frames per video.
+Each frame is resized to 224
+×
+\times
+224.
+The KL loss weight is set to 1e-6, and the reconstruction loss is reduced using the mean over all elements rather than the default reduction over the batch dimension only.
+We randomly sample 1000 videos from the training set as a validation set and select the checkpoint with the lowest reconstruction loss.
+The final model corresponds to the checkpoint trained for one epoch plus 20k iterations.
+For VLA pre-training, we initialize from the 8.5B Emu3
+[
+47
+]
+pretrained checkpoint and train on the datasets in Table
+1
+.
+The training is performed on 32 A800 GPUs with a per-GPU batch size of 8.
+Image observations are resized to 256
+×
+\times
+256.
+We use the first and last frames of each video clip together with one learnable motion query, and the maximum sequence length is set to 2500 tokens.
+We train for 10k iterations in total, which takes roughly 24 hours.
+For VLA co-fine-tuning, we follow the evaluation protocols from UniVLA
+[
+50
+]
+for each benchmark.
+We load the checkpoint from the VLA pre-training stage and train with 16 A800 GPUs, using a batch size of 8 per GPU and full-parameter fine-tuning.
+The maximum sequence length is set to 3200 tokens.
+For SimplerEnv-Windowx
+[
+30
+]
+, we use BridgeV2
+[
+43
+]
+data with images resized to 256
+×
+\times
+256 and train for 12k iterations.
+For SimplerEnv-Google Robot
+[
+30
+]
+, Fractal
+[
+4
+]
+images are resized to 240
+×
+\times
+192, and training continues for 16k iterations.
+For Libero
+[
+32
+]
+, images are resized to 200
+×
+\times
+200, and training runs for 8k iterations.
+For Calvin
+[
+33
+]
+, third-person views are resized to 200
+×
+\times
+200 and wrist views to 80
+×
+\times
+80, with training conducted for 12k iterations.
+The per-iteration training time across these configurations is similar; for example, Libero training takes about 25 hours for 8k iterations.
+Overall, each configuration requires roughly one to two days of training.
+Table 1
+:
+Training datasets.
+Dataset Name
+Count
+Berkeley Autolab Ur5
+[
+10
+]
+892
+Bridgev2
+[
+43
+]
+24879
+Cmu Play Fusion
+[
+9
+]
+576
+Fractal
+[
+4
+]
+65530
+Kuka
+[
+23
+]
+84202
+Maniskill
+[
+20
+]
+30029
+Taco Play
+[
+37
+]
+3242
+Toto
+[
+59
+]
+899
+Utaustin Mutex
+[
+39
+]
+1500
+Viola
+[
+60
+]
+135
+Calvin
+[
+33
+]
+22966
+Libero
+[
+32
+]
+1693
+Total
+236543
+Table 2
+:
+Long-horizon robotic manipulation evaluation on the CALVIN
+[
+33
+]
+benchmark.
+Methods marked with
+†
+{\dagger}
+are from our re-implementation.
+Method
+Task
+Tasks Completed in a Row
+Avg. Len
+↑
+\uparrow
+1
+2
+3
+4
+5
+UniVLA
+†
+[
+50
+]
+ABCD
+→
+\rightarrow
+D
+0.988
+0.934
+0.883
+0.829
+0.764
+4.398
+Ours
+0.972
+0.939
+0.894
+0.859
+0.809
+4.473
+TLA
+[
+6
+]
+ABC
+→
+\rightarrow
+D
+0.955
+0.858
+0.754
+0.669
+0.565
+3.800
+Dita
+[
+21
+]
+0.945
+0.825
+0.728
+0.613
+0.500
+3.610
+UniVLA
+†
+[
+50
+]
+0.972
+0.902
+0.826
+0.741
+0.661
+4.102
+Ours
+0.968
+0.912
+0.844
+0.779
+0.708
+4.211
+Table 3
+:
+Evaluation on SimplerEnv-Google Robot
+[
+30
+]
+across various manipulation tasks.
+Model
+Pick
+Move
+Drawer
+Place
+Average
+OpenVLA
+[
+24
+]
+0.180
+0.563
+0.630
+0.000
+0.343
+SpatialVLA
+[
+36
+]
+0.860
+0.779
+0.574
+0.090
+0.576
+MoTo
+[
+12
+]
+0.740
+0.604
+0.431
+0.000
+0.444
+villa-X
+[
+11
+]
+0.987
+0.750
+0.593
+0.056
+0.597
+UniVLA
+[
+50
+]
+0.870
+0.565
+0.194
+0.167
+0.449
+\rowcolor
+gray!20
+Ours
+0.923
+0.676
+0.428
+0.407
+0.609
+Figure 1
+:
+Sensitivity analysis of
+N
+N
+and
+l
+a
+l_{a}
+on LIBERO.
+Table 4
+:
+Comparison between our latent motion representation and Wan 2.1 VAE latent
+𝐳
+\mathbf{z}
+on LIBERO.
+Variant
+Pre-training
+Co-fine-tuning
+Spatial
+Object
+Goal
+Long
+Average
+Ours
+latent motion + terminal frame
++ latent motion
+0.948
+0.974
+0.958
+0.906
+0.947
+Wan2.1 VAE
+[
+44
+]
+latent
+𝐳
+\mathbf{z}
++ terminal frame
++ latent
+𝐳
+\mathbf{z}
+0.938
+0.950
+0.922
+0.868
+0.920
+Figure 2
+:
+Cross-Recon visualization on LIBERO
+[
+32
+]
+.
+The first six columns show temporally sampled frames from three rows: Structure (top), Motion (middle), and Cross-Recon (bottom).
+The Cross-Recon videos are generated by combining the static appearance from the Structure video with the motion representation extracted from the Motion video, revealing the transferred motion patterns.
+Each Cross-Recon frame is overlaid with a motion heatmap to highlight dynamic regions.
+The last column presents three summary maps: motion heatmaps obtained by averaging and maximizing per-frame absolute differences between Cross-Recon and Structure, and the end-effector trajectory estimated from the motion regions.
+Figure 3
+:
+Cross-Recon visualization on SimplerEnv
+[
+30
+]
+and Bridgev2
+[
+43
+]
+.
+Figure 4
+:
+Visualization of latent-motion clusters and corresponding video examples.
+(a) Unsupervised clustering results of clip-level motion trajectories. Each subplot shows the average 2D motion trajectory (obtained from the first two PCA components of the accumulated frame-wise motion deltas) for one cluster.
+(b) Representative video examples from clusters.
+Cluster 1 and 2 correspond to monotonic downward-like or upward-like motions, whereas Cluster 3 and 4 exhibit rightward-like or leftward-like behaviors.
+Figure 5
+:
+Comparative visualization of future-frame prediction strategies.
+1.3
+Interpretation of the World Model and the Latent Motion Chain
+Our method combines a world model formulation with latent action modeling.
+The world model component consists of two stages: pre-training and co-fine-tuning.
+During pre-training, the world model is not action-conditioned.
+This follows the representation adopted by UniVLA
+[
+50
+]
+and FlowVLA
+[
+58
+]
+, where the world model predicts future environment evolution given a language instruction and an initial state, rather than explicit actions.
+During the co-fine-tuning stage, we introduce an action-conditioned formulation:
+p
+​
+(
+v
+t
++
+1
+∣
+v
+t
+,
+A
+t
+)
+p(v^{t+1}\mid v^{t},A^{t})
+.
+Our latent motion does not explicitly perform multi-step rollouts. Instead, it provides a continuous and decoupled motion encoding over a temporal window, which can be interpreted as an implicit motion chain.
+2
+Additional Results
+2.1
+Analysis of keyframes and action chunk size
+We evaluate the number of sparse keyframes
+N
+∈
+{
+1
+,
+2
+,
+3
+,
+4
+,
+5
+}
+N\!\in\!\{1,2,3,4,5\}
+and action chunk sizes
+l
+a
+∈
+{
+5
+,
+10
+,
+20
+,
+25
+}
+l_{a}\!\in\!\{5,10,20,25\}
+on LIBERO to understand the temporal granularity required by latent motion reasoning.
+As shown in Figure
+1
+, both hyperparameters exhibit a clear inverted-U trend. The best performance is achieved at
+(
+N
+=
+2
+,
+l
+a
+=
+10
+)
+(N=2,l_{a}=10)
+, corresponding to a
+∼
+\sim
+20-frame (
+≈
+\approx
+2 s) temporal horizon.
+When using only one keyframe (
+N
+=
+1
+N=1
+), performance drops significantly across all suites, especially on long-horizon tasks, indicating that the latent motion becomes under-constrained. Increasing
+N
+N
+to 2 provides sufficient visual anchoring and yields the largest improvement. However, further increasing
+N
+N
+gradually degrades performance. With dense observations, the model can rely on short-term visual matching instead of inferring motion dynamics, weakening the benefit of latent temporal reasoning.
+A similar phenomenon appears for action chunk size. Small chunks (
+l
+a
+=
+5
+l_{a}=5
+) reduce temporal abstraction and make the policy closer to step-wise imitation. Large chunks (
+l
+a
+≥
+20
+l_{a}\geq 20
+) introduce high uncertainty in future evolution, particularly harming the long-horizon tasks. The intermediate chunk size (
+l
+a
+=
+10
+l_{a}=10
+) achieves the best trade-off between predictability and abstraction.
+Overall, the results suggest that the proposed model performs best when sparse observations provide partial constraints while still requiring the model to infer continuous evolution. This supports our design motivation: the latent motion token serves as a dynamics aggregator over a medium temporal window rather than dense frame tracking or one-step prediction.
+2.2
+Comparison with other Video VAE
+To further analyze the role of latent motion representations, we replace VidTwin with the VAE from Wan 2.1
+[
+44
+]
+and conduct a controlled comparison. Specifically, we use the latent
+𝐳
+\mathbf{z}
+extracted by the Wan 2.1 VAE as auxiliary supervision during both pre-training and co-fine-tuning.
+The Wan 2.1 VAE is trained on large-scale video data and therefore incorporates rich generic video priors. As shown in Table
+4
+, this variant achieves an average success rate of 0.920 on LIBERO. While competitive, it remains inferior to our latent motion design (0.947).
+2.3
+CALVIN
+Calvin
+[
+33
+]
+is an open-source simulated benchmark built on PyBullet, designed for learning long-horizon, language-conditioned robotic manipulation tasks.
+It provides a tabletop simulation environment containing 23 types of manipulation skills, such as lifting, pushing, rotating, and object relocation.
+These skills must be executed in sequence to complete multi-step tasks, introducing substantial uncertainty and randomness, which makes Calvin a highly challenging evaluation benchmark.
+The dataset includes a large number of expert demonstrations and is organized into multiple subsets.
+In our experiments, we use the ABCD
+→
+\rightarrow
+D and ABC
+→
+\rightarrow
+D subsets, and during training, we only utilize demonstrations that include natural language descriptions of the actions.
+Following the official evaluation protocol, all tests consist of 1000 episodes, each containing a sequence of five sub-tasks specified by natural language instructions.
+The main results are presented in Table
+2
+. Our method achieves an average success length of 4.473 on the ABCD
+→
+\rightarrow
+D task and 4.211 on the ABC
+→
+\rightarrow
+D task.
+For a fair comparison, we reproduced UniVLA
+[
+50
+]
+using the training sets listed in Table
+1
+, and followed a fine-tuning setup with 16 A800 GPUs and a per-GPU batch size of 8.
+Under the same training configuration, our approach outperforms UniVLA
+[
+50
+]
+.
+2.4
+SimplerEnv-Google Robot
+We also evaluate our method on the SimplerEnv-Google Robot benchmark.
+The evaluation primarily follows the visual matching protocol, which assesses the alignment between real and simulated visual appearances by overlaying real-world images onto simulated backgrounds and adjusting the textures of foreground objects and the robot within the simulator.
+This benchmark includes four tasks:
+pick coke can
+,
+move near
+,
+open/close drawer
+, and
+place in closed drawer
+.
+The main results are shown in Table
+3
+.
+Our method achieves an average success rate of 0.609, outperforming UniVLA
+[
+50
+]
+, villa-x
+[
+11
+]
+, MoTo
+[
+12
+]
+, and other baselines.
+Here, UniVLA refers to our reproduction.
+Our method surpasses UniVLA on all four tasks and shows a particularly large improvement on the
+place in closed drawer
+task.
+Figure 6
+:
+An Intel RealSense camera and a Realman RM75B robot.
+Figure 7
+:
+Comparison between data collection and real-world deployment during testing.
+2.5
+More Visualization
+We provide extended visualizations for the latent motion analysis presented in Section 4.4, with the main results shown in Figures
+2
+,
+3
+,
+4
+, and
+5
+.
+Effective decoupling of structure and motion latents.
+Figures
+2
+and
+3
+analyze representative samples from the Libero and Bridge datasets. The first six columns display temporally sampled frames from three rows: Structure (top), Motion (middle), and Cross-Reconstruction (bottom). The Cross-Recon videos are synthesized by combining the static appearance from the Structure video with the motion representation extracted from the Motion video, thereby revealing transferred motion patterns.
+Each Cross-Recon frame is overlaid with a motion heatmap to highlight dynamic regions.
+The final column summarizes three diagnostic maps: motion heatmaps computed by averaging and maximizing the per-frame absolute differences between Cross-Recon and Structure, as well as the end-effector trajectory estimated from the activated motion regions. As shown, the highlighted areas consistently follow the movement of the robot arm in the Motion video.
+In the video results, these regions fluctuate over time; for clarity in static visualization, we display aggregated highlights in the figures.
+We further analyze the distribution of motion latents, as shown in Figure
+4
+. To derive an interpretable trajectory representation from high-dimensional motion latents, we first extract per-frame motion features from each video clip and accumulate framewise differences to obtain a temporal sequence describing the overall motion trend of the clip. These sequences are then resampled to a fixed length across all clips and standardized globally. We subsequently apply PCA to the sequence features and take the first two principal components as a 2D trajectory for each clip. This representation preserves the dynamic structure encoded in the latent space while enabling clear comparison across clips.
+Figure
+4
+(a) shows unsupervised clustering of all motion trajectories in the 2D PCA space. To obtain cluster-level canonical shapes, we temporally align trajectories within each cluster via resampling and plot their mean curves along with 95% confidence intervals. Distinct trajectory patterns emerge across clusters—such as monotonic rises, two-stage reversals, and multi-phase back-and-forth motions—indicating that the model’s motion latent captures high-level motion semantics.
+To further validate the semantic consistency within each cluster, we randomly sample two video clips per cluster and visualize three uniformly sampled frames from each clip, as shown in Figure
+4
+(b). The clips within the same cluster exhibit highly similar motion trends in appearance, confirming that the structure of the motion-latent space yields meaningful discrimination among different action patterns.
+Motion latent enhances dynamic modeling for future frame prediction.
+As shown in Figure
+5
+, we further visualize future frame predictions under different pretraining strategies. From top to bottom, the examples correspond to four tasks:
+i) pick up the chocolate pudding and place it in the basket,
+ii) pour,
+iii) open the fridge, and
+iv) put the banana inside the drawer.
+In Figure
+5
+(a), world-model-based approaches suffer from reconstructing redundant background pixels, which can draw attention away from critical interactions and motion cues. As a result, the predicted future frames sometimes remain nearly unchanged, such as in tasks (ii) and (iii).
+Figure
+5
+(b) shows that predicting only the target frame often leads to unstable generation due to the absence of intermediate evolution steps: in task (i), the target frame nearly collapses back to the initial frame, and in task (iii), only one door of the fridge is generated.
+In contrast, our method leverages the motion latent
+z
+m
+z_{m}
+as a chain-of-thought for motion, providing stronger guidance for future-frame prediction. The generated final frames align more accurately with the intended task instructions.
+3
+Real-Robot Experiments
+Experimental Setup.
+As shown in Figure
+6
+, we use the Realman RM75B robot, which is equipped with 7 degrees of freedom and a single gripper.
+An Intel RealSense camera is used to capture RGB images.
+We set up a cup-grasping experiment and collected a total of 127 episodes, consisting of 65,382 frames with corresponding actions.
+Each episode contains an average of 515 frames, corresponding to approximately 20 seconds in the real world.
+The dataset mainly includes grasping cups of four different colors, with the number of episodes per color as follows: red 31, blue 39, yellow 24, and purple 33.
+Figure
+7
+(a) shows some collected data.
+During training, all images are cropped and resized to 256×256. The action chunk size is set to 10. We train the model for 2k steps using 16 GPUs with a per-GPU batch size of 8.
+The data were collected in the afternoon and evening and then used for model training.
+Testing was conducted the following day.
+As shown in Figure
+7
+, the lighting conditions have some differences between data collection compared and during real-world deployment.
+We found that the model was still able to correctly execute instructions under different lighting conditions.
+Figure
+7
+(b) shows in the first two rows two test cases: grasping a red/purple cup and placing it on a plate. Their background lighting differs from the training data, but the model is still able to execute the tasks successfully.
+BETA
\ No newline at end of file
diff --git a/research/notes/composer-2-technical-report.md b/research/notes/composer-2-technical-report.md
new file mode 100644
index 0000000000000000000000000000000000000000..5453ad04a61dbb89a036afe96567830e34e26ff3
--- /dev/null
+++ b/research/notes/composer-2-technical-report.md
@@ -0,0 +1,2518 @@
+---
+title: Composer 2 Technical Report
+id: composer-2-technical-report
+tags:
+- deepread
+created: '2026-06-10T00:23:34.475868Z'
+source: https://arxiv.org/html/2603.24477
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:34.475642Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Composer 2 Technical Report
+Title:
+Content selection saved. Describe the issue below:
+Description:
+License: CC BY 4.0
+arXiv:2603.24477v2 [cs.SE] 26 Mar 2026
+Composer 2 Technical Report
+Cursor Research Team
+1
+Introduction
+Composer 2 is a specialized model designed for agentic software engineering. The model demonstrates strong long-term planning and coding intelligence while maintaining the ability to efficiently solve problems for interactive use.
+The model scores strongly on CursorBench, our benchmark of real-world software engineering (Figure
+1
+), while also scoring at frontier levels on public software engineering benchmarks such as SWE-bench Multilingual
+[Jimenez
+et al.
+,
+2024
+]
+and Terminal-Bench
+[Merrill
+et al.
+,
+2026
+]
+.
+The model is trained in two phases: first, continued pretraining to improve the model’s knowledge and latent coding ability, followed by large-scale reinforcement learning to improve end-to-end coding performance through stronger reasoning, accurate multi-step execution, and coherence on long-horizon realistic coding problems.
+A core tenet of Composer training is to emulate real-world user challenges as closely as possible to minimize train-test mismatch. We develop infrastructure to support training in the same Cursor harness that is used by the deployed model, with equivalent tools and structure, and use environments that match real problems closely. To measure the ability of the model on increasingly difficult tasks, we introduce a benchmark derived from real software engineering problems in large codebases including our own.
+Composer 2 is a frontier-level coding model and demonstrates a process for training strong domain-specialized models. On our CursorBench evaluations the model achieves a major improvement in accuracy compared to previous Composer models (61.3). On public benchmarks the model scores 61.7 on Terminal-Bench and 73.7 on SWE-bench Multilingual in our harness, comparable to state-of-the-art systems.
+Figure 1:
+Composer 2 improves greatly from previous Composer models, achieving performance competitive with state-of-the-art models.
+By specializing entirely on coding ability, Composer attains such performance while being lower cost to serve than state-of-the-art model API pricing. See Section
+5
+for detailed evaluations.
+2
+Background and Related Work
+Generating code has been a standout application of large language models
+Feng
+et al.
+[
+2020
+]; Clement
+et al.
+[
+2020
+]; Chen
+et al.
+[
+2021
+]; Li
+et al.
+[
+2022
+]
+. Code provides a rich source of challenging training data that has supplemented language data in most large models
+Fried
+et al.
+[
+2023
+]; Li
+et al.
+[
+2023
+]; Lozhkov
+et al.
+[
+2024
+]; Rozière
+et al.
+[
+2023
+]; Guo
+et al.
+[
+2024
+]; DeepSeek-AI [
+2024a
+]; Allal
+et al.
+[
+2023
+]; Nijkamp
+et al.
+[
+2023
+]; Hui
+et al.
+[
+2024
+]; Wang
+et al.
+[
+2021
+,
+2023
+]; Team
+et al.
+[
+2024
+]; Mishra
+et al.
+[
+2024
+]
+. Early applications of code generation typically focused on autocomplete applications. Subsequently, instruction tuning turned models into coding assistants
+Luo
+et al.
+[
+2024
+]; Wei
+et al.
+[
+2024
+]; Zhuo
+et al.
+[
+2025
+]; Muennighoff
+et al.
+[
+2024
+]
+capable of responding to user requests. In the last year, software engineering
+agents
+have achieved widespread adoption, pushing models beyond chat to autonomously navigate repositories and solve complex engineering tasks
+Yang
+et al.
+[
+2024
+,
+2025
+]; Wang
+et al.
+[
+2025
+]; Qian
+et al.
+[
+2024
+]; Hong
+et al.
+[
+2023
+]
+.
+Software engineering agents aim to autonomously act to solve a given task prompt. Given an environment, i.e., a codebase and an isolated container for code execution, along with a prompt
+x
+x
+giving the agent its task, an agent produces a rollout consisting of a series of actions
+a
+1
+,
+…
+,
+a
+T
+a_{1},\ldots,a_{T}
+, each of which makes one or more tool calls and yields responses
+y
+1
+,
+…
+,
+y
+T
+y_{1},\ldots,y_{T}
+. Tool calls may modify the underlying environment, and the result of a rollout is the final state of this environment. Each action
+a
+i
+a_{i}
+is selected by sampling from a language model policy
+π
+θ
+​
+(
+a
+i
+∣
+x
+,
+a
+1
+,
+y
+1
+,
+…
+,
+a
+i
+−
+1
+,
+y
+i
+−
+1
+)
+\pi_{\theta}(a_{i}\mid x,a_{1},y_{1},\ldots,a_{i-1},y_{i-1})
+, after which a reward is given based on the code’s correctness, succinctness, and conformance to software engineering principles. In contrast to more constrained settings like competitive programming, a strong software engineering agent must perform non-trivial exploration, write its own tests, and construct the minimal changes necessary to solve the task prompt.
+Composer 2 has access to a small set of general tools that allow it to read and edit files, run shell commands, search the codebase using grep or semantic search, and search the web. Its prompt includes a system message, the tool call format specification, recent file information, past user messages, and the current task. The most common end result of this process is a set of changes to files in the codebase environment, although there are many other common use cases, such as answering questions, writing plans, resolving version control issues, or monitoring long-running jobs.
+Our main research thrust for Composer 2 investigates how scaling model training can reliably improve performance on real-world coding.
+We target this through two distinct training phases: continued pretraining (Section
+3
+), and asynchronous reinforcement learning (Section
+4
+). To measure progress, we construct a suite of challenging benchmarks (Section
+5
+).
+3
+Continued Pretraining
+The continued pretraining stage aims to improve the language model’s base knowledge, specifically in the domain of coding. Such continued pretraining has long been demonstrated to drastically improve downstream performance
+Gururangan
+et al.
+[
+2020
+]; Howard and Ruder [
+2018
+]
+. Taking this a step further, recent models use a staged training approach, progressively filtering towards higher quality data
+Hoffmann
+et al.
+[
+2022
+]; Touvron
+et al.
+[
+2023
+]; Ye and others [
+2024
+]
+. While we start with base models naturally trained with large amounts of code data, we find that additional supervised learning reliably improves knowledge benchmarks and leads to improved coding performance of the final coding agent.
+We used internal evaluations and inference performance considerations to select a base model. Our evaluations measure internal codebase perplexity, coding knowledge, and state tracking. For more details, see Appendix
+B
+. These evaluations led us to select Kimi K2.5
+Team [
+2026
+]
+, a 1.04T parameter / 32B active parameter Mixture-of-Experts model as our base model for Composer 2.
+3.1
+Training
+We extend Kimi K2.5 with a continued pretraining stage on a large code-dominated data mix. The purpose of this stage is to provide a base model for the subsequent agentic RL training by specializing the model on coding knowledge and capabilities. We divide this stage into three phases. We spend the bulk of compute at 32k token sequence length, followed by a shorter long-context extension phase to 256k sequence length, and finally a short SFT phase on targeted coding tasks. Training was performed in MXFP8 on NVIDIA B300s using the AdamW optimizer. See Section
+6.1
+for more training details. During training, we measure the evaluation loss on our internal codebase. We see that the loss decreases log-linearly over the course of the training run.
+Continued pretraining ultimately serves to improve downstream RL performance, and the connection between the two stages is an area of active research. We study the relationship between codebase perplexity and RL performance by applying our continued pretraining recipe to Qwen3-Coder-30B-A3B
+Team [
+2025e
+]
+. Continued pretraining is performed at three logarithmically spaced compute levels: small, medium, and large. Each of these checkpoints then undergoes SFT on a small dataset, followed by an identical RL run. Figure
+2
+(left) shows the relationship between the final loss after SFT and the RL reward after a fixed number of steps, demonstrating that cross-entropy loss is indeed predictive of downstream RL performance.
+Figure 2:
+Continued pretraining translates to downstream RL performance.
+Left: We study this relationship on a smaller Qwen model, examining checkpoints trained on a varying number of tokens. Right: The model undergoes a steady decrease in training perplexity.
+Multi-Token Prediction
+To serve the model faster in production, we train additional Multi-Token Prediction (MTP) layers
+Gloeckle
+et al.
+[
+2024
+]; DeepSeek-AI [
+2024b
+]
+to use with speculative decoding. We initialize the MTP layers from scratch and train them on the same data mix. To speed up convergence, we train the MTP layers with self-distillation, teaching the model to predict the exact logit distribution of the main LM head at each position. To ensure that this process generalizes, the MTP layers are trained atop a checkpoint cut from the middle of the continued pretraining run. During the final two phases (long-context and SFT), the MTP layers are included and trained jointly with the rest of the model.
+4
+Reinforcement Learning
+Figure 3:
+RL training tasks.
+Composer 2 is trained by reinforcement learning on a large set of coding tasks.
+These tasks are run in environments that emulate real Cursor sessions as closely as possible (see Section
+6.2
+for infrastructure details).
+At a high level, RL training consists of sampling a problem, simulating a group of rollouts from the agent with different solutions, and then updating the model weights based on solution quality.
+We create a problem distribution that reflects the most common use cases. Figure
+3
+shows the breakdown in terms of task category. Notably, our training distribution captures many aspects of software engineering absent from popular AI coding benchmarks. In later stages of training, we use simple heuristics—such as number of turns and thinking tokens of rollouts—to upsample increasingly harder data points.
+4.1
+Asynchronous RL Training
+Our reinforcement learning pipeline is built around learning from large-scale policy gradients while maintaining stability.
+We use a policy gradient algorithm with multiple samples per prompt
+Shao
+et al.
+[
+2024
+]; Ahmadian
+et al.
+[
+2024
+]
+and a fixed group size.
+We operate in the single-epoch regime, i.e., the same prompt is never trained on twice.
+We utilize Adam as our underlying optimizer and update the full parameter set. RL training operates in a highly asynchronous regime with independent training and rollout generation workers (see Section
+6.2
+for details).
+A number of policy gradient variants have been proposed in prior literature
+Yu
+et al.
+[
+2025
+]; Zheng
+et al.
+[
+2025
+]; MiniMax [
+2025
+]; Liu
+et al.
+[
+2025a
+]
+. As in Dr. GRPO
+Liu
+et al.
+[
+2025a
+]
+, we found that it is crucial to minimize the bias in the gradients that can arise from transforming the underlying advantage. Following this work, we remove the length standardization term from GRPO as it introduces a length bias. We do not normalize group advantages by their standard deviation, as it results in the degenerate case where small behavioral differences get massively upweighted within a group where every rollout achieves equal correctness.
+Yu
+et al.
+[
+2025
+]
+proposed to mask out rollouts that exceed the maximum sequence length. Some subsequent works employed this masking
+Liu
+et al.
+[
+2025b
+]; Golubev
+et al.
+[
+2025
+]
+, while other works found it to yield mixed results. For instance,
+Liu
+et al.
+[
+2025a
+]
+found that masking overlong rollouts shows limited effectiveness on long-tail reasoning tasks but increases the accuracy and clarity of responses in medium and short-length reasoning tasks, and
+Du
+et al.
+[
+2025
+]
+found that overlong masking caused output length to grow too quickly. We did not see benefits with overlong masking at small scale and opted not to mask rollouts that exceed the maximum sequence length. Our self-summary system (discussed below) also limits the occurrence of these cases in practice.
+Since agent rollouts can be very long, especially when aiming for long-horizon coherency, it is important that our system maintains stability in the highly asynchronous regime. Our main strategy is to minimize how off-policy the samples become. On the infrastructure side, this divergence is reduced via fast weight synchronization and in-flight weight updates, similar to PipelineRL
+Piché
+et al.
+[
+2025
+]
+. Inference workers are capable of updating weights mid-rollout, which means later tokens in a rollout are likely less off-policy. To reduce further divergence between the sampling and training policy, we replay MoE routing
+Ma
+et al.
+[
+2025
+]
+. We discuss the implementation of our asynchronous RL pipeline in Section
+6.2
+.
+Figure 4:
+Comparison of estimators of
+KL
+​
+(
+p
+∥
+q
+)
+\mathrm{KL}(p\,\|\,q)
+for two synthetic Gaussian distributions with unit variance and different means.
+Similar to prior work
+Shao
+et al.
+[
+2024
+]; Team [
+2025d
+]
+, we use a Kullback–Leibler divergence for regularization,
+KL
+​
+(
+q
+∥
+p
+)
+=
+𝔼
+x
+∼
+q
+​
+[
+−
+log
+⁡
+r
+​
+(
+x
+)
+]
+\mathrm{KL}(q\,\|\,p)=\mathbb{E}_{x\sim q}\!\left[-\log r(x)\right]
+,
+r
+​
+(
+x
+)
+=
+p
+​
+(
+x
+)
+/
+q
+​
+(
+x
+)
+.
+r(x)=p(x)/q(x).
+Many open-source implementations of RL estimate KL with the estimator
+k
+3
+=
+(
+r
+−
+1
+)
+−
+log
+⁡
+r
+k_{3}=(r-1)-\log r
+, defined in
+Schulman [
+2020
+]
+. The
+k
+3
+k_{3}
+estimator is an unbiased estimator of KL and reduces variance when
+p
+p
+and
+q
+q
+are close. However, Amini et al. shows in
+[Amini
+et al.
+,
+2025
+, Figure 1]
+that the variance increases drastically as
+p
+p
+and
+q
+q
+diverge. See Figure
+4
+: for large KL values, the variance of the estimate is extremely large. (The
+k
+2
+k_{2}
+estimator does not suffer from variance blow-up, but is biased.) Therefore, we use the standard estimator
+k
+1
+=
+−
+log
+⁡
+r
+k_{1}=-\log r
+instead.
+Figure 5:
+Both average and best-of-K performance increase over the RL training period.
+The above curves are reported on a held-out evaluation set, along with CursorBench tasks. Performance steadily improves throughout RL training. Importantly, we do not observe a tradeoff between average performance and best-of-K performance.
+A growing body of recent literature has argued that RL on LLMs often improves average performance primarily by concentrating probability mass on already-known successful trajectories, sometimes at the cost of policy entropy and output diversity
+Yue
+et al.
+[
+2025
+]; Liang
+et al.
+[
+2026
+]; Chen
+et al.
+[
+2025
+]; Wen
+et al.
+[
+2026
+]; Tajwar
+et al.
+[
+2026
+]
+. Under this view, improvements at best-of-K may be limited because the model becomes better at selecting one high-confidence solution rather than expanding the set of reachable correct solutions. Against this backdrop, our results are notable: rather than observing a trade-off in which average reward rises while best-of-K remains flat, we find that our training improves both statistics as shown in Figure
+5
+. This suggests that, in our setting, RL is not merely reweighting a fixed pool of reasoning paths, but is also improving the model’s effective coverage of correct solutions under repeated sampling.
+Self-Summarization
+To enable Composer 2 to work across long horizons, we use the self-summarization technique introduced in Composer 1.5
+Team [
+2025b
+]
+. Each training rollout can involve multiple generations chained together by summaries, rather than a single prompt–response pair. We use the final reward for all tokens produced by the model in the chain. This upweights both the agent responses in good trajectories and also the self-summarizations that made them work. At the same time, poor summaries that lose critical information are downweighted. As Composer trains, it learns to use self-summaries to process more information, even with a limited context window. For hard examples, it often self-summarizes multiple times. In our experiments, we find that self-summary consistently reduces the error compared to using separate prompt-based compaction, while using significantly fewer tokens and reusing the KV cache.
+4.2
+Agent Behavior
+While the primary goal of RL training is to improve model intelligence, we also aim to produce a model that provides a good developer experience.
+This is affected by the communication style of the model as well as the time and resources it takes to answer a question.
+Figure 6:
+Nonlinear penalties push the model to be quick on easy tasks and think more on hard tasks.
+For behavior and communication, we apply an array of auxiliary rewards to ensure the model provides a good experience. These include rewards for coding style, communication, and product-specific penalties for poor tool calls, such as creating to-do list items and then leaving them unfinished. During RL training, we monitor the model for emergent behaviors and occasionally introduce additional behavior rewards as needed. For example, we observed that the model would start to leave long chains-of-thought in comments or collapse to using the terminal tool only.
+To incentivize the model to produce solutions quickly on easy requests while allowing it to think longer on hard requests, we add a concave down and increasing nonlinear length penalty to the reward:
+C
+length
+​
+{
+k
+,
+q
+}
+​
+(
+x
+)
+=
+(
+1
++
+k
+​
+x
+)
+1
+−
+q
+−
+1
+k
+​
+(
+1
+−
+q
+)
+,
+C_{\text{length}\{k,q\}}(x)=\frac{(1+kx)^{1-q}-1}{k(1-q)},
+where
+k
+k
+and
+q
+q
+are hyperparameters which define the curvature of the penalty, and the input
+x
+x
+is a weighted combination of thinking tokens, tool calling tokens, tool output tokens, final message tokens, number of tool calls, and number of turns of a rollout.
+The nonlinearity reflects that on easy tasks, achievable with only a few tool calls, every additional bit of effort is felt more acutely than in long-horizon tasks, where the agent might iterate for hundreds of tool calls. See Figure
+6
+for some examples of the nonlinear curves produced by this equation.
+We find that utilizing such length penalties enables the model to learn particularly efficient behaviors, e.g., making multiple tool calls in parallel.
+5
+Real-World Evaluation with CursorBench
+The application of coding agents has evolved rapidly over the past year, expanding from simple, tightly-scoped edits to complex debugging, large-scale refactoring, and feature development.
+At Cursor, we have observed that performance on public evaluation benchmarks often correlates only loosely with the real-world utility of these models.
+We attribute this misalignment to four primary factors:
+•
+Domain Mismatch:
+As the capabilities of coding agents expand, static benchmarks often fail to capture the full spectrum of developer workflows.
+For instance, SWE-bench and its variants predominantly focus on isolated bug-fixing.
+Terminal-Bench covers a wider range of task types, but many of its tasks (e.g., computing chess moves) are abstract puzzles rather than typical software engineering operations.
+•
+Prompt Over-specification:
+Public benchmarks are typically highly specified, assuming a narrow set of correct solutions.
+In contrast, real developer requests are often underspecified and admit multiple valid architectural approaches.
+Consequently, public benchmarks either penalize correct alternative solutions or rely on unnaturally explicit prompts that bypass the challenge of interpreting ambiguous intent.
+•
+Data Contamination and Overfitting:
+Because public benchmarks are constructed from historical scrapes of open-source repositories, they are frequently leaked into model training mixtures, artificially inflating scores.
+Recently, OpenAI suspended reporting SWE-bench Verified results after finding evidence that frontier models could generate gold patches from memory
+74
+.
+Beyond contamination, the fixed and narrow nature of these benchmarks can compress performance differences: for instance, Haiku 4.5 achieves 73.3% on SWE-bench Verified, very close to GPT-5’s 74.9%, misaligning with accuracy on broader and more diverse task distributions like Terminal-Bench.
+•
+Narrow Evaluation Scope:
+Existing coding evaluations predominantly measure functional correctness.
+In practice, developers also heavily weigh code quality, readability, latency, cost, and the quality of the agent’s interactive behavior throughout a session.
+(a)
+Lines changed in reference diff.
+(b)
+Problem description length.
+Figure 7:
+Compared to public benchmarks, CursorBench tasks have less-specified task prompts, and require an order of magnitude more code changes.
+We find this better represents the complexity and ambiguity of real-world software engineering requests.
+To address these limitations, we introduce CursorBench, an internal evaluation suite comprising tasks drawn from actual coding sessions of our engineering team.
+Because these tasks originate from real agent sessions rather than curated public repositories, CursorBench better reflects the true distribution of software engineering tasks while completely avoiding train-set contamination.
+Furthermore, rather than relying solely on functional correctness, we evaluate models using specific metrics targeting code quality, execution efficiency, and interactive agent behavior in realistic settings.
+Figure
+7
+highlights the structural differences between CursorBench and public evaluation sets.
+CursorBench tasks necessitate substantially more extensive code modifications, with a median of 181 lines changed compared to just 7–10 lines for SWE-bench Verified and Multilingual (Figure
+7(a)
+).
+At the same time, CursorBench prompts are also more underspecified, featuring a median description length of only 390 characters versus 1,185–3,055 characters for public benchmarks (Figure
+7(b)
+).
+This combination of broad execution scope and high intent ambiguity accurately reflects the intrinsic difficulty of real-world software engineering, where developers must frequently synthesize context from production logs, sparse user bug reports, and large existing codebases to derive a solution.
+Figures
+8
+and
+12
+show representative examples: one requires diagnosing a build-tool transpilation bug in a retry loop from a terse bug report and observability logs, while the other requires designing a tuned heuristic detector over hundreds of chat responses to quantify a subtle streaming regression and discover its hidden invariants.
+⬇
+//
+executeScoringRollout.ts
+-
+linked
+code
+snippet
+from
+the
+problem
+statement
+for
+(
+let
+attempt
+=
+1;
+attempt
+<=
+MAX_RETRIES
+;
+attempt
+++)
+{
+try
+{
+const
+request
+=
+new
+ScoringRequest
+(...);
+const
+{
+ctx
+:
+Ctx
+,
+startSpan
+:
+taskSpan
+}
+=
+ctx
+.
+span
+(
+"scoring"
+);
+using _taskSpan = taskSpan.start();
+const
+result
+=
+await
+executeScoring
+(...);
+let
+rawOutput
+=
+""
+;
+if
+(
+result
+.
+response
+)
+{
+rawOutput
+=
+result
+.
+response
+.
+join
+(
+"\n"
+);
+}
+const
+parsed
+=
+parseOutput
+(
+rawOutput
+);
+if
+(
+parsed
+.
+parseError
+)
+{
+lastError
+=
+parsed
+.
+parseError
+;
+ctx
+.
+warn
+({
+error
+:
+lastError
+},
+"Error,
+will
+retry"
+);
+if
+(
+attempt
+<
+MAX_RETRIES
+)
+{
+continue
+;
+}
+}
+//
+...
+}
+catch
+(
+error
+)
+{
+/*
+...
+*/
+}
+}
+Problem statement:
+scoring attempt 2 and attempt 3 succeeded but i get “failed after 3 attempts. Last error: [canceled] User aborted request” error at the end
+@executeScoringRollout.ts (1084-1118)
+check if there is some bug in this
+Please see datadog logs at @logs and fix
+Figure 8:
+Example CursorBench task
+(truncated and obfuscated from our evaluation pipeline). The agent receives a terse bug report and must cross-reference the source code with production observability logs to diagnose the failure. The logs also contain unrelated production service warnings which are a red herring: the true root cause is an esbuild 0.20.2 downleveling bug for
+using
+. The transpiled output lowers the highlighted declaration into
+var
+-scoped error state that is not reset between retry iterations, causing stale failure state to be re-thrown from the generated
+finally
+block even after later attempts succeed.
+New CursorBench iterations are continually developed by our team.
+As user workflows evolve and agent capabilities improve, we regularly update the evaluation set to remain aligned with how developers actually use the product.
+Figure
+9
+shows how the benchmark has grown in complexity across iterations: compared to earlier versions of CursorBench, tasks from CursorBench-3 involve changing more than twice as many files and lines of code on average.
+In addition to increased problem size, the distribution of task types has also shifted, as developers increasingly delegate long-running command execution, experiment monitoring, and data analysis to agents.
+This continual refresh ensures that our evaluations remain aligned with the shifting frontier of real-world difficulty and not saturated.
+Finally, we complement our primary CursorBench evaluation with a suite of targeted evaluations covering other aspects of coding agent quality and behavior. These include an intent evaluation, which assesses how the model handles ambiguous prompts; an instruction-following evaluation, which measures how well the model follows system prompts, user prompts, rules, and skills; an eager editing evaluation, which tests how the model responds to questions where it should avoid editing code; a code quality evaluation, which judges the quality of both code and comments; and an interruption evaluation, which quantifies how well the model handles mid-rollout interruptions and user feedback. We develop these evaluations by identifying important dimensions of agent behavior, selecting data points that elicit them, and writing rubrics to measure performance.
+Figure 9:
+Evolution of CursorBench across iterations.
+Each version incorporates more complex requests. CursorBench-3 more than doubles the median task size from the initial version, shown as the relative percent change in the bottom bar.
+6
+Infrastructure
+6.1
+Training Infrastructure
+Parallelism.
+Previous Composer training stacks combined Fully Sharded Data Parallelism (FSDP)
+Rajbhandari
+et al.
+[
+2020
+]; Zhao
+et al.
+[
+2023
+]
+, Expert Parallelism (EP)
+Shazeer
+et al.
+[
+2017
+]; Fedus
+et al.
+[
+2022
+]
+, and Tensor Parallelism (TP)
+Shoeybi
+et al.
+[
+2019
+]
+.
+In the original MoE design, EP reused the same rank group as TP, so EP was not an independent scaling axis.
+This coupling kept the implementation simple, but constrained support for larger MoE configurations and would unnecessarily enable activation sharding in the continued pretraining phase, even when activation memory pressure is modest.
+Composer 2 instead uses Context Parallelism (CP)
+Liu
+et al.
+[
+2024
+]; Jacobs
+et al.
+[
+2023
+]
+as the primary long-context scaling axis. CP requires less communication than TP and improves compute efficiency by preserving full hidden dimensions in various projections; in contrast, TP produces less efficient skinny local matrix multiplications. There are a few tricks we use to implement CP efficiently in the Multi-Head Latent Attention (MLA) architecture. To minimize communication overhead, we compute local KV latent vectors, all-gather the latent vectors across CP ranks, and then compute the KV projections. Although this replicates the projection on all CP ranks, the projection is small and reduces CP communications, allowing us to fully overlap CP communications with the computation of the Q projection. Additionally, while naive CP causes load imbalance during causal attention as later tokens have to attend to more tokens, we use the technique from
+Liu
+et al.
+[
+2024
+]
+to address this: we split the sequence into
+2
+×
+CP
+2\times\text{CP}
+chunks, and the
+i
+i
+-th rank processes chunks
+i
+i
+and
+2
+×
+CP
+−
+1
+−
+i
+2\times\text{CP}-1-i
+, resulting in roughly equal work during causal attention for all ranks. Finally, the context parallelism dimension is folded into the FSDP dimension, allowing us to use CP ranks to reduce per-GPU parameter/state memory usage.
+Composer 2 also introduces a more flexible expert-parallel design by decoupling EP from TP. This requires using different meshes for sharding dense layers and expert weights. EP is formed from DP and CP capacity, enabling support for larger expert-parallel degrees and making expert-grouped GEMMs more efficient with larger per-rank token batches. We use EP=8, CP=2 for the continued pretraining phase and EP=8, CP=8 for the RL phase. We use DeepEP to implement high-throughput token dispatch/combine
+Zhao
+et al.
+[
+2025
+]
+. DeepEP communication buffers have relatively low overhead, and DeepEP’s kernel uses 20 SMs by default, leaving headroom for concurrent compute. We also quantize the tokens to MXFP8 (discussed below) before dispatch for more efficient communication, which does not affect our precision since we already perform our expert computations in MXFP8. We keep the combine at BF16 for increased precision. To maximize compute–communication overlap, tokens are split into microbatches and pipelined across separate communication and compute streams.
+Finally, we found that it was critical for different DP ranks to have similar amounts of compute to achieve high utilization. In continued pretraining, DP balance is easily achieved with fixed sequence lengths. In RL, different rollouts of different prompts can result in very different sequence lengths, so before each training step, we run a global sequence packing stage to ensure balanced DP compute load. The packing algorithm takes into account the increased attention costs of longer sequences.
+Kernels.
+Figure 10:
+Overview of a single grouped GEMM training flow in our Mixture-of-Experts layer. Each colored block represents a single kernel launch.
+Composer 2 training uses in-house kernels written in CUDA, PTX, and ThunderKittens/ParallelKittens
+Spector
+et al.
+[
+2025
+]; Sul
+et al.
+[
+2025a
+]
+. The kernels primarily optimize low-precision training of the mixture-of-experts (MoE) layer. Our training recipe uses both MXFP8
+Open Compute Project [
+2023
+]
+and NVFP4
+NVIDIA [
+2025
+]
+precision formats. We exclusively target NVIDIA Blackwell GPUs for block-scaled tensor-core matrix multiplications (i.e., in-hardware dequantization during systolic-array matrix multiplication). Figure
+10
+illustrates a single grouped GEMM training flow within our MoE layer.
+For the MoE forward pass, we use a novel variant of NVFP4: values are quantized from BF16 into FP4E2M1 using FP8E4M3 per-block scales (block size = 16) and FP32
+per-token
+scales. We found the original NVFP4 format, which uses FP32 per-tensor scales, fragile for two reasons. First, per-tensor scaling makes training batch-variant, collapsing numerical precision and causing the RL training to diverge. Second, inter-token scale values leak future token information into past tokens, resulting in biased gradients. Despite adding latency to the quantization and GEMM epilogue, per-token scaling thus proved to be the more effective scheme.
+For the MoE backward pass, we use the standard MXFP8 format with FP8E4M3 values and FP8E8M0 scales per 32-element block. We can do this because of the asymmetry in RL training. On the forward pass, it is necessary that the trainer match the inference for numerical stability. We therefore use trainer NVFP4 in order to support fast inference. The backward pass, however, runs only on the training cluster. This is not a bottleneck on system-wide RL efficiency, so we can afford higher precision to improve training stability.
+Finally, the choice of hardware-level math precision mattered considerably. For NVFP4 quantization, we found that using IEEE-compliant floating-point arithmetic (e.g.,
+__fdiv_rn
+) is critical; using fast-approximation alternatives causes training to diverge after roughly a hundred RL steps. Conversely, using the fast-approximation path (e.g.,
+__fdividef
+) for MXFP8 quantization has not caused any divergence since our initial training of Composer 1, so we select it for the best performance.
+We actively open-source our kernel implementations and support community efforts to improve the GPU kernel ecosystem. We collaborated closely with Colfax to implement the Flash Attention 4 backward kernel for the QK 192 / V 128 configuration (a.k.a. the "DeepSeek shapes"), which has been merged into the public repository
+Jay Shah [
+2026
+]
+. We also actively support the development of ThunderKittens in collaboration with the Hazy Research group at Stanford
+Sul
+et al.
+[
+2025a
+,
+c
+,
+b
+]; Sul and Ré [
+2026
+]
+. Recently, we open-sourced the state-of-the-art BF16, MXFP8, and NVFP4 GEMM implementations into ThunderKittens
+HazyResearch [
+2026
+]
+. Finally, we share our knowledge on quantization and MoE kernel implementation through online posts
+Team [
+2025a
+]
+.
+6.2
+RL Infrastructure
+Our RL infrastructure consists of four decoupled services: training, environments, inference, and evaluations. A decoupled service stack enables larger-scale global training, high availability, and independent scaling and sharding. The production training job for Composer 2 spanned 3 regions for GPU compute and 4 regions for CPU compute.
+Training
+We use a fully asynchronous, high-throughput training stack built on Ray
+Moritz
+et al.
+[
+2018
+]
+and PyTorch
+Paszke
+et al.
+[
+2019
+]
+. A centralized reconciler performs slot-based sample lifecycle state management, moving samples through a pipeline of distributed executors and implementing scheduling policies that balance sample generation throughput with policy staleness. We design all services within the trainer around the concept of futures, which allow for eager execution of computation when upstream dependencies are ready. We leverage the Ray object store to hold samples that are ready for consumption by train workers, which allows for natural spilling to local NVMe storage when nodes have insufficient CPU memory.
+To support large-scale post-training, all components within the trainer are fault-tolerant down to the process or process-group level. We run passive and active health checks on all nodes during training; upon detection of a hardware fault, we mark the node as unhealthy for scheduling but continue training with warm standby nodes. Decoupling training from inference and environment infrastructure naturally makes training more resilient to failures in these services; during the training run, we saw many cases where these services had partial or full outages without failing the training job. To minimize the number of training job restarts, we use a reactive configuration system and support live code updates on a per-process level; when new code is deployed, existing actors are drained of in-flight requests and transparently replaced.
+Replaying long-running coding rollouts is expensive. To mitigate expensive failures on job-level faults, we perform policy-aware checkpointing at the rollout level and group level in addition to conventional checkpointing of model weights at the step level. For rollout checkpointing, we rely on memory snapshots of the codebase environment state, so that upon recovery, we can pass the reconstructed codebase environment to verifiers. For group checkpointing, we write sequences with advantages tagged with policy versions to NFS; upon job restart, the scheduler considers these when determining whether to dispatch new work or simply load ready groups.
+Environments and Anyrun.
+Stateful codebase environments are a first-class artifact of our post-training stack. Environments are run on top of Anyrun, an internal compute platform built for running untrusted code at scale. This is the same compute platform that powers Cloud Agents and Automations in the Cursor product.
+All environment creation requests from the trainer are sent to a global service, which routes the request to an underlying Anyrun cluster. Our training workload is sharded across multiple Anyrun clusters for both instance availability and fault tolerance. Within a cluster, a distributed set of Anyrun managers schedule pods, scale cloud compute provisioned across multiple regions, and perform state reconciliation to manage hundreds of thousands of pods per cluster. Each pod is a dedicated Firecracker VM capable of running a full development environment, including a browser and GUI for computer use. We run pods on a large mixture of machine types and architectures (x86, ARM) to maximize instance availability.
+Scheduling throughput is particularly important for the bursty nature of RL workloads. Each Anyrun cluster is capable of scheduling more than 500 pods per second while maintaining desired binpacking requirements. One challenge with a naive packing strategy is that the steady-state resource usage for a pod can be dramatically lower than its peak during startup and can also be bursty due to overcommits. To solve this, we monitor and schedule with awareness of live readings of hardware pressure (CPU, memory, disk) along with more conventional scheduling heuristics.
+Anyrun supports forking and snapshotting of full coding environments at both the filesystem and memory level. This unlocks useful capabilities during RL, such as mid-trajectory rollout checkpointing and post-rollout state capture for future introspection. When a pod fork is requested, we attempt to first schedule the fork onto the same node; if not feasible due to space constraints, we live-migrate pod state to a node with capacity.
+Egress is carefully controlled in environments to limit any external impact. Any access to the internet from a pod must go through Anygress, an internal service within Anyrun responsible for proxying traffic, enforcing granular request policies, and dropping sensitive headers. To better replicate real-world environments, Anygress operates transparently instead of relying on proxy environment variables by injecting a trusted root CA on pod startup and redirecting pod traffic at the TCP layer.
+We train with tools that are representative of the harness in the Cursor client. Each codebase environment starts with a shared tool library that can be invoked over RPC. Some tools like semantic search have external dependencies and are handled outside of the environment. To support the full tool set available in the Cursor client, we maintain a shadow deployment of the Cursor backend that is used both during dataset preparation and rollouts. Sharing the production implementation in this way allows us to scale experiments and training safely while remaining faithful to the harness that Composer 2 will be deployed into.
+There are cases where we want tool behavior to differ between training and production settings. Concrete examples include enforcing stricter tool argument checks to encourage more precise model behavior, and removing certain tools to improve model steerability. To achieve this, the set of available tools and the desired behavior of each tool are dynamically determined for each environment.
+Inference and Weight Sync.
+We partner with Fireworks AI to run RL inference. Because Kimi K2.5 is a Mixture-of-Experts model, numerical differences can cause different experts to be chosen in the inference engine forward pass and trainer forward pass. If the trainer and inference engine do not agree on expert routing for each token, log-probabilities computed during training may not match the distribution from which tokens were sampled, introducing noise into the policy gradient. To address this, we employ router replay
+Zheng
+et al.
+[
+2025
+]; Ma
+et al.
+[
+2025
+]
+: during inference, the engine returns the selected expert indices for every token at every MoE layer, and during the training forward pass the router’s expert assignment is overridden to match. The router still computes gating scores so that gradients flow through it. We extend the basic replay scheme by filtering out replayed experts whose gating scores fall below a plausibility threshold derived from the router’s own top-
+k
+k
+selections, replacing them with the router’s candidates; we found that this reduces p99 numerics mismatch between the inference and training forward passes.
+Every training step, we synchronize updated weights to the inference engine by uploading to a shared S3 bucket. To minimize transfer size, we use delta compression: each rank caches its previous upload and transmits only the diff against the new weights. Because RL updates are small, even with full-parameter training these diffs compress to a handful of gigabytes for the 1T-parameter model. Uploading is fully sharded across all training ranks, allowing us to saturate the egress bandwidth of the training cluster; similarly, download on the Fireworks side is sharded across inference replicas. Compression, upload, and hotload signaling are fully pipelined in background workers so that training is never blocked. During the Composer 2 training run, we ran inference across geographically distributed clusters in the US and Europe. Each cluster independently downloads and reconstructs weights from the shared delta chain, requiring no direct connectivity to the training cluster, enabling world-scale distributed RL inference over commodity cloud storage.
+Online Evaluations.
+To provide faithful evaluations of our model during training, we run a pinned version of the production backend and Cursor client for each evaluation job. This provides high confidence that model behavior during evals is an exact replication of what our end users see, and also allows us to iterate on the Cursor harness and model system prompt using the same infrastructure. For each training step we want to evaluate, we acquire a lease for an evaluation deployment, automatically move GPUs to that deployment, and perform a cross-region weight sync of the evaluation checkpoint from the training cluster where it resides to the inference deployment.
+7
+Results
+7.1
+CursorBench
+We evaluate our models by running Cursor agents directly within Anyrun (Section
+6.2
+), the same infrastructure that supports our reinforcement learning pipeline.
+For each task in CursorBench, we initialize the codebase environment and initial task prompt, and we run the agent exactly as it would execute in our production environment.
+Metrics.
+We compute accuracy aggregated over all tasks across multiple passes of the evaluation set to reduce variance.
+In addition to accuracy, we also measure efficiency metrics like completion tokens, end-to-end latency, and inference cost to ensure the model remains maximally useful for interactive developer workflows.
+Figure 11:
+On CursorBench, Composer 2 achieves a superior Pareto frontier in cost while remaining highly competitive in token efficiency.
+For GPT-5.4, Codex-5.3, Opus 4.6, and Sonnet 4.6, we plot the high (circle), medium (triangle), and low (square) effort variants.
+Table
+1
+reports the accuracy of various models on CursorBench-3.
+Composer 2 achieves 61.3%, representing a 37% relative improvement over Composer 1.5 and a 61% improvement over Composer 1.
+Compared to its base model, Kimi K2.5, Composer 2 demonstrates a substantial accuracy boost, validating the effectiveness of our continued pretraining and reinforcement learning pipeline.
+Furthermore, Composer 2 achieves accuracy competitive with the strongest frontier models despite being significantly cheaper at inference.
+Figure
+11
+contextualizes these accuracy metrics against resource consumption.
+Regarding token usage, Composer 2 generates trajectories comparable in length to other models while providing frontier-level accuracy, remaining highly token-efficient relative to other frontier models operating at similar accuracy levels.
+However, due to differences in active parameter counts, raw token usage does not fully capture inference efficiency.
+Since we do not have access to FLOPs used by API models, we provide the median inference cost per CursorBench task in Figure
+11
+.
+Here, Composer 2 achieves a Pareto-optimal trade-off: its inference cost is similar to smaller or low-effort variants of models, while its accuracy remains competitive with much larger frontier models.
+Together, these results demonstrate that domain-specialized training can yield models that are simultaneously more accurate and more cost-effective than general-purpose alternatives for the demanding requirements of real-world software engineering.
+Table 1:
+Benchmark results across public and internal evaluation suites.
+For third-party models, we present results in an (our harness / self-reported) format where both are available. For Anthropic models on Terminal-Bench, we report the Claude Code scores from the official leaderboard in place of our harness evaluation. Overall, Composer 2 achieves accuracy competitive with the strongest frontier models.
+Model
+CursorBench
+SWE-bench Multi.
+Terminal-Bench
+Composer 2
+61.3
+73.7
+61.7
+Composer 1.5
+44.2
+65.9
+47.9
+Composer 1
+38.0
+56.9
+40.0
+Opus 4.6 High
+58.2
+75.8
+/
+77.8
+58.0
+/
+65.4
+Opus 4.5 High
+48.4
+73.8
+/
+76.2
+52.1
+/
+59.8
+GPT-5.4
+63.9
+76.8
+/
+-
+66.5
+†
+/
+75.1
+GPT-5.3 Codex
+59.1
+74.8
+/
+-
+64.8
+†
+/
+77.3
+GPT-5.2
+56.5
+68.3
+/
+-
+60.5
+/
+62.2
+GLM-5
+42.7
+66.9
+/
+73.3
+59.6
+/
+56.2
+Kimi K2.5
+36.0
+65.1
+/
+73.0
+47.3
+/
+50.8
+†
+OpenAI safety filters refused 5 GPT-5.4 and 3 GPT-5.3-Codex tasks; refused problems scored as 0.
+7.2
+Public Benchmarks
+We further evaluate Composer 2 on two public benchmarks: SWE-bench Multilingual and Terminal-Bench (Table
+1
+, last two columns).
+For Composer models, we compute scores using our own harness.
+For third-party models, we report results as (our harness / self-reported) where both are available; for Anthropic models on Terminal-Bench, we use the official Claude Code leaderboard scores rather than our own harness evaluations.
+For SWE-bench, we simply prepend “
+please solve this github issue
+” to the problem statement without instructions for writing or running test cases.
+For Terminal-Bench, we augment the user prompt with solution formatting instructions on where files should be placed or environment should be set up.
+On SWE-bench Multilingual, Composer 2 scores 73.7%, a 7.8% improvement over Composer 1.5 and 16.8% over Composer 1.
+On Terminal-Bench, Composer 2 achieves 61.7%, improving upon Composer 1.5 by 13.8% and Composer 1 by 21.7%.
+Against its base model, Kimi K2.5, Composer 2 achieves similar performance on SWE-bench Multilingual and considerably improved performance on Terminal-Bench.
+Overall, Composer 2’s performance on these public benchmarks remains highly competitive with other state-of-the-art models.
+Across both benchmarks, each successive Composer version shows consistent gains, demonstrating that continued investment in both pretraining and reinforcement learning yields compounding gains for agentic software engineering.
+8
+Conclusion
+Composer 2 demonstrates that strong specialized models can be trained through continued pretraining and reinforcement learning. Starting from a strong general-purpose model, a model can be specialized to achieve frontier-level performance in agentic coding. The main insight, from both an algorithmic and infrastructure point of view, is to scale training while ensuring a close domain match with the target domain. We do this through careful domain benchmarking with CursorBench, harness and environment engineering, and behavioral reward development, along with rigorous infrastructure reliability.
+The results of Composer 2 are optimistic on the future improvement available through further scaling. While Composer 2 marks a steady improvement over previous versions, there are many cases where the model shows intelligence or coherence behaviors that can be clearly improved. The model trained in this work is large (1.04T parameters, 32B active) but likely smaller than other proprietary models of comparative ability. We believe there remains considerable room for development both architecturally and algorithmically.
+The scope of coding agents as a tool is also expanding from interactive problems to agentic tasks that would require hours of human time
+Kwa
+et al.
+[
+2025
+]
+, with a general expectation that the horizon will grow quickly in the future
+Team [
+2025c
+]
+. For future Composer iterations, our team is focused on expanding the ability of the model to work on these problems through training methods to handle longer problems both in the algorithms to effectively utilize longer term training signal and in the infrastructure to support faithful long-horizon problems.
+References
+A. Ahmadian, C. Cremer, M. Gallé, M. Fadaee, J. Kreutzer, O. Pietquin, A. Üstün, and S. Hooker (2024)
+Back to basics: revisiting reinforce-style optimization for learning from human feedback in llms
+.
+In
+Proceedings of the 62nd Annual Meeting of the Association for Computational
+Linguistics (Volume 1: Long Papers), ACL 2024, Bangkok, Thailand,
+August 11-16, 2024
+,
+pp. 12248–12267
+.
+External Links:
+Link
+,
+Document
+Cited by:
+§4.1
+.
+A. AI (2025)
+LoCoDiff-bench: long context diff reconstruction benchmark
+.
+Note:
+https://abanteai.github.io/LoCoDiff-bench/
+Cited by:
+2nd item
+.
+Z. AI (2026)
+GLM-5: from vibe coding to agentic engineering
+.
+Note:
+https://z.ai/blog/glm-5
+Cited by:
+Appendix B
+.
+L. B. Allal, R. Li, D. Kocetkov, C. Mou, C. Akiki, C. M. Ferrandis, N. Muennighoff, M. Mishra, A. Gu, M. Dey,
+et al.
+(2023)
+SantaCoder: don’t reach for the stars!
+.
+In
+International Conference on Machine Learning, ICML 2023 Workshop on Knowledge and Logical Reasoning in the Era of Data-driven Learning
+,
+Cited by:
+§2
+.
+A. Amini, T. Vieira, and R. Cotterell (2025)
+Better estimation of the kullback–leibler divergence between language models
+.
+In
+The Thirty-ninth Annual Conference on Neural Information Processing Systems
+,
+External Links:
+Link
+Cited by:
+§4.1
+.
+M. Chen, J. Tworek, H. Jun, Q. Yuan, H. P. d. O. Pinto, J. Kaplan, H. Edwards, Y. Burda, N. Joseph, G. Brockman,
+et al.
+(2021)
+Evaluating large language models trained on code
+.
+arXiv preprint arXiv:2107.03374
+.
+External Links:
+Link
+Cited by:
+§2
+.
+Z. Chen, X. Qin, Y. Wu, Y. Ling, Q. Ye, W. X. Zhao, and G. Shi (2025)
+Pass@k training for adaptively balancing exploration and exploitation of large reasoning models
+.
+arXiv preprint arXiv:2508.10751
+.
+External Links:
+Document
+,
+Link
+Cited by:
+§4.1
+.
+C. Clement, D. Drain, J. Timcheck, A. Svyatkovskiy, and N. Sundaresan (2020)
+PyMT5: multi-mode translation of natural language and python code with transformers
+.
+In
+Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
+,
+pp. 9052–9065
+.
+Cited by:
+§2
+.
+DeepSeek-AI (2024a)
+DeepSeek-coder-v2: breaking the barrier of closed-source models in code intelligence
+.
+arXiv preprint arXiv:2406.11931
+.
+External Links:
+Link
+Cited by:
+§2
+.
+DeepSeek-AI (2024b)
+DeepSeek-v3 technical report
+.
+arXiv preprint arXiv:2412.19437
+.
+External Links:
+Link
+Cited by:
+§3.1
+.
+DeepSeek-AI (2025)
+DeepSeek-v3.2: pushing the frontier of open large language models
+.
+arXiv preprint arXiv:2512.02556
+.
+External Links:
+Link
+Cited by:
+Appendix B
+.
+D. Du, S. Liu, T. Yang, S. Chen, and Y. Li (2025)
+UloRL: an ultra-long output reinforcement learning approach for advancing large language models’ reasoning abilities
+.
+arXiv preprint arXiv:2507.19766
+.
+External Links:
+Link
+Cited by:
+§4.1
+.
+W. Fedus, B. Zoph, and N. Shazeer (2022)
+Switch transformers: scaling to trillion parameter models with simple and efficient sparsity
+.
+Journal of Machine Learning Research
+23
+(
+120
+),
+pp. 1–39
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+Z. Feng, D. Guo, D. Tang, N. Duan, X. Feng, M. Gong, L. Shou, B. Qin, T. Liu, D. Jiang,
+et al.
+(2020)
+Codebert: a pre-trained model for programming and natural languages
+.
+In
+Findings of the association for computational linguistics: EMNLP 2020
+,
+pp. 1536–1547
+.
+Cited by:
+§2
+.
+D. Fried, A. Aghajanyan, J. Lin, S. Wang, E. Wallace, F. Shi, R. Zhong, S. Yih, L. Zettlemoyer, and M. Lewis (2023)
+InCoder: A generative model for code infilling and synthesis
+.
+In
+The Eleventh International Conference on Learning Representations,
+ICLR 2023, Kigali, Rwanda, May 1-5, 2023
+,
+External Links:
+Link
+Cited by:
+§2
+.
+F. Gloeckle, B. Y. Idrissi, B. Rozière, D. Lopez-Paz, and G. Synnaeve (2024)
+Better & faster large language models via multi-token prediction
+.
+In
+Forty-first International Conference on Machine Learning, ICML 2024,
+Vienna, Austria, July 21-27, 2024
+,
+External Links:
+Link
+Cited by:
+§3.1
+.
+A. Golubev, M. Trofimova, S. Polezhaev, I. Badertdinov, M. Nekrashevich, A. Shevtsov, S. Karasik, S. Abramov, A. Andriushchenko, F. Fisin, S. Skvortsov, and B. Yangel (2025)
+Training long-context, multi-turn software engineering agents with reinforcement learning
+.
+arXiv preprint arXiv:2508.03501
+.
+External Links:
+Link
+Cited by:
+§4.1
+.
+D. Guo, Q. Zhu, D. Yang, Z. Xie, K. Dong, W. Zhang, G. Chen, X. Bi, Y. Wu, Y. K. Li,
+et al.
+(2024)
+DeepSeek-coder: when the large language model meets programming–the rise of code intelligence
+.
+arXiv preprint arXiv:2401.14196
+.
+External Links:
+Link
+Cited by:
+§2
+.
+S. Gururangan, A. Marasović, S. Swayamdipta, K. Lo, I. Beltagy, D. Downey, and N. A. Smith (2020)
+Don’t stop pretraining: adapt language models to domains and tasks
+.
+In
+Proceedings of the 58th Annual Meeting of the Association for Computational
+Linguistics, ACL 2020, Online, July 5-10, 2020
+,
+pp. 8342–8360
+.
+External Links:
+Link
+,
+Document
+Cited by:
+§3
+.
+HazyResearch (2026)
+ThunderKittens gemm kernels
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+J. Hoffmann, S. Borgeaud, A. Mensch, E. Buchatskaya, T. Cai, E. Rutherford, D. de Las Casas, L. A. Hendricks, J. Welbl, A. Clark,
+et al.
+(2022)
+Training compute-optimal large language models
+.
+In
+Advances in Neural Information Processing Systems 35: Annual Conference
+on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
+LA, USA, November 28 - December 9, 2022
+,
+Cited by:
+§3
+.
+S. Hong, M. Zhuge, J. Chen, X. Zheng, Y. Cheng, J. Wang, C. Zhang, Z. Wang, S. K. S. Yau, Z. Lin,
+et al.
+(2023)
+MetaGPT: meta programming for a multi-agent collaborative framework
+.
+In
+The twelfth international conference on learning representations
+,
+Cited by:
+§2
+.
+J. Howard and S. Ruder (2018)
+Universal language model fine-tuning for text classification
+.
+In
+Proceedings of the 56th Annual Meeting of the Association for Computational
+Linguistics, ACL 2018, Melbourne, Australia, July 15-20, 2018, Volume
+1: Long Papers
+,
+pp. 328–339
+.
+External Links:
+Link
+,
+Document
+Cited by:
+§3
+.
+B. Hui, J. Yang, Z. Cui, J. Yang, D. Liu, L. Zhang, T. Liu, J. Zhang, B. Yu, K. Lu,
+et al.
+(2024)
+Qwen2.5-coder technical report
+.
+arXiv preprint arXiv:2409.12186
+.
+Cited by:
+§2
+.
+S. A. Jacobs, M. Tanaka, C. Zhang, M. Zhang, S. L. Song, S. Rajbhandari, and Y. He (2023)
+Deepspeed ulysses: system optimizations for enabling training of extreme long sequence transformer models
+.
+arXiv preprint arXiv:2309.14509
+.
+Cited by:
+§6.1
+.
+Jay Shah (2026)
+Flash attention pull request #2270
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+C. E. Jimenez, J. Yang, A. Wettig, S. Yao, K. Pei, O. Press, and K. R. Narasimhan (2024)
+SWE-bench: can language models resolve real-world github issues?
+.
+In
+The Twelfth International Conference on Learning Representations
+,
+External Links:
+Link
+Cited by:
+§1
+.
+T. Kwa, B. West, J. Becker, A. Deng, K. Garcia, M. Hasin, S. Jawhar, M. Kinniment, N. Rush, S. V. Arx, R. Bloom, T. Broadley, H. Du, B. Goodrich, N. Jurkovic, L. H. Miles, S. Nix, T. Lin, N. Parikh, D. Rein, L. J. K. Sato, H. Wijk, D. M. Ziegler, E. Barnes, and L. Chan (2025)
+Measuring ai ability to complete long tasks
+.
+Note:
+https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/
+Cited by:
+§8
+.
+R. Li, L. B. Allal, Y. Zi, N. Muennighoff, D. Kocetkov, C. Mou, M. Marone, C. Akiki, J. Li, J. Chim,
+et al.
+(2023)
+StarCoder: may the source be with you!
+.
+Trans. Mach. Learn. Res.
+2023
+.
+External Links:
+Link
+Cited by:
+§2
+.
+Y. Li, D. Choi, J. Chung, N. Kushman, J. Schrittwieser, R. Leblond, T. Eccles, J. Keeling, F. Gimeno, A. Dal Lago,
+et al.
+(2022)
+Competition-level code generation with alphacode
+.
+Science
+378
+(
+6624
+),
+pp. 1092–1097
+.
+Cited by:
+§2
+.
+X. Liang, Z. Li, Y. Gong, Y. Shen, Y. N. Wu, Z. Guo, and W. Chen (2026)
+Beyond pass@1: self-play with variational problem synthesis sustains RLVR
+.
+In
+The Fourteenth International Conference on Learning Representations,
+ICLR 2026
+,
+External Links:
+Link
+Cited by:
+§4.1
+.
+H. Liu, M. Zaharia, and P. Abbeel (2024)
+Ring attention with blockwise transformers for near-infinite context
+.
+In
+The Twelfth International Conference on Learning Representations,
+ICLR 2024, Vienna, Austria, May 7-11, 2024
+,
+External Links:
+Link
+Cited by:
+§6.1
+.
+Z. Liu, C. Chen, W. Li, P. Qi, T. Pang, C. Du, W. S. Lee, and M. Lin (2025a)
+Understanding r1-zero-like training: a critical perspective
+.
+arXiv preprint arXiv:2503.20783
+.
+External Links:
+Link
+Cited by:
+§4.1
+,
+§4.1
+.
+Z. Liu, Z. Yang, Y. Chen, C. Lee, M. Shoeybi, B. Catanzaro, and W. Ping (2025b)
+AceReason-nemotron 1.1: advancing math and code reasoning through sft and rl synergy
+.
+arXiv preprint arXiv:2506.13284
+.
+External Links:
+Link
+Cited by:
+§4.1
+.
+A. Lozhkov, R. Li, L. B. Allal, F. Cassano, J. Lamy-Poirier, N. Tazi, A. Tang, D. Pykhtar, J. Liu, Y. Wei,
+et al.
+(2024)
+StarCoder 2 and the stack v2: the next generation
+.
+arXiv preprint arXiv:2402.19173
+.
+External Links:
+Link
+Cited by:
+§2
+.
+Z. Luo, C. Xu, P. Zhao, Q. Sun, X. Geng, W. Hu, C. Tao, J. Ma, Q. Lin, and D. Jiang (2024)
+WizardCoder: empowering code large language models with evol-instruct
+.
+In
+The Twelfth International Conference on Learning Representations,
+ICLR 2024, Vienna, Austria, May 7-11, 2024
+,
+External Links:
+Link
+Cited by:
+§2
+.
+W. Ma, H. Zhang, L. Zhao, Y. Song, Y. Wang, Z. Sui, and F. Luo (2025)
+Stabilizing MoE reinforcement learning by aligning training and inference routers
+.
+arXiv preprint arXiv:2510.11370
+.
+External Links:
+2510.11370
+,
+Link
+Cited by:
+§4.1
+,
+§6.2
+.
+M. A. Merrill, A. G. Shaw, N. Carlini, B. Li, H. Raj, I. Bercovich, L. Shi, J. Y. Shin, T. Walshe, E. K. Buchanan,
+et al.
+(2026)
+Terminal-bench: benchmarking agents on hard, realistic tasks in command line interfaces
+.
+In
+The Fourteenth International Conference on Learning Representations,
+ICLR 2026
+,
+External Links:
+Link
+Cited by:
+§1
+.
+MiniMax (2025)
+MiniMax-m1: scaling test-time compute efficiently with lightning attention
+.
+arXiv preprint arXiv:2506.13585
+.
+External Links:
+Link
+Cited by:
+§4.1
+.
+M. Mishra, M. Stallone, G. Zhang, Y. Shen, A. Prasad, A. M. Soria, M. Merler, P. Selvam, S. Surendran, S. Singh,
+et al.
+(2024)
+Granite code models: a family of open foundation models for code intelligence
+.
+arXiv preprint arXiv:2405.04324
+.
+Cited by:
+§2
+.
+P. Moritz, R. Nishihara, S. Wang, A. Tumanov, R. Liaw, E. Liang, M. Elibol, Z. Yang, W. Paul, M. I. Jordan, and I. Stoica (2018)
+Ray: A distributed framework for emerging AI applications
+.
+In
+13th USENIX Symposium on Operating Systems Design and Implementation,
+OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018
+,
+A. C. Arpaci-Dusseau and G. Voelker (Eds.)
+,
+pp. 561–577
+.
+External Links:
+Link
+Cited by:
+§6.2
+.
+N. Muennighoff, Q. Liu, A. R. Zebaze, Q. Zheng, B. Hui, T. Y. Zhuo, S. Singh, X. Tang, L. von Werra, and S. Longpre (2024)
+OctoPack: instruction tuning code large language models
+.
+In
+The Twelfth International Conference on Learning Representations,
+ICLR 2024, Vienna, Austria, May 7-11, 2024
+,
+External Links:
+Link
+Cited by:
+§2
+.
+E. Nijkamp, B. Pang, H. Hayashi, L. Tu, H. Wang, Y. Zhou, S. Savarese, and C. Xiong (2023)
+CodeGen: an open large language model for code with multi-turn program synthesis
+.
+In
+The Eleventh International Conference on Learning Representations,
+ICLR 2023, Kigali, Rwanda, May 1-5, 2023
+,
+External Links:
+Link
+Cited by:
+§2
+.
+NVIDIA (2025)
+Pretraining large language models with nvfp4
+.
+arXiv preprint arXiv:2509.25149
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+Open Compute Project (2023)
+OCP microscaling formats (mx) specification version 1.0
+.
+Note:
+https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+Cited by:
+§6.1
+.
+A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga,
+et al.
+(2019)
+Pytorch: an imperative style, high-performance deep learning library
+.
+Advances in neural information processing systems
+32
+.
+Cited by:
+§6.2
+.
+A. Piché, E. Kamalloo, R. Pardinas, X. Chen, and D. Bahdanau (2025)
+PipelineRL: faster on-policy reinforcement learning for long sequence generation
+.
+arXiv preprint arXiv:2509.19128
+.
+External Links:
+Link
+Cited by:
+§4.1
+.
+C. Qian, W. Liu, H. Liu, N. Chen, Y. Dang, J. Li, C. Yang, W. Chen, Y. Su, X. Cong,
+et al.
+(2024)
+Chatdev: communicative agents for software development
+.
+In
+Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)
+,
+pp. 15174–15186
+.
+Cited by:
+§2
+.
+S. Rajbhandari, J. Rasley, O. Ruwase, and Y. He (2020)
+ZeRO: memory optimizations toward training trillion parameter models
+.
+In
+SC20: International Conference for High Performance Computing, Networking, Storage and Analysis
+,
+pp. 1–16
+.
+Cited by:
+§6.1
+.
+B. Rozière, J. Gehring, F. Gloeckle, S. Sootla, I. Gat, X. E. Tan, Y. Adi, J. Liu, R. Sauvestre, T. Remez,
+et al.
+(2023)
+Code llama: open foundation models for code
+.
+arXiv preprint arXiv:2308.12950
+.
+External Links:
+Link
+Cited by:
+§2
+.
+J. Schulman (2020)
+Approximating KL divergence
+.
+Note:
+Blog post
+Cited by:
+§4.1
+.
+Z. Shao, P. Wang, Q. Zhu, R. Xu, J. Song, M. Zhang, Y. K. Li, Y. Wu, and D. Guo (2024)
+DeepSeekMath: pushing the limits of mathematical reasoning in open language models
+.
+arXiv preprint arXiv:2402.03300
+.
+External Links:
+Link
+Cited by:
+§4.1
+,
+§4.1
+.
+N. Shazeer, A. Mirhoseini, K. Maziarz, A. Davis, Q. Le, G. Hinton, and J. Dean (2017)
+Outrageously large neural networks: the sparsely-gated mixture-of-experts layer
+.
+In
+International Conference on Learning Representations (ICLR)
+,
+External Links:
+Link
+Cited by:
+§6.1
+.
+M. Shoeybi, M. Patwary, R. Puri, P. LeGresley, J. Casper, and B. Catanzaro (2019)
+Megatron-LM: training multi-billion parameter language models using model parallelism
+.
+arXiv preprint arXiv:1909.08053
+.
+Cited by:
+§6.1
+.
+B. F. Spector, S. Arora, A. Singhal, A. Parthasarathy, D. Y. Fu, and C. Ré (2025)
+ThunderKittens: simple, fast, and adorable kernels
+.
+In
+The Thirteenth International Conference on Learning Representations
+,
+External Links:
+Link
+Cited by:
+§6.1
+.
+S. H. Sul, S. Arora, B. F. Spector, and C. Ré (2025a)
+ParallelKittens: systematic and practical simplification of multi-gpu ai kernels
+.
+arXiv preprint arXiv:2511.13940
+.
+External Links:
+Link
+Cited by:
+§6.1
+,
+§6.1
+.
+S. H. Sul, S. Arora, B. Spector, and C. Ré (2025b)
+Loads and loads of fluffy kittens
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+S. H. Sul, D. Lim, B. Spector, and C. Ré (2025c)
+One kernel for all your gpus
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+S. H. Sul and C. Ré (2026)
+ThunderKittens 2.0: even faster kernels for your gpus
+.
+External Links:
+Link
+Cited by:
+§6.1
+.
+F. Tajwar, G. Zeng, Y. Zhou, Y. Song, D. Arora, Y. Jiang, J. Schneider, R. Salakhutdinov, H. Feng, and A. Zanette (2026)
+Maximum likelihood reinforcement learning
+.
+arXiv preprint arXiv:2602.02710
+.
+External Links:
+Document
+,
+Link
+Cited by:
+§4.1
+.
+C. Team, H. Zhao, J. Hui, J. Howland, N. Nguyen, S. Zuo, A. Hu, C. A. Choquette-Choo, J. Shen, J. Kelley,
+et al.
+(2024)
+Codegemma: open code models based on gemma
+.
+arXiv preprint arXiv:2406.11409
+.
+Cited by:
+§2
+.
+C. Team (2025a)
+1.5x faster moe training with custom mxfp8 kernels
+.
+Note:
+https://cursor.com/blog/kernels
+Cited by:
+§6.1
+.
+C. Team (2025b)
+Self-summarization for composer
+.
+Note:
+https://cursor.com/blog/self-summarization
+Cited by:
+§4.1
+.
+C. Team (2025c)
+The third era of software
+.
+Note:
+https://cursor.com/blog/third-era
+Cited by:
+§8
+.
+K. Team (2025d)
+Kimi k1.5: scaling reinforcement learning with LLMs
+.
+arXiv preprint arXiv:2501.12599
+.
+Cited by:
+§4.1
+.
+K. Team (2026)
+Kimi K2.5: visual agentic intelligence
+.
+arXiv preprint arXiv:2602.02276
+.
+Cited by:
+Appendix B
+,
+Appendix B
+,
+§3
+.
+Q. Team (2025e)
+Qwen3 technical report
+.
+arXiv preprint arXiv:2505.09388
+.
+Cited by:
+§3.1
+.
+H. Touvron, T. Lavril, G. Izacard, X. Martinet, M. Lachaux, T. Lacroix, B. Rozière, N. Goyal, E. Hambro, F. Azhar,
+et al.
+(2023)
+LLaMA: open and efficient foundation language models
+.
+arXiv preprint arXiv:2302.13971
+.
+External Links:
+Link
+Cited by:
+§3
+.
+X. Wang, B. Li, Y. Song, F. F. Xu, X. Tang, M. Zhuge, J. Pan, Y. Song, B. Li, J. Singh, H. H. Tran, F. Li, R. Ma, M. Zheng, B. Qian, Y. Shao, N. Muennighoff, Y. Zhang, B. Hui, J. Lin, R. Brennan, H. Peng, H. Ji, and G. Neubig (2025)
+OpenHands: an open platform for AI software developers as generalist agents
+.
+In
+The Thirteenth International Conference on Learning Representations,
+ICLR 2025, Singapore, April 24-28, 2025
+,
+External Links:
+Link
+Cited by:
+§2
+.
+Y. Wang, H. Le, A. Gotmare, N. Bui, J. Li, and S. Hoi (2023)
+Codet5+: open code large language models for code understanding and generation
+.
+In
+Proceedings of the 2023 conference on empirical methods in natural language processing
+,
+pp. 1069–1088
+.
+Cited by:
+§2
+.
+Y. Wang, W. Wang, S. Joty, and S. C. Hoi (2021)
+Codet5: identifier-aware unified pre-trained encoder-decoder models for code understanding and generation
+.
+In
+Proceedings of the 2021 conference on empirical methods in natural language processing
+,
+pp. 8696–8708
+.
+Cited by:
+§2
+.
+Y. Wei, Z. Wang, J. Liu, Y. Ding, and L. Zhang (2024)
+Magicoder: empowering code generation with OSS-Instruct
+.
+In
+Forty-first International Conference on Machine Learning, ICML 2024,
+Vienna, Austria, July 21-27, 2024
+,
+Proceedings of Machine Learning Research
+,
+pp. 52632–52657
+.
+External Links:
+Link
+Cited by:
+§2
+.
+X. Wen, Z. Liu, S. Zheng, Z. Xu, S. Ye, Z. Wu, X. Liang, Y. Wang, J. Li, Z. Miao, J. Bian, and M. Yang (2026)
+Reinforcement learning with verifiable rewards implicitly incentivizes correct reasoning in base LLMs
+.
+In
+The Fourteenth International Conference on Learning Representations,
+ICLR 2026
+,
+External Links:
+Link
+Cited by:
+§4.1
+.
+[74]
+()
+Why SWE-bench Verified no longer measures frontier coding capabilities — openai.com
+.
+Note:
+https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/
+[Accessed 24-03-2026]
+Cited by:
+3rd item
+.
+J. Yang, C. E. Jimenez, A. Wettig, K. Lieret, S. Yao, K. Narasimhan, and O. Press (2024)
+SWE-agent: agent-computer interfaces enable automated software engineering
+.
+In
+Advances in Neural Information Processing Systems 38: Annual Conference
+on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver,
+BC, Canada, December 10 - 15, 2024
+,
+External Links:
+Link
+Cited by:
+§2
+.
+J. Yang, K. Lieret, C. E. Jimenez, A. Wettig, K. Khandpur, Y. Zhang, B. Hui, O. Press, L. Schmidt, and D. Yang (2025)
+SWE-smith: scaling data for software engineering agents
+.
+In
+Advances in Neural Information Processing Systems 38: Annual Conference
+on Neural Information Processing Systems 2025, NeurIPS 2025,
+San Diego, CA, USA, December 1-4, 2025
+,
+External Links:
+Link
+Cited by:
+§2
+.
+J. Ye
+et al.
+(2024)
+Data mixing made efficient: a biannual survey of data mixing for LLM pre-training
+.
+arXiv preprint arXiv:2403.16952
+.
+External Links:
+Link
+Cited by:
+§3
+.
+Q. Yu, Z. Zhang, R. Zhu, Y. Yuan, X. Zuo, Y. Yue, W. Dai, T. Fan, G. Liu, J. Liu, L. Liu, X. Liu, H. Lin, Z. Lin, B. Ma, G. Sheng, Y. Tong, C. Zhang, M. Zhang, R. Zhang, W. Zhang, H. Zhu, J. Zhu, J. Chen, J. Chen, C. Wang, H. Yu, Y. Song, X. Wei, H. Zhou, J. Liu, W. Ma, Y. Zhang, L. Yan, Y. Wu, and M. Wang (2025)
+DAPO: an open-source LLM reinforcement learning system at scale
+.
+In
+The Thirty-ninth Annual Conference on Neural Information Processing Systems,
+NeurIPS 2025
+,
+External Links:
+Link
+Cited by:
+§4.1
+,
+§4.1
+.
+Y. Yue, Z. Chen, R. Lu, A. Zhao, Z. Wang, Y. Yue, S. Song, and G. Huang (2025)
+Does reinforcement learning really incentivize reasoning capacity in LLMs beyond the base model?
+.
+In
+The Thirty-ninth Annual Conference on Neural Information Processing Systems,
+NeurIPS 2025
+,
+Note:
+Oral
+External Links:
+Link
+Cited by:
+§4.1
+.
+C. Zhao, S. Zhou, L. Zhang, C. Deng, Z. Xu, Y. Liu, K. Yu, J. Li, and L. Zhao (2025)
+DeepEP: an efficient expert-parallel communication library
+.
+GitHub
+.
+Note:
+https://github.com/deepseek-ai/DeepEP
+Cited by:
+§6.1
+.
+Y. Zhao, A. Gu, R. Varma, L. Luo, C. Huang, M. Xu, L. Wright, H. Shojanazeri, M. Ott, S. Shleifer,
+et al.
+(2023)
+PyTorch fsdp: experiences on scaling fully sharded data parallel
+.
+arXiv preprint arXiv:2304.11277
+.
+Cited by:
+§6.1
+.
+C. Zheng, S. Liu, M. Li, X. Chen, B. Yu, C. Gao, K. Dang, Y. Liu, R. Men, A. Yang, J. Zhou, and J. Lin (2025)
+Group sequence policy optimization
+.
+arXiv preprint arXiv:2507.18071
+.
+External Links:
+2507.18071
+,
+Link
+Cited by:
+§4.1
+,
+§6.2
+.
+T. Y. Zhuo, A. R. Zebaze, L. Von Werra, H. de Vries, Q. Liu, and N. Muennighoff (2025)
+Parameter-efficient instruction tuning code large language models: an empirical study
+.
+In
+ICLR 2025 Third Workshop on Deep Learning for Code
+,
+Cited by:
+§2
+.
+Appendix A
+Contributors
+The Composer research team consists of:
+Aaron Chan,
+Ahmed Shalaby,
+Alexander Wettig,
+Aman Sanger,
+Andrew Zhai,
+Anurag Ajay,
+Ashvin Nair,
+Charlie Snell,
+Chen Lu,
+Chen Shen,
+Emily Jia,
+Federico Cassano,
+Hanpeng Liu,
+Haoyu Chen,
+Henry Wildermuth,
+Jacob Jackson,
+Janet Li,
+Jediah Katz,
+Jiajun Yao,
+Joey Hejna,
+Josh Warner,
+Julius Vering,
+Kevin Frans,
+Lee Danilek,
+Less Wright,
+Lujing Cen,
+Luke Melas-Kyriazi,
+Michael Truell,
+Michiel de Jong,
+Naman Jain,
+Nate Schmidt,
+Nathan Wang,
+Niklas Muennighoff,
+Oleg Rybkin,
+Paul Loh,
+Phillip Kravtsov,
+Rishabh Yadav,
+Sahil Shah,
+Sam Kottler,
+Alexander M Rush,
+Shengtong Zhang,
+Shomil Jain,
+Sriram Sankar,
+Stefan Heule,
+Stuart H. Sul,
+Sualeh Asif,
+Victor Rong,
+Wanqi Zhu,
+William Lin,
+Yuchen Wu,
+Yuri Volkov,
+Yury Zemlyanskiy,
+Zack Holbrook,
+Zhiyuan Zhang
+Appendix B
+Base Model Selection
+Before training, we evaluated several potential open-source base models including GLM-5
+AI [
+2026
+]
+, Kimi K2.5
+Team [
+2026
+]
+, and DeepSeek V3.2
+DeepSeek-AI [
+2025
+]
+. Three base model evaluations contributed to our selection of Kimi K2.5:
+•
+Coding knowledge
+: We score factual knowledge with an internal benchmark called FreshBench. FreshBench is a question-answer benchmark adversarially constructed against previous Composer models. We identify turns where Composer had to read library source code or perform a web search to solve a coding task. From these traces we create question-answer pairs, validating the answers with a web searching agent.
+•
+State tracking
+: While editing a repository, coding agents often need to understand dozens of past file edits before taking an action.
+LoCoDiff
+AI [
+2025
+]
+is a benchmark that asks the model to recreate the state of a file after many diffs, an important base skill for model long-term memory. State tracking is an internal benchmark similar to LoCoDiff built from our monorepo.
+Instead of measuring raw accuracy, which we found sensitive to single-character errors, we report the average character-level distance.
+•
+Codebase perplexity
+: We measure perplexity to determine the coding intelligence of the base model.
+We use our private monorepo as an uncontaminated source, concatenating the files alphabetically and computing the sum of the negative log-likelihoods over a rolling window.
+We intentionally do not consider coding agent benchmarks when testing base models. We find that such benchmarks are less predictive of final performance, as agentic and long-horizon capabilities can drastically change during the RL stage.
+Table
+2
+shows the results of the analysis. All three models considered perform quite well in these experiments. We selected Kimi K2.5
+Team [
+2026
+]
+due to its general strong performance as well as further additional considerations such as its efficiency in our infrastructure.
+Model
+FreshBench
+↑
+\uparrow
+State Tracking
+↓
+\downarrow
+Negative Log-Likelihood
+↓
+\downarrow
+DeepSeek V3.2
+68.9%
+66
+11.75M
+Kimi K2.5
+83.2%
+86
+13.81M
+GLM-5
+79.2%
+92
+14.11M
+GPT-5.4
+92.5
+%
+103
+-
+Claude 4.6 Opus
+88.9%
+65
+-
+Gemini 3 Flash
+84.5%
+27
+-
+Claude 4.5 Sonnet
+80.1%
+69
+-
+Claude 4.5 Haiku
+61.7%
+177
+-
+Table 2:
+Base models evaluated on our internal benchmarks.
+Negative log-likelihood is measured over our internal codebase.
+Appendix C
+CursorBench
+C.1
+Streaming Prefix Detection
+The following is another example CursorBench task.
+Problem statement:
+We’re seeing a weird streaming bug in some chat responses:
+Now I
+Now I need to updat
+Now I need to update this.
+Now I need to update this. I ha
+Now I need to update this. I have the
+Instead of getting proper streaming deltas, we get repeated growing prefixes like the snippet. I think this happens mostly inside think tokens. I want to know how common this is. Look at 954 response json files in @logs folder
+Figure 12:
+Example CursorBench task.
+The agent must infer the failure mode from a partial symptom report, write a heuristic detection algorithm over 954 heterogeneous chat responses, and carefully tune that heuristic to recover an exact count of malformed prefix-streaming cases without overcounting normal incremental output. Additionally, a variant of the bug produces an “interleave stutter” where the initial prefix chain is only two lines long before stabilizing into a repeating line with incrementing repetitions and agent must carefully examine chat responses to discover this.
+The following listing shows the algorithmic core of the reference diff for this task.
+⬇
+MIN_CHAIN
+=
+3
+MIN_SEED_LEN
+=
+2
+MAX_SEED_LEN
+=
+50
+def
+find_prefix_chain
+(
+text
+:
+str
+)
+->
+tuple
+[
+int
+,
+str
+]
+|
+None
+:
+if
+len
+(
+text
+)
+<
+10:
+return
+None
+first_nl
+=
+text
+.
+find
+(
+"\n"
+)
+if
+first_nl
+<
+MIN_SEED_LEN
+or
+first_nl
+>
+MAX_SEED_LEN
+:
+return
+None
+seed
+=
+text
+[:
+first_nl
+]
+needle
+=
+"\n"
++
+seed
+starts
+=
+[0]
+pos
+=
+0
+while
+True
+:
+idx
+=
+text
+.
+find
+(
+needle
+,
+pos
+)
+if
+idx
+==
+-1:
+break
+starts
+.
+append
+(
+idx
++
+1)
+pos
+=
+idx
++
+1
+if
+len
+(
+starts
+)
+<
+MIN_CHAIN
+:
+return
+None
+ends
+=
+[
+s
+-
+1
+for
+s
+in
+starts
+[1:]]
++
+[
+len
+(
+text
+)]
+chunks
+=
+[
+text
+[
+s
+:
+e
+]
+for
+s
+,
+e
+in
+zip
+(
+starts
+,
+ends
+)]
+chain
+=
+1
+for
+i
+in
+range
+(
+len
+(
+chunks
+)
+-
+1):
+cur
+,
+nxt
+=
+chunks
+[
+i
+],
+chunks
+[
+i
++
+1]
+if
+len
+(
+cur
+)
+<
+len
+(
+nxt
+)
+and
+nxt
+.
+startswith
+(
+cur
+):
+chain
++=
+1
+else
+:
+break
+return
+(
+chain
+,
+seed
+)
+if
+chain
+>=
+MIN_CHAIN
+else
+None
+def
+iter_think_blocks
+(
+text
+:
+str
+):
+pos
+=
+0
+while
+True
+:
+open_idx
+=
+text
+.
+find
+(
+"<think>"
+,
+pos
+)
+if
+open_idx
+==
+-1:
+return
+close_idx
+=
+text
+.
+find
+(
+"</think>"
+,
+open_idx
+)
+if
+close_idx
+==
+-1:
+yield
+text
+[
+open_idx
++
+7:].
+lstrip
+(
+"\n"
+)
+return
+yield
+text
+[
+open_idx
++
+7:
+close_idx
+].
+lstrip
+(
+"\n"
+)
+pos
+=
+close_idx
++
+8
+def
+has_prefix_streaming_bug
+(
+chat_response
+:
+str
+)
+->
+bool
+:
+return
+any
+(
+find_prefix_chain
+(
+block
+)
+is
+not
+None
+for
+block
+in
+iter_think_blocks
+(
+chat_response
+)
+)
+BETA
\ No newline at end of file
diff --git a/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md b/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..5f6e44ee95c499ba91cf6eb1cea721d150f0143b
--- /dev/null
+++ b/research/notes/cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models.md
@@ -0,0 +1,16916 @@
+---
+title: 'CWM: An Open-Weights LLM for Research on Code Generation with World Models'
+id: cwm-an-open-weights-llm-for-research-on-code-generation-with-world-models
+tags:
+- deepread
+created: '2026-06-10T00:30:44.771058Z'
+source: https://arxiv.org/html/2510.02387
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:44.770910Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+CWM: An Open-Weights LLM for Research on Code Generation with World Models
+\metadata
+[Inference Code]
+github.com/facebookresearch/cwm
+\metadata
+[Model Weights]
+ai.meta.com/resources/models-and-libraries/cwm-downloads
+,
+huggingface.co/facebook/cwm
+,
+../cwm-sft
+,
+../cwm-pretrain
+CWM
+: An Open-Weights LLM for Research on Code Generation with World Models
+Meta FAIR CodeGen Team
+(
+September 29, 2025
+)
+Abstract
+We release Code World Model (
+CWM
+), a 32-billion-parameter open-weights LLM, to advance research on code generation with world models.
+To improve code understanding beyond what can be learned from training on static code alone, we mid-train
+CWM
+on a large amount of observation-action trajectories from Python interpreter and agentic Docker environments, and perform extensive multi-task reasoning RL in verifiable coding, math, and multi-turn software engineering environments.
+With
+CWM
+, we provide a strong testbed for researchers to explore the opportunities world modeling affords for improving code generation with reasoning and planning in computational environments.
+We present first steps of how world models can benefit agentic coding, enable step-by-step simulation of Python code execution, and show early results of how reasoning can benefit from the latter.
+CWM
+is a dense, decoder-only LLM trained with a context size of up to
+131
+k
+131\text{\,}\mathrm{k}
+tokens.
+Independent of its world modeling capabilities,
+CWM
+offers strong performance on general coding and math tasks: it reaches pass@1 scores of
+65.8
+%
+65.8\text{\,}\mathrm{\char 37\relax}
+on SWE-bench Verified (with test-time scaling),
+68.6
+%
+68.6\text{\,}\mathrm{\char 37\relax}
+on LiveCodeBench,
+96.6
+%
+96.6\text{\,}\mathrm{\char 37\relax}
+on Math-500, and
+76.0
+%
+76.0\text{\,}\mathrm{\char 37\relax}
+on AIME 2024.
+To support further research on code world modeling, we release model checkpoints after mid-training, SFT, and RL.
+1
+Introduction
+Software development is one of the domains where Large Language Models (LLMs) have already had a significant real-world impact
+(Cui et al.,
+2024
+; Bick et al.,
+2024
+)
+.
+They have quickly been adopted into the workflows of software engineers worldwide, and their capabilities are advancing fast: from only supporting programmers with small snippets of code to fixing issues or writing code bases autonomously
+(Yeverechyahu et al.,
+2024
+; Handa et al.,
+2025
+)
+.
+However, reliably generating high-quality code remains a challenge even for the current generation of LLMs, with benchmarks consistently revealing shortcomings upon release
+(Hendrycks et al.,
+2021a
+; Chen et al.,
+2021
+; Aider Team,
+2025
+; Jimenez et al.,
+2024
+)
+.
+We believe that advancing code generation with LLMs may require new training and modeling paradigms. Typically, code is treated the same as any other text data during pre-training: the model learns to predict code line by line, from left to right and top to bottom. We think this is not sufficient – to master coding, one must understand not just what code
+looks like
+but what it
+does
+when executed. Such skill is instrumental to the everyday work of software engineers: at a local level, they understand how the execution of a line of code changes the state of the local variables, and, at a global level, they can make predictions about how changes to a codebase affect program outputs. Yet, teaching LLMs such
+code world modeling
+capabilities is typically not considered before post-training.
+Figure 1
+:
+Overview of the
+CWM
+training stages and the model checkpoints that we release. We generally report performance of the final CWM (instruct, RL trained) model, except where otherwise stated.
+We release Code World Model (
+CWM
+), a new LLM for code generation and reasoning that has been trained on large amounts of code world modeling data.
+Concretely,
+CWM
+is mid-trained on two different kinds of observation-action trajectories that capture important aspects of software development: Python code execution traces and agentic interactions with Docker environments. Mid-training on such data at scale should help improve coding performance by grounding our model’s predictions in the underlying dynamical systems and provide a superior starting point for RL.
+For the Python execution data, actions are Python statements and observations contain the contents of the local variables. By training
+CWM
+on a trajectory of observation-action pairs conditioned on the code only as context, we directly teach the model how the execution of a line of Python affects the state of the local variables. Our premise here is that teaching
+CWM
+the semantics and not just syntax of programs should help with writing code as well as with reasoning tasks like verification, testing, and debugging.
+We also train
+CWM
+on a large-scale collection of synthetically generated agentic interactions with computational environments. These trajectories are generated with our so-called ForagerAgent, which “forages” for data covering agentic software engineering scenarios such as implementing missing functionality or fixing bugs from error messages. Actions here are shell-like commands or code edits generated by the agent, and observations are responses from the running environment. While it is not uncommon for recent models to include similar data, this is mostly done at smaller scale during post-training
+(Yang et al.,
+2025b
+)
+.
+Data from ForagerAgent, on the other hand, is large scale and included already during mid-training, helping shape internal representations ahead of post-training.
+CWM
+uses a dense, decoder-only Transformer architecture
+(Vaswani et al.,
+2017
+; Radford et al.,
+2018
+)
+with
+32
+B
+32\text{\,}\mathrm{B}
+parameters, interleaved sliding window attention supporting up to
+131
+k
+131\text{\,}\mathrm{k}
+tokens context size, and is trained over pre-, mid-, and post-training phases (see
+Figure
+˜
+1
+). With quantization, inference with
+CWM
+can be performed on a single
+80
+GB
+80\text{\,}\mathrm{G}\mathrm{B}
+NVIDIA H100.
+Beyond world modeling capabilities,
+CWM
+achieves strong performance on general and agentic coding and reasoning tasks relative to other open-weights models of comparable size: it reaches pass@1 scores of
+65.8
+%
+65.8\text{\,}\mathrm{\char 37\relax}
+on SWE-bench Verified (with test-time scaling; see
+Figure
+˜
+2
+),
+68.6
+%
+68.6\text{\,}\mathrm{\char 37\relax}
+on LiveCodeBench-v5,
+96.6
+%
+96.6\text{\,}\mathrm{\char 37\relax}
+on Math-500,
+76.0
+%
+76.0\text{\,}\mathrm{\char 37\relax}
+on AIME 2024, and
+94.3
+%
+94.3\text{\,}\mathrm{\char 37\relax}
+on CruxEval Output.
+First and foremost, the release of
+CWM
+is meant to enable novel research on improving code generation with world modeling.
+We are excited by this vision and our report provides early supportive evidence: we give examples of how world models can benefit agentic coding, enable step-by-step simulation of Python code execution, and show early results of how reasoning can benefit from the latter.
+However, we believe the best is yet to come and hope to join forces with the open source research community to explore how world models can be used to leverage reasoning and planning to improve code generation.
+To this end, we release both the final weights and intermediate checkpoints under a noncommercial research license.
+Given
+CWM
+’s competitive performance, we conducted a preparedness assessment which concluded that
+CWM
+is unlikely to increase catastrophic risks beyond those present in the current model ecosystem.
+1
+1
+1
+See
+ai.meta.com/research/publications/cwm-preparedness
+and
+Section
+8
+for details.
+Figure 2
+:
+On SWE-bench Verified,
+CWM
+outperforms open-weight models with similar parameter counts and is even competitive with much larger or closed-weight LLMs.
+The base score for
+CWM
+is computed with a single attempt per instance (no retries, majority voting, or parallel candidates), averaged over multiple runs to reduce variance.
+For “Test Time Scaling”, we generate multiple candidates in parallel and then submit one patch based on ranking.
+The “Test Time Scaling” score for GPT-oss models is
+high
+reasoning budget, while the lower score is
+low
+.
+(*: GPT-5 and GPT-oss use a custom subset of
+477
+477
+problems, while
+CWM
+is evaluated on the full set of
+500
+500
+problems.)
+2
+Code world model datasets
+CWM
+is trained on a large variety of datasets across pre-, mid-, and post-training phases. We focus strongly on code and code world modeling data across all stages of training. We highlight two large-scale data collection efforts that empower
+CWM
+’s world modeling capabilities: Python execution traces and ForagerAgent. We refer to
+Section
+˜
+4
+for more traditional ingredients in our datamixes.
+2.1
+Executable repository images: building repositories at scale
+Figure 3
+:
+CWM format for Python traces. Given a source code context and a marker of the trace starting point,
+CWM
+predicts a series of stack frames representing the Program states and the actions (executed code).
+A core prerequisite for capturing Python execution traces and agentic trajectories in real-world software engineering tasks is executing code in repositories at scale. For isolation and repeatability, we build these repositories as Docker containers, referred to as
+executable repository images
+. These images contain a preconfigured environment capable of running repository code and tests without additional setup.
+As manually building arbitrary GitHub repositories cannot scale to our desired dataset size, we apply both LLM- and CI-assisted methods.
+For the former, an LLM-backed agent, denoted as RepoAgent, was tasked with setting up the development environment of a target repository, finding test files, and ensuring that a significant number of them could run and pass. To support its efforts, we provide RepoAgent with human-readable documentation extracted from the target repository. Although this further improves RepoAgent’s success rates, human-targeted documentation can suffer from inaccuracies due to lack of verifiability and insufficient maintenance incentives. In contrast, machine-targeted instructions must remain accurate for successful builds, with platforms like GitHub immediately signaling failures.
+Therefore, we also developed the
+Activ
+(Act in virtual) pipeline to repurpose GitHub Actions CI execution for building executable repository images. This pipeline runs the workflows locally via the
+act
+(Lee,
+2019
+)
+library. Since many GitHub Actions workflows are not designed for third-party execution and not limited to CI builds, we modify the target repository’s source code and trigger an early exit after the completion of a single successful build.
+As all GitHub Actions workflow jobs run simultaneously in individual containers and the build state is transient, we add or modify the repository’s pytest configuration files, to inject a fixture that is automatically run at test-execution time. This fixture captures the build state of the container running unit tests. We then commit and push the resulting image from each repository, as further detailed in
+Section
+˜
+13
+.
+Running both RepoAgent and Activ methods in parallel, we created over
+35
+k
+35\text{\,}\mathrm{k}
+unique executable repository images.
+2.2
+Python tracing: neural code interpretation data
+The first type of
+CWM
+data we present is memory tracing of Python programs. This involves gathering executable functions or executable repository images, and running them using different IO pairs or CI tests, while capturing the state of the memory, chiefly the local variables, after each line is executed. This process enables us to align code and execution trace to simulate observation-action data within the computational environment. Prior work empirically shows this approach is beneficial in improving general code generation and understanding capabilities
+(Armengol-Estapé et al.,
+2025
+;
+Zhang et al.,
+)
+. Neural code interpretation further has the potential to go beyond traditional interpreters, with applications such as tracing through unexecutable code or combining it with reasoning capabilities.
+Next, we describe the different sources from which we gather execution trace data.
+Function-level tracing.
+We collect a dataset of Python functions from online sources and automatically generate input-output pairs with a combination of fuzzing and prompting Llama3-70B-Instruct. Our tracing process captures the state of the Python program (interpreter stack frames) at different intermediate execution points, corresponding to events of the Python interpreter (e.g., executed lines, return statements, exceptions). The final dataset contains over
+120
+M
+120\text{\,}\mathrm{M}
+traced Python functions.
+We post-process the raw traces to construct observation-action pairs. The observation contains local variables and stack frame metadata immediately prior to executing a line of code, the action is the specific Python line being executed, and the subsequent observation captures the resulting local variable states and additional event metadata such as return statements; we disregard global variables and external side effects. The variable values that do not change with respect to the previous step are summarized with an ellipsis. We prefix the trace data with the source code context.
+Figure
+˜
+3
+illustrates the CWM format for Python traces. Given a Python code context and a marker of the tracing initial point, the model follows with a series of Python stack frame predictions (in the form of a JSON-formatted dictionary with the local variables) and the corresponding actions (i.e., the part of the code that is being executed). The frame, action, and argument separators, as well as the trace context start indicator, are represented using custom tokens.
+We refer the reader to
+Section
+˜
+11
+for trace prediction examples and
+Section
+˜
+17
+for a specification of this trace representation format.
+We re-use both the tracing app and trace formatting for all other execution trace data described in the remainder of this section.
+CodeContests solutions tracing.
+We also generate tracing data for solutions to competitive programming problems. Concretely, we use Llama-3.1-70B-Instruct to generate Python solutions to training set problems in CodeContests
+(Li et al.,
+2022
+)
+, reusing the framework of
+Gehring et al. (
+2025
+)
+. Generations are filtered to ensure a balance of incorrect and correct submissions, leading to an overall count of
+262
+k
+262\text{\,}\mathrm{k}
+. We trace these solutions with inputs from the provided unit tests and filter out long traces with more than
+10
+k
+10\text{\,}\mathrm{k}
+line events or large traces taking up more than
+1
+MB
+1\text{\,}\mathrm{M}\mathrm{B}
+disk space, leaving us with
+33
+k
+33\text{\,}\mathrm{k}
+effective code snippets and
+70
+k
+70\text{\,}\mathrm{k}
+traces.
+Repository-level tracing.
+We also performed Python execution tracing for the unit tests of more than
+21
+k
+21\text{\,}\mathrm{k}
+available and traceable repository images. For a subset of these repositories, we use the repository’s git log to randomly select additional commits prior to our built commit. Since the build environment is configured for the current commit’s dependencies, older commits may fail to execute. We attempted tracing for up to
+40
+40
+historical commits per repository but capped successful traces at
+4
+4
+commits per repository to avoid over-representation of any single repository. This process resulted in around
+70
+k
+70\text{\,}\mathrm{k}
+execution-traced commits.
+We post-process raw traces in two steps. First, we “episodify” our traces, extracting function-level traces from raw pytest traces with configurable stack depth and stochastic step-in probability. When stochastic step-in occurs, function calls are probabilistically included in their parent trace rather than a separate episode to simulate variable execution depth. In a second step, we then gather and compress the source code context from the target repository that is necessary for predicting the observation-action trajectory.
+To the resulting context-trace pair, we then apply the same
+CWM
+formatting as before.
+Natural language tracing.
+Lastly, we generate a dataset of step-by-step descriptions of Python code execution in
+natural language
+rather than our strict JSON-like format from before. Natural language explanations of code execution are closer in domain to other LLM tasks, which we hope will simplify knowledge transfer to other context such as reasoning in code generation. This less-structured format also has other advantages, such as allowing for injection of semantic context (e.g., “this operation preserves the structure property of the max heap”) or for compressing traces by dynamically skipping less interesting parts of the trajectory (e.g., repeated logic within a for loop). We generate this data by prompting Qwen3-32B-FP8 (without thinking)
+(Yang et al.,
+2025a
+)
+to re-write execution traces from our function-level and CodeContests trace datasets. After removing cases where the final output prediction from Qwen diverges from the ground truth trace, we obtain
+75
+M
+75\text{\,}\mathrm{M}
+trajectories from standalone Python functions and
+110
+k
+110\text{\,}\mathrm{k}
+from CodeContests data.
+2.3
+ForagerAgent: agentic midtraining data generation
+We mid-train
+CWM
+on a large-scale dataset of interactions between an LLM-based software engineering agent and a computational environment. This data is generated with our so-called ForagerAgent, which collects multi-step trajectories by prompting an LLM with a software engineering task to solve in the context of a particular code repository. Exposing
+CWM
+to such data at large scale early on should improve subsequent post-training in similar environments, as model predictions should already be grounded in environment dynamics.
+The actions available to the agent are derived from the standard SWE-Agent
+(Yang et al.,
+2024
+)
+toolset: (i) create a file, (ii) edit a file, (iii) run a bash command, and (iv) view or navigate inside a file. The trajectory is concluded once the LLM, either Llama3-70B-Instruct
+(Dubey et al.,
+2024
+)
+or Qwen3-235B-A22B (w/o thinking)
+(Yang et al.,
+2025a
+)
+, believes the task has been solved or the number of tokens, turns, or API costs exceed a hard limit. Like the repository-level tracing data, ForagerAgent relies on our set of executable repository images (see
+Section
+˜
+2.1
+) to seed problem generation. To avoid contamination, we filter out all repositories (and their forks) that are used in SWE-bench. The tasks presented to the model can be categorized into two groups: synthetic tasks and real-world tasks, which we call
+mutate-fix
+and
+issue-fix
+.
+Mutate-fix tasks
+. For mutate-fix tasks, we start with a working codebase and then synthetically introduce a bug for the agent to fix. We begin by identifying functions (and methods – omitted for brevity below) that can be verified using the repository test suite.
+As a first step, we filter these functions to the subset for which all unit tests pass successfully.
+We then consider the following set of mutations to synthetically introduce a bug into these functions:
+•
+Functions: remove either a portion of the function or the entire function.
+•
+Arguments: remove arguments from the function definition or randomly re-order function call arguments.
+•
+Variables: sample a pair of variables in the function and swap all their occurrences.
+•
+Statements: remove an import or return statement.
+•
+Operators: replace operators (binary, unary, or boolean) in statements in the function.
+We filter out mutations that cannot be applied for a given function by parsing the corresponding abstract syntax tree (AST). Lastly, we verify that applying the candidate mutation does in fact cause the associated unit tests to fail.
+We can now use the mutation as a starting point for agentic data collection: we instruct the agent to inspect the mutated function, run its unit tests, and resolve the failing tests by fixing the bug.
+Issue-fix tasks
+. For issue-fix tasks, we prompt the model to fix real issues in our set of repositories, using both issue and pull request data from GitHub.
+We check out commits preceding bug-fixing PRs and task the agent with resolving failing unit tests, providing the corresponding GitHub issue descriptions for context.
+We ensure unit tests are failing before the PRs and that their resolution is necessary and sufficient for addressing the issues.
+Post-processing.
+To avoid overfitting to repetitive interactions, we apply a near-deduplication of trajectories foraged from the same source repository: we first represent a trajectory by the concatenation of its actions, then encode the trajectory using MinHash, and lastly drop trajectories such that the pairwise Jaccard similarity for all encoded trajectories we keep is less than
+0.5
+0.5
+. Because our goal with the ForagerAgent data is to learn a comprehensive world model of agentic interactions with code environments, we do not filter trajectories based on whether they succeed at bug or issue resolution. Following the same motivation, we further train the model to predict both agent and environment turns, although we stochastically mask loss for
+50
+%
+50\text{\,}\mathrm{\char 37\relax}
+of observations as they exhibit limited diversity. Overall, we are left with
+3
+M
+3\text{\,}\mathrm{M}
+trajectories obtained from
+10.2
+k
+10.2\text{\,}\mathrm{k}
+images, and we refer to
+Table
+˜
+1
+for more detailed statistics.
+Table 1
+:
+Statistics of ForagerAgent trajectories. We gather
+3
+M
+3\text{\,}\mathrm{M}
+trajectories from
+10.2
+k
+10.2\text{\,}\mathrm{k}
+images and
+3.15
+k
+3.15\text{\,}\mathrm{k}
+underlying repositories. The trajectories are split
+55
+55
+–
+45
+45
+between issue- and mutate-fix tasks.
+Repos
+Images
+Trajectories
+Issues-Fix
+Mutate-Fix
+3.15k
+10.2k
+3M
+55%
+Functions
+Arguments
+Variables
+Statements
+Operators
+7
+%
+7\text{\,}\mathrm{\char 37\relax}
+9
+%
+9\text{\,}\mathrm{\char 37\relax}
+6
+%
+6\text{\,}\mathrm{\char 37\relax}
+11
+%
+11\text{\,}\mathrm{\char 37\relax}
+12
+%
+12\text{\,}\mathrm{\char 37\relax}
+3
+Examples of code world modeling
+Before introducing the
+CWM
+architecture and benchmark results more formally, in this section, we share a few example generations from the final model that illustrate our excitement for code world modeling.
+For competitive programming,
+Figure
+˜
+4
+shows an example where
+CWM
+first creates an initial solution after reasoning about the problem statement, then constructs input-output pairs to assess the correctness of its solution, before finally comparing its own prediction against the actual results of program execution.
+Although we did not train
+CWM
+for it directly, this is exactly the kind of reasoning about environment dynamics that we believe code world models can enable.
+We are excited about future work that explicitly integrates reasoning about environment feedback to improve agentic code generation.
+While
+CWM
+is trained to simulate the execution of Python code line-by-line, we believe the formal integration of trace prediction into natural language reasoning is interesting future work.
+Figure
+˜
+5
+presents an early prototype of this, showing
+CWM
+already generalizes to such behavior under light prompting.
+We believe that execution trace prediction enables
+grounded
+reasoning about code generation and execution, without requiring access to live execution environments.
+This can take explicit forms, such as in our prototype, but we also envision more implicit forms of regularizing reasoning through world modeling.
+We further plan to extend
+CWM
+’s trace prediction to create a “neural debugger” whose capabilities go beyond traditional debugging, such as jumping to future lines of code in constant time, predicting inputs to reach arbitrary states, or learning abstract representations of program states that support program verification, debugging, or generation.
+Relatedly, we are excited about more direct uses of Python execution trace prediction to improve code generation.
+Under appropriate prompting,
+CWM
+can be used to predict execution traces for functions that are not given directly in the context but are instead constrained only through natural language or formal asserts.
+After trace prediction, the model then constructs the function body from the generated actions, which are the executed Python statements.
+We show early examples of this in
+Figures
+˜
+11.26
+and
+11.27
+.
+This method of reasoning-via-tracing for program generation is reminiscent of work by
+Hoare (
+1971
+); Dijkstra (
+1976
+)
+on using logical reasoning about program execution to guide development rather than only using it to confirm correctness post-hoc.
+⬇
+Okay
+,
+I
+need
+to
+solve
+this
+problem
+where
+I
+have
+to
+check
+if
+a
+given
+string
+is
+evenly
+separated
+.
+The
+definition
+says
+that
+between
+every
+pair
+of
+the
+same
+characters
+,
+there
+must
+be
+an
+even
+number
+of
+characters
+.
+So
+,
+for
+example
+,
+in
+"
+abba
+",
+each
+’
+a
+’
+is
+at
+positions
+0
+and
+3.
+The
+distance
+between
+them
+is
+3-0
+=
+3,
+but
+the
+number
+of
+characters
+in
+between
+is
+3-0-1
+=
+2,
+which
+is
+even
+.
+So
+that
+works
+.
+But
+in
+"
+aabba
+",
+the
+first
+two
+’
+a
+’
+s
+are
+at
+positions
+0
+and
+1.
+The
+number
+of
+characters
+between
+them
+is
+0,
+which
+is
+even
+.
+Let
+me
+double
+check
+.
+But
+the
+example
+says
+it
+’
+s
+not
+correct
+.
+Hmm
+,
+wait
+maybe
+I
+’
+m
+misunderstanding
+the
+problem
+...
+⬇
+<
+tool
+:
+create
+>
+/
+code
+/
+solution
+.
+cpp
+#
+include
+<
+bits
+/
+stdc
+++.
+h
+>
+using
+namespace
+std
+;
+bool
+is_evenly_separated
+(
+const
+string
+&
+s
+)
+{
+unordered_map
+<
+char
+,
+int
+>
+prev_index
+;
+...
+</
+tool
+>
+⬇
+File
+created
+successfully
+at
+:
+/
+code
+/
+solution
+.
+cpp
+(35
+lines
+).
+⬇
+...
+Looking
+back
+,
+the
+code
+reads
+a
+string
+,
+passes
+it
+to
+the
+function
+.
+Let
+me
+create
+some
+test
+cases
+.
+Test
+1:
+’
+abba
+’
+->
+expected
+YES
+->
+the
+code
+should
+return
+YES
+.
+Test
+2:
+’
+aabba
+’
+->
+expected
+NO
+->
+because
+when
+it
+processes
+the
+third
+character
+,
+it
+’
+s
+the
+third
+occurrence
+of
+’
+a
+’.
+Wait
+for
+’
+aabba
+’:
+Indices
+0:
+a
+(
+occurs
+once
+)
+index1
+:
+a
+now
+(
+occurs
+twice
+.
+check
+0
+and
+1
+->
+difference
+is
+1
+->
+???
+But
+difference
+is
+-1
+->
+mod
+2
+is
+-1
+%
+Then
+loop
+reaches
+index
+4
+(
+a
+’
+s
+third
+occurrence
+),
+which
+code
+detects
+count
+for
+a
+which
+is
+already
+2
+->
+returns
+NO
+.
+So
+code
+should
+output
+NO
+here
+.
+Other
+test
+:
+’
+aaa
+’
+(
+NO
+when
+processing
+third
+a
+).
+test4
+:
+’
+abcd
+’
+->
+YES
+(
+each
+character
+appears
+once
+or
+twice
+in
+non
+-
+same
+parity
+)
+->
+code
+OK
+.
+ttest5
+:
+’
+aabb
+’
+->
+the
+two
+a
+’
+s
+(
+index
+0
+and1
+->
+difference
+1,
+ok
+).
+Two
+b
+’
+s
+(
+index
+2,3
+difference
+->
+1
+is
+ok
+).
+Code
+returns
+YES
+.
+Now
+let
+’
+s
+compile
+and
+run
+the
+code
+.
+⬇
+<
+tool
+:
+bash
+>
+g
+++
+-
+std
+=
+c
+++17
+-
+O2
+solution
+.
+cpp
+-
+o
+solution
+;
+echo
+-
+e
+"
+abba
+\
+n
+"
+|
+./
+solution
+echo
+-
+e
+"
+aabba
+\
+n
+"
+|
+./
+solution
+echo
+-
+e
+"
+aaa
+\
+n
+"
+|
+./
+solution
+echo
+-
+e
+"
+abcd
+\
+n
+"
+|
+./
+solution
+echo
+-
+e
+"
+aabb
+\
+n
+"
+|
+./
+solution
+</
+tool
+>
+⬇
+YES
+NO
+NO
+YES
+YES
+⬇
+All
+the
+test
+cases
+match
+the
+expected
+results
+.
+The
+code
+works
+correctly
+for
+these
+samples
+and
+edge
+cases
+.
+For
+very
+large
+inputs
+,
+the
+code
+has
+O
+(
+n
+)
+time
+and
+O
+(1)
+space
+(
+since
+there
+are
+26
+possible
+characters
+,
+hash
+maps
+would
+use
+at
+most
+26
+entries
+).
+Therefore
+,
+the
+solution
+is
+efficient
+and
+should
+pass
+all
+test
+cases
+,
+including
+those
+up
+to
+1
+e6
+characters
+long
+.
+The
+approach
+checks
+for
+each
+character
+occurrence
+and
+ensures
+that
+each
+pair
+of
+occurrences
+meet
+the
+parity
+requirement
+.
+Also
+,
+it
+immediately
+terminates
+upon
+finding
+a
+third
+occurrence
+or
+a
+pair
+with
+same
+parity
+indices
+,
+which
+is
+crucial
+for
+efficient
+processing
+.
+I
+’
+m
+confident
+that
+there
+are
+no
+bugs
+,
+and
+the
+solution
+meets
+the
+problem
+’
+s
+requirements
+.
+⬇
+The
+approach
+involves
+checking
+each
+character
+’
+s
+occurrences
+to
+ensure
+that
+no
+character
+appears
+more
+than
+twice
+,
+and
+any
+pair
+of
+occurrences
+of
+a
+character
+must
+be
+an
+odd
+number
+of
+positions
+apart
+(
+i
+.
+e
+.,
+their
+indices
+have
+different
+parities
+).
+‘‘‘
+cpp
+#
+include
+<
+bits
+/
+stdc
+++.
+h
+>
+using
+namespace
+std
+;
+bool
+is_evenly_separated
+(
+const
+string
+&
+s
+)
+{...
+Figure 4
+:
+Example of
+CWM
+solving a competitive programming problem in agentic fashion. The reasoning steps are shortened and some details are omitted due to space constraints. (Tool calls in purple, environment feedback in orange, and reasoning in blue.)
+⬇
+Given
+a
+python
+code
+function
+and
+an
+assert
+statement
+containing
+a
+specific
+input
+,
+provide
+the
+assertion
+with
+the
+exact
+literal
+output
+that
+the
+function
+returns
+with
+that
+input
+.
+Do
+not
+include
+any
+mathematical
+expressions
+or
+function
+calls
+--
+only
+the
+final
+literal
+value
+.
+Your
+response
+should
+be
+solely
+the
+assertion
+,
+enclosed
+within
+[
+ANSWER
+]
+and
+[/
+ANSWER
+]
+tags
+.
+You
+are
+a
+computational
+world
+model
+and
+can
+predict
+the
+program
+execution
+.
+Your
+execution
+trace
+prediction
+format
+MUST
+follow
+this
+structure
+:
+1.
+The
+execution
+trace
+prediction
+starts
+with
+the
+<|
+trace_context_start
+|>
+token
+and
+ends
+with
+a
+final
+<|
+frame_sep
+|>
+token
+.
+2.
+For
+each
+code
+execution
+step
+:
+-
+Begin
+with
+<|
+frame_sep
+|>
+followed
+by
+the
+event
+token
+which
+can
+be
+<|
+call_sep
+|>,
+<|
+line_sep
+|>,
+<|
+return_sep
+|>
+or
+<|
+exception_sep
+|>.
+-
+After
+<|
+call_sep
+|>
+or
+<|
+line_sep
+|>
+put
+the
+local
+variable
+states
+as
+dictionary
+in
+JSON
+format
+followed
+by
+the
+<|
+action_sep
+|>
+token
+and
+the
+current
+source
+code
+line
+.
+-
+After
+<|
+return_sep
+|>,
+<|
+exception_sep
+|>
+directly
+put
+the
+<|
+action_sep
+|>
+token
+and
+the
+current
+source
+code
+line
+followed
+by
+an
+<|
+arg_sep
+|>
+token
+and
+the
+return
+or
+exception
+arguments
+.
+3.
+Provide
+the
+full
+assertion
+with
+the
+correct
+output
+that
+you
+obtained
+after
+<|
+return_sep
+|>
+in
+[
+ANSWER
+]
+and
+[/
+ANSWER
+]
+tags
+Here
+is
+an
+example
+of
+how
+you
+would
+predict
+the
+output
+of
+the
+program
+using
+your
+trace
+prediction
+capability
+:
+Python
+function
+:
+def
+f
+(
+a
+,
+b
+):
+y
+=
+a
+for
+i
+in
+range
+(
+b
+):
+y
++=
+y
+*
+i
+return
+y
+assert
+f
+(1,3)
+==
+??
+<
+think
+>
+your
+internal
+reasoning
+</
+think
+>
+Let
+’
+s
+verify
+this
+by
+putting
+the
+code
+into
+a
+trace
+context
+and
+call
+the
+function
+in
+the
+main
+()
+function
+and
+then
+trace
+the
+execution
+of
+the
+main
+function
+.
+We
+indicate
+the
+entry
+point
+of
+the
+execution
+trace
+with
+a
+#
+<<
+START_OF_TRACE
+marker
+.
+def
+f
+(
+a
+,
+b
+):
+y
+=
+a
+for
+i
+in
+range
+(
+b
+):
+y
++=
+y
+*
+i
+return
+y
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+(1,3)
+<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+def
+main
+():
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+line_sep
+|>{}<|
+action_sep
+|>
+return
+f
+(1,3)
+<|
+frame_sep
+|><|
+call_sep
+|>{"
+a
+":
+"1",
+"
+b
+":
+"3"}<|
+action_sep
+|>
+def
+f
+(
+a
+,
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+".."}<|
+action_sep
+|>
+y
+=
+a
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"1"}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"..",
+"
+i
+":
+"0"}<|
+action_sep
+|>
+y
++=
+y
+*
+i
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"..",
+"
+i
+":
+".."}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"..",
+"
+i
+":
+"1"}<|
+action_sep
+|>
+y
++=
+y
+*
+i
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"2",
+"
+i
+":
+".."}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"..",
+"
+i
+":
+"2"}<|
+action_sep
+|>
+y
++=
+y
+*
+i
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"6",
+"
+i
+":
+".."}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+"..",
+"
+y
+":
+"..",
+"
+i
+":
+".."}<|
+action_sep
+|>
+return
+y
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+y
+<|
+arg_sep
+|>"6"<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+f
+(1,3)
+<|
+arg_sep
+|>"6"<|
+frame_sep
+|>
+Now
+let
+us
+analyze
+the
+trace
+.
+The
+return
+argument
+of
+the
+function
+call
+f
+(1,3)
+in
+the
+main
+()
+function
+is
+"6"
+in
+JSON
+format
+,
+so
+the
+return
+value
+is
+6.
+[
+ANSWER
+]
+assert
+f
+(1,3)
+==
+6
+[/
+ANSWER
+]
+Python
+function
+:
+def
+f
+(
+d
+,
+k
+):
+new_d
+=
+{}
+for
+key
+,
+val
+in
+d
+.
+items
+():
+if
+key
+<
+k
+:
+new_d
+[
+key
+]
+=
+val
+return
+new_d
+assert
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+==
+??
+⬇
+Let
+’
+s
+analyze
+the
+function
+call
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3).
+The
+function
+f
+takes
+a
+dictionary
+d
+and
+a
+key
+k
+.
+It
+creates
+a
+new
+dictionary
+new_d
+and
+iterates
+over
+the
+items
+in
+d
+.
+For
+each
+key
+-
+value
+pair
+,
+if
+the
+key
+is
+less
+thank
+,
+it
+adds
+that
+key
+-
+value
+pair
+to
+new_d
+.
+In
+this
+case
+,
+the
+dictionary
+is
+{1:
+2,
+2:
+4,
+3:
+3}
+and
+the
+key
+is
+3.
+We
+need
+to
+check
+each
+key
+in
+the
+dictionary
+to
+see
+if
+it
+’
+s
+less
+than
+3.
+-
+Key
+1:
+1
+<
+3
+is
+True
+,
+so
+we
+add
+1:
+2
+to
+new_d
+.
+-
+Key
+2:
+2
+<
+3
+is
+True
+,
+so
+we
+add
+2:
+4
+to
+new_d
+.
+-
+Key
+3:
+3
+<
+3
+is
+False
+,
+so
+we
+skip
+3:
+3.
+So
+,
+the
+resulting
+dictionary
+new_d
+should
+be
+{1:
+2,
+2:
+4}.
+</
+think
+>
+⬇
+Let
+’
+s
+verify
+this
+by
+putting
+the
+code
+into
+a
+trace
+context
+and
+call
+the
+function
+in
+the
+main
+()
+function
+and
+then
+trace
+the
+execution
+of
+the
+main
+function
+.
+We
+indicate
+the
+entry
+point
+of
+the
+execution
+trace
+with
+a
+#
+<<
+START_OF_TRACE
+marker
+.
+def
+f
+(
+d
+,
+k
+):
+new_d
+=
+{}
+for
+key
+,
+val
+in
+d
+.
+items
+():
+if
+key
+<
+k
+:
+new_d
+[
+key
+]
+=
+val
+return
+new_d
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+def
+main
+():
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+line_sep
+|>{}<|
+action_sep
+|>
+return
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+<|
+frame_sep
+|><|
+call_sep
+|>{"
+d
+":
+"{1:
+2,
+2:
+4,
+3:
+3}",
+"
+k
+":
+"3"}<|
+action_sep
+|>
+def
+f
+(
+d
+,
+k
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+".."}<|
+action_sep
+|>
+new_d
+=
+{}
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"{}"}<|
+action_sep
+|>
+for
+key
+,
+val
+in
+d
+.
+items
+():
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"1",
+"
+val
+":
+"2"}<|
+action_sep
+|>
+if
+key
+<
+k
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+new_d
+[
+key
+]
+=
+val
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"{1:
+2}",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+for
+key
+,
+val
+in
+d
+.
+items
+():
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"2",
+"
+val
+":
+"4"}<|
+action_sep
+|>
+if
+key
+<
+k
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+new_d
+[
+key
+]
+=
+val
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"{1:
+2,
+2:
+4}",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+for
+key
+,
+val
+in
+d
+.
+items
+():
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"3",
+"
+val
+":
+"3"}<|
+action_sep
+|>
+if
+key
+<
+k
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+for
+key
+,
+val
+in
+d
+.
+items
+():
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+d
+":
+"..",
+"
+k
+":
+"..",
+"
+new_d
+":
+"..",
+"
+key
+":
+"..",
+"
+val
+":
+".."}<|
+action_sep
+|>
+return
+new_d
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+new_d
+<|
+arg_sep
+|>"{1:
+2,
+2:
+4}"<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+<|
+arg_sep
+|>"{1:
+2,
+2:
+4}"<|
+frame_sep
+|>
+Now
+let
+us
+analyze
+the
+trace
+.
+The
+return
+argument
+of
+the
+function
+call
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+in
+the
+main
+()
+function
+is
+"{1:
+2,
+2:
+4}"
+in
+JSON
+format
+,
+so
+the
+return
+value
+is
+{1:
+2,
+2:
+4}.
+[
+ANSWER
+]
+assert
+f
+({1:
+2,
+2:
+4,
+3:
+3},
+3)
+==
+{1:
+2,
+2:
+4}
+[/
+ANSWER
+]
+Figure 5
+:
+Example with execution trace prediction and reasoning. In this example, we add an execution trace example to the prompt. After reasoning in natural language about the code, the model uses its execution trace prediction capability to confirm the correct return value
+{1: 2, 2: 4}
+. We encode all special tokens (e.g.
+<|frame_sep|>
+) as such.
+We refer to
+Section
+˜
+11
+for additional example trajectories from
+CWM
+, including self-correcting SWE behavior, a comparison of the advantages and disadvantages of strict trace prediction compared to natural language (and why we think their combination is worth exploring), further demonstrations of how we can adapt trace prediction to elicit code generation, and an example of using
+CWM
+to predict program termination.
+(a)
+CWM
+architecture: GQA with alternating local (
+8
+k
+8\text{\,}\mathrm{k}
+) and dynamic global (
+131
+k
+131\text{\,}\mathrm{k}
+) sliding window attention.
+(b)
+Overview of the
+CWM
+training and inference types for inputs and outputs.
+Figure 6
+:
+Figures illustrating the
+CWM
+Transformer architecture and the main types of data introduced in the different training steps and used at inference time.
+4
+CWM
+: architecture, pre-training, and scaling laws
+We next share details on the
+CWM
+architecture, final pre-training recipe, and scaling law experiments. Specific details about our efficient training infrastructure can be found in
+Section
+˜
+6
+.
+4.1
+Architecture and hyper-parameters
+Model architecture.
+CWM
+is a
+32
+32
+-billion-parameter dense decoder-only model.
+We choose a dense architecture over sparse alternatives for ease-of-use in downstream open source research.
+CWM
+uses an alternating pattern of local and global attention blocks interleaved in a
+3
+:
+1
+3\!:\!1
+ratio (see
+Figure
+˜
+6(a)
+) with sliding window sizes of
+8192
+8192
+and
+131 072
+131\,072
+tokens respectively. Transformer blocks use Grouped-Query-Attention
+(Ainslie et al.,
+2023
+)
+with
+48
+48
+query heads and
+8
+8
+key-value heads. We use SwiGLU activation functions
+(Shazeer,
+2020
+)
+, RMSNorm
+(Zhang and Sennrich,
+2019
+)
+with pre-normalization, Rotary Positional Embedding (RoPE)
+(Su et al.,
+2021
+)
+, and we train with full document-causal masking. To support long-context modeling, we follow
+Roziere et al. (
+2023
+); Xiong et al. (
+2023
+)
+and apply Scaled RoPE with
+θ
+=
+1
+M
+\theta=$1\text{\,}\mathrm{M}$
+and scale factor
+16
+16
+from mid-training onwards. We give a full overview of
+CWM
+parameters and architecture choices in
+Table
+˜
+2
+.
+Training hyper-parameters.
+We train the model with the AdamW optimizer
+(Loshchilov and Hutter,
+2019
+)
+with
+β
+1
+=
+0.9
+\beta_{1}=0.9
+,
+β
+2
+=
+0.95
+\beta_{2}=0.95
+, weight decay of
+0.1
+0.1
+, and gradient clipping at norm
+1.0
+1.0
+.
+After
+2000
+2000
+steps of linear warmup, we use a cosine decay learning rate schedule with peak learning rate
+8
+×
+10
+−
+4
+8\text{\times}{10}^{-4}
+and decaying by a factor of
+100
+×
+100\times
+over the training horizon. The cosine decay schedule is calculated for a total training duration of
+13
+T
+13\text{\,}\mathrm{T}
+tokens, with the last
+5
+T
+5\text{\,}\mathrm{T}
+tokens of the scheduler used during mid-training.
+Key hyper-parameters were determined using scaling laws, which we detail in
+Section
+˜
+4.3
+.
+Table 2
+:
+Key hyper‑parameters of the 32 B
+CWM
+.
+Parameter
+Value
+Number of parameters
+32
+B
+32\text{\,}\mathrm{B}
+Layers
+64
+64
+Hidden dimension
+6144
+6144
+Intermediate dimension
+21 504
+21\,504
+Number of attention heads / dimension
+48
+48
+/
+128
+128
+Number of key-value heads
+8
+8
+Local window size
+8192
+8192
+tokens
+Max global context
+131 072
+131\,072
+tokens
+Activation function
+SwiGLU
+Normalization
+RMSNorm (pre-norm)
+Positional Encoding
+Scaled RoPE (
+θ
+=
+10
+6
+\theta=10^{6}
+, scale factor
+=
+16
+=16
+)
+Vocabulary size
+128 256
+128\,256
+tokens
+Tokenizer.
+CWM
+uses the Llama 3 tokenizer
+(Dubey et al.,
+2024
+)
+which is a fast Byte-Pair Encoding tokenizer implemented with TikToken.
+2
+2
+2
+See
+https://github.com/openai/tiktoken
+.
+The vocabulary contains
+128 000
+128\,000
+regular tokens as well as
+256
+256
+reserved tokens.
+We keep the control tokens from Llama 3 and leverage unused reserved tokens to support our tracing and reasoning use cases.
+4.2
+Two-stage pre-training
+CWM
+pre-training consists of two stages sharing learning-rate scheduler and optimizer states but differing in datamix and maximum document lengths:
+1.
+General pre-training
+: We begin with an initial pre-training phase on
+8
+T
+8\text{\,}\mathrm{T}
+tokens from a diverse range of mostly English sources, with an emphasis on coding data (making up about
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+of the mix) as well as STEM and general knowledge. We pre-train our model with a global batch size of
+8.4
+M
+8.4\text{\,}\mathrm{M}
+tokens and a context length of
+8192
+8192
+tokens.
+3
+3
+3
+Note that our “local” attention blocks are therefore effectively global during pre-training.
+2.
+Code world model mid-training
+: We then mid-train the model for an additional
+5
+T
+5\text{\,}\mathrm{T}
+tokens. We here depart from our more generalist pre-training datamix and introduce a number of datasets in support of our code world modeling objectives. We mid-train with a global batch size of
+33
+M
+33\text{\,}\mathrm{M}
+tokens and maximum context length of
+131
+k
+131\text{\,}\mathrm{k}
+tokens.
+4
+4
+4
+We have observed lackluster performance when training on long-context data at smaller batch sizes. We speculate that increasing the batch size (in tokens) is beneficial for training on long-context data, as the decrease in the number of documents contained in each batch increases the variance in our gradient estimate.
+Mid-training is the key stage for teaching code world modeling capabilities.
+Next, we discuss the changes we make to the pre-training recipe during mid-training to optimize
+CWM
+performance.
+Mid-training datamix.
+For mid-training, we introduce the ForagerAgent and Python execution tracing data, our main
+CWM
+datasets introduced in
+Section
+˜
+2
+, into the datamix. We additionally include code- and reasoning-related data such as datasets derived from GitHub pull requests similar to SWE-RL
+(Wei et al.,
+2025
+)
+, data from compiler intermediate representations
+(Cummins et al.,
+2024
+)
+, Triton PyTorch kernels similar to
+Paliskara and Saroufim (
+2025
+)
+, and formal mathematics in Lean covering statement and proof translation, as well as world modeling (see
+Section
+˜
+18
+).
+CWM
+-specific data makes up
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+of the overall mid-training datamix.
+We further increase the fraction of general code data to
+40
+%
+40\text{\,}\mathrm{\char 37\relax}
+and keep
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+for rehearsal of the initial pre-training datamix, as this proved essential in retaining performance on standard evaluations.
+Within the rehearsal fraction, we now upweight higher quality datasets such as those containing math or long context data, while making sure to avoid over-epoching.
+We summarize the types of data used across
+CWM
+training stages in
+Figure
+˜
+6(b)
+.
+Mid-training datamix ablations.
+For many of the datasets introduced during mid-training we can afford to train for multiple epochs. To determine the desired number of epochs per dataset, we perform a series of scaling law experiments
+(Kaplan et al.,
+2020
+)
+that
+simulate
+different levels of epoching
+(Dubey et al.,
+2024
+)
+. In agreement with the literature on this
+(Muennighoff et al.,
+2023
+)
+, we generally find that multi-epoch training improves downstream task performance, albeit at diminishing returns, before eventually leading to overfitting. By selecting target epochs such that metrics indicate little to no diminishing returns, we arrive at between
+1
+1
+and
+4
+4
+target epochs per dataset.
+The final proportion of a dataset in the mid-training mix is then calculated such that the desired number of epochs is reached at the end of mid-training.
+When estimating the number of steps per epoch, we account for both token packing (wrapping) for pre-training data and truncation for chat data.
+Long-context mid-training.
+A significant amount of the mid-training data is long-context, with about
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+of documents exceeding
+65
+k
+65\text{\,}\mathrm{k}
+tokens.
+This motivates our decision to increase the maximum sequence length to
+131
+k
+131\text{\,}\mathrm{k}
+tokens for all of mid-training.
+Consequently, we do not need a dedicated long-context finetuning phase common in many other recipes
+(Yang et al.,
+2025a
+; Agarwal et al.,
+2025
+; Dubey et al.,
+2024
+)
+. While our local-global pattern reduces the cost of long-context attention (see
+Section
+˜
+4.1
+), we still found that data-parallel workers with short-context documents would often wait for ranks with long-context data during distributed training. To improve iteration speeds, we “bucketize” all documents by sequence length, ensuring all workers draw documents from the same bucket at a given step. We choose the bucket boundaries as
+(
+0
+,
+16385
+]
+(0,16385]
+,
+(
+16385
+,
+65537
+]
+(16385,65537]
+, and
+(
+65537
+,
+∞
+)
+(65537,\infty)
+tokens and take care that the marginal probability of sampling a dataset is unchanged from bucketization. Note that, to achieve further speedups, we limit the maximum global attention size to
+32 768
+32\,768
+in the medium bucket.
+4.3
+Scaling laws
+Scaling laws for LLMs that predict model performance as a function of compute, data, and model size have been studied extensively
+(Kaplan et al.,
+2020
+; Hoffmann et al.,
+2022
+; Bi et al.,
+2024
+)
+. These empirical laws enable the estimation of the expected loss for a given compute budget, the identification of the optimal scaling strategy between model and data size, and an informed selection of training hyper-parameters. Following
+Bi et al. (
+2024
+)
+, we develop scaling laws for optimal hyper-parameter prediction for the pre-training of
+CWM
+.
+We adopt the compute budget formula
+C
+=
+M
+⋅
+D
+C=M\cdot D
+, where
+M
+M
+is the model size represented as the number of non-embedding FLOP per token and
+D
+D
+is the data scale corresponding to the total number of training tokens. For a decoder-only Transformer, the number of FLOP per token is approximated by
+M
+=
+6
+​
+N
+ne
+⏟
+linear term
++
+6
+​
+d
+​
+L
+​
+S
+⏟
+attention term
+,
+M=\underbrace{6N_{\text{ne}}}_{\text{linear term}}+\underbrace{6dLS}_{\text{attention term}},
+(1)
+where
+N
+ne
+N_{\text{ne}}
+is the number of parameters excluding embeddings,
+d
+d
+is the model hidden dimension,
+S
+S
+is the sequence length, and
+L
+L
+is the number of layers. This formula explicitly accounts for the computational cost of self-attention, which constitutes a significant portion of the total compute, especially for smaller models and longer contexts where attention overhead is relatively more pronounced, as discussed in
+Bi et al. (
+2024
+)
+. We refer to
+Section
+˜
+14
+for further detail.
+Recent LLMs are trained beyond data-optimal regimes
+(Dubey et al.,
+2024
+)
+to optimize inference costs and produce smaller yet capable models.
+Gadre et al. (
+2024
+)
+show that models scale predictably for a fixed model-to-data ratio and advocate for scaling laws that mirror the setting of the final pretraining run. Therefore, we maintain a fixed model-to-data ratio of
+D
+/
+M
+=
+40
+D/M=40
+across compute budgets, matching the target ratio of our
+32
+32
+B parameter model pre-trained on
+8
+T
+8\text{\,}\mathrm{T}
+tokens. This ratio is roughly
+8
+8
+times more data than would be compute optimal according to the Chinchilla paper
+(Hoffmann et al.,
+2022
+)
+.
+We conduct a quasi-random search over batch size and learning rate across eight increasing compute scales, ranging from
+2
+×
+10
+18
+2\times 10^{18}
+to
+2
+×
+10
+20
+2\times 10^{20}
+FLOP. For each scale, we keep the configurations within 1% of the best validation loss and fit the batch size
+B
+​
+S
+BS
+and learning rate
+L
+​
+R
+LR
+with respect to the compute budget
+C
+C
+. Consistent with prior work,
+B
+​
+S
+BS
+grows and
+L
+​
+R
+LR
+declines gradually with
+C
+C
+, while near-optimal hyper-parameters span a broad range. However, likely due to our different pre-training data, our equations for learning rate and batch size diverge from
+Bi et al. (
+2024
+)
+:
+L
+​
+R
+​
+(
+C
+)
+=
+19.29
+⋅
+C
+−
+0.177
+,
+B
+​
+S
+​
+(
+C
+)
+=
+30.17
+⋅
+C
+0.231
+.
+\begin{split}LR(C)=19.29\cdot C^{-0.177},\\
+BS(C)=30.17\cdot C^{0.231}.\end{split}
+(2)
+See
+Section
+˜
+14
+for additional details.
+5
+Post-training: SFT, RL algorithms and environments
+Our post-training phase improves
+CWM
+’s ability to solve complex programming-related problems with reasoning, building on the internal code world model learned during earlier training stages. Concretely, we first perform supervised finetuning (SFT) to improve both reasoning and general instruction-following capabilities. We then carry out large-scale multi-task multi-turn reinforcement learning on coding contests, math questions, and software engineering environments. We describe the SFT stage, our RL algorithms, data and environments, and detail our joint RL training recipe. As we do not intend to develop a general-purpose chatbot we therefore deliberately omit an RLHF stage.
+5.1
+SFT
+We perform SFT for
+100
+B
+100\text{\,}\mathrm{B}
+tokens, distributed across
+50
+k
+50\text{\,}\mathrm{k}
+steps with a global batch size of
+2
+M
+2\text{\,}\mathrm{M}
+tokens and
+32
+k
+32\text{\,}\mathrm{k}
+token sequence lengths.
+We share optimization hyperparameters with pre-training but change the learning rate schedule to
+1
+k
+1\text{\,}\mathrm{k}
+steps of linear warmup followed by a constant learning rate of
+1
+×
+10
+−
+5
+1\text{\times}{10}^{-5}
+.
+In preliminary experiments, keeping a constant learning rate achieved similar evaluation metrics to annealing with cosine schedules while enabling high learning rate training during RL.
+We further observed a performance decrease when SFT-ing at longer sequence lengths.
+We suspect this is due to the configuration of our dataloader which always sequence-packs inputs (per data-parallel rank and local batch) from a single dataset. For very small datasets and large context sizes, this reduces the amount of unique steps such datasets can be observed, which may negatively affect performance.
+Datamix.
+We train on a diverse mix of internal and open-access data during SFT, including standard instruction-following datasets.
+About
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+of the datamix is rehearsal from mid-training (which itself includes
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+pre-training data). This is to avoid overfitting to the SFT distribution ahead of RL and retain
+CWM
+capabilities taught in mid-training.
+Our datamix also contains agentic SWE RL trajectories (see
+Section
+˜
+5.3.1
+), some of which have been rejection-sampled from earlier iterations of the
+CWM
+itself. We have generally found it useful to iteratively improve the starting point for RL by including trajectories from earlier iterations in the next SFT. Similarly, we include external datasets with reasoning traces, as we have found the performance benefit from them carries through to our final post-RL model. Specifically, we use the OpenMathReasoning
+(Moshkov et al.,
+2025
+)
+and OpenCodeReasoning
+(Ahmad et al.,
+2025
+)
+datasets that rely on DeepSeek-R1
+(Guo et al.,
+2025
+)
+.
+Reasoning tokens.
+For SFT training on reasoning data, we introduce
+<|reasoning_thinking_start|>
+and
+<|reasoning_thinking_end|>
+tokens that surround any reasoning text. Because we mask the loss on all
+<|reasoning_thinking_start|>
+tokens, the model does not learn to generate them.
+This enables both reasoning and non-reasoning behavior for the
+CWM
+-SFT model: non-reasoning mode is active by default and reasoning mode can be activated by injecting
+<|reasoning_thinking_start|>
+into the beginning of assistant responses.
+Note that we discontinue the use of these reasoning tokens during RL as explained in
+Section
+˜
+5.3
+.
+5.2
+RL algorithm
+We use a variant of Group Relative Policy Optimization (GRPO) to train
+CWM
+(Shao et al.,
+2024
+)
+.
+GRPO is a policy gradient method that uses the PPO loss
+(Schulman et al.,
+2017
+)
+in combination with Monte Carlo value estimation instead of a value model, as used in PPO. Many works have proposed improvements to and fixes of GRPO
+(Yu et al.,
+2025
+; Liu et al.,
+2025
+; Hu et al.,
+2025
+; Mistral-AI et al.,
+2025
+)
+.
+We incorporate a number of these and include further changes to support multi-turn RL and efficient asynchronous RL. A formal description of our RL algorithm can be found in
+Section
+˜
+12
+.
+Differences from GRPO.
+We deviate from the original GRPO algorithm in the following ways:
+•
+Multi-turn:
+GRPO was originally developed for single turn (prompt
+→
+\rightarrow
+response) environments. Instead, we use a multi-turn variant where the sequence contains both model- and environment-generated tokens after the prompt, resulting in the need for masking via
+M
+i
+,
+t
+M_{i,t}
+. Furthermore, whereas GRPO used the
+reward
+r
+i
+r_{i}
+, we use the
+return
+R
+i
+R_{i}
+(sum of rewards) in the advantage calculation.
+•
+Asynchronous:
+Whereas GRPO uses a synchronous setup, where nodes switch between generating batches of completions and training on them, we use asynchronous RL, resulting in much higher throughput.
+•
+No
+σ
+\sigma
+normalization:
+GRPO calculates the advantage as the centered and scaled terminal reward
+A
+^
+i
+=
+(
+r
+i
+−
+μ
+)
+/
+σ
+\hat{A}_{i}=(r_{i}-\mu)/\sigma
+, where
+μ
+\mu
+and
+σ
+\sigma
+are mean and standard deviation of rewards in the batch. This introduces a difficulty bias
+(Liu et al.,
+2025
+)
+, which we avoid by using the more conventional
+A
+^
+i
+=
+(
+R
+i
+−
+μ
+)
+\hat{A}_{i}=(R_{i}-\mu)
+.
+•
+No length normalization:
+As noted by
+Liu et al. (
+2025
+)
+, dividing the loss by the trajectory length as done in GRPO leads to a length bias, whereby the agent is incentivized to increase the length on hard problems so as to lower the average loss. To avoid this bias, we divide by the maximum number of tokens in a trajectory, which matches the maximum context size of our model
+N
+=
+131072
+N=131072
+.
+•
+Batching strategy:
+We batch by a maximum token limit instead of the common approach that keeps a fixed number of trajectories per batch. This change aims at improving efficiency and to stabilize training by lowering the variance in batch size between different optimization steps – this is important in combination with our removal of length normalization. As a result, different trajectories that belong to the same group might contribute to separate optimization steps. The proportion of groups that get split can be decreased by increasing the maximum token limit or the number of gradient accumulation steps, but in practice we observe this not to be an issue even when more than half of the groups are split.
+•
+Clip-higher:
+Following
+(Yu et al.,
+2025
+)
+, we use a higher upper clip value,
+ε
+high
+=
+0.25
+\varepsilon_{\text{high}}=0.25
+and
+ε
+low
+=
+0.2
+\varepsilon_{\text{low}}=0.2
+to prevent entropy collapse.
+•
+No KL:
+Using clip-higher to prevent entropy collapse, we found it unnecessary to use KL regularization
+5
+5
+5
+We recommend using the k2 estimator
+(Schulman,
+2020
+)
+whose gradient is an unbiased estimator of the forward
+KL
+⁡
+(
+π
+θ
+,
+π
+old
+)
+\operatorname{KL}(\pi_{\theta},\pi_{\text{old}})
+, rather than the k3 estimator used by GRPO whose gradient is an unbiased estimator of the reverse
+KL
+⁡
+(
+π
+old
+,
+π
+θ
+)
+\operatorname{KL}(\pi_{\text{old}},\pi_{\theta})
+.
+.
+•
+Skip zero-advantage trajectories:
+The
+effective
+batch size is the number of tokens that do not have zero advantage and contribute to the gradient. We reduce variance in the effective batch size by skipping all zero-advantage trajectories.
+•
+Skip stale trajectories:
+To limit the degree of off-policyness, we skip trajectories whose most recent tokens were generated from a policy more than
+100
+100
+training steps behind the current policy.
+•
+Weighted mean return:
+We found that longer trajectories are more likely to fail
+(Hassid et al.,
+2025
+)
+, leading to the majority of
+tokens
+having a negative advantage. To avoid biasing the token-averaged return, we compute
+μ
+\mu
+as a length-weighted average.
+•
+Gibberish detection:
+While gibberish typically leads to lower rewards and naturally decreases at the beginning of RL, it can increase later when some successful gibberish trajectories get reinforced, especially for agentic SWE RL. So we explicitly reject any trajectory containing any token
+y
+t
+y_{t}
+that is both rare and generated with low probability:
+id
+⁡
+(
+y
+t
+)
+>
+100
+,
+000
+\operatorname{id}(y_{t})>100,000
+and
+logprob
+⁡
+(
+y
+t
+)
+<
+−
+log
+⁡
+(
+128
+,
+256
+)
+−
+2
+\operatorname{logprob}(y_{t})<-\log(128,256)-2
+where
+128
+,
+256
+128,256
+is the vocabulary size and the thresholds are tuned for high precision. Gibbrish typically consists of a window of tokens generated at low probability. BPE tokens are sorted by merge order where large
+id
+\operatorname{id}
+corresponds to rare tokens. Generating such tokens suggests that the model is generating at high entropy and over-weighing rare tokens. This method stopped any increasing gibberish generation and performed better than detectors based on logprob and position alone.
+5.3
+RL environments & data
+We consider four types of RL tasks: Agentic software engineering (SWE) (
+Section
+˜
+5.3.1
+), Coding (
+Section
+˜
+5.3.2
+), Agentic coding (
+Section
+˜
+5.3.3
+), and Mathematics (
+Section
+˜
+5.3.4
+).
+Each RL task is defined by a dataset (containing prompts, a verification suite like unit tests, and additional metadata) and an environment that the agent interacts with.
+We integrate these tasks into a joint RL training phase which we detail in
+Section
+˜
+5.4
+.
+We further refer to
+Section
+˜
+6.2
+for implementation details regarding our environments and RL training infrastructure.
+Our environments constitute partially observable Markov decision processes: a language model is employed as an agent, producing actions based on the preceding sequence of action-observation pairs and an initial prompt. All environments for training
+CWM
+utilize software-based verification of outcomes, producing a single terminal reward signal per rollout. In the remainder of this section, we describe the specification of environment, i.e., prompt and observation design, reward function, and the corresponding datasets.
+During reasoning RL, we discontinue the use of SFT reasoning tokens and replace them with clear-text
+<think>
+</think>
+tags.
+Early RL experiments on top of the SFT model showed long initial reasoning traces and slow improvements. We attribute this to our SFT reasoning data, which enhances reasoning performance but limits exploration during RL training. Switching out reasoning tags resulted in shorter responses, higher starting entropies, and significantly improved final performance. This suggests our approach leads to a best-of-both worlds scenario: the model’s familiarity with reasoning responses from SFT enables rapid improvements early on in RL, while the introduction of the new tokens allows the model to develop its own reasoning that is guided – but not restricted – by the SFT data.
+5.3.1
+Agentic SWE
+Figure 7
+:
+SWE RL design. An agent solves software engineering tasks end-to-end through long-horizon agent-environment interactions via reasoning and tool use (up to 128 turns and
+131
+k
+131\text{\,}\mathrm{k}
+context size). SWE RL employs a minimal toolset:
+bash
+as the core, with
+edit
+,
+create
+, and
+submit
+as lightweight
+bash
+plugins. The reward combines hidden test outcomes with patch similarity, where the similarity reward is applied when tests fail to provide auxiliary learning signals.
+Design.
+Agentic SWE RL substantially improves our model’s software reasoning and engineering capability (e.g., on SWE-bench Verified
+(Jimenez et al.,
+2024
+)
+) by enhancing the model’s agentic reasoning and tool-use skills (see
+Section
+˜
+20
+for the capability evolution during RL training). Its philosophy is to remain simple yet general: an LLM agent tackles a task end-to-end through reasoning and tool execution, without relying on task-specific post-processing.
+The same design is applied to the agentic coding environment (see
+Section
+˜
+5.3.3
+). Each SWE RL trajectory has a single human user turn (besides the system prompt) containing the issue description and multiple turns of agent-environment interactions.
+During training, we allow long-horizon interaction, with a maximum of
+128
+128
+turns over a context window of
+131
+k
+131\text{\,}\mathrm{k}
+tokens.
+As shown in
+Figure
+˜
+7
+, the agent is equipped with four tools to solve a given task (e.g., software issue), where we embrace a minimal tool design centered on bash and editing, inspired by Sonnet 3.5
+(Anthropic,
+2025
+)
+:
+•
+bash
+: executing commands in a stateful shell session,
+•
+edit
+: modifying an existing file using the search/replace format used by Agentless
+(Xia et al.,
+2024
+)
+and Aider
+(Aider Team,
+2025
+)
+,
+•
+create
+: creating a new file in the sandbox, and
+•
+submit
+: marking something (e.g., a file path) as the final submission according to the task requirement.
+The runtime implementation of the tools follows SWE-agent
+(Yang et al.,
+2024
+)
+and OpenHands
+(Wang et al.,
+2025
+)
+, where
+bash
+is a stateful shell session running in a persistent server process, and serves as the main component, while other customized tools are treated as plugins that can be “de-sugared” into simple bash commands. For example, the
+edit
+and
+create
+tools are two standalone Python scripts, and the
+submit
+tool, when used for file paths, reduces to
+cat <path>
+to retrieve the file content.
+Figure 8
+:
+SWE RL interaction example.
+The agent interacts extensively with the repository sandbox through reasoning, exploration, editing, and test execution, submitting a final patch using
+git diff
+along with a summary.
+The user prompt includes custom instructions for resolving software issues.
+For example, the prompt shown in
+Figure
+˜
+8
+asks the agent to “…thoroughly explore the codebase, think hard, and leverage significant execution to verify correctness by writing comprehensive tests to validate your solution…”, which it follows in its subsequent actions. Notable differences from prior designs are that (1) our agent must generate the complete end-to-end patch directly via
+git diff
+rather than relying on task-specific post-processing, and (2) it must also produce a summary explaining how it resolves the issue, to improve clarity and usability. We also retain all reasoning turns for logical coherence.
+Reward.
+We adopt a hybrid reward for SWE RL. When all the hidden tests pass, the reward is
+1
+1
+. If not, we adopt the patch similarity reward used in the SWE-RL paper
+(Wei et al.,
+2025
+)
+. Unlike the SWE-RL paper, which uses a continuous reward value, we apply a discrete and threshold-based design to improve the training stability by avoiding rewarding low-similarity patches.
+In detail, when the computed similarity is above the threshold of
+0.5
+0.5
+, the reward is
+0
+, otherwise, the reward is
+−
+1
+-1
+.
+This reward shaping showed benefits in early ablation, because a higher patch similarity incentivizes the model to localize the actual bugs more precisely and to produce a closer fix to the oracle patch.
+This also helps the model to gain more learning signals from difficult issues for which it cannot produce any test-passing patch.
+Data self-bootstrapping.
+In SWE RL,
+CWM
+is required to solve software issues as a
+reasoning agent
+(i.e., through both reasoning and tool use).
+However, such data does not exist in the public and the format is completely new to the model before RL and different to our ForagerAgent data, so early iterations of our model struggled to interact with the software environment across long horizons without making format errors. To address this, we perform an iterative self-bootstrapping process to collect high-quality agentic reasoning traces and supply them back to the joint SFT stage so that
+CWM
+can have a better prior distribution before RL. This process not only helps with format adherence but also significantly improves our model’s software engineering capability both before and after RL.
+Figure 9
+:
+SWE RL self-bootstrapping. Starting from a pre-RL checkpoint, we iteratively perform RL, rejection-sample high-quality reasoning traces, and feed them back into SFT. This process improves data quality and format adherence across iterations, raising success rates, and providing stronger initialization for joint RL.
+As shown in
+Figure
+˜
+9
+, we start from a pre-RL
+CWM
+checkpoint (not the final
+CWM
+SFT) that has not been SFT-ed on any SWE trajectories in the reasoning agent format.
+Over three main iterations, we perform RL and use the RL-ed model (i.e., the SWE expert in the figure) to do rejection sampling. We then select high-quality traces from the rejection samples using custom heuristics (e.g., long trajectories that pass all hidden tests without any tool use errors). Next, we perform SFT with this data on top of the original model.
+This filtering helps reduce biases that RL fails to eliminate, such as the tendency to make editing mistakes. Then, iteratively, we start RL with the new SFT-ed model and collect higher-quality traces for the next round. Eventually, we include the final set of the traces into the joint SFT mix to prepare for the final joint RL.
+This results in the final
+CWM
+SFT model.
+For each iteration, we redo SFT on the original midtraining checkpoint and discard old trajectories.
+Importantly, we find that the bootstrapped data greatly improve the performance of the SFT checkpoint on SWE-bench Verified. Without SWE RL traces, the SFT model hardly resolves any issues due to format errors. With more iterations of bootstrapping, the data quality improves significantly, and the success rate increases from
+30
+30
+%, to
+37
+37
+%, and to
+43
+43
+% pass@1 over SWE-bench Verified. During earlier iterations, we record the offline pass rate for each instance and use it as the GRPO baseline in later iterations. This lets us set the group size to 1 and speed up each epoch. We find this technique leads to faster SWE RL training. In the final joint RL, we still perform online estimation of the GRPO baseline for consistency with other environments and for a higher performance ceiling.
+Data sourcing and filtering.
+We reuse the executable repository images from our mid-training data generation efforts (
+Section
+˜
+2.1
+). Since issue solving requires additional metadata (e.g., issue text, base commit hashes, and diff patches), we join these repositories with publicly available issue and pull request metadata to create repository-issue pairs. The
+git log
+history enables us to create one-to-many repository-issue pairs. We also include publicly available training data such as SWE-Gym
+(Pan et al.,
+2025
+)
+and R2E-Gym
+(Jain et al.,
+2025b
+)
+, further filtered by us for quality (e.g., removing non-verifiable instances whose tests cannot pass). All training data are decontaminated against SWE-bench Verified at repository-level granularity, see
+Section
+˜
+15
+.
+We estimate the difficulty of each instance using the pass@1 score from
+CWM
+SFT, calculated over at least
+32
+32
+samples. Instances with a pass@1 above
+95
+95
+% are filtered out as easy, while those with a non-zero pass@1 are included in the primary dataset. Instances with a
+0
+% pass@1 are placed in a secondary dataset that is sampled less frequently at the beginning. To make these hard problems solvable, we augment their prompts by adding the hidden test as a hint. This augmentation increases the pass@1 rate from
+0
+% to approximately
+30
+30
+%. Later in training, we remove hints from hard instances so the model learns to solve them from scratch. Finally, this process yields
+12.6
+k
+12.6\text{\,}\mathrm{k}
+unique training instances:
+6.9
+k
+6.9\text{\,}\mathrm{k}
+in the primary set and
+5.7
+k
+5.7\text{\,}\mathrm{k}
+in the secondary.
+5.3.2
+Coding
+Figure 10
+:
+Prompt template for math (left path) and competitive programming (right path) RL tasks.
+Design.
+RL for competitive programming aims to teach the model to write correct programs for challenging tasks and to reason about code and algorithms. Our competitive programming environment presents the problem to the agent in the first turn and optionally allows follow-up attempts, during which the environment provides execution feedback. It supports multiple programming languages and provides detailed feedback on syntax errors, timeouts, and incorrect test outputs. The environment terminates either when the maximum number of turns is reached or when the agent produces a correct solution. In the joint RL run, we limit the number of attempts to one but allow up to
+64
+K
+64\text{\,}\mathrm{K}
+tokens in responses to enable extensive reasoning.
+We adapt a lightweight prompt template which is shown in
+Figure
+˜
+10
+. The system prompt asks for reasoning delimited by
+<think>...</think>
+in clear-text. The user prompt specifies the programming language and instructs the agent to put the code solution inside a markdown block.
+Reward.
+We assign a reward of
+−
+1
+-1
+for incorrect trajectories and
+1
+1
+for correct ones. A trajectory is correct if it meets all of the following criteria:
+•
+Contains exactly one
+</think>
+tag, signaling successful reasoning completion.
+•
+Contains exactly one markdown block in the model’s generated answer.
+•
+The code solution passes all unit tests within the specified time and memory limits. We execute the unit tests in parallel using an internal code execution service on remote machines.
+Data sourcing and filtering.
+We source coding problems from various programming contest websites. A problem typically consists of a problem description, limitations on the input and output domains, memory and time limits, and input-output examples. In addition, each problem comes with a set of tests that we use to verify the correctness of candidate solutions: a solution is considered correct only if it produces the expected output for any given test input.
+We decontaminate the coding problems against test benchmarks and de-duplicate them to ensure that each training problem is unique. In both cases, we use MinHash-based similarity detection
+6
+6
+6
+https://github.com/serega/gaoya
+, applying word- or character-based matching depending on the length of each document. This process ensures the integrity of our evaluations, which is especially important for code generation and mathematical problems, where data contamination can significantly affect performance metrics.
+We use Llama-3.3-70B-Instruct to identify and remove poorly posed problems, such as those containing gibberish, missing or truncated problem statements, or lacking input/output descriptions. We do not apply any difficulty-based filtering. After decontamination (see
+Section
+˜
+15
+), the final code RL dataset has
+81
+k
+81\text{\,}\mathrm{k}
+prompts.
+5.3.3
+Agentic coding
+Figure 11
+:
+Example interaction for the agentic coding RL environment. The agent uses reasoning and tools to solve competitive programming problems. Before generating a final solution, the agent summarizes the interaction.
+The agentic coding environment combines the reasoning and tool use features of the SWE RL environment (
+Section
+˜
+5.3.1
+) with the competitive programming setup described in
+Section
+˜
+5.3.2
+. The user prompt is customized for solving competitive programming tasks and explicitly asks the agent to write and run tests to check and improve the solution, as illustrated in
+Figure
+˜
+11
+. Different from SWE RL, there is no
+submit
+tool in this environment. Instead, the agent needs to provide the solution in its final response, which is then extracted for evaluation. We consider two programming languages, Python and C++, where the images used for agentic interaction are
+python:3.11-slim
+for Python and
+python:3.11-bookworm
+(with
+gcc 12
+support) for C++.
+The final solution is then evaluated using the same execution infrastructure as for the competitive programming environment.
+5.3.4
+Mathematics
+Design.
+Although not the main focus on this research work, we consider mathematical reasoning as another RL task to further strengthen and generalize
+CWM
+’s reasoning capabilities. We restrict these problems to questions that have definitive and easy to verify answers. Both the questions and answers are formulated in
+L
+a
+T
+e
+X
+, similar to much of the math content found on the web.
+We also include a tool-enabled version of the math environment, adhering to the format described in
+Section
+˜
+17
+. Here, the agent may invoke the Python interpreter with custom code. Standard output and error contents will form the next observation and the agent is prompted to continue solving the task. We impose a limit of
+4
+4
+tool calls per episode and a
+10
+s
+10\text{\,}\mathrm{s}
+timeout per call.
+The prompt template shares its system prompt with the coding environment but differs in the user prompt. As shown in
+Figure
+˜
+10
+, the user prompt instructs the agent to place the final answer inside a
+L
+a
+T
+e
+X
+box.
+Reward.
+Every trajectory is classified as either correct (reward =
+1
+1
+) or incorrect (reward =
+−
+1
+-1
+). Correctness is defined as:
+•
+Exactly one
+</think>
+tag, signaling successful reasoning completion.
+•
+Exactly one
+$\boxed{}$
+for the predicted answer.
+•
+Our verifier emits
+True
+for the comparison between the predicted answer and the ground-truth answer.
+Since there is no general normal form for mathematical expressions, the verifier checks whether the predicted answer is equivalent to the ground truth answer using a variety of heuristics detailed in
+Section
+˜
+16
+.
+Data sourcing and filtering.
+We gather math questions and answers from publicly available sources. To remove duplicates, we use the MinHash LSH algorithm to identify similar problems and verify that they have the same answer using our verifier. We also filter out problems that were solved correctly in all attempts (
+32
+32
+out of
+32
+32
+) by our SFT model in non-reasoning mode. This helps avoid wasting compute on easy problems and reduces the risk of reinforcing incorrect reasoning followed by a correctly memorized answer. The prompt set used for RL training contains a total of
+278
+k
+278\text{\,}\mathrm{k}
+problem-answer pairs.
+5.4
+Joint RL
+Figure 12
+:
+Async RL systems overview. Worker nodes generate trajectory batches from multiple RL environments and send them to trainer nodes via a transfer queue. Trainer nodes form training batches either from worker-provided data or the rehearsal mix, packing trajectories up to the maximum context length for a single gradient update. Environment execution and verification can occur locally on worker nodes or remotely on another cluster or in the cloud.
+Finally, we train
+CWM
+using all of the above-mentioned RL tasks. As shown in
+Figure
+˜
+12
+, joint RL uses our asynchronous RL infrastructure: worker nodes generate
+G
+G
+trajectories per prompt from multiple RL environments and send them to trainer nodes through the data transfer queue. The trainer nodes then form training batches either from these worker-provided batches or directly from the SFT datamix (
+Section
+˜
+5.1
+) for rehearsal. We refer to
+Section
+˜
+6.2
+for further detail on engineering aspects of our asynchronous RL infrastructure.
+Data and RL environment mix.
+Worker nodes produce trajectories from three main data sources: software engineering, competitive programming, and mathematics. They use the four RL environments we describe in the previous sections, which we refer to as agentic SWE (
+Section
+˜
+5.3.1
+), code (
+Section
+˜
+5.3.2
+), agentic coding (
+Section
+˜
+5.3.3
+), and
+math (
+Section
+˜
+5.3.4
+). Each data source may contain multiple datasets from different origins; however, all datasets within the same data source share a consistent format and problem domain. We sample
+40
+%
+40\text{\,}\mathrm{\char 37\relax}
+of tasks from software engineering,
+40
+%
+40\text{\,}\mathrm{\char 37\relax}
+from competitive programming, and
+20
+%
+20\text{\,}\mathrm{\char 37\relax}
+from mathematics. Rehearsal batches constitute
+1
+/
+3
+1/3
+of the training data and are integrated with a standard negative log likelihood loss, scaled by a factor of
+0.1
+0.1
+to match the gradient magnitudes obtained with GRPO (
+Section
+˜
+5.2
+).
+Three-stage training.
+We split our joint RL training into three distinct stages. Between stages, we adapt the task distribution and employ custom reward shaping techniques.
+•
+Stage 1 – Reasoning format bootstrapping:
+In the initial training stage, we soft-control the length of generations in math and coding tasks with an action length reward schedule.
+For the
+40
+%
+40\text{\,}\mathrm{\char 37\relax}
+of tasks related to competitive programming, we evenly sample from four environments: code Python, code C++, agentic coding Python, and agentic coding C++ (10% each).
+For a subset of SWE tasks identified as challenging, we include a hint in the prompts and downsample their occurence (4% of overall tasks;
+Section
+˜
+5.3.1
+).
+•
+Stage 2 – Increasing task diversity and data resampling:
+After
+14 125
+14\,125
+gradient steps, we increase the proportion of competitive programming tasks in the datamix to
+50
+%
+50\text{\,}\mathrm{\char 37\relax}
+and reduce the fraction of SWE tasks to
+30
+%
+30\text{\,}\mathrm{\char 37\relax}
+.
+We also include additional environment variations for each task.
+Specifically, we add Rust, Go, Java, and JavaScript versions of the code environment, which, together with Python and C++, now constitute
+25
+%
+25\text{\,}\mathrm{\char 37\relax}
+of the datamix. The other half of the competitive programming tasks use the agentic coding environment, to which we do not add new languages.
+For the SWE data, we disable plugins with a
+50
+%
+50\text{\,}\mathrm{\char 37\relax}
+chance, such that file edits require standard terminal commands.
+We further remove hints from the challenging SWE subset and oversample it in a
+4
+:
+1
+4:1
+ratio when plugins are used, and reverse this ratio when plugins are disabled.
+Competitive programming and SWE datasources are filtered to include instances with a solve rate in
+[
+0.1
+,
+0.7
+]
+[0.1,0.7]
+only in order to to maximize the learning signal.
+For math tasks, we enable Python tool calling for
+2
+%
+2\text{\,}\mathrm{\char 37\relax}
+of the total datamix.
+At
+16 500
+16\,500
+steps, we apply filtering based on solve rate with the
+[
+0.1
+,
+0.7
+]
+[0.1,0.7]
+interval to our math dataset as well.
+For SWE data, we create fine-grained subsets for each 0.1 solve-rate interval from
+(
+0.0
+,
+0.7
+]
+(0.0,0.7]
+, such as
+(
+0.1
+,
+0.2
+]
+(0.1,0.2]
+and
+(
+0.6
+,
+0.7
+]
+(0.6,0.7]
+, and sample harder examples more frequently using weights inversely proportional to the interval’s midpoint.
+Hyperparameters.
+After an initial linear warmup over 100 steps, we employ a learning rates of
+2.5
+×
+10
+−
+7
+2.5\text{\times}{10}^{-7}
+throughout training.
+The maximum batch size for each gradient step is
+8.4
+M
+8.4\text{\,}\mathrm{M}
+tokens during the first stage and
+16.8
+M
+16.8\text{\,}\mathrm{M}
+tokens for the second stage.
+Gradients are clipped to norm 0.1. We use
+G
+=
+8
+G=8
+rollouts per data point, and new model weights are broadcast to workers after 4 gradient steps. We list further GRPO-specific hyperparameters in
+Section
+˜
+5.2
+.
+Length reward scheduling.
+In both the code and mathematics environments, we allow context lengths of up to
+64
+k
+64\text{\,}\mathrm{k}
+. We observe that, at the start of RL training, the model rapidly increases its response length, leading to inefficient token usage. To address this, we penalize the reward for correct but overlong solutions similar to DAPO
+(Yu et al.,
+2025
+)
+, but gradually phase out this penalty over training. Specifically, we linearly interpolate the reward between
+1
+1
+and
+−
+1
+-1
+for correct answers with a length that exceeds a soft maximum (
+8
+k
+8\text{\,}\mathrm{k}
+at the beginning of training) but is lower than the hard maximum of
+64
+k
+64\text{\,}\mathrm{k}
+. This provides a dense reward signal to the model that incentivizes it to reduce its response length, while still providing a positive signal when the answer is correct.
+During training, we gradually increase the soft maximum in a continuous manner until it is equal to the hard maximum after
+10
+k
+10\text{\,}\mathrm{k}
+training steps. See
+Figure
+˜
+13
+for an illustration of this process.
+Figure 13
+:
+Length reward scheduling for RL training. The decaying threshold
+δ
+\delta
+starts at
+8
+k
+8\text{\,}\mathrm{k}
+at the start of training and linearly increases to its
+64
+k
+64\text{\,}\mathrm{k}
+limit over
+10 000
+10\,000
+steps.
+6
+Code and infrastructure
+This section discusses details of our training pipeline, including efficiency-related features leveraged for
+CWM
+training and the asynchronous RL architecture.
+6.1
+Techniques for efficient training
+CWM
+is trained on H100s using a combination of Fully-Sharded Data Parallelism (FSDP) and Tensor Parallelism (TP), see
+Table
+˜
+3
+. We adopt FlashAttention-3
+(Dao et al.,
+2022
+; Dao,
+2024
+)
+to improve training speed and reduce memory overhead. Additionally, we incorporate several optimizations towards efficient training.
+fp8
+matrix multiplication.
+All linear layers in transformer blocks used
+float8
+low-precision mode, similar to
+Micikevicius et al. (
+2022
+)
+, achieving twice the nominal FLOPs of
+bfloat16
+on Hopper GPUs. (For RL training, fp8 precision hurt performance and hence we used
+bfloat16
+for linear layers in transformer blocks.) We used dynamic “row-wise” scaling, also called “outer-vector”, that is, operands were scaled along their reduction dimension. We used the
+e4m3
+variant exclusively, and we disabled fast-accumulation throughout. The two matrix multiplication operations for the gradient computation in the backwards pass use special setups:
+w.grad
+is computed in
+bfloat16
+(which increased accuracy and precluded the need to transpose its operands to satisfy
+float8
+layout constraints, which make kernel fusion difficult);
+in.grad
+uses “tensor-wise” scaling for its weight operand, i.e., a single scaling factor for the whole tensor, which again makes transposition more efficient. In practice, we try to issue “unscaled” matrix multiplication kernels, introducing scaling in the kernel epilogue worsens performance, and perform the scaling as a manual post-processing step, which can be fused into subsequent kernels.
+Reducing communication overhead for tensor parallelism.
+We reduced the communication overhead of tensor parallelism (which we implement as sequence parallelism) by more-effectively overlapping it with computation via decomposition and micropipelining, using PyTorch’s Asynchronous Tensor Parallel (Async-TP) feature
+7
+7
+7
+See
+Async tensor parallelism in PyTorch with TorchTitan
+.
+, derived from xFormers
+(Lefaudeux et al.,
+2022
+)
+and originally inspired by
+Wang et al. (
+2022a
+)
+. We implemented this optimization by-hand for the matrix multiplication needed to compute
+w.grad
+during the backward pass (sharded along the reduction dimension), as there is no defacto support in PyTorch.
+fp8
+with tensor parallelism.
+When TP is enabled, we adapted our
+fp8
+recipe to further improve performance. We use “sub-row-wise” scaling where appropriate to align quantization boundaries with TP shards, which both avoids communication and improves accuracy. We perform all-gathers in
+fp8
+, which improves throughput and also enables fusing quantization into previous kernels (e.g., LayerNorm). During the backward pass, however, we sometimes all-gather the same data twice (once in
+fp8
+and once in
+bf16
+, since it will be consumed by two separate matmuls, one for each dtype). Because of Async-TP, however, this adds zero overhead and enables fusing quantization into previous kernels.
+Reducing memory consumption.
+We use PyTorch’s AutoAC
+8
+8
+8
+Enabled by setting
+torch._functorch.config.activation_memory_budget < 1
+.
+for activation checkpointing, which is integrated in the “partitioner” layer of the
+torch.compile
+stack, and uses an integer-linear program solver to optimize the memory-versus-recomputation tradeoff given a user-provided budget. We also leverage PyTorch’s vocab- and loss-parallel helpers to reduce memory consumption.
+Table 3
+:
+Summary of the training setup for the different
+CWM
+training stages on H100 GPUs.
+Phase
+Seq. Length
+Batch size
+# GPUs
+Shards
+DP
+TP
+Pre-training
+8
+k
+8\text{\,}\mathrm{k}
+8.4
+M
+8.4\text{\,}\mathrm{M}
+2048
+2048
+1024
+1024
+2
+2
+Mid-training
+131
+k
+131\text{\,}\mathrm{k}
+33.6
+M
+33.6\text{\,}\mathrm{M}
+2048
+2048
+256
+256
+8
+8
+Supervised Fine-tuning
+32
+k
+32\text{\,}\mathrm{k}
+2.1
+M
+2.1\text{\,}\mathrm{M}
+256
+256
+32
+8
+Reinforcement Learning
+131
+k
+131\text{\,}\mathrm{k}
+8.4
+M
+8.4\text{\,}\mathrm{M}
+/
+16.8
+M
+16.8\text{\,}\mathrm{M}
+2560/4608
+64
+8
+6.2
+RL systems
+We train our models using our own asynchronous distributed RL framework. The key distinction from the standard
+llm
+training lies in the data collection process: in RL, training data is gathered through rollouts where the agent interacts with an environment.
+Rollouts.
+As shown in
+Figure
+˜
+14
+, a rollout consists of a sequence of iterative agent-environment interactions. Each environment implements two methods:
+•
+start
+: start a new episode by producing an initial state and an observation (prompt) based on a sample from the dataset. The state encapsulates the contents of the hidden environment along with any specific resources corresponding to the current episode.
+•
+step
+: takes an action (sequence of tokens) leading to a state transition. The new observation includes all information visible to the agent and necessary during training or inference, such as the latest action, observation, and reward.
+Figure 14
+:
+Overview of how agents interact with RL environments to produce trajectories.
+All interactions between the agent and the environment are token-based. In addition, the environment can suggest context switches to erase past history or restart from scratch, allowing multi-context trajectories.
+Our environments adhere to a common
+trajectory format
+, which prescribes that a trajectory consists of a sequence of messages, whose format is detailed in
+Section
+˜
+17
+.
+Training.
+GPUs are divided into a set of
+workers
+that continuously perform rollouts and
+trainers
+that update the current policy. Workers send batches of trajectories to the trainers as soon as they are completed and trainers send updated model weights to the workers periodically. After a model update is received and applied on a worker, the worker continues generation of partially completed trajectories using the old KV-cache. This approach ensures continuously high GPU utilization (see
+Figure
+˜
+15
+) and has been used in our previous work
+(Synnaeve et al.,
+2019
+; Gehring et al.,
+2025
+; Tang et al.,
+2025
+; Cohen et al.,
+2025
+)
+and notable RL frameworks such as PipelineRL
+(Piche et al.,
+2025
+)
+.
+Inference.
+We use our own throughput-optimized inference backend FastGen
+(Carbonneaux,
+2025
+)
+.
+FastGen supports batched inference, CUDA graphs, paged attention
+(Kwon et al.,
+2023
+)
+, chunked prefills, host-side KV-cache, tensor parallelism, and CPU/GPU profiling.
+In batched inference, one generates tokens for each sequence in a batch in parallel, continuing without synchronizing CUDA streams until a block of tokens (e.g.,
+32
+32
+) is completed.
+After each block, completed sequences are truncated at stop tokens and returned, and new sequences are added to the batch so as to keep the batch size constant.
+For more details, see
+Carbonneaux (
+2025
+)
+.
+Parallelism.
+We support various kinds of parallelism on both trainer and worker nodes.
+Trainers operate largely as in pretraining (see
+Section
+˜
+4.2
+), supporting FSDP and TP.
+Worker nodes are grouped with TP to perform batched inference.
+Whereas all trainer GPUs are synchronized, the worker groups/model replicas operate asynchronously from each other and from the trainers.
+Model transfer.
+For efficient model transfers, we use our custom PyTorch distributed backend, moodist
+9
+9
+9
+See
+https://github.com/facebookresearch/moodist
+.
+(Mella,
+2025
+)
+. It implements efficient queues that transfer data directly between GPU and CPU memory via InfiniBand both within the same compute node and between different nodes. This facilitates transferring model weights directly from the trainer’s GPU memory to the worker’s CPU memory. With FSDP, each trainer has a shard of the model weights. These shards must be concatenated and sent to the workers.
+Model transfer consists of three stages:
+1.
+Each trainer sends their local shard from GPU memory to the CPU memory of a single worker.
+2.
+The workers perform a distributed concatenation similar to an all-gather, such that each worker ends up with all model weights.
+3.
+Each worker then individually applies the new weights.
+The trainers are only involved in the first stage, which minimizes the amount of time model transfer takes on the trainers. On the workers, the first and second stages both occur entirely in CPU memory, which allows them to overlap with generations. The third stage is simply a CPU to GPU memory copy, so it is reasonably fast. When TP is enabled, this process occurs individually for each data-parallel group.
+Execution infrastructure.
+Our training pipeline leverages an internal code execution service to safely execute tens of thousands of code snippets per second, in parallel across multiple programming languages and asynchronously in isolated containerized environments. This code execution service is integrated into our training loop to provide execution results including stdout, stderr, exit codes, and environment state as feedback to the LLM.
+Containerized execution for agentic RL.
+We use a custom tool-based execution environment for agentic reinforcement learning, enabling agents to interact with containerized environments through structured tool calls for agentic tasks. It features a core tool execution framework based on flexible container backends (e.g., Docker execution services or Modal
+(
+Modal Team,
+)
+), implementation of remote execution servers and clients as an interface to a persistent shell session, plugins that can be defined as standalone Python scripts invoked through bash, along with evaluation infrastructure for reward calculation or benchmarking.
+Figure 15
+:
+In
+CWM
+-RL, model weights can be updated at any time on the worker side: between trajectories, within a trajectory between steps, or even during token generation. Compared to traditional RL, this removes all synchronization overhead, maximizing worker throughput while minimizing idle time. In exchange for never blocking inference, we accept that trajectories will potentially use mixed weights, though frequent model updates ensure that generations remain reasonably on-policy. Different workers may not update their weights at the same time: the system waits for each worker to signal readiness before sending new weights to avoid memory overload.
+7
+Experimental results
+We begin this section by analyzing the impact of incorporating
+CWM
+data during mid-training for a small-scale ablation.
+Next, we evaluate
+CWM
+and compare its performance against relevant baselines, focusing on coding and mathematical reasoning tasks.
+We consider agentic evaluation for coding tasks, together with additional computation-oriented evaluations covering
+(i) output prediction with execution traces and reasoning, (ii) full execution trace prediction, (iii) program termination prediction, and (iv) prediction and generation of algorithmic complexity.
+Finally, we evaluate
+CWM
+considering established benchmarks for competitive programming, mathematical reasoning, non-reasoning evaluation, and long-context.
+Unless otherwise mentioned, we use a temperature of
+1.0
+1.0
+and top-p value of
+0.95
+0.95
+for all evaluations.
+7.1
+The impact of
+CWM
+data
+To evaluate the effect of incorporating
+CWM
+data during mid-training, we perform ablations with
+8
+B
+8\text{\,}\mathrm{B}
+parameter models trained for
+7
+T
+7\text{\,}\mathrm{T}
+tokens. We first pre-trained one model for
+6
+T
+6\text{\,}\mathrm{T}
+tokens and then studied different mid-training datamixes for the remaining
+1
+T
+1\text{\,}\mathrm{T}
+tokens, ablating the two
+CWM
+datasets, ForagerAgent and Python execution trace data, as well as our Github PR trajectory data.
+10
+10
+10
+One may wonder if using the non-agentic PR data alone is sufficient for reaching strong performance on SWE-bench Verified.
+After mid-training, all variants underwent a fine-tuning phase comparable to our main setup for
+CWM
+described in
+Section
+˜
+5.1
+but excluding the RL phase. We report results on CruxEval-O, CruxEval-I, NLLs over SWE-bench Verified (SBV) oracle patches, and NLLs over agentic SBV trajectories (truncated to
+32
+k
+32\text{\,}\mathrm{k}
+sequence length) for the models out of mid-training and pass@1 SBV numbers for the models after SFT.
+The results in
+Table
+˜
+4
+show that the best performance across our set of metrics is achieved when using all datasets together. This effect carries over to our SBV evaluation of the SFT model, demonstrating how mid-training data choices can positively affect final model performance. Looking at the impact of individual datasets, we find the inclusion of the PR data helps oracle SBV NLLs and SBV pass@1 but not the agentic SBV trajectory NLLs or CruxEval. Further incorporating execution trace data significantly improves CruxEval-input and -output prediction but leaves all SBV-related metrics unaffected. Lastly, only the addition of ForagerAgent data improves agentic SBV NLLs.
+The ForagerAgent data is further able to improve SBV pass@1 scores by another
+3.7
+%
+3.7\text{\,}\mathrm{\char 37\relax}
+.
+Table 4
+:
+Our ablation study reveals a positive impact on performance from introducing GitHub PR trajectory, Python execution tracing, and ForagerAgent data during mid-training. We report results for CruxEval-output, CruxEval-input, NLLs on oracle SWE-bench Verified (SBV) trajectories, NLLs on agentic SBV trajectories, and SBV pass@1 scores. All results are for
+8
+B
+8\text{\,}\mathrm{B}
+models, jointly pre-trained for
+6
+T
+6\text{\,}\mathrm{T}
+tokens followed by
+1
+T
+1\text{\,}\mathrm{T}
+tokens of mid-training ablation, with SBV pass@1 reported after an additional SFT phase.
+PRs
+Tracing
+Forager
+CruxEval-O
+↑
+\uparrow
+CruxEval-I
+↑
+\uparrow
+Oracle SBV NLL
+↓
+\downarrow
+Agentic SBV NLL (32k)
+↓
+\downarrow
+SBV
+↑
+\uparrow
+✗
+✗
+✗
+45.4
+44.1
+0.64
+0.39
+14.6
+✓
+✗
+✗
+44.6
+45.8
+0.55
+0.37
+18.6
+✓
+✓
+✗
+73.9
+51.5
+0.54
+0.38
+18.4
+✓
+✓
+✓
+74.5
+54.8
+0.54
+0.29
+22.1
+7.2
+Agentic evaluation
+SWE-bench Verified.
+Figures
+˜
+2
+and
+16
+show results for SWE-bench Verified.
+CWM
+achieves pass@1 resolve rates of
+65.8
+%
+65.8\text{\,}\mathrm{\char 37\relax}
+with test-time-scaling and
+53.9
+%
+53.9\text{\,}\mathrm{\char 37\relax}
+without test-time scaling (averaged over 4 runs). With test-time scaling,
+CWM
+outperforms open-weight models at similar size and is competitive to larger and proprietary models. The base score without test-time scaling also surpasses open-weight models with similar parameter counts and remains respectable even when comparing to much larger models such as GPT-oss-120B
+(Agarwal et al.,
+2025
+)
+, Qwen3-Coder
+(Yang et al.,
+2025a
+)
+, and Kimi K2
+(Kimi Team et al.,
+2025
+)
+.
+Figure 16
+:
+SWE-bench Verified pass@1 scores.
+CWM
+achieves best-in-class performance with and without test-time-scaling (tts), achieving
+65.8
+%
+65.8\text{\,}\mathrm{\char 37\relax}
+and
+53.9
+%
+53.9\text{\,}\mathrm{\char 37\relax}
+respectively. Note that GPT-oss scores are computed with respect to a limited subset of
+477
+477
+out of
+500
+500
+problems.
+For Test-Time-Scaling (TTS) on SWE-bench Verified, we first generate
+k
+k
+candidate solutions as well as
+40
+40
+novel
+unit tests in parallel agentic loops for each instance.
+Like Agentless
+(Xia et al.,
+2024
+)
+, we ask the model to generate tests that verify patch correctness
+and
+reproduce the original bug, enabling us to filter out tests that fail to reproduce errors. Following SWE-RL
+(Wei et al.,
+2025
+)
+, we keep the top-5 majority tests for each instance.
+Since candidate solutions are often similar in the number of
+existing
+tests they pass, we prioritize the strongest candidates by keeping only those patches that pass the highest number of existing tests. We then execute the remaining patches on the filtered set of novel tests and select the patch with the highest pass rate for submission. In case of ties, we prioritize the majority patch, and if the tie remains, we choose the patch whose trajectory has fewer tokens. We refer to this approach as best@
+k
+k
+.
+In
+Figure
+˜
+16
+, we report results for best@
+k
+k
+for
+k
+=
+16
+k=16
+, which achieves a
+65.8
+%
+65.8\text{\,}\mathrm{\char 37\relax}
+resolve rate. As a simple alternative to best@
+k
+k
+, we found that majority voting
+(Wang et al.,
+2022b
+)
+of candidate patches, based on exact string matching and without any test generation or execution, leads to a pass rate of
+58.4
+%
+58.4\text{\,}\mathrm{\char 37\relax}
+. In
+Figure
+˜
+17(a)
+, we report best@
+k
+k
+and pass@
+k
+k
+across different values of
+k
+k
+.
+As expected, pass@k improves monotonically with larger
+k
+k
+, ultimately reaching a success rate of
+80.4
+%
+80.4\text{\,}\mathrm{\char 37\relax}
+at
+k
+=
+40
+k=40
+. For best@
+k
+k
+, performance improves sharply from
+k
+=
+2
+k=2
+before plateauing around
+k
+=
+16
+k=16
+. For majority-voting, performance improves gradually from
+k
+=
+2
+k=2
+and plateaus at
+k
+=
+24
+k=24
+.
+(a)
+(b)
+Figure 17
+:
+(a) Test time scaling (TTS) with both our best@
+k
+k
+method majority voting can significantly increase pass@
+1
+1
+rates for
+CWM
+on SWE-bench Verified. (b) Accuracy of
+CWM
+on Aider Polyglot by programming language using the whole file edit format.
+Alternative harnesses for SWE-bench Verified.
+To better understand the robustness of
+CWM
+to the choice of evaluation harness and tool-calling implementations, we perform experiments with third-party approaches, namely Mini-SWE-Agent
+(Yang et al.,
+2024
+)
+and OpenHands
+(Wang et al.,
+2025
+)
+. For both, we shortened and adapt the system prompt to better align with the SWE RL prompt (see
+Figure
+˜
+11
+) and make sure to keep reasoning output as part of the message history. We configure both harnesses to use OpenAI function calling, which sends messages along with structured tool descriptions. When prompting the model, we format and append the available tools to the system prompt. When the model decides to call a tool, the call is parsed and returned in a
+tool_calls
+field in our response. This makes sure that tools are rendered with a syntax template suitable for prompting
+CWM
+.
+For Mini-SWE-Agent, we follow the official budget of
+250
+250
+turns. For OpenHands, we report results for
+40
+40
+,
+128
+128
+, and
+500
+500
+turns.
+Additionally, we report results for our harness when limiting tool use to bash commands only. As
+Section
+˜
+7.2
+shows, although resolve rates degrade when using different agents, tool implementations, or limiting tool choices,
+CWM
+provides robust and reasonable performance across all setups.
+Table 5
+:
+SWE-bench Verified resolve rates for alternative agentic harnesses are lower than the
+53.9
+%
+53.9\text{\,}\mathrm{\char 37\relax}
+pass@1 achieved with our approach, but performance remains reasonable across the board.
+Harness
+Configuration
+Resolve Rate (%)
+Mini-SWE-Agent
+250 turns
+37.6
+OpenHands
+40 turns
+36.0
+128 turns
+42.6
+500 turns
+40.8
+Ours
+(bash-only)
+128 turns
+42.1
+Ours
+128 turns
+53.9
+Table 6
+:
+Results on Aider Polyglot for
+CWM
+and baselines from the official leaderboard.
+Model
+Format
+Pass 1@2 (%)
+o3-pro (high)
+Diff
+84.9
+DeepSeek R1 (0528)
+Diff
+71.4
+Qwen3 235B A22B diff, no think
+Diff
+59.6
+Kimi K2
+Diff
+59.1
+gpt-oss-120b (high)
+Diff
+41.8
+Qwen3-32B
+Diff
+40.0
+Gemini 2.0 Pro exp-02-05
+Whole File
+35.6
+CWM
+Whole File
+35.1
+Grok 3 Mini Beta (low)
+Whole File
+34.7
+o1-mini-2024-09-12
+Whole File
+32.9
+gpt-4.1-mini
+Diff
+27.1
+Codestral 25.01
+Whole File
+11.1
+Multi-lingual coding.
+The Aider Polyglot benchmark
+(Aider Team,
+2025
+)
+measures coding ability across a diverse set of programming languages using challenging exercises from Exercism.
+11
+11
+11
+See
+https://exercism.org/
+.
+The primary metric is the pass rate on the second attempt, allowing the model to iterate on test failures once. We make a few changes to the harness to align it with the
+CWM
+training distribution, such as removing hard-coded assistant messages from the history, concatenating adjacent messages of the same role, removing examples from the system prompt, turning off auto-linting and stripping of reasoning traces, and reiterating in the prompt that exact matches are needed. We evaluate with reasoning, at temperature
+0.4
+0.4
+, and without test-time-scaling. Although Aider Polyglot may not fully qualify as a truly agentic benchmark – given its lack of dynamic tool use beyond code execution and limited interaction – we include it here in light of the self-correction capabilities that it allows for.
+As shown in
+Table
+˜
+6
+, CWM achieves
+35.1
+%
+35.1\text{\,}\mathrm{\char 37\relax}
+accuracy, comparable to other models in its class such as Qwen3-32B (
+40.0
+%
+40.0\text{\,}\mathrm{\char 37\relax}
+) and other models using the “whole file” edit format such as Gemini 2.0 Pro (
+35.6
+%
+35.6\text{\,}\mathrm{\char 37\relax}
+).
+We also observe good generalization performance across the six languages tested in the benchmark, as shown in
+Figure
+˜
+17(b)
+. Many top-performing models, such as o3-pro (
+84.9
+%
+84.9\text{\,}\mathrm{\char 37\relax}
+)
+(OpenAI,
+2025b
+)
+, DeepSeek R1 (
+71.4
+%
+71.4\text{\,}\mathrm{\char 37\relax}
+), and Qwen3 235B (
+59.6
+%
+59.6\text{\,}\mathrm{\char 37\relax}
+), achieve substantially higher scores using the “diff” edit format. However,
+CWM
+was not optimized for this format and does not reach competitive performance with it.
+Terminal-Bench.
+Another multi-turn agentic coding benchmark that is gaining in popularity is Terminal-Bench
+(The Terminal-Bench Team,
+2025
+)
+. In Terminal-Bench, the agent is asked to solve various complex tasks by operating directly in a tmux session.
+Again, we align the prompts and response parsing of the Terminus-1
+12
+12
+12
+See
+https://www.tbench.ai/terminus
+.
+agent provided by the benchmark with our RL training phase: we modify the system prompt to use the tools that
+CWM
+was trained with (see
+Figure
+˜
+8
+) and parse the model output back into the format that Terminus-1 expects. We also include reasoning tokens from prior turns into the agent’s history.
+In this setup,
+CWM
+achieves a
+26.25
+%
+26.25\text{\,}\mathrm{\char 37\relax}
+accuracy with the Terminus 1 agent following the default budget of
+50
+50
+turns.
+Table
+˜
+7
+shows this places
+CWM
+below o4-mini but above Gemini 2.5 Pro on the Terminal-Bench leaderboard.
+Table 7
+:
+Results on Terminal-Bench for
+CWM
+and baselines from the official leaderboard.
+Model
+Agent
+Accuracy (%)
+OpenAI-Multiple
+OB-1
+59.0
+GPT-5
+OB-1
+49.0
+GPT-5
+Terminus 1
+30.0
+o4-mini
+Goose
+27.5
+CWM
+Terminus 1
+26.3
+Gemini 2.5 Pro
+Terminus 1
+25.3
+o4-mini
+Terminus 1
+18.5
+Grok 3 Beta
+Terminus 1
+17.5
+Gemini 2.5 Flash
+Terminus 1
+16.8
+Qwen3-32B
+TerminalAgent
+15.5
+Table 8
+:
+Execution trace prediction is competitive with reasoning for CruxEval-output pass@1 scores. For
+CWM
+, we use temperature
+0.6
+0.6
+, top-p
+0.95
+0.95
+, and
+10
+10
+generations, while for
+CWM
+SFT we use greedy decoding.
+Budget
+Mode
+CWM SFT
+CWM
+small
+Language w/o CoT
+67.8
+66.6
+Trace Step
+59.1
+58.1
+large
+Language w/ CoT
+83.3
+94.3
+Trace Full
+87.3
+87.7
+7.3
+Execution trace prediction
+Next, we analyze the ability of
+CWM
+to perform trace prediction, analyze its prediction, and explore this ability to predict program termination.
+CruxEval-O as execution trace prediction.
+The following experiment evaluates
+CWM
+’s ability to predict Python execution traces using the format introduced in
+Section
+˜
+2.2
+. We prompt the model with functions and input arguments from the CruxEval test set, ask it to predict the function execution trace line-by-line, and then compare its output prediction to the ground truth. To elicit trace prediction, we construct prompts following our custom trace format, with the input containing the function as the code context, the call arguments as the state, and the line containing the function definition as the first action.
+In addition to this “full” execution trace prediction scenario, we also study a single-“step” scenario, for which we ask the model to directly predict the return value of the function. This is achieved by replacing the
+<|line_sep|>
+token with
+<|return_sep|>
+.
+We illustrate both formats in
+Figure
+˜
+11.22
+. We compare the “step” scenario to classic CruxEval-output prediction, which few-shot prompts the model to directly predict outputs given function definitions and inputs.
+Consequently, we compare the “full” trace prediction mode to CruxEval-output with reasoning, which allows
+CWM
+to use reasoning as introduced in
+Section
+˜
+5.3
+before predicting the function output.
+Our results in
+Table
+˜
+8
+show that large compute budgets, either allowing for execution trace prediction or reasoning, produce better results.
+CWM
+achieves a best score of
+94.0
+%
+94.0\text{\,}\mathrm{\char 37\relax}
+in natural language reasoning mode, while full trace prediction achieves
+88
+%
+88\text{\,}\mathrm{\char 37\relax}
+. Note that language reasoning traces are significantly more verbose, using
+1164
+1164
+tokens on average compared to
+497
+497
+tokens for full trace prediction. We also report results for
+CWM
+after SFT, which achieves its best result of
+87.3
+%
+87.3\text{\,}\mathrm{\char 37\relax}
+using full execution trace prediction.
+Single-step trace prediction is not competitive with classic few-shot prompting for either
+CWM
+model.
+Execution trace prediction analysis.
+Follow previous paragraph, we present a detailed evaluation of the quality of the execution traces predicted by
+CWM
+for validation sets of CruxEval and our function-level data. Concretely, we measure the fraction of generated traces that follow our format (Valid Trace Format) and the observation (action) exact match accuracy (Observation (Action) Exact Match), which measures the number of observations (actions) exactly matching ground truth relative to the total number of observations (actions) per execution trace. Our trace format specifies the state as a JSON dump of a dictionary containing the local variables.
+We report the fraction of state predictions matching this format (Valid JSON Format).
+Additionally, Key (+Value) Match measures the average fraction of matching keys (and values) per state prediction.
+The results in
+Table
+˜
+9
+show that
+CWM
+adheres to the correct trace and observation format for all data sources, achieving more than
+99
+%
+99\text{\,}\mathrm{\char 37\relax}
+format matching across the board.
+CWM
+is able to accurately predict the execution trace as well as intermediate observations and actions, which is reflected in scores larger than
+96
+%
+96\text{\,}\mathrm{\char 37\relax}
+for Observation/Action Exact Match and larger than
+97
+%
+97\text{\,}\mathrm{\char 37\relax}
+in Key (+Value) Match.
+Table 9
+:
+Detailed analysis of execution trace prediction with
+CWM
+and greedy decoding. We present a breakdown of the accuracy of the individual components of trace prediction for validation set inputs from CruxEval as well as our function-level data.
+The CruxEval pass@
+1
+1
+score here differs from the one in
+Table
+˜
+8
+(
+87.7
+%
+87.7\text{\,}\mathrm{\char 37\relax}
+) due to greedy decoding.
+Overall, we find solid accuracy across state and action prediction.
+CruxEval
+Function-level
+Output
+pass@1
+88.0
+94.4
+Trace
+Valid Trace Format
+99.6
+100.0
+State Exact Match
+96.9
+96.4
+Action Exact Match
+96.5
+98.0
+States
+Valid JSON Format
+100.0
+100.0
+Key Match
+99.1
+99.0
+Key+Value Match
+98.1
+97.9
+Statistics
+Avg State Length (Token)
+11.7
+18.8
+Avg Action Length (Token)
+11.2
+10.0
+7.4
+Program termination prediction
+The question of
+whether a program terminates is a reasoning problem which goes beyond what can be shown by considering individual finite traces as in
+CWM
+training: non-termination cannot be observed by executing a trace in finite time, and termination on all inputs cannot be feasibly observed by enumerating traces.
+Figure
+˜
+11.28
+in the Appendix illustrates termination reasoning, whereby
+CWM
+considers several concrete inputs before generalizing to the conclusion of terminating on all inputs.
+We propose HaltEval-prelim, a novel benchmark obtained by automatically translating C programs with termination annotations into Python using LLaMA-3-70B via few-shot prompting. The C programs are sourced from the International Competition on Software Verification (SVCOMP) and the Termination Problems Database (TPDB).
+13
+13
+13
+See
+https://sv-comp.sosy-lab.org/
+and
+https://termination-portal.org/wiki/TPDB
+.
+Each original problem comes with termination/non-termination annotations, which we manually verify are preserved during the Python translation phase and otherwise discard.
+We obtained a balanced dataset consisting of
+115
+115
+terminating (
+T
+) and
+115
+115
+non-terminating (
+NT
+) Python programs.
+We query LLMs to judge whether a program terminates (answer
+#T
+) or diverges (answer
+f
+​
+(
+n
+)
+f(n)
+where n leads to divergence, followed by the comment
+#NT
+).
+We reward a divergence claim if
+f
+​
+(
+n
+)
+f(n)
+times out after 5 seconds.
+If, however, the model predicts
+#NT
+, and the ground truth is
+#T
+, it is not rewarded, even if execution exceeds the timeout. For instance, if
+f
+f
+’s ground truth is
+#T
+and
+f
+​
+(
+42
+)
+f(42)
+runs for 7.5 million years and then terminates, our scoring (pass@1) will not reward a
+#NT
+claim for
+f
+​
+(
+42
+)
+f(42)
+even though it trips timeout.
+Our use of timeout as a proxy for divergence is similar to
+Alon and David (
+2022
+)
+in judging correct non-termination claims, but different in that timeout is not used as a ground truth for termination claims. This results in an eval that is conservative in the sense that it awards scores that could be higher than that given by a perfect oracle, but never lower.
+It would be worth exploring replacing the ground truths and input validation by logical proofs of termination and non-termination
+(Cook et al.,
+2011
+; Gupta et al.,
+2008
+)
+.
+Table
+˜
+10
+reports results for
+CWM
+, Qwen3-32B, and Llama3-70B with direct prediction, prompted chain-of-thought (CoT), and reasoning (for
+CWM
+and Qwen3-32B only). “Reasoning” here means use of the
+<think> ... </think>
+format from RL. We report CoT prompting numbers to represent an attempt to approximate reasoning that is compatible with Llama3-70B.
+As a reference, we also provide the scores of a constant classifier tagging all programs as terminating, which would obtain a pass@ of
+0.5
+0.5
+.
+When comparing
+CWM
+and Qwen3, results suggest Qwen3 reaches better direct and CoT performance, however under the reasoning setup, both models significantly improved, reaching comparable performance of
+∼
+\sim
+0.94
+0.94
+pass@
+1
+1
+.
+Table 10
+:
+HaltEval-prelim pass@1
+scores for different LLMs in different prompting settings. For reasoning we use temperature
+0.6
+0.6
+, top-p of
+0.95
+0.95
+, and
+10
+10
+generations, while for direct and CoT predictions we use greedy decoding.
+Constant
+CWM
+Qwen3-32B
+Llama-3-70B
+T
+Direct
+CoT
+Reasoning
+Direct
+CoT
+Reasoning
+Direct
+CoT
+pass@1
+0.5
+0.37
+0.55
+0.94
+0.49
+0.68
+0.94
+0.43
+0.48
+We initially designed HaltEval-prelim under the assumption that termination would be difficult to assess, given its undecidability. The strong results achieved by both
+CWM
+and Qwen3-32B with reasoning were therefore unexpected. Still, these findings should be interpreted cautiously: the benchmark is based on small, self-contained programs and does not reflect the challenges of real-world software, where bugs must be detected in large and complex codebases. Hence, success on this preliminary dataset may not translate directly to practice. Moreover, termination in real systems is highly imbalanced – typically with hundreds or thousands of terminating loops for every non-terminating one – unlike the balanced distribution in our dataset
+(Vanegue et al.,
+2025
+)
+.
+7.5
+Algorithmic complexity prediction
+We evaluate
+CWM
+on two tasks from
+BigO(Bench)
+(Chambon et al.,
+2025
+)
+: complexity prediction, determining the Big-O time/space complexity of existing code, and complexity generation, solving coding problems while adhering to specified complexity constraints. We report all@
+1
+1
+scores, which require correct LLM output simultaneously across all possible complexity classes for a given problem. For complexity generation, we also report the pass@
+1
+1
+score with and without the complexity requirement (the solution still needs to be correct), and a best@
+1
+1
+score that corresponds to pass
+@
+​
+1
+@1
+on the lowest complexity class of each problem, dismissing suboptimal classes.
+Results for
+CWM
+, Qwen3-32B, Qwen3-coder-30B, and Gemma-3-27B are all presented in
+Table
+˜
+11
+. To ensure the comparison with external models remains as fair as possible, we choose to re-evaluate them alongside
+CWM
+in the same evaluation setting.
+For both tasks, we use
+BigO(Bench)
+’s official setup, after performing a prompt ablation that did not seem to further boost performance.
+On time complexity prediction,
+CWM
+achieves the best all@
+1
+1
+score of all compared models but fares worse for space complexity. In particular, looking at the official benchmark leaderboard,
+14
+14
+14
+See
+https://facebookresearch.github.io/BigOBench/leaderboard.html
+at the time of writing.
+CWM
+ranks second overall on time complexity prediction (all@
+1
+1
+) across all reported models of all sizes.
+For time complexity generation,
+CWM
+achieves the best overall pass@
+1
+1
+, best@
+1
+1
+, and all@
+1
+1
+scores for our set of models, and also ranks second in general looking at the official benchmark scores.
+For space complexity generation, our model ranks first for pass@
+1
+1
+on code only, and second behind Qwen3-32B in terms of the remaining metrics. We note that
+CWM
+stands out in particular in time complexity reasoning, systematically outperforming other models across all metrics on both prediction and generation variants. Moreover, when complexity requirements are set aside, the model’s performance on code-only pass@
+1
+1
+degrades far less than for other models, indicating
+CWM
+is able to maintain focus on fundamental task requirements while effectively handling additional constraints.
+Table 11
+:
+BigOBench
+results comparing
+CWM
+against Qwen3-32B (with reasoning), Qwen3-coder-30B, and Gemma-3-27B on complexity prediction and complexity generation, for both time and space complexity.
+CWM
+outperforms our set of baseline models for all metrics on time complexity prediction and generation. For space complexity generation,
+CWM
+performs best on code-only pass@
+1
+1
+and ranks second on the remaining metrics.
+We refer to the main text for details on the task and metrics.
+CWM
+Qwen3-32B
+Qwen3-coder-30B
+Gemma-3-27B
+Prediction
+Time Complexity
+- all@
+1
+1
+41.3
+39.0
+36.6
+37.7
+Space Complexity
+- all@
+1
+1
+12.3
+15.1
+9.1
+13.1
+Generation
+Time Complexity
+Code Only - pass@
+1
+1
+76.1
+70.0
+43.8
+34.4
+Code & Complexity - pass@
+1
+1
+31.3
+29.1
+20.3
+13.3
+Code & Complexity - best@
+1
+1
+48.6
+43.5
+27.2
+15.2
+Code & Complexity - all@
+1
+1
+7.6
+6.5
+5.5
+2.1
+Space Complexity
+Code Only - pass@
+1
+1
+73.2
+65.9
+45.1
+36.4
+Code & Complexity - pass@
+1
+1
+24.1
+25.5
+17.7
+14.6
+Code & Complexity - best@
+1
+1
+36.6
+39.6
+26.3
+20.6
+Code & Complexity - all@
+1
+1
+3.2
+5.1
+2.4
+1.5
+7.6
+Code and mathematical reasoning
+We present results on LiveCodeBench (LCB,
+Jain et al. (
+2025a
+)
+), concretely the LCBv5 and LCBv6 date ranges
+01.10.2024-01.02.2025
+15
+15
+15
+For LCBv5, we report results starting from October to be consistent with the numbers reported by Qwen3.
+and
+01.08.2024-01.05.2025
+, in
+Table
+˜
+12
+.
+16
+16
+16
+Results for Magistral were taken from the official reported numbers for 1.2 version, where no explicit dates were mentioned.
+We here compare
+CWM
+to relevant baseline models with similar parameter counts.
+Again, we observe highly competitive performance on par with Magistral-small-1.2
+(Rastogi et al.,
+2025
+)
+, Qwen3-32B, and gpt-oss-20B
+(Agarwal et al.,
+2025
+)
+.
+Table
+˜
+12
+also contains pass@1 results for
+CWM
+on Math-500
+(Lightman et al.,
+2023
+)
+, AIME24
+(OpenAI,
+2024
+)
+, and AIME25 – all averaged over
+n
+=
+20
+n=20
+samples.
+CWM
+performs slightly worse across the board, with notable gap compared to gpt-oss-20B (high) on AIME.
+In
+Figure
+˜
+18(a)
+, we additionally report test-time scaling results using majority voting and short-3@k for
+CWM
+on AIME.
+Short-m@k
+(Hassid et al.,
+2025
+)
+begins sampling
+k
+k
+answers in parallel but stops sampling once the first
+m
+m
+generations are complete, and then selects the most common answer among the three.
+CWM
+performance on AIME24 increases by up to
+11
+%
+11\text{\,}\mathrm{\char 37\relax}
+at
+k
+=
+10
+k=10
+with majority voting.
+Short-3@k achieves performance comparable to majority voting, while significantly reducing computational cost for a given
+k
+k
+.
+Table 12
+:
+Agentic, code, and mathematical reasoning benchmarks. We compare
+CWM
+to baselines with roughly the same number of parameters.(
+†
+\dagger
+: LCB results for gpt-oss-20B (high) suffered from repeated time-outs due to repetitive reasoning, despite our prompt-tuning efforts – which boosted gpt-oss (low/medium) scores by about
+10
+%
+10\text{\,}\mathrm{\char 37\relax}
+.)
+Magistral-small-1.2-24B
+Qwen3 -32B
+gpt-oss-20B (low / med / high)
+CWM
+LCBv5
+70.0
+65.7
+54.2 / 66.9 /
+0
+–
+†
+.0
+68.6
+LCBv6
+61.6
+61.9
+47.3 /
+62.0
+/
+0
+–
+†
+.0
+63.5
+Math-500
+-
+97.2
+–
+96.6
+AIME24
+86.1
+81.4
+42.1 / 80.0 /
+92.1
+*
+76.0
+AIME25
+77.3
+72.9
+37.1 / 72.1 /
+91.7
+*
+68.2
+7.7
+Non-reasoning evaluations
+Although our main focus with
+CWM
+is code world modeling, we also provide evaluation results of
+CWM
+on a set of standard tasks covering code, math, and general knowledge without reasoning mode enabled. We here compare to models with similar parameter counts, such as Qwen3-32B or Gemma-3-27B, as baselines, and we use greedy generation instead of sampling at non-zero temperature.
+The results in
+Table
+˜
+13
+show that
+CWM
+typically performs better than Gemma-3-27B, similar to Qwen2.5-32B, but worse than Qwen3-32B. An interesting exception to this is CruxEval-O
+(Gu et al.,
+2024
+)
+, where the introduction of the tracing data (see
+Section
+˜
+2.2
+) likely helps
+CWM
+gain an advantage. Note that we achieve even better results on CruxEval-Output when using reasoning (see
+Section
+˜
+7.3
+).
+Next, we consider two long-context evaluation benchmarks: LoCoDiff
+(Mentat AI Team,
+2025
+)
+and RULER
+(Hsieh et al.,
+2024
+)
+. In LoCoDiff, models are provided with the commit history of a specific file and asked to construct its final version. To succeed, models must follow the files’ evolution – from the initial commit, through diffs on multiple branches, to the resolution of merge conflicts. Performance is evaluated by the proportion of files for which the model reproduces the target version exactly. We compare
+CWM
+to DeepSeek-R1 0528, Claude Sonnet 4
+(Anthropic,
+2025
+)
+, Gemini 2.5 Pro 06-05
+(Comanici et al.,
+2025
+)
+, Kimi K2, GPT-5, and gpt-oss-120B. Qwen3-32B is not on the leaderboard and has a shorter native context length.
+For LoCoDiff, the results in
+Figure
+˜
+18(b)
+show that, while all models suffer a degradation in performance as the sequence length increases,
+CWM
+provides better performance than DeepSeek-R1 0528 and gpt-oss-120B and is competitive with large scale commercial models (e.g., GPT-5 and Gemini 2.5 Pro) on both short and long sequences, with a significant gap to Claude Sonnet 4 only. We present results for RULER in
+Section
+˜
+19
+.
+Table 13
+:
+Performance of
+CWM
+and
+CWM
+Mid
+\textsc{CWM}_{\textup{\tiny{Mid}}}
+, (
+CWM
+after mid-training), on a set of general, math, and coding tasks without any reasoning compared to a set of recent baseline models with similar parameter counts.
+CWM
+CWM
+Mid
+{}_{\textup{\tiny{Mid}}}
+Qwen3-32B
+Qwen2.5-32B
+Gemma-3-27B
+Llama-3-70B
+Llama-4-Scout
+MMLU
+77.7
+73.6
+83.6
+83.3
+78.7
+79.3
+78.3
+MMLU-Pro
+60.2
+52.3
+65.5
+55.1
+52.9
+53.8
+56.1
+GPQA
+40.6
+31.7
+49.5
+48.0
+26.3
+–
+40.4
+GSM8k
+93.3
+84.7
+93.4
+92.9
+81.2
+83.7
+85.4
+HumanEval-Plus
+75.0
+68.3
+72.1
+66.3
+55.8
+–
+59.9
+MBPP
+73.4
+67.8
+78.2
+73.6
+68.4
+66.2
+68.6
+CRUX-O
+83.4
+78.9
+72.5
+67.8
+60.0
+–
+61.9
+(a)
+(b)
+Figure 18
+:
+(a) Test-time scaling on AIME24 with majority voting and short-3@k. See main text for details.(b) LoCoDiff results for
+CWM
+and baselines considering different sequence lengths buckets.
+8
+Transparency, Risks & Limitations
+8.1
+Transparency on external models and data
+As mentioned previously in the relevant sections, we use data from external LLMs in four contexts: (i) ForagerAgent, (ii) trace-to-natural language conversion, (iii) function tracing, and (iv) the SFT phase. For the ForagerAgent, we employ Llama3-70B-Instruct
+(Dubey et al.,
+2024
+)
+and Qwen3-235B-A22B (without thinking)
+(Yang et al.,
+2025a
+)
+as base models to interact with the computational environment. For converting raw Python traces into natural language, we use Qwen3-32B-FP8 (without thinking)
+(Yang et al.,
+2025a
+)
+. For function tracing, we use Llama3-70B-Instruct to generate Python function inputs and to generate solutions for CodeContests data. Finally, during SFT, we incorporate trajectories from DeepSeek-R1
+(Guo et al.,
+2025
+)
+through the OpenMathReasoning
+(Moshkov et al.,
+2025
+)
+and OpenCodeReasoning
+(Ahmad et al.,
+2025
+)
+datasets. We used mitigated versions of the OpenCodeReasoning and OpenMathReasoning datasets, where mitigations included algorithmic bias filtering and cybersecurity protections. We applied similar mitigations when using Qwen3-32B-FP8 to generate data for training. No external LLM tokens were used beyond those explicitly mentioned in these four contexts.
+8.2
+Code World Model Preparedness Report
+Despite its relatively small size of
+32
+B
+32\text{\,}\mathrm{B}
+parameters,
+CWM
+outperforms open-weight models at similar size and is competitive to larger and proprietary models on verified software engineering benchmarks. To anticipate risks from this release, including potentially novel risks, we conducted an automated assessment of CWM capabilities relevant to the domains identified in our Frontier AI Framework
+17
+17
+17
+See
+https://ai.meta.com/static-resource/meta-frontier-ai-framework
+.
+that could present potentially catastrophic risks, namely Cyber and Chemical & Biological risks. As part of ongoing work to improve the robustness of our evaluations and the reliability of our models, we also include a preliminary propensity evaluation, with plans to expand this area in future assessments.
+We performed this assessment by testing the relative performance of
+CWM
+against a set popular and capable open-source models that represent a baseline of capabilities available in the open ecosystem: Qwen3-Coder-480B-A35B-Instruct
+(Yang et al.,
+2025a
+)
+, Llama 4 Maverick
+(Meta AI,
+2025
+)
+, and gpt-oss-120B
+(OpenAI,
+2025a
+)
+.
+Based on the results of these assessments, we believe that the open-source release of
+CWM
+is unlikely to meaningfully increase risks related to Cybersecurity or Chemical & Biological threats beyond the current ecosystem baseline.
+Additionally, our preliminary evaluations suggest that
+CWM
+shows undesirable propensities at rates comparable to most open-source models though some models achieve substantially lower rates, i.e., gpt-oss-120B.
+These results indicate that
+CWM
+is within the “moderate” risk threshold for the catastrophic domains defined in Meta’s Frontier AI Framework.
+17
+We share the details in the Code World Model Preparedness Report.
+18
+18
+18
+Code World Model Preparedness Report, available at
+https://ai.meta.com/research/publications/cwm-preparedness
+.
+8.3
+Limitations & future research
+We explicitly release CWM as a research model under a noncommercial research license for the community to explore the opportunities afforded by world modeling and reasoning in computational environments. As such, our models come with a number of limitations which we outline below to help the research community make the most of
+CWM
+, while being aware of its shortcomings and avoiding accidental misuse.
+As these are research-only models, they are not suitable for production use cases. Although we have performed some limited evaluations, we have not conducted a full range of possible evaluations for these models. The performance of
+CWM
+in production and real-world scenarios has not been evaluated by Meta. These models have not been fully evaluated or trained for user-facing interactions and they are not intended for such use. Researchers are recommended to exercise caution when deploying or using these models.
+Similarly,
+CWM
+should not be used as a general-purpose assistant or chat model. While it was exposed to some level of instruction-following data during SFT,
+CWM
+has not undergone any thorough optimization for general chat-bot use, such as RLHF
+(Ouyang et al.,
+2022
+)
+. General chat use is not an intended use of CWM and generations may diverge from expectations and/or be inappropriate or inaccurate. Further,
+CWM
+training focuses strongly on code generation and reasoning with code. Thus, our models may be lacking in other domains such as factual knowledge or classic natural language tasks.
+CWM
+is not trained for use as a general-purpose assistant or chat model and has not been aligned on, or fully evaluated for, content risks. We make available system level protections – like Llama Guard, Prompt Guard, and Code Shield – as a solution to help manage content generation in research environments.
+19
+19
+19
+See
+https://www.llama.com/llama-protections
+.
+However, these system level protections alone are unlikely to be sufficient to enable production uses of
+CWM
+and further evaluations and fine-tuning may be required.
+CWM
+is intended to be used in English only. It is not multilingual and performance in other languages has not been evaluated or optimized.
+Lastly, while we are excited about the opportunities that world modeling affords, these are only our first steps in this direction. Our code world modeling dataset collection efforts focus on explicit Python execution, and expanding this set to include other programming languages or symbolic execution is left for future work. Robust ways to leverage world model knowledge to improve performance across a variety of tasks via prompting or fine tuning is a ripe area for research. Similarly, planning with code world models, either using formal inference frameworks or informally during reasoning, is an exciting direction for research and core to our motivation for building
+CWM
+in the first place. In some sense, one might compare the current state of
+CWMs
+to LLMs before CoT
+(Wei et al.,
+2023
+)
+: the capabilities are there, we just need to find out how to make the most of them.
+9
+Conclusion
+Our vision is for Code World Models to bridge the gap between language‑level reasoning and executable semantics. We believe that coding and agentic use cases of LLMs will benefit from having a world model, a learned transition function between states conditioned on actions.
+With the release of
+CWM
+, we present the first steps of this vision.
+Our ablations already show that world modeling data, Python execution traces, and executable Docker environments can be directly beneficial for downstream task performance.
+More broadly though,
+CWM
+provides a strong test-bed for future research in zero-shot planning, grounded chain-of-thought, and reinforcement learning with sparse, verifiable rewards.
+Similar to our early results with execution trace prediction, we believe that the Python tracing world model enables research on reasoning about code generation, execution, correctness, and verification.
+World models should improve reinforcement learning because agents that are already familiar with the dynamics of the environment can focus on learning which actions lead to rewards.
+More research is needed to consistently leverage the benefits of incorporating world models into LLMs during pre-training across tasks.
+Ultimately, models that can reason about the consequences of their actions should be much more efficient in their interactions with the environment which should allow for scaling the complexity of the tasks they perform.
+Authors: Meta FAIR CodeGen Team
+Alphabetic order for core contributors, from second author onward and excluding senior authors, and contributors.
+Core contributors
+Jade Copet
+Quentin Carbonneaux
+Gal Cohen
+Jonas Gehring
+Jacob Kahn
+Jannik Kossen
+Felix Kreuk
+Emily McMilin
+Michel Meyer
+Yuxiang Wei
+David Zhang
+Kunhao Zheng
+Contributors
+Jordi Armengol-Estapé
+Pedram Bashiri
+Maximilian Beck
+Pierre Chambon
+Abhishek Charnalia
+Chris Cummins
+Juliette Decugis
+Zacharias V. Fisches
+François Fleuret
+Fabian Gloeckle
+Alex Gu
+Michael Hassid
+Daniel Haziza
+Badr Youbi Idrissi
+Christian Keller
+Rahul Kindi
+Hugh Leather
+Gallil Maimon
+Aram Markosyan
+Francisco Massa
+Pierre-Emmanuel Mazaré
+Vegard Mella
+Naila Murray
+Keyur Muzumdar
+Peter O’Hearn
+Matteo Pagliardini
+Dmitrii Pedchenko
+Tal Remez
+Volker Seeker
+Marco Selvi
+Oren Sultan
+Sida Wang
+Luca Wehrstedt
+Ori Yoran
+Lingming Zhang
+Senior core contributors
+Taco Cohen
+Yossi Adi
+Gabriel Synnaeve
+References
+Agarwal et al. (2025)
+Sandhini Agarwal, Lama Ahmad, Jason Ai, Sam Altman, Andy Applebaum, Edwin
+Arbus, Rahul K Arora, Yu Bai, Bowen Baker, Haiming Bao, et al.
+gpt-oss-120b & gpt-oss-20b model card.
+arXiv preprint arXiv:2508.10925
+, 2025.
+Ahmad et al. (2025)
+Wasi Uddin Ahmad, Sean Narenthiran, Somshubra Majumdar, Aleksander Ficek,
+Siddhartha Jain, Jocelyn Huang, Vahid Noroozi, and Boris Ginsburg.
+Opencodereasoning: Advancing data distillation for competitive
+coding.
+arXiv preprint arXiv:2504.01943
+, 2025.
+Aider Team (2025)
+Aider Team.
+aider, 2025.
+https://github.com/Aider-AI/aider
+.
+GitHub repository; accessed 2025-08-18.
+Ainslie et al. (2023)
+Joshua Ainslie, James Lee-Thorp, Michiel De Jong, Yury Zemlyanskiy, Federico
+Lebrón, and Sumit Sanghai.
+Gqa: Training generalized multi-query transformer models from
+multi-head checkpoints.
+arXiv preprint arXiv:2305.13245
+, 2023.
+Alon and David (2022)
+Yoav Alon and Cristina David.
+Using graph neural networks for program termination.
+In Abhik Roychoudhury, Cristian Cadar, and Miryung Kim, editors,
+Proceedings of the 30th ACM Joint European Software Engineering
+Conference and Symposium on the Foundations of Software Engineering,
+ESEC/FSE 2022, Singapore, Singapore, November 14-18, 2022
+, pages 910–921.
+ACM, 2022.
+10.1145/3540250.3549095
+.
+https://doi.org/10.1145/3540250.3549095
+.
+Anthropic (2025)
+Anthropic.
+Claude 3.7 sonnet and claude code, February 2025.
+https://www.anthropic.com/news/claude-3-7-sonnet
+.
+Anthropic (2025)
+Anthropic.
+Raising the bar on swe-bench verified with claude 3.5 sonnet.
+2025.
+https://www.anthropic.com/engineering/swe-bench-sonnet
+.
+Accessed 2025-08-18.
+Aram H. Markosyan (2024)
+Hugh Leather Aram H. Markosyan, Gabriel Synnaeve.
+Leanuniverse: A library for consistent and scalable lean4 dataset
+management, 2024.
+Armengol-Estapé et al. (2025)
+Jordi Armengol-Estapé, Quentin Carbonneaux, Tianjun Zhang, Aram H
+Markosyan, Volker Seeker, Chris Cummins, Melanie Kambadur, Michael FP
+O’Boyle, Sida Wang, Gabriel Synnaeve, et al.
+What i cannot execute, i do not understand: Training and evaluating
+llms on program execution traces.
+arXiv preprint arXiv:2503.05703
+, 2025.
+Austin et al. (2021)
+Jacob Austin, Augustus Odena, Maxwell I. Nye, Maarten Bosma, Henryk
+Michalewski, David Dohan, Ellen Jiang, Carrie J. Cai, Michael Terry, Quoc V.
+Le, and Charles Sutton.
+Program synthesis with large language models.
+CoRR
+, abs/2108.07732, 2021.
+https://arxiv.org/abs/2108.07732
+.
+Azerbayev et al. (2023)
+Zhangir Azerbayev, Bartosz Piotrowski, Hailey Schoelkopf, Edward W. Ayers,
+Dragomir Radev, and Jeremy Avigad.
+Proofnet: Autoformalizing and formally proving undergraduate-level
+mathematics, 2023.
+https://arxiv.org/abs/2302.12433
+.
+Bi et al. (2024)
+Xiao Bi, Deli Chen, Guanting Chen, Shanhuang Chen, Damai Dai, Chengqi Deng,
+Honghui Ding, Kai Dong, Qiushi Du, Zhe Fu, et al.
+Deepseek llm: Scaling open-source language models with longtermism.
+arXiv preprint arXiv:2401.02954
+, 2024.
+Bick et al. (2024)
+Alexander Bick, Adam Blandin, and David J Deming.
+The rapid adoption of generative ai.
+Technical report, National Bureau of Economic Research, 2024.
+Bisk et al. (2020)
+Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, and Yejin Choi.
+PIQA: reasoning about physical commonsense in natural language.
+In
+The Thirty-Fourth AAAI Conference on Artificial
+Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of
+Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium
+on Educational Advances in Artificial Intelligence, EAAI 2020, New York,
+NY, USA, February 7-12, 2020
+, pages 7432–7439. AAAI Press, 2020.
+10.1609/AAAI.V34I05.6239
+.
+https://doi.org/10.1609/aaai.v34i05.6239
+.
+Carbonneaux (2025)
+Quentin Carbonneaux.
+Fastgen, 2025.
+https://github.com/facebookresearch/fastgen
+.
+Chambon et al. (2025)
+Pierre Chambon, Baptiste Roziere, Benoit Sagot, and Gabriel Synnaeve.
+Bigo(bench) – can llms generate code with controlled time and space
+complexity?, 2025.
+https://arxiv.org/abs/2503.15242
+.
+Chen et al. (2021)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Pondé
+de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph,
+Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy
+Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder,
+Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens
+Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias
+Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss,
+William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor
+Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher
+Hesse, Andrew N. Carr, Jan Leike, Joshua Achiam, Vedant Misra, Evan Morikawa,
+Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter
+Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and
+Wojciech Zaremba.
+Evaluating large language models trained on code.
+CoRR
+, abs/2107.03374, 2021.
+https://arxiv.org/abs/2107.03374
+.
+Clark et al. (2018)
+Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa
+Schoenick, and Oyvind Tafjord.
+Think you have solved question answering? try arc, the AI2
+reasoning challenge.
+CoRR
+, abs/1803.05457, 2018.
+http://arxiv.org/abs/1803.05457
+.
+Cobbe et al. (2021)
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz
+Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano,
+Christopher Hesse, and John Schulman.
+Training verifiers to solve math word problems.
+CoRR
+, abs/2110.14168, 2021.
+https://arxiv.org/abs/2110.14168
+.
+Cohen et al. (2025)
+Taco Cohen, David W. Zhang, Kunhao Zheng, Yunhao Tang, Rémi Munos, and
+Gabriel Synnaeve.
+Soft policy optimization: Online off-policy RL for sequence models.
+CoRR
+, abs/2503.05453, 2025.
+10.48550/ARXIV.2503.05453
+.
+https://doi.org/10.48550/arXiv.2503.05453
+.
+Comanici et al. (2025)
+Gheorghe Comanici, Eric Bieber, Mike Schaekermann, Ice Pasupat, Noveen
+Sachdeva, Inderjit Dhillon, Marcel Blistein, Ori Ram, Dan Zhang, Evan Rosen,
+et al.
+Gemini 2.5: Pushing the frontier with advanced reasoning,
+multimodality, long context, and next generation agentic capabilities.
+arXiv preprint arXiv:2507.06261
+, 2025.
+CompFiles authors (2025)
+CompFiles authors.
+Compfiles.
+https://github.com/dwrensha/compfiles
+, 2025.
+Cook et al. (2011)
+Byron Cook, Andreas Podelski, and Andrey Rybalchenko.
+Proving program termination.
+Commun. ACM
+, 54(5):88–98, 2011.
+10.1145/1941487.1941509
+.
+https://doi.org/10.1145/1941487.1941509
+.
+Cui et al. (2024)
+Zheyuan Cui, Mert Demirer, Sonia Jaffe, Leon Musolff, Sida Peng, and Tobias
+Salz.
+The Effects of Generative AI on High Skilled Work: Evidence from
+Three Field Experiments with Software Developers.
+SSRN eLibrary
+, 2024.
+10.2139/ssrn.4945566
+.
+Cummins et al. (2024)
+Chris Cummins, Volker Seeker, Dejan Grubisic, Baptiste Roziere, Jonas Gehring,
+Gabriel Synnaeve, and Hugh Leather.
+Meta large language model compiler: Foundation models of compiler
+optimization.
+arXiv preprint arXiv:2407.02524
+, 2024.
+Dao (2024)
+Tri Dao.
+FlashAttention-2: Faster attention with better parallelism and work
+partitioning.
+In
+International Conference on Learning Representations
+(ICLR)
+, 2024.
+Dao et al. (2022)
+Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher Ré.
+FlashAttention: Fast and memory-efficient exact attention with
+IO-awareness.
+In
+Advances in Neural Information Processing Systems
+(NeurIPS)
+, 2022.
+Dijkstra (1976)
+Edsger W. Dijkstra.
+A Discipline of Programming
+.
+Prentice-Hall, 1976.
+ISBN 013215871X.
+https://www.worldcat.org/oclc/01958445
+.
+Dua et al. (2019)
+Dheeru Dua, Yizhong Wang, Pradeep Dasigi, Gabriel Stanovsky, Sameer Singh, and
+Matt Gardner.
+DROP: A reading comprehension benchmark requiring discrete
+reasoning over paragraphs.
+In Jill Burstein, Christy Doran, and Thamar Solorio, editors,
+Proceedings of the 2019 Conference of the North American Chapter of the
+Association for Computational Linguistics: Human Language Technologies,
+NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and
+Short Papers)
+, pages 2368–2378. Association for Computational Linguistics,
+2019.
+10.18653/V1/N19-1246
+.
+https://doi.org/10.18653/v1/n19-1246
+.
+Dubey et al. (2024)
+Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad
+Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan,
+et al.
+The llama 3 herd of models.
+arXiv e-prints
+, pages arXiv–2407, 2024.
+Gadre et al. (2024)
+Samir Yitzhak Gadre, Georgios Smyrnis, Vaishaal Shankar, Suchin Gururangan,
+Mitchell Wortsman, Rulin Shao, Jean Mercat, Alex Fang, Jeffrey Li, Sedrick
+Keh, et al.
+Language models scale reliably with over-training and on downstream
+tasks.
+arXiv preprint arXiv:2403.08540
+, 2024.
+Gao et al. (2025)
+Bofei Gao, Feifan Song, Zhe Yang, Zefan Cai, Yibo Miao, Qingxiu Dong, Lei Li,
+Chenghao Ma, Liang Chen, Runxin Xu, Zhengyang Tang, Benyou Wang, Daoguang
+Zan, Shanghaoran Quan, Ge Zhang, Lei Sha, Yichang Zhang, Xuancheng Ren,
+Tianyu Liu, and Baobao Chang.
+Omni-math: A universal olympiad level mathematic benchmark for
+large language models.
+In
+The Thirteenth International Conference on Learning
+Representations, ICLR 2025, Singapore, April 24-28, 2025
+. OpenReview.net,
+2025.
+https://openreview.net/forum?id=yaqPf0KAlN
+.
+Gehring et al. (2025)
+Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux,
+Taco Cohen, and Gabriel Synnaeve.
+Rlef: Grounding code llms in execution feedback with reinforcement
+learning, 2025.
+https://arxiv.org/abs/2410.02089
+.
+Gu et al. (2024)
+Alex Gu, Baptiste Rozière, Hugh Leather, Armando Solar-Lezama, Gabriel
+Synnaeve, and Sida I. Wang.
+Cruxeval: A benchmark for code reasoning, understanding and
+execution.
+arXiv preprint arXiv:2401.03065
+, 2024.
+Guo et al. (2025)
+Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu,
+Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via
+reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+Gupta et al. (2008)
+Ashutosh Gupta, Thomas A. Henzinger, Rupak Majumdar, Andrey Rybalchenko, and
+Ru-Gang Xu.
+Proving non-termination.
+In George C. Necula and Philip Wadler, editors,
+Proceedings of
+the 35th ACM SIGPLAN-SIGACT Symposium on Principles of Programming
+Languages, POPL 2008, San Francisco, California, USA, January 7-12, 2008
+,
+pages 147–158. ACM, 2008.
+10.1145/1328438.1328459
+.
+https://doi.org/10.1145/1328438.1328459
+.
+Handa et al. (2025)
+Kunal Handa, Alex Tamkin, Miles McCain, Saffron Huang, Esin Durmus, Sarah Heck,
+Jared Mueller, Jerry Hong, Stuart Ritchie, Tim Belonax, et al.
+Which economic tasks are performed with ai? evidence from millions of
+claude conversations.
+arXiv preprint arXiv:2503.04761
+, 2025.
+Hassid et al. (2025)
+Michael Hassid, Gabriel Synnaeve, Yossi Adi, and Roy Schwartz.
+Don’t overthink it. preferring shorter thinking chains for improved
+llm reasoning.
+arXiv preprint arXiv:2505.17813
+, 2025.
+Hendrycks et al. (2021a)
+Dan Hendrycks, Steven Basart, Saurav Kadavath, Mantas Mazeika, Akul Arora,
+Ethan Guo, Collin Burns, Samir Puranik, Horace He, Dawn Song, et al.
+Measuring coding challenge competence with apps.
+arXiv preprint arXiv:2105.09938
+, 2021a.
+Hendrycks et al. (2021b)
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric
+Tang, Dawn Song, and Jacob Steinhardt.
+Measuring mathematical problem solving with the MATH dataset.
+In Joaquin Vanschoren and Sai-Kit Yeung, editors,
+Proceedings
+of the Neural Information Processing Systems Track on Datasets and Benchmarks
+1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual
+,
+2021b.
+https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html
+.
+Hoare (1971)
+C. A. R. Hoare.
+Proof of a program: FIND.
+Commun. ACM
+, 14(1):39–45, 1971.
+10.1145/362452.362489
+.
+https://doi.org/10.1145/362452.362489
+.
+Hoffmann et al. (2022)
+Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor
+Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes
+Welbl, Aidan Clark, et al.
+Training compute-optimal large language models.
+arXiv preprint arXiv:2203.15556
+, 2022.
+Hsieh et al. (2024)
+Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei
+Jia, Yang Zhang, and Boris Ginsburg.
+Ruler: What’s the real context size of your long-context language
+models?
+arXiv preprint arXiv:2404.06654
+, 2024.
+Hu et al. (2025)
+Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, Xiangyu Zhang, and Heung-Yeung
+Shum.
+Open-reasoner-zero: An open source approach to scaling up
+reinforcement learning on the base model, 2025.
+https://arxiv.org/abs/2503.24290
+.
+Jain et al. (2025a)
+Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida
+Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica.
+Livecodebench: Holistic and contamination free evaluation of large
+language models for code.
+In
+The Thirteenth International Conference on Learning
+Representations, ICLR 2025, Singapore, April 24-28, 2025
+. OpenReview.net,
+2025a.
+https://openreview.net/forum?id=chfJJYC3iL
+.
+Jain et al. (2025b)
+Naman Jain, Jaskirat Singh, Manish Shetty, Liang Zheng, Koushik Sen, and Ion
+Stoica.
+R2e-gym: Procedural environments and hybrid verifiers for scaling
+open-weights swe agents.
+arXiv preprint arXiv:2504.07164
+, 2025b.
+Jimenez et al. (2024)
+Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir
+Press, and Karthik R. Narasimhan.
+Swe-bench: Can language models resolve real-world github issues?
+In
+The Twelfth International Conference on Learning
+Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024
+.
+OpenReview.net, 2024.
+https://openreview.net/forum?id=VTF8yNQM66
+.
+Kaplan et al. (2020)
+Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon
+Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
+Scaling laws for neural language models.
+arXiv preprint arXiv:2001.08361
+, 2020.
+Kimi Team et al. (2025)
+Kimi Team, Yifan Bai, Yiping Bao, Guanduo Chen, Jiahao Chen, Ningxin
+Chen, Ruijue Chen, Yanru Chen, Yuankun Chen, Yutian Chen, et al.
+Kimi k2: Open agentic intelligence.
+arXiv preprint arXiv:2507.20534
+, 2025.
+Kwon et al. (2023)
+Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu,
+Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.
+Efficient memory management for large language model serving with
+pagedattention.
+In
+Proceedings of the ACM SIGOPS 29th Symposium on Operating
+Systems Principles
+, 2023.
+Kydlicek et al. (2025)
+Hynek Kydlicek, Alina Lozovskaya, Nathan Habib, and Clémentine Fourrier.
+Math-verify, 2025.
+https://github.com/huggingface/Math-Verify
+.
+Lee (2019)
+Casey Lee.
+act, 2019.
+https://github.com/nektos/act
+.
+Lefaudeux et al. (2022)
+Benjamin Lefaudeux, Francisco Massa, Diana Liskovich, Wenhan Xiong, Vittorio
+Caggiano, Sean Naren, Min Xu, Jieru Hu, Marta Tintore, Susan Zhang, et al.
+xformers: A modular and hackable transformer modelling library, 2022.
+Li et al. (2024)
+Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu,
+Joseph E. Gonzalez, and Ion Stoica.
+From crowdsourced data to high-quality benchmarks: Arena-hard and
+benchbuilder pipeline.
+CoRR
+, abs/2406.11939, 2024.
+10.48550/ARXIV.2406.11939
+.
+https://doi.org/10.48550/arXiv.2406.11939
+.
+Li et al. (2022)
+Yujia Li, David H. Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser,
+Rémi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal
+Lago, Thomas Hubert, Peter Choy, Cyprien de Masson d’Autume, Igor Babuschkin,
+Xinyun Chen, Po-Sen Huang, Johannes Welbl, Sven Gowal, Alexey Cherepanov,
+James Molloy, Daniel J. Mankowitz, Esme Sutherland Robson, Pushmeet Kohli,
+Nando de Freitas, Koray Kavukcuoglu, and Oriol Vinyals.
+Competition-level code generation with alphacode.
+CoRR
+, abs/2203.07814, 2022.
+10.48550/ARXIV.2203.07814
+.
+https://doi.org/10.48550/arXiv.2203.07814
+.
+Lightman et al. (2023)
+Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker,
+Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+In
+The Twelfth International Conference on Learning
+Representations
+, 2023.
+Lin et al. (2025)
+Yong Lin, Shange Tang, Bohan Lyu, Jiayun Wu, Hongzhou Lin, Kaiyu Yang, Jia Li,
+Mengzhou Xia, Danqi Chen, Sanjeev Arora, and Chi Jin.
+Goedel-prover: A frontier model for open-source automated theorem
+proving, 2025.
+https://arxiv.org/abs/2502.07640
+.
+Liu et al. (2025)
+Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun
+Lee, and Min Lin.
+Understanding r1-zero-like training: A critical perspective.
+arXiv preprint arXiv:2503.20783
+, 2025.
+Loshchilov and Hutter (2019)
+Ilya Loshchilov and Frank Hutter.
+Decoupled weight decay regularization.
+In
+International Conference on Learning Representations
+, 2019.
+https://openreview.net/forum?id=Bkg6RiCqY7
+.
+mathlib Community (2020)
+The mathlib Community.
+The lean mathematical library.
+In
+Proceedings of the 9th ACM SIGPLAN International Conference
+on Certified Programs and Proofs
+, CPP 2020, page 367–381, New York, NY,
+USA, 2020. Association for Computing Machinery.
+ISBN 9781450370974.
+10.1145/3372885.3373824
+.
+https://doi.org/10.1145/3372885.3373824
+.
+Mella (2025)
+Vegard Mella.
+Moodist, 2025.
+https://github.com/facebookresearch/moodist
+.
+Mentat AI Team (2025)
+Mentat AI Team.
+Locodiff-bench: Natural long-context code benchmark, 2025.
+https://github.com/AbanteAI/LoCoDiff-bench
+.
+Meta AI (2025)
+Meta AI.
+Llama 4 model card.
+https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md
+,
+2025.
+Accessed: 2025-09-18.
+Meurer et al. (2017)
+Aaron Meurer, Christopher P. Smith, Mateusz Paprocki, Ondřej
+Čertík, Sergey B. Kirpichev, Matthew Rocklin, AMiT Kumar, Sergiu
+Ivanov, Jason K. Moore, Sartaj Singh, Thilina Rathnayake, Sean Vig, Brian E.
+Granger, Richard P. Muller, Francesco Bonazzi, Harsh Gupta, Shivam Vats,
+Fredrik Johansson, Fabian Pedregosa, Matthew J. Curry, Andy R. Terrel,
+Štěpán Roučka, Ashutosh Saboo, Isuru Fernando, Sumith Kulal,
+Robert Cimrman, and Anthony Scopatz.
+Sympy: symbolic computing in python.
+PeerJ Computer Science
+, 3:e103, January 2017.
+ISSN 2376-5992.
+10.7717/peerj-cs.103
+.
+https://doi.org/10.7717/peerj-cs.103
+.
+Micikevicius et al. (2022)
+Paulius Micikevicius, Dusan Stosic, Neil Burgess, Marius Cornea, Pradeep Dubey,
+Richard Grisenthwaite, Sangwon Ha, Alexander Heinecke, Patrick Judd, John
+Kamalu, et al.
+Fp8 formats for deep learning.
+arXiv preprint arXiv:2209.05433
+, 2022.
+Mihaylov et al. (2018)
+Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal.
+Can a suit of armor conduct electricity? A new dataset for open
+book question answering.
+In Ellen Riloff, David Chiang, Julia Hockenmaier, and Jun’ichi
+Tsujii, editors,
+Proceedings of the 2018 Conference on Empirical
+Methods in Natural Language Processing, Brussels, Belgium, October 31 -
+November 4, 2018
+, pages 2381–2391. Association for Computational
+Linguistics, 2018.
+10.18653/V1/D18-1260
+.
+https://doi.org/10.18653/v1/d18-1260
+.
+Mistral-AI et al. (2025)
+Mistral-AI, :, Abhinav Rastogi, Albert Q. Jiang, Andy Lo, Gabrielle Berrada,
+Guillaume Lample, Jason Rute, Joep Barmentlo, Karmesh Yadav, Kartik
+Khandelwal, Khyathi Raghavi Chandu, Léonard Blier, Lucile Saulnier, Matthieu
+Dinot, Maxime Darrin, Neha Gupta, Roman Soletskyi, Sagar Vaze, Teven Le Scao,
+Yihan Wang, Adam Yang, Alexander H. Liu, Alexandre Sablayrolles, Amélie
+Héliou, Amélie Martin, Andy Ehrenberg, Anmol Agarwal, Antoine Roux, Arthur
+Darcet, Arthur Mensch, Baptiste Bout, Baptiste Rozière, Baudouin De
+Monicault, Chris Bamford, Christian Wallenwein, Christophe Renaudin,
+Clémence Lanfranchi, Darius Dabert, Devon Mizelle, Diego de las Casas,
+Elliot Chane-Sane, Emilien Fugier, Emma Bou Hanna, Gauthier Delerce, Gauthier
+Guinet, Georgii Novikov, Guillaume Martin, Himanshu Jaju, Jan Ludziejewski,
+Jean-Hadrien Chabran, Jean-Malo Delignon, Joachim Studnia, Jonas Amar,
+Josselin Somerville Roberts, Julien Denize, Karan Saxena, Kush Jain, Lingxiao
+Zhao, Louis Martin, Luyu Gao, Lélio Renard Lavaud, Marie Pellat, Mathilde
+Guillaumin, Mathis Felardos, Maximilian Augustin, Mickaël Seznec, Nikhil
+Raghuraman, Olivier Duchenne, Patricia Wang, Patrick von Platen, Patryk
+Saffer, Paul Jacob, Paul Wambergue, Paula Kurylowicz, Pavankumar Reddy
+Muddireddy, Philomène Chagniot, Pierre Stock, Pravesh Agrawal, Romain
+Sauvestre, Rémi Delacourt, Sanchit Gandhi, Sandeep Subramanian, Shashwat
+Dalal, Siddharth Gandhi, Soham Ghosh, Srijan Mishra, Sumukh Aithal, Szymon
+Antoniak, Thibault Schueller, Thibaut Lavril, Thomas Robert, Thomas Wang,
+Timothée Lacroix, Valeriia Nemychnikova, Victor Paltz, Virgile Richard,
+Wen-Ding Li, William Marshall, Xuanyu Zhang, and Yunhao Tang.
+Magistral, 2025.
+https://arxiv.org/abs/2506.10910
+.
+(68)
+Modal Team.
+Modal: High-performance ai infrastructure.
+https://modal.com/docs
+.
+Accessed 2025-08-18.
+Moshkov et al. (2025)
+Ivan Moshkov, Darragh Hanley, Ivan Sorokin, Shubham Toshniwal, Christof Henkel,
+Benedikt Schifferer, Wei Du, and Igor Gitman.
+Aimo-2 winning solution: Building state-of-the-art mathematical
+reasoning models with openmathreasoning dataset.
+arXiv preprint arXiv:2504.16891
+, 2025.
+Moura and Ullrich (2021)
+Leonardo de Moura and Sebastian Ullrich.
+The lean 4 theorem prover and programming language.
+In André Platzer and Geoff Sutcliffe, editors,
+Automated
+Deduction – CADE 28
+, pages 625–635, Cham, 2021. Springer International
+Publishing.
+ISBN 978-3-030-79876-5.
+Muennighoff et al. (2023)
+Niklas Muennighoff, Alexander Rush, Boaz Barak, Teven Le Scao, Nouamane Tazi,
+Aleksandra Piktus, Sampo Pyysalo, Thomas Wolf, and Colin A Raffel.
+Scaling data-constrained language models.
+Advances in Neural Information Processing Systems
+,
+36:50358–50376, 2023.
+OpenAI (2024)
+OpenAI.
+Learning to reason with llms, September 2024.
+https://openai.com/index/learning-to-reason-with-llms/
+.
+OpenAI (2025a)
+OpenAI.
+gpt-oss-120b & gpt-oss-20b model card.
+arXiv preprint arXiv:2508.10925
+, 2025a.
+OpenAI (2025b)
+OpenAI.
+Claude 3.7 sonnet and claude code, April 2025b.
+https://openai.com/index/introducing-o3-and-o4-mini/
+.
+Ouyang et al. (2022)
+Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela
+Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al.
+Training language models to follow instructions with human feedback.
+Advances in neural information processing systems
+,
+35:27730–27744, 2022.
+Paliskara and Saroufim (2025)
+Sahan Paliskara and Mark Saroufim.
+Kernelbook, 5 2025.
+https://huggingface.co/datasets/GPUMODE/KernelBook
+.
+Pan et al. (2025)
+Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr,
+and Yizhe Zhang.
+Training software engineering agents and verifiers with swe‑gym.
+In
+Proceedings of the 42nd International Conference on Machine
+Learning (ICML 2025)
+, 2025.
+https://arxiv.org/abs/2412.21139
+.
+arXiv:2412.21139, accepted at ICML 2025.
+Piche et al. (2025)
+Alex Piche, Rafael Pardinas, Ehsan Kamalloo, and Dzmitry Bahdanau.
+Pipeline RL: fast LLM agent training, 2025.
+https://huggingface.co/blog/ServiceNow/pipelinerl
+.
+Radford et al. (2018)
+Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever.
+Improving language understanding by generative pre-training.
+Technical report, OpenAI, 2018.
+https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf
+.
+Rastogi et al. (2025)
+Abhinav Rastogi, Albert Q Jiang, Andy Lo, Gabrielle Berrada, Guillaume Lample,
+Jason Rute, Joep Barmentlo, Karmesh Yadav, Kartik Khandelwal, Khyathi Raghavi
+Chandu, et al.
+Magistral.
+arXiv preprint arXiv:2506.10910
+, 2025.
+Rein et al. (2023)
+David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe
+Pang, Julien Dirani, Julian Michael, and Samuel R. Bowman.
+GPQA: A graduate-level google-proof q&a benchmark.
+CoRR
+, abs/2311.12022, 2023.
+10.48550/ARXIV.2311.12022
+.
+https://doi.org/10.48550/arXiv.2311.12022
+.
+Roziere et al. (2023)
+Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat,
+Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Romain Sauvestre, Tal Remez,
+et al.
+Code llama: Open foundation models for code.
+arXiv preprint arXiv:2308.12950
+, 2023.
+Saavedra et al. (2024)
+Nuno Saavedra, André Silva, and Martin Monperrus.
+Gitbug-actions: Building reproducible bug-fix benchmarks with github
+actions.
+In
+Proceedings of the 2024 IEEE/ACM 46th International
+Conference on Software Engineering: Companion Proceedings
+, ICSE-Companion
+’24, page 1–5, New York, NY, USA, 2024. Association for Computing
+Machinery.
+ISBN 9798400705021.
+10.1145/3639478.3640023
+.
+https://doi.org/10.1145/3639478.3640023
+.
+Sakaguchi et al. (2020)
+Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi.
+Winogrande: An adversarial winograd schema challenge at scale.
+In
+The Thirty-Fourth AAAI Conference on Artificial
+Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of
+Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium
+on Educational Advances in Artificial Intelligence, EAAI 2020, New York,
+NY, USA, February 7-12, 2020
+, pages 8732–8740. AAAI Press, 2020.
+10.1609/AAAI.V34I05.6399
+.
+https://doi.org/10.1609/aaai.v34i05.6399
+.
+Schulman (2020)
+John Schulman.
+Approximating kl divergence, 2020.
+https://joschu.net/blog/kl-approx.html
+.
+Schulman et al. (2017)
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Shao et al. (2024)
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei
+Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo.
+Deepseekmath: Pushing the limits of mathematical reasoning in open
+language models, 2024.
+https://arxiv.org/abs/2402.03300
+.
+Shazeer (2020)
+Noam Shazeer.
+Glu variants improve transformer, 2020.
+https://arxiv.org/abs/2002.05202
+.
+Su et al. (2021)
+Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu.
+Roformer: Enhanced transformer with rotary position embedding, 2021.
+https://arxiv.org/abs/2104.09864
+.
+Synnaeve et al. (2019)
+Gabriel Synnaeve, Jonas Gehring, Zeming Lin, Daniel Haziza, Nicolas Usunier,
+Danielle Rothermel, Vegard Mella, Da Ju, Nicolas Carion, Laura Gustafson,
+et al.
+Growing up together: Structured exploration for large action spaces.
+2019.
+Talmor et al. (2019)
+Alon Talmor, Jonathan Herzig, Nicholas Lourie, and Jonathan Berant.
+Commonsenseqa: A question answering challenge targeting commonsense
+knowledge.
+In Jill Burstein, Christy Doran, and Thamar Solorio, editors,
+Proceedings of the 2019 Conference of the North American Chapter of the
+Association for Computational Linguistics: Human Language Technologies,
+NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and
+Short Papers)
+, pages 4149–4158. Association for Computational Linguistics,
+2019.
+10.18653/V1/N19-1421
+.
+https://doi.org/10.18653/v1/n19-1421
+.
+Tang et al. (2025)
+Yunhao Tang, Kunhao Zheng, Gabriel Synnaeve, and Rémi Munos.
+Optimizing language models for inference time objectives using
+reinforcement learning.
+CoRR
+, abs/2503.19595, 2025.
+10.48550/ARXIV.2503.19595
+.
+https://doi.org/10.48550/arXiv.2503.19595
+.
+The Terminal-Bench Team (2025)
+The Terminal-Bench Team.
+Terminal-bench: A benchmark for ai agents in terminal environments,
+Apr 2025.
+https://github.com/laude-institute/terminal-bench
+.
+Vanegue et al. (2025)
+Julien Vanegue, Jules Villard, Peter O’Hearn, and Azalea Raad.
+Non-termination proving: 100 million loc and beyond, 2025.
+https://arxiv.org/abs/2509.05293
+.
+Vaswani et al. (2017)
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin.
+Attention is all you need.
+Advances in neural information processing systems
+, 30, 2017.
+Wang et al. (2022a)
+Shibo Wang, Jinliang Wei, Amit Sabne, Andy Davis, Berkin Ilbeyi, Blake
+Hechtman, Dehao Chen, Karthik Srinivasa Murthy, Marcello Maggioni, Qiao
+Zhang, et al.
+Overlap communication with dependent computation via decomposition in
+large deep learning models.
+In
+Proceedings of the 28th ACM International Conference on
+Architectural Support for Programming Languages and Operating Systems, Volume
+1
+, pages 93–106, 2022a.
+Wang et al. (2025)
+Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xianu Tang, Mingchen Zhuge,
+Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li,
+Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe
+Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, and
+Graham Neubig.
+Openhands: An open platform for AI software developers as
+generalist agents.
+In
+The Thirteenth International Conference on Learning
+Representations
+, 2025.
+https://openreview.net/forum?id=OJd3ayDDoF
+.
+Wang et al. (2022b)
+Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, Sharan Narang,
+Aakanksha Chowdhery, and Denny Zhou.
+Self-consistency improves chain of thought reasoning in language
+models.
+arXiv preprint arXiv:2203.11171
+, 2022b.
+Wei et al. (2023)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia,
+Ed Chi, Quoc Le, and Denny Zhou.
+Chain-of-thought prompting elicits reasoning in large language
+models, 2023.
+https://arxiv.org/abs/2201.11903
+.
+Wei et al. (2024)
+Jason Wei, Nguyen Karina, Hyung Won Chung, Yunxin Joy Jiao, Spencer Papay,
+Amelia Glaese, John Schulman, and William Fedus.
+Measuring short-form factuality in large language models.
+CoRR
+, abs/2411.04368, 2024.
+10.48550/ARXIV.2411.04368
+.
+https://doi.org/10.48550/arXiv.2411.04368
+.
+Wei et al. (2025)
+Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang,
+Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang.
+Swe-rl: Advancing llm reasoning via reinforcement learning on open
+software evolution.
+arXiv preprint arXiv:2502.18449
+, 2025.
+Xia et al. (2024)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying llm-based software engineering agents.
+arXiv preprint
+, 2024.
+Xiong et al. (2023)
+Wenhan Xiong, Jingyu Liu, Igor Molybog, Hejia Zhang, Prajjwal Bhargava, Rui
+Hou, Louis Martin, Rashi Rungta, Karthik Abinav Sankararaman, Barlas Oguz,
+et al.
+Effective long-context scaling of foundation models.
+arXiv preprint arXiv:2309.16039
+, 2023.
+Yang et al. (2025a)
+An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen
+Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al.
+Qwen3 technical report.
+arXiv preprint arXiv:2505.09388
+, 2025a.
+Yang et al. (2024)
+John Yang, Carlos E Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao,
+Karthik Narasimhan, and Ofir Press.
+Swe-agent: Agent-computer interfaces enable automated software
+engineering.
+Advances in Neural Information Processing Systems
+,
+37:50528–50652, 2024.
+Yang et al. (2025b)
+John Yang, Kilian Leret, Carlos E Jimenez, Alexander Wettig, Kabir Khandpur,
+Yanzhe Zhang, Binyuan Hui, Ofir Press, Ludwig Schmidt, and Diyi Yang.
+Swe-smith: Scaling data for software engineering agents.
+arXiv preprint arXiv:2504.21798
+, 2025b.
+Yeverechyahu et al. (2024)
+Doron Yeverechyahu, Raveesh Mayya, and Gal Oestreicher-Singer.
+The impact of large language models on open-source innovation:
+Evidence from github copilot.
+arXiv preprint arXiv:2409.08379
+, 2024.
+Ying et al. (2025)
+Huaiyuan Ying, Zijian Wu, Yihan Geng, Zheng Yuan, Dahua Lin, and Kai Chen.
+Lean workbook: A large-scale lean problem set formalized from natural
+language math problems, 2025.
+https://arxiv.org/abs/2406.03847
+.
+Yu et al. (2025)
+Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Tiantian
+Fan, Gaohong Liu, Lingjun Liu, Xin Liu, et al.
+Dapo: An open-source llm reinforcement learning system at scale.
+arXiv preprint arXiv:2503.14476
+, 2025.
+Yue et al. (2024)
+Albert S. Yue, Lovish Madaan, Ted Moskovitz, DJ Strouse, and Aaditya K. Singh.
+HARP: A challenging human-annotated math reasoning benchmark.
+CoRR
+, abs/2412.08819, 2024.
+10.48550/ARXIV.2412.08819
+.
+https://doi.org/10.48550/arXiv.2412.08819
+.
+Zellers et al. (2019)
+Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi.
+Hellaswag: Can a machine really finish your sentence?
+In Anna Korhonen, David R. Traum, and Lluís Màrquez,
+editors,
+Proceedings of the 57th Conference of the Association for
+Computational Linguistics, ACL 2019, Florence, Italy, July 28- August 2,
+2019, Volume 1: Long Papers
+, pages 4791–4800. Association for Computational
+Linguistics, 2019.
+10.18653/V1/P19-1472
+.
+https://doi.org/10.18653/v1/p19-1472
+.
+Zhang and Sennrich (2019)
+Biao Zhang and Rico Sennrich.
+Root mean square layer normalization
+.
+Curran Associates Inc., Red Hook, NY, USA, 2019.
+(113)
+David W Zhang, Michaël Defferrard, Corrado Rainone, and Roland Memisevic.
+Grounding code understanding in step-by-step execution.
+https://openreview.net/forum?id=MUr7Fl93QS
+.
+Zheng et al. (2022)
+Kunhao Zheng, Jesse Michael Han, and Stanislas Polu.
+Minif2f: a cross-system benchmark for formal olympiad-level
+mathematics, 2022.
+https://arxiv.org/abs/2109.00110
+.
+Zheng et al. (2023)
+Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu,
+Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang,
+Joseph E. Gonzalez, and Ion Stoica.
+Judging llm-as-a-judge with mt-bench and chatbot arena.
+In Alice Oh, Tristan Naumann, Amir Globerson, Kate Saenko, Moritz
+Hardt, and Sergey Levine, editors,
+Advances in Neural Information
+Processing Systems 36: Annual Conference on Neural Information Processing
+Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023
+,
+2023.
+http://papers.nips.cc/paper_files/paper/2023/hash/91f18a1287b398d378ef22505bf41832-Abstract-Datasets_and_Benchmarks.html
+.
+\beginappendix
+10
+Acknowledgments
+The authors thank Ariel Stolerman, Ayelet Regev Dabah, Dani Shames, Tamir Meyer and Nadav Azaria for support in building executable repository images at scale; Jeff Yang, Yonatan Komornik and Tarun Anand for support in curating GitHub PR and Issue metadata; Qian Liang, Meng Zhang, Hanwen Zha, Ananya Saxena, Emily Dinan, Melanie Kambadur for the support in data preparation; Yining Yang, Sten Sootla, Chris Waterson and Michael Jiang for support in the development of RepoAgent and additional repository images;
+Eslam Elnikety, Jamie Cahill, Christine Wang, Don Landrum, Sadman Fahmid, Andrew Hamiel, Ned Newton, Andrii Golovei, Rashmi Narasimha, Zack Leman, Mehrdad Mahdavi, Leon Yang, Joshua Fink, Sargun Dillon, Jeff Hanson and Zach Wentz for the internal sandboxing platform and the code execution and Docker execution services built atop it, enabling secure and massively parallel execution of untrusted code; Mathurin Videau, Leonid Shamis, Jeremy Reizenstein, Maria Lomeli, Lucca Bertoncini, Vivien Cabannes, Charles Arnal and Pascal Kesseli for their contributions to the CWM research codebase and training and evaluation infrastructure; Julien Vanegue for advice on practical aspects of the halting problem;
+Daniel Fried and Rishabh Singh for support in designing and developing Agentic SWE RL;
+the Modal team – especially Jonathon Belotti, Matthew Saltz, Colin Weld, Peyton Walters, Deven Navani, Michael Waskom, Advay Pal, Akshat Bubna, Alec Powell, Lucy Zhang, and Eric Zhang – for extensive support with remote execution, infrastructure, and platform stability;
+Lovish Madaan, Binh Tang, Viktor Kerkez, Rishabh Agarwal, Alan Schelten, Xuewei Wang and Jeremy Fu for support with mathematical expression comparison code.
+11
+CWM
+Examples
+Extending
+Section
+˜
+3
+, we here present additional examples of using
+CWM
+for SWE reasoning, trace prediction, and a combination of the two.
+Reasoning agent.
+Figure
+˜
+11.19
+shows an example of
+CWM
+solving an SWE-bench Verified problem in a bash-only environment, which is more challenging than environments that provide dedicated tools for common tasks such as file editing. In this example, the model makes incorrect edits in the initial turns but realizes its error and restores the original file state using
+git checkout
+, followed by producing a correct edit with
+sed
+.
+Figure
+˜
+11.20
+demonstrates that
+CWM
+can leverage test execution to verify patch correctness before submission. In this specific example, the agent makes sure that the changes it makes do not break any existing functionality. Only after this verification, the agent submits the patch and generates a summary.
+Lastly,
+Figure
+˜
+11.21
+shows the default SWE RL setting where
+CWM
+is paired with the
+edit
+tool. In this example,
+CWM
+performs extensive reasoning before making the edit. The
+edit
+tool then provides agent-friendly feedback showing the surrounding code after the change.
+Python execution trace prediction.
+Figures
+˜
+11.23
+and
+11.24
+showcase Python execution trace prediction at inference time and compare it to reasoning about program execution in natural language.
+For
+Figure
+˜
+11.23
+, a Python list is modified while iterating over it.
+In execution trace prediction mode, the model tracks all list modifications and predicts the output correctly.
+With natural language reasoning, the model fails to predict the correct return value – even though it appears to recognize the list modification during reasoning.
+Conversely,
+Figure
+˜
+11.24
+presents an example requiring the evaluation of a complex Python statement.
+Execution trace prediction fails to correctly predict the outcome of the statement in a single prediction step, but natural language reasoning breaks down the complex statement into simpler expressions and then combines those into a correct result.
+We believe that combining the groundedness of trace prediction with the flexibility of natural language reasoning makes for interesting future research.
+Figure
+˜
+11.25
+demonstrates how
+CWM
+’s execution trace prediction capabilities allow it to function as a neural Python debugger.
+We think that equipping
+CWM
+with debugging capabilities that are not available with traditional debuggers, such as skipping loops in constant time, jumping to arbitrary lines of code, or predicting inputs to reach arbitrary states is highly interesting future work.
+Figure
+˜
+11.22
+shows how we execution trace prediction for CruxEval output prediction in our experiments in
+Section
+˜
+7.3
+.
+Figure
+˜
+11.26
+demonstrates how
+CWM
+’s Python execution trace prediction capability can be used for code generation.
+By specifying a set of
+asserts
+consistent with the desired behavior and simply pretending to
+import
+the desired function, without actually giving a function definition,
+CWM
+starts to generate actions consistent with the desired function.
+It is possible that the model acquired this capability of jointly tracing and generating code because for some tracing data we do not include the source context of third-party libraries.
+We expand on this in
+Figure
+˜
+11.27
+, the example of mixing tracing and code generation discussed earlier in Section
+3
+.
+Future work could build on
+CWM
+’s capabilities here and explore how execution trace prediction can be used to improve code generation.
+Program Termination.
+Figure
+˜
+11.28
+llustrates termination reasoning, whereby CWM considers several concrete inputs before generalizing to the conclusion of terminating on all inputs.
+⬇
+The
+issue
+persists
+due
+to
+an
+indentation
+error
+.
+I
+need
+to
+correct
+the
+indentation
+in
+the
+sky_coordinate
+.
+py
+file
+.
+However
+,
+since
+the
+’
+edit
+’
+tool
+is
+not
+available
+,
+I
+’
+ll
+try
+a
+different
+approach
+.
+Let
+me
+check
+the
+current
+state
+of
+the
+__getattr__
+method
+in
+sky_coordinate
+.
+py
+.
+⬇
+<
+tool
+:
+bash
+>
+sed
+-
+n
+890,910
+p
+/
+testbed
+/
+astropy
+/
+coordinates
+/
+sky_coordinate
+.
+py
+</
+tool
+>
+⬇
+<
+output
+>
+return
+getattr
+(
+self
+.
+_sky_coord_frame
+,
+attr
+)
+#
+Try
+to
+interpret
+as
+a
+new
+frame
+for
+transforming
+.
+frame_cls
+=
+frame_transform_graph
+.
+lookup_name
+(
+attr
+)
+if
+frame_cls
+is
+not
+None
+and
+self
+.
+frame
+.
+is_transformable_to
+(
+frame_cls
+):
+return
+self
+.
+transform_to
+(
+attr
+)
+#
+Fail
+return
+self
+.
+__getattribute__
+(
+attr
+)
+f
+"’{
+self
+.
+__class__
+.
+__name__
+}’
+object
+has
+no
+attribute
+’{
+attr
+}’"
+)
+def
+__setattr__
+(
+self
+,
+attr
+,
+val
+):
+#
+This
+is
+to
+make
+anything
+available
+through
+__getattr__
+immutable
+if
+"
+_sky_coord_frame
+"
+in
+self
+.
+__dict__
+:
+if
+self
+.
+_is_name
+(
+attr
+):
+raise
+AttributeError
+(
+f
+"’{
+attr
+}’
+is
+immutable
+")
+if
+not
+attr
+.
+startswith
+("
+_
+")
+and
+hasattr
+(
+self
+.
+_sky_coord_frame
+,
+attr
+):
+setattr
+(
+self
+.
+_sky_coord_frame
+,
+attr
+,
+val
+)
+return
+</
+output
+>
+<
+budget
+>
+*
+Remaining
+turns
+:
+82
+*
+Remaining
+tokens
+:
+85000
+</
+budget
+>
+⬇
+The
+issue
+is
+that
+the
+line
+‘
+return
+self
+.
+__getattribute__
+(
+attr
+)‘
+is
+not
+properly
+replacing
+the
+original
+‘
+raise
+AttributeError
+‘
+line
+,
+and
+there
+’
+s
+an
+extra
+line
+with
+‘
+f
+"’{
+self
+.
+__class__
+.
+__name__
+}’
+object
+has
+no
+attribute
+’{
+attr
+}’"‘
+that
+’
+s
+causing
+a
+syntax
+error
+.
+I
+need
+to
+correctly
+replace
+the
+entire
+block
+.
+Let
+me
+try
+to
+fix
+this
+by
+restoring
+the
+original
+file
+and
+applying
+the
+change
+again
+carefully
+.
+⬇
+<
+tool
+:
+bash
+>
+git
+checkout
+/
+testbed
+/
+astropy
+/
+coordinates
+/
+sky_coordinate
+.
+py
+</
+tool
+>
+⬇
+<
+output
+>
+Updated
+1
+path
+from
+the
+index
+</
+output
+>
+<
+budget
+>
+*
+Remaining
+turns
+:
+81
+*
+Remaining
+tokens
+:
+84837
+</
+budget
+>
+⬇
+Now
+that
+the
+sky_coordinate
+.
+py
+file
+has
+been
+restored
+,
+I
+’
+ll
+apply
+the
+correct
+change
+using
+sed
+,
+making
+sure
+to
+replace
+the
+entire
+raise
+AttributeError
+block
+with
+the
+correct
+line
+.
+The
+original
+code
+at
+line
+898
+is
+:
+#
+Fail
+raise
+AttributeError
+(
+f
+"’{
+self
+.
+__class__
+.
+__name__
+}’
+object
+has
+no
+attribute
+’{
+attr
+}’"
+)
+I
+need
+to
+replace
+this
+with
+:
+#
+Fail
+return
+self
+.
+__getattribute__
+(
+attr
+)
+Let
+me
+use
+sed
+to
+do
+this
+correctly
+.
+⬇
+<
+tool
+:
+bash
+>
+sed
+-
+i
+’898,900
+c
+\
+#
+Fail
+\
+n
+return
+self
+.
+__getattribute__
+(
+attr
+)’
+/
+testbed
+/
+astropy
+/
+coordinates
+/
+sky_coordinate
+.
+py
+</
+tool
+>
+Figure 11.19
+:
+Example intermediate turns in SWE RL (bash-only). The agent learns to correct indentation mistakes from wrong edits by restoring the original file state using
+git checkout
+, followed by applying a correct edit using
+sed
+.
+⬇
+The
+code
+ran
+without
+errors
+,
+and
+‘
+model
+.
+dual_coef_
+‘
+was
+printed
+,
+which
+should
+be
+an
+empty
+CSR
+matrix
+.
+However
+,
+the
+output
+is
+empty
+,
+which
+suggests
+that
+‘
+model
+.
+dual_coef_
+‘
+is
+indeed
+an
+empty
+matrix
+.
+Now
+,
+I
+need
+to
+verify
+that
+this
+fix
+doesn
+’
+t
+break
+any
+other
+functionality
+.
+To
+do
+that
+,
+I
+’
+ll
+run
+the
+tests
+for
+the
+SVM
+module
+in
+sklearn
+to
+ensure
+that
+all
+tests
+pass
+.
+⬇
+<
+tool
+:
+bash
+>
+python
+-
+m
+pytest
+sklearn
+/
+svm
+/
+tests
+/
+</
+tool
+>
+⬇
+=============================
+test
+session
+starts
+==============================
+platform
+linux
+--
+Python
+3.6.13,
+pytest
+-6.2.4,
+py
+-1.11.0,
+pluggy
+-0.13.1
+rootdir
+:
+/
+testbed
+,
+configfile
+:
+setup
+.
+cfg
+collected
+135
+items
+sklearn
+/
+svm
+/
+tests
+/
+test_bounds
+.
+py
+...................
+[
+14%
+sklearn
+/
+svm
+/
+tests
+/
+test_sparse
+.
+py
+...............................
+[
+37%
+sklearn
+/
+svm
+/
+tests
+/
+test_svm
+.
+py
+..........................................
+[
+68%
+...........................................
+[100%
+=======================
+135
+passed
+,
+15
+warnings
+in
+4.31
+s
+=======================
+⬇
+All
+tests
+have
+passed
+,
+which
+suggests
+that
+the
+fix
+is
+correct
+and
+doesn
+’
+t
+break
+any
+existing
+functionality
+.
+Now
+,
+I
+need
+to
+create
+a
+patch
+file
+that
+includes
+the
+changes
+made
+to
+‘
+sklearn
+/
+svm
+/
+base
+.
+py
+‘.
+⬇
+<
+tool
+:
+bash
+>
+git
+diff
+sklearn
+/
+svm
+/
+base
+.
+py
+>
+svm_patch
+.
+patch
+</
+tool
+>
+⬇
+(
+No
+output
+)
+⬇
+The
+patch
+file
+has
+been
+created
+.
+Now
+,
+I
+’
+ll
+submit
+it
+as
+the
+final
+solution
+.
+⬇
+<
+tool
+:
+submit
+>
+./
+svm_patch
+.
+patch
+</
+tool
+>
+⬇
+The
+following
+patch
+content
+is
+marked
+as
+your
+final
+submission
+:
+<
+submission
+>
+diff
+--
+git
+a
+/
+sklearn
+/
+svm
+/
+base
+.
+py
+b
+/
+sklearn
+/
+svm
+/
+base
+.
+py
+index
+64
+cebe837
+..67
+dd8e38f
+100644
+---
+a
+/
+sklearn
+/
+svm
+/
+base
+.
+py
++++
+b
+/
+sklearn
+/
+svm
+/
+base
+.
+py
+@@
+-287,8
++287,11
+@@
+class
+BaseLibSVM
+(
+BaseEstimator
+,
+metaclass
+=
+ABCMeta
+):
+n_SV
+=
+self
+.
+support_vectors_
+.
+shape
+[0]
+dual_coef_indices
+=
+np
+.
+tile
+(
+np
+.
+arange
+(
+n_SV
+),
+n_class
+)
+-
+dual_coef_indptr
+=
+np
+.
+arange
+(0,
+dual_coef_indices
+.
+size
++
+1,
+-
+dual_coef_indices
+.
+size
+/
+n_class
+)
++
+if
+dual_coef_indices
+.
+size
+==
+0:
++
+dual_coef_indptr
+=
+np
+.
+array
+([0,
+0],
+dtype
+=
+np
+.
+int32
+)
++
+else
+:
++
+dual_coef_indptr
+=
+np
+.
+arange
+(0,
+dual_coef_indices
+.
+size
++
+1,
++
+dual_coef_indices
+.
+size
+/
+n_class
+)
+self
+.
+dual_coef_
+=
+sp
+.
+csr_matrix
+(
+(
+dual_coef_data
+,
+dual_coef_indices
+,
+dual_coef_indptr
+),
+(
+n_class
+,
+n_SV
+))
+</
+submission
+>
+Review
+the
+patch
+content
+and
+ensure
+it
+correctly
+resolves
+the
+issue
+.
+If
+necessary
+,
+you
+can
+continue
+refining
+the
+patch
+and
+submit
+an
+updated
+version
+.
+⬇
+The
+patch
+looks
+good
+.
+It
+adds
+a
+check
+for
+when
+‘
+dual_coef_indices
+‘
+is
+empty
+and
+handles
+it
+by
+setting
+‘
+dual_coef_indptr
+‘
+to
+‘[0,
+0]‘,
+which
+should
+correctly
+create
+an
+empty
+CSR
+matrix
+for
+‘
+self
+.
+dual_coef_
+‘.
+This
+should
+resolve
+the
+ZeroDivisionError
+issue
+when
+‘
+support_vectors_
+‘
+is
+empty
+.
+I
+’
+m
+confident
+that
+this
+patch
+is
+correct
+and
+should
+fix
+the
+problem
+without
+introducing
+any
+regressions
+,
+as
+all
+tests
+passed
+after
+the
+fix
+.
+⬇
+##
+Issue
+Summary
+The
+issue
+was
+a
+ZeroDivisionError
+in
+the
+‘
+_sparse_fit
+‘
+method
+of
+the
+SVM
+class
+when
+using
+sparse
+data
+and
+the
+‘
+support_vectors_
+‘
+attribute
+was
+empty
+.
+The
+error
+occurred
+because
+of
+a
+division
+by
+zero
+when
+calculating
+the
+‘
+dual_coef_indptr
+‘
+for
+the
+CSR
+matrix
+of
+dual
+coefficients
+.
+##
+Investigation
+and
+Reproduction
+...
+Figure 11.20
+:
+Example final turns in SWE RL, with details like budget notices omitted for brevity. The agent verifies that the patch does not break existing functionality, submits the patch, and produces a summary.
+⬇
+Okay
+,
+so
+the
+grep
+command
+shows
+that
+in
+django
+/
+db
+/
+models
+/
+fields
+/
+json
+.
+py
+,
+there
+are
+several
+instances
+of
+json
+.
+dumps
+being
+used
+.
+The
+ones
+in
+lines
+84
+and
+95
+are
+the
+ones
+in
+the
+get_prep_value
+and
+validate
+methods
+,
+respectively
+.
+These
+are
+the
+ones
+that
+might
+need
+to
+be
+modified
+to
+include
+ensure_ascii
+=
+False
+to
+fix
+the
+Unicode
+display
+issue
+in
+the
+admin
+.
+However
+,
+wait
+a
+minute
+.
+The
+get_prep_value
+method
+is
+used
+to
+prepare
+the
+value
+for
+the
+database
+,
+but
+the
+display_in
+the
+admin
+uses
+this
+method
+as
+well
+.
+If
+we
+modify
+get_prep_value
+to
+include
+ensure_ascii
+=
+False
+,
+that
+could
+affect
+how
+the
+data
+is
+stored
+in
+the
+database
+,
+but
+JSON
+fields
+should
+handle
+Unicode
+correctly
+regardless
+.
+However
+,
+...
+But
+let
+me
+think
+again
+.
+The
+display_for_field
+function
+in
+django
+/
+contrib
+/
+admin
+/
+utils
+.
+py
+calls
+get_prep_value
+on
+the
+JSONField
+instance
+.
+If
+get_prep_value
+returns
+a
+string
+with
+ensure_ascii
+=
+False
+,
+then
+the
+displayed
+JSON
+data
+will
+show
+Unicode
+characters
+correctly
+.
+So
+,
+modifying
+the
+get_prep_value
+method
+to
+include
+ensure_ascii
+=
+False
+in
+the
+json
+.
+dumps
+call
+should
+fix
+the
+issue
+.
+Similarly
+,
+the
+validate
+method
+’
+s
+json
+.
+dumps
+call
+might
+not
+need
+to
+change
+,
+but
+to
+be
+consistent
+,
+maybe
+it
+’
+s
+better
+to
+include
+ensure_ascii
+=
+False
+there
+as
+well
+,
+although
+it
+’
+s
+less
+critical
+for
+display
+purposes
+.
+Additionally
+,
+the
+lines
+122,
+383,
+and
+416
+use
+json
+.
+dumps
+,
+but
+these
+seem
+to
+be
+for
+different
+purposes
+(
+compiling
+JSON
+paths
+,
+etc
+.).
+However
+,
+if
+these
+are
+part
+of
+the
+serialization
+process
+for
+JSONField
+values
+,
+they
+might
+also
+need
+to
+include
+ensure_ascii
+=
+False
+to
+ensure
+consistent
+behavior
+.
+But
+without
+more
+context
+,
+it
+’
+s
+harder
+to
+say
+.
+The
+main
+issue
+is
+the
+admin
+display
+,
+which
+is
+handled
+by
+get_prep_value
+.
+Let
+me
+check
+the
+pull
+request
+again
+(
+https
+://
+github
+.
+com
+/
+adamchainz
+/
+django
+-
+mysql
+/
+pull
+/714)
+to
+see
+what
+changes
+were
+made
+.
+Since
+I
+can
+’
+t
+access
+the
+link
+,
+I
+’
+ll
+have
+to
+rely
+on
+the
+issue
+description
+,
+which
+mentions
+that
+the
+pull
+request
+fixed
+the
+problem
+by
+modifying
+how
+JSON
+data
+is
+serialized
+.
+Assuming
+that
+the
+fix
+involves
+adding
+ensure_ascii
+=
+False
+to
+the
+json
+.
+dumps
+calls
+in
+the
+JSONField
+class
+,
+I
+should
+modify
+the
+get_prep_value
+and
+possibly
+other
+relevant
+methods
+.
+Another
+thing
+to
+consider
+:
+the
+forms
+.
+JSONField
+in
+django
+/
+forms
+/
+fields
+.
+py
+might
+also
+need
+similar
+changes
+,
+especially
+in
+how
+it
+prepares
+values
+for
+display
+.
+Let
+me
+check
+that
+file
+as
+well
+.
+Looking
+at
+django
+/
+forms
+/
+fields
+.
+py
+,
+the
+JSONField
+class
+has
+a
+prepare_value
+method
+:
+def
+prepare_value
+(
+self
+,
+value
+):
+if
+value
+is
+None
+:
+return
+None
+if
+isinstance
+(
+value
+,
+str
+):
+return
+value
+return
+self
+.
+encoder
+.
+dumps
+(
+value
+)
+Here
+,
+self
+.
+encoder
+.
+dumps
+(
+value
+)
+is
+used
+.
+If
+the
+encoder
+is
+the
+default
+JSONEncoder
+,
+it
+would
+still
+use
+ASCII
+encoding
+.
+However
+,
+the
+encoder
+for
+JSONField
+can
+be
+customized
+,
+but
+the
+default
+might
+not
+set
+ensure_ascii
+=
+False
+.
+To
+ensure
+that
+the
+form
+field
+displays
+Unicode
+characters
+correctly
+,
+maybe
+this
+should
+also
+be
+modified
+to
+include
+ensure_ascii
+=
+False
+,
+but
+since
+the
+encoder
+is
+part
+of
+the
+field
+’
+s
+configuration
+,
+it
+might
+be
+better
+to
+handle
+it
+there
+.
+Alternatively
+,
+maybe
+the
+display_for_field
+function
+in
+the
+admin
+should
+handle
+JSONField
+values
+differently
+,
+but
+that
+might
+be
+more
+complex
+.
+Given
+the
+information
+,
+the
+most
+straightforward
+fix
+seems
+to
+be
+modifying
+the
+get_prep_value
+method
+in
+django
+/
+db
+/
+models
+/
+fields
+/
+json
+.
+py
+to
+include
+ensure_ascii
+=
+False
+in
+the
+json
+.
+dumps
+call
+.
+This
+would
+affect
+how
+JSON
+data
+is
+displayed
+in
+the
+admin
+,
+as
+that
+’
+s
+where
+get_prep_value
+is
+called
+.
+Let
+me
+proceed
+with
+that
+change
+and
+see
+if
+it
+resolves
+the
+issue
+.
+⬇
+<
+tool
+:
+edit
+>
+django
+/
+db
+/
+models
+/
+fields
+/
+json
+.
+py
+<<<<<<<
+SEARCH
+def
+get_prep_value
+(
+self
+,
+value
+):
+if
+value
+is
+None
+:
+return
+value
+return
+json
+.
+dumps
+(
+value
+,
+cls
+=
+self
+.
+encoder
+)
+=======
+def
+get_prep_value
+(
+self
+,
+value
+):
+if
+value
+is
+None
+:
+return
+value
+return
+json
+.
+dumps
+(
+value
+,
+cls
+=
+self
+.
+encoder
+,
+ensure_ascii
+=
+False
+)
+>>>>>>>
+REPLACE
+</
+tool
+>
+⬇
+File
+django
+/
+db
+/
+models
+/
+fields
+/
+json
+.
+py
+has
+been
+successfully
+updated
+at
+line
+81
+and
+now
+contains
+521
+lines
+.
+The
+surrounding
+lines
+after
+the
+edit
+are
+shown
+below
+:
+73
+try
+:
+74
+return
+json
+.
+loads
+(
+value
+,
+cls
+=
+self
+.
+decoder
+)
+75
+except
+json
+.
+JSONDecodeError
+:
+76
+return
+value
+77
+78
+def
+get_internal_type
+(
+self
+):
+79
+return
+’
+JSONField
+’
+80
+81
+def
+get_prep_value
+(
+self
+,
+value
+):
+82
+if
+value
+is
+None
+:
+83
+return
+value
+84
+return
+json
+.
+dumps
+(
+value
+,
+cls
+=
+self
+.
+encoder
+,
+ensure_ascii
+=
+False
+)
+85
+86
+def
+get_transform
+(
+self
+,
+name
+):
+87
+transform
+=
+super
+().
+get_transform
+(
+name
+)
+88
+if
+transform
+:
+89
+return
+transform
+90
+return
+KeyTransformFactory
+(
+name
+)
+91
+92
+def
+validate
+(
+self
+,
+value
+,
+model_instance
+):
+Review
+the
+changes
+and
+make
+sure
+they
+are
+as
+expected
+.
+Figure 11.21
+:
+Example of the
+CWM
+SWE RL agent engaging in long reasoning before modifying the file with the
+edit
+tool. For conciseness, some details (e.g., budget notices) are omitted and reasoning paragraphs are merged.
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+def
+f
+(
+n
+):
+p
+=
+’’
+if
+n
+%
+p
++=
+’sn’
+else
+:
+return
+n
+*
+n
+for
+x
+in
+range
+(1,
+n
++1):
+if
+x
+%
+p
++=
+’to’
+else
+:
+p
++=
+’ts’
+return
+p
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+(1)
+<|
+frame_sep
+|><|
+call_sep
+|>{}
+<|
+action_sep
+|>
+def
+main
+():
+<|
+frame_sep
+|>
+<|return_sep|>
+----
+END
+OF
+PROMPT
+----
+<|
+action_sep
+|>
+return
+f
+(1)
+<|
+arg_sep
+|>
+"’snts’"
+<|
+frame_sep
+|>
+(a)
+Single-step execution trace prediction.
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+def
+f
+(
+n
+):
+p
+=
+’’
+if
+n
+%
+p
++=
+’sn’
+else
+:
+return
+n
+*
+n
+for
+x
+in
+range
+(1,
+n
++1):
+if
+x
+%
+p
++=
+’to’
+else
+:
+p
++=
+’ts’
+return
+p
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+(1)
+<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+def
+main
+():
+<|
+frame_sep
+|>
+----
+END
+OF
+PROMPT
+----
+<|
+line_sep
+|>{}<|
+action_sep
+|>
+return
+f
+(1)
+<|
+frame_sep
+|><|
+call_sep
+|>{
+"n"
+:
+"1"
+}<|
+action_sep
+|>
+def
+f
+(
+n
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+}<|
+action_sep
+|>
+p
+=
+’’
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+"’’"
+}
+<|
+action_sep
+|>
+if
+n
+%
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+".."
+}
+<|
+action_sep
+|>
+p
++=
+’sn’
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+"’sn’"
+}
+<|
+action_sep
+|>
+for
+x
+in
+range
+(1,
+n
++1):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+".."
+,
+"x"
+:
+"1"
+}
+<|
+action_sep
+|>
+if
+x
+%
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+".."
+,
+"x"
+:
+".."
+}
+<|
+action_sep
+|>
+p
++=
+’ts’
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+"’snts’"
+,
+"x"
+:
+".."
+}
+<|
+action_sep
+|>
+for
+x
+in
+range
+(1,
+n
++1):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"n"
+:
+".."
+,
+"p"
+:
+".."
+,
+"x"
+:
+".."
+}
+<|
+action_sep
+|>
+return
+p
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+p
+<|
+arg_sep
+|>
+"’snts’"
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+f
+(1)
+<|
+arg_sep
+|>
+"’snts’"
+<|
+frame_sep
+|>
+(b)
+Full, line-by-line, execution trace prediction.
+Figure 11.22
+:
+Python execution trace prediction for CruxEval output prediction in (a) single-step mode and (b) full, line-by-line, mode.
+The predictions shown are generated by
+CWM
+.
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+def
+f
+(
+nums
+):
+a
+=
+0
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+a
++=
+1
+return
+nums
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+([1,
+3,
+-1,
+1,
+-2,
+6])
+<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+def
+main
+():
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|>
+---
+END
+OF
+PROMPT
+---
+<|
+line_sep
+|>{}<|
+action_sep
+|>
+return
+f
+([1,
+3,
+-1,
+1,
+-2,
+6])
+<|
+frame_sep
+|><|
+call_sep
+|>{
+"nums"
+:
+"[1,
+3,
+-1,
+1,
+-2,
+6]"
+}<|
+action_sep
+|>
+def
+f
+(
+nums
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+}<|
+action_sep
+|>
+a
+=
+0
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"0"
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"0"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"1"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"1"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"2"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"2"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"3"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"3"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"4"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"4"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"5"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+"5"
+}<|
+action_sep
+|>
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+"[1,
+1,
+1,
+1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+a
++=
+1
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+"6"
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"nums"
+:
+".."
+,
+"a"
+:
+".."
+,
+"i"
+:
+".."
+}<|
+action_sep
+|>
+return
+nums
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+nums
+<|
+arg_sep
+|>
+"[1,
+1,
+1,
+1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+f
+([1,
+3,
+-1,
+1,
+-2,
+6])<|
+arg_sep
+|>
+"[1,
+1,
+1,
+1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6]"
+<|
+frame_sep
+|>
+(a)
+CWM
+Execution Trace Prediction (correct prediction).
+⬇
+Given
+a
+python
+code
+function
+and
+an
+assert
+statement
+containing
+a
+specific
+input
+,
+provide
+the
+assertion
+with
+the
+exact
+literal
+output
+that
+the
+function
+returns
+with
+that
+input
+.
+Do
+not
+include
+any
+mathematical
+expressions
+or
+function
+calls
+-
+only
+the
+final
+literal
+value
+.
+Your
+response
+should
+be
+solely
+the
+assertion
+,
+enclosed
+within
+[
+ANSWER
+]
+and
+[/
+ANSWER
+]
+tags
+.
+[
+SKIPPED
+EXAMPLE
+FOR
+ILLUSTRATION
+PURPOSES
+]
+Python
+function
+:
+‘‘‘
+def
+f
+(
+nums
+):
+a
+=
+0
+for
+i
+in
+range
+(
+len
+(
+nums
+)):
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+a
++=
+1
+return
+nums
+assert
+f
+([1,
+3,
+-1,
+1,
+-2,
+6])
+==
+??
+‘‘‘
+----
+END
+OF
+PROMPT
+---
+Okay
+,
+let
+’
+s
+see
+.
+I
+need
+to
+figure
+out
+what
+the
+function
+f
+does
+when
+given
+the
+list
+[1,
+3,
+-1,
+1,
+-2,
+6].
+The
+function
+starts
+by
+initializing
+a
+variable
+a
+to
+0.
+Then
+it
+loops
+over
+the
+range
+of
+the
+length
+of
+nums
+.
+But
+wait
+,
+the
+length
+of
+nums
+is
+6
+initially
+.
+However
+,
+inside
+the
+loop
+,
+it
+’
+s
+inserting
+elements
+into
+nums
+,
+which
+will
+change
+the
+length
+of
+the
+list
+as
+the
+loop
+progresses
+.
+That
+might
+affect
+how
+many
+times
+the
+loop
+runs
+.
+Let
+me
+break
+it
+down
+step
+by
+step
+.
+The
+initial
+list
+is
+[1,
+3,
+-1,
+1,
+-2,
+6].
+The
+variable
+a
+starts
+at
+0.
+The
+loop
+runs
+for
+i
+in
+range
+(
+len
+(
+nums
+)),
+which
+is
+range
+(6)
+initially
+.
+So
+i
+will
+take
+values
+0
+through
+5.
+First
+iteration
+(
+i
+=0):
+-
+nums
+.
+insert
+(
+i
+,
+nums
+[
+a
+])
+->
+insert
+nums
+[0]
+(
+which
+is
+1)
+at
+position
+0.
+The
+list
+becomes
+[1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+Then
+a
+increments
+to
+1.
+Second
+iteration
+(
+i
+=1):
+-
+Now
+,
+the
+list
+has
+7
+elements
+.
+But
+the
+loop
+was
+set
+up
+with
+range
+(6),
+so
+i
+is
+still
+1.
+We
+insert
+nums
+[
+a
+]
+(
+nums
+[1]
+is
+1)
+at
+position
+1.
+The
+list
+becomes
+[1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+2.
+Third
+iteration
+(
+i
+=2):
+-
+Insert
+nums
+[2]
+(
+which
+is
+1)
+at
+position
+2.
+The
+list
+is
+now
+[1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+a
+is
+3.
+Fourth
+iteration
+(
+i
+=3):
+-
+Insert
+nums
+[3]
+(3)
+at
+position
+3.
+The
+list
+becomes
+[1,
+1,
+1,
+1,
+3,
+3,
+-1,
+1,
+-2,
+6].
+a
+is
+4.
+Fifth
+iteration
+(
+i
+=4):
+-
+Insert
+nums
+[4]
+(-1)
+at
+position
+4.
+The
+list
+is
+[1,
+1,
+1,
+1,
+3,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+a
+is
+5.
+Sixth
+iteration
+(
+i
+=5):
+-
+Insert
+nums
+[5]
+(1)
+at
+position
+5.
+The
+list
+becomes
+[1,
+1,
+1,
+1,
+3,
+1,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+a
+is
+6.
+After
+these
+six
+iterations
+,
+the
+loop
+ends
+because
+i
+has
+gone
+through
+0
+to
+5.
+The
+final
+list
+is
+[1,
+1,
+1,
+1,
+3,
+1,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+Wait
+,
+but
+let
+me
+double
+-
+check
+each
+step
+to
+make
+sure
+I
+didn
+’
+t
+make
+a
+mistake
+.
+Wait
+a
+minute
+,
+when
+we
+insert
+elements
+,
+the
+original
+elements
+shift
+to
+the
+right
+.
+Let
+me
+re
+-
+examine
+each
+step
+carefully
+.
+Original
+list
+:
+[1,
+3,
+-1,
+1,
+-2,
+6]
+First
+iteration
+(
+i
+=0,
+a
+=0):
+-
+Insert
+nums
+[0]
+(1)
+at
+position
+0.
+The
+list
+becomes
+[1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+1.
+Second
+iteration
+(
+i
+=1,
+a
+=1):
+-
+Insert
+nums
+[1]
+(1)
+at
+position
+1.
+The
+list
+is
+now
+[1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+2.
+Third
+iteration
+(
+i
+=2,
+a
+=2):
+-
+Insert
+nums
+[2]
+(1)
+at
+position
+2.
+The
+list
+becomes
+[1,
+1,
+1,
+1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+3.
+Fourth
+iteration
+(
+i
+=3,
+a
+=3):
+-
+Insert
+nums
+[3]
+(3)
+at
+position
+3.
+The
+list
+is
+[1,
+1,
+1,
+1,
+3,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+4.
+Fifth
+iteration
+(
+i
+=4,
+a
+=4):
+-
+Insert
+nums
+[4]
+(-1)
+at
+position
+4.
+The
+list
+becomes
+[1,
+1,
+1,
+1,
+3,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+5.
+Sixth
+iteration
+(
+i
+=5,
+a
+=5):
+-
+Insert
+nums
+[5]
+(1)
+at
+position
+5.
+The
+list
+is
+[1,
+1,
+1,
+1,
+3,
+1,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+a
+becomes
+6.
+Yes
+,
+that
+seems
+correct
+.
+So
+the
+final
+list
+returned
+by
+the
+function
+is
+[1,
+1,
+1,
+1,
+3,
+1,
+-1,
+3,
+-1,
+1,
+-2,
+6].
+</
+think
+>
+[
+ANSWER
+]
+assert
+f
+([1,
+3,
+-1,
+1,
+-2,
+6])
+==
+[1,
+1,
+1,
+1,
+3,
+1,
+-1,
+3,
+-1,
+1,
+-2,
+6]
+[/
+ANSWER
+]
+(b)
+CWM
+Reasoning Trace (wrong prediction).
+Figure 11.23
+:
+CWM
+execution trace prediction (correct) vs. natural language reasoning trace (wrong). In this CruxEval example, a list is modified while iterating over it. In execution trace prediction mode the
+CWM
+model generates the correct answer
+[1, 1, 1, 1, 1, 1, 1, 3, -1, 1, -2, 6]
+. During reasoning in natural language, even though the
+CWM
+model recognizes the modification, it still predicts the wrong output.
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+def
+f
+(
+sequence
+,
+value
+):
+i
+=
+max
+(
+sequence
+.
+index
+(
+value
+)
+-
+len
+(
+sequence
+)
+//
+3,
+0)
+result
+=
+’’
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+if
+v
+==
+’+’
+:
+result
++=
+value
+else
+:
+result
++=
+sequence
+[
+i
++
+j
+]
+return
+result
+def
+main
+():
+#
+<<
+START_OF_TRACE
+return
+f
+(
+’hosu’
+,
+’o’
+)<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+def
+main
+():
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|>
+---
+END
+OF
+PROMPT
+---
+<|
+line_sep
+|>{}<|
+action_sep
+|>
+return
+f
+(
+’hosu’
+,
+’o’
+)
+<|
+frame_sep
+|><|
+call_sep
+|>{
+"sequence"
+:
+"’hosu’"
+,
+"value"
+:
+"’o’"
+}<|
+action_sep
+|>
+def
+f
+(
+sequence
+,
+value
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+}<|
+action_sep
+|>
+i
+=
+max
+(
+sequence
+.
+index
+(
+value
+)
+-
+len
+(
+sequence
+)
+//
+3,
+0)
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i": "1"
+}<|
+action_sep
+|>
+result
+=
+’’
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+"’’"
+}<|
+action_sep
+|>
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+"0"
+,
+"v"
+:
+"’o’"
+}<|
+action_sep
+|>
+if
+v
+==
+’+’
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+result
++=
+sequence
+[
+i
++
+j
+]
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+"’o’"
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+"1"
+,
+"v"
+:
+"’s’"
+}<|
+action_sep
+|>
+if
+v
+==
+’+’
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+result
++=
+sequence
+[
+i
++
+j
+]
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+"’os’"
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+"2"
+,
+"v"
+:
+"’u’"
+}<|
+action_sep
+|>
+if
+v
+==
+’+’
+:
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+result
++=
+sequence
+[
+i
++
+j
+]
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+"’osu’"
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+<|
+frame_sep
+|><|
+line_sep
+|>{
+"sequence"
+:
+".."
+,
+"value"
+:
+".."
+,
+"i"
+:
+".."
+,
+"result"
+:
+".."
+,
+"j"
+:
+".."
+,
+"v"
+:
+".."
+}<|
+action_sep
+|>
+return
+result
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+result
+<|
+arg_sep
+|>
+"’osu’"
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+f
+(
+’hosu’
+,
+’o’
+)<|
+arg_sep
+|>
+"’osu’"
+<|
+frame_sep
+|>
+(a)
+CWM
+Execution Trace Prediction (wrong prediction).
+⬇
+Given
+a
+python
+code
+function
+and
+an
+assert
+statement
+containing
+a
+specific
+input
+,
+provide
+the
+assertion
+with
+the
+exact
+literal
+output
+that
+the
+function
+returns
+with
+that
+input
+.
+Do
+not
+include
+any
+mathematical
+expressions
+or
+function
+calls
+-
+only
+the
+final
+literal
+value
+.
+Your
+response
+should
+be
+solely
+the
+assertion
+,
+enclosed
+within
+[
+ANSWER
+]
+and
+[/
+ANSWER
+]
+tags
+.
+[
+SKIPPED
+EXAMPLE
+FOR
+ILLUSTRATION
+PURPOSES
+]
+Python
+function
+:
+‘‘‘
+def
+f
+(
+sequence
+,
+value
+):
+i
+=
+max
+(
+sequence
+.
+index
+(
+value
+)
+-
+len
+(
+sequence
+)
+//
+3,
+0)
+result
+=
+’’
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]):
+if
+v
+==
+’+’:
+result
++=
+value
+else
+:
+result
++=
+sequence
+[
+i
++
+j
+]
+return
+result
+assert
+f
+(’
+hosu
+’,
+’
+o
+’)
+==
+??
+‘‘‘
+----
+END
+OF
+PROMPT
+---
+Okay
+,
+let
+’
+s
+see
+.
+I
+need
+to
+figure
+out
+what
+the
+function
+f
+returns
+when
+called
+with
+the
+arguments
+’
+hosu
+’
+and
+’
+o
+’.
+Let
+me
+start
+by
+understanding
+the
+function
+step
+by
+step
+.
+First
+,
+the
+function
+takes
+two
+parameters
+:
+sequence
+and
+value
+.
+In
+this
+case
+,
+sequence
+is
+the
+string
+’
+hosu
+’
+and
+value
+is
+’
+o
+’.
+The
+first
+line
+inside
+the
+function
+is
+:
+i
+=
+max
+(
+sequence
+.
+index
+(
+value
+)
+-
+len
+(
+sequence
+)
+//
+3,
+0).
+Let
+’
+s
+break
+this
+down
+.
+sequence
+.
+index
+(
+value
+)
+finds
+the
+index
+of
+the
+first
+occurrence
+of
+’
+o
+’
+in
+’
+hosu
+’.
+The
+string
+’
+hosu
+’
+is
+h
+-
+o
+-
+s
+-
+u
+,
+so
+the
+index
+of
+’
+o
+’
+is
+1.
+len
+(
+sequence
+)
+is
+the
+length
+of
+’
+hosu
+’,
+which
+is
+4.
+len
+(
+sequence
+)
+//
+3
+is
+4
+divided
+by
+3,
+which
+equals
+1
+(
+since
+integer
+division
+truncates
+).
+So
+,
+sequence
+.
+index
+(
+value
+)
+-
+len
+(
+sequence
+)
+//
+3
+is
+1
+-
+1
+=
+0.
+Then
+,
+max
+(0,
+0)
+is
+0.
+Therefore
+,
+i
+is
+0.
+Next
+,
+the
+function
+initializes
+an
+empty
+string
+result
+=
+’’.
+Then
+,
+there
+’
+s
+a
+for
+loop
+:
+for
+j
+,
+v
+in
+enumerate
+(
+sequence
+[
+i
+:]).
+Since
+i
+is
+0,
+sequence
+[
+i
+:]
+is
+the
+entire
+string
+’
+hosu
+’.
+So
+,
+the
+loop
+will
+iterate
+over
+each
+character
+in
+’
+hosu
+’
+along
+with
+their
+indices
+j
+.
+Let
+’
+s
+go
+through
+each
+iteration
+:
+1.
+j
+=
+0,
+v
+=
+’
+h
+’:
+-
+v
+is
+not
+’+’,
+so
+result
++=
+sequence
+[
+i
++
+j
+]
+which
+is
+sequence
+[0
++
+0]
+=
+’
+h
+’.
+Now
+result
+is
+’
+h
+’.
+2.
+j
+=
+1,
+v
+=
+’
+o
+’:
+-
+v
+is
+not
+’+’,
+so
+result
++=
+sequence
+[0
++
+1]
+=
+’
+o
+’.
+Now
+result
+is
+’
+ho
+’.
+3.
+j
+=
+2,
+v
+=
+’
+s
+’:
+-
+v
+is
+not
+’+’,
+so
+result
++=
+sequence
+[0
++
+2]
+=
+’
+s
+’.
+Now
+result
+is
+’
+hos
+’.
+4.
+j
+=
+3,
+v
+=
+’
+u
+’:
+-
+v
+is
+not
+’+’,
+so
+result
++=
+sequence
+[0
++
+3]
+=
+’
+u
+’.
+Now
+result
+is
+’
+hosu
+’.
+After
+the
+loop
+,
+the
+function
+returns
+result
+,
+which
+is
+’
+hosu
+’.
+Wait
+a
+minute
+,
+but
+the
+value
+parameter
+is
+’
+o
+’.
+The
+code
+checks
+if
+v
+==
+’+’,
+but
+in
+the
+sequence
+’
+hosu
+’,
+there
+are
+no
+’+’
+characters
+.
+So
+,
+the
+condition
+v
+==
+’+’
+is
+never
+true
+.
+Therefore
+,
+the
+code
+never
+appends
+the
+value
+’
+o
+’
+to
+the
+result
+.
+Instead
+,
+it
+appends
+the
+characters
+from
+the
+sequence
+starting
+at
+index
+i
+(
+which
+is
+0)
+for
+each
+character
+in
+the
+sequence
+.
+So
+,
+the
+result
+is
+just
+the
+original
+sequence
+’
+hosu
+’.
+Therefore
+,
+the
+function
+returns
+’
+hosu
+’
+when
+called
+with
+f
+(’
+hosu
+’,
+’
+o
+’).
+</
+think
+>
+[
+ANSWER
+]
+assert
+f
+(’
+hosu
+’,
+’
+o
+’)
+==
+’
+hosu
+’
+[/
+ANSWER
+]
+(b)
+CWM
+Reasoning Trace (correct prediction).
+Figure 11.24
+:
+CWM
+execution trace prediction (wrong) vs.
+CWM
+natural language reasoning trace (correct). In this CruxEval example, the
+CWM
+model with reasoning in natural language predicts the correct output
+’hosu’
+.
+In execution trace prediction mode, the
+CWM
+model predicts
+’osu’
+, because it fails to predict the value of the variable
+i
+in the first line correctly. It predicts 1, instead of the true value 0.
+⬇
+=====
+CONTEXT
+=====
+def
+count_letters
+(
+s
+,
+letter
+):
+n
+=
+0
+for
+c
+in
+s
+:
+n
++=
+int
+(
+c
+==
+letter
+)
+return
+n
+def
+format_answer
+(
+word
+,
+letter
+,
+count
+):
+parts
+=
+[
+"Found"
+,
+f
+"{count:04d}"
+,
+"occurrences
+of
+the
+letter"
+,
+letter
+,
+"in"
+,
+word
+]
+return
+"
+"
+.
+join
+(
+parts
+)
+def
+f
+(
+c
+):
+#
+<<
+START_OF_TRACE
+word
+=
+"strawberry"
+num
+=
+count_letters
+(
+word
+,
+c
+)
+ans
+=
+format_answer
+(
+word
+,
+c
+,
+num
+)
+return
+ans
+=====
+SESSION
+START
+=====
+->
+def
+f
+(
+c
+):
+>>
+print
+{
+’c’
+:
+"’r’"
+}
+>>
+next
+->
+word
+=
+"strawberry"
+>>
+next
+->
+num
+=
+count_letters
+(
+word
+,
+"r"
+)
+>>
+step
+->
+def
+count_letters
+(
+s
+,
+letter
+):
+>>
+next
+->
+n
+=
+0
+>>
+next
+->
+for
+c
+in
+s
+:
+>>
+next
+->
+n
++=
+int
+(
+c
+==
+letter
+)
+>>
+print
+{
+’s’
+:
+"’strawberry’"
+,
+’letter’
+:
+"’r’"
+,
+’n’
+:
+’0’
+,
+’c’
+:
+"’s’"
+}
+>>
+next
+->
+for
+c
+in
+s
+:
+>>
+next
+->
+n
++=
+int
+(
+c
+==
+letter
+)
+>>
+print
+{
+’s’
+:
+"’strawberry’"
+,
+’letter’
+:
+"’r’"
+,
+’n’
+:
+’0’
+,
+’c’
+:
+"’t’"
+}
+>>
+out
+->
+ans
+=
+format_answer
+(
+word
+,
+"r"
+,
+num
+)
+>>
+next
+->
+return
+ans
+>>
+next
+->
+return
+ans
+’Found
+0003
+occurrences
+of
+the
+letter
+r
+in
+strawberry’
+Figure 11.25
+:
+Interactive Python code debugging with
+CWM
+. As a show-case of
+CWM
+’s execution tracing capabilities (
+Section
+˜
+2.2
+), we present an interactive debugger to steer execution trace predictions. From a given code context,
+CWM
+predicts the initial state of the trace (we selected this example based on the prediction of the
+c
+parameter). We can then prompt the model to generate the program state at the next line, or to directly predict the return value after a function is entered. In this example, we
+step
+into the
+count_letters
+function, step
+out
+of it after a few steps (implemented via continuous predictions until the function returns), and finally use
+next
+to predict the return value of
+format_answer
+without line-by-line unrolling. In-between, we
+print
+the local variables as generated by
+CWM
+.
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+from
+mysterious_library
+import
+f
+assert
+f
+(2,
+2)
+==
+4
+assert
+f
+(3,
+7)
+==
+10
+assert
+f
+(-1,
+-8)
+==
+-9
+assert
+f
+(1,
+1)
+==
+2
+f
+(5,
+5)
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|>
+----
+END
+OF
+PROMPT
+---
+<|
+call_sep
+|>{}<|
+action_sep
+|>
+f
+(5,
+5)
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+line_sep
+|>{}<|
+action_sep
+|>
+f
+(5,
+5)
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+call_sep
+|>{"
+a
+":
+"5",
+"
+b
+":
+"5"}<|
+action_sep
+|>
+def
+f
+(
+a
+,
+b
+):
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+a
+":
+"..",
+"
+b
+":
+".."}<|
+action_sep
+|>
+return
+a
++
+b
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+a
++
+b
+<|
+arg_sep
+|>"10"<|
+frame_sep
+|>
+Figure 11.26
+:
+CWM
+predicting the execution traces of an unseen function
+f
+, imported from a library not included in the context. The expected semantics of
+f
+can be inferred from the assertions in the context.
+CWM
+correctly generates the traces of a function that sums two numbers. It is possible that the model acquired this capability of jointly tracing and generating code because for some tracing data we do not include the source context of third-party libraries.
+⬇
+Given
+the
+following
+problem
+:
+Three
+little
+pigs
+from
+all
+over
+the
+world
+are
+meeting
+for
+a
+convention
+!
+Every
+minute
+,
+a
+triple
+of
+3
+new
+pigs
+arrives
+on
+the
+convention
+floor
+.
+After
+the
+n
+-
+th
+minute
+,
+the
+convention
+ends
+.
+The
+big
+bad
+wolf
+has
+learned
+about
+this
+convention
+,
+and
+he
+has
+an
+attack
+plan
+.
+At
+some
+minute
+in
+the
+convention
+,
+he
+will
+arrive
+and
+eat
+exactly
+x
+pigs
+.
+Then
+he
+will
+get
+away
+.
+The
+wolf
+wants
+Gregor
+to
+help
+him
+figure
+out
+the
+number
+of
+possible
+attack
+plans
+that
+involve
+eating
+exactly
+x
+pigs
+for
+various
+values
+of
+x
+(1
+<=
+x
+<=
+3
+n
+).
+Two
+attack
+plans
+are
+considered
+different
+,
+if
+they
+occur
+at
+different
+times
+or
+if
+the
+sets
+of
+little
+pigs
+to
+eat
+are
+different
+.
+Note
+that
+all
+queries
+are
+independent
+,
+that
+is
+,
+the
+wolf
+does
+not
+eat
+the
+little
+pigs
+,
+he
+only
+makes
+plans
+!
+Input
+The
+first
+line
+of
+input
+contains
+two
+integers
+n
+and
+q
+(1
+<=
+n
+<=
+10^6,
+1
+<=
+q
+<=
+2*
+10^5),
+the
+number
+of
+minutes
+the
+convention
+lasts
+and
+the
+number
+of
+queries
+the
+wolf
+asks
+.
+Each
+of
+the
+next
+q
+lines
+contains
+a
+single
+integer
+x_i
+(1
+<=
+x_i
+<=
+b3n
+),
+the
+number
+of
+pigs
+the
+wolf
+will
+eat
+in
+the
+i
+-
+th
+query
+.
+Output
+You
+should
+print
+q
+lines
+,
+with
+line
+i
+representing
+the
+number
+of
+attack
+plans
+if
+the
+wolf
+wants
+to
+eat
+x_i
+pigs
+.
+Since
+each
+query
+answer
+can
+be
+large
+,
+output
+each
+answer
+modulo
+10^9+7.
+Examples
+Input
+2
+3
+1
+5
+6
+Output
+9
+6
+1
+Input
+...
+Note
+In
+the
+example
+test
+,
+n
+=2.
+Thus
+,
+there
+are
+3
+pigs
+at
+minute
+1,
+and
+6
+pigs
+at
+minute
+2.
+There
+are
+three
+queries
+:
+x
+=1,
+x
+=5,
+and
+x
+=6.
+If
+the
+wolf
+wants
+to
+eat
+1
+pig
+,
+he
+can
+do
+so
+in
+3+6=9
+possible
+attack
+plans
+,
+depending
+on
+whether
+he
+arrives
+at
+minute
+1
+or
+2.
+If
+the
+wolf
+wants
+to
+eat
+5
+pigs
+,
+the
+wolf
+cannot
+arrive
+at
+minute
+1,
+since
+there
+aren
+’
+t
+enough
+pigs
+at
+that
+time
+.
+Therefore
+,
+the
+wolf
+has
+to
+arrive
+at
+minute
+2,
+and
+there
+are
+6
+possible
+attack
+plans
+.
+If
+the
+wolf
+wants
+to
+eat
+6
+pigs
+,
+his
+only
+plan
+is
+to
+arrive
+at
+the
+end
+of
+the
+convention
+and
+devour
+everybody
+.
+Remember
+to
+output
+your
+answers
+modulo
+10^9+7!
+Generate
+the
+code
+step
+by
+step
+.
+You
+MUST
+FIRST
+generate
+the
+execution
+traces
+for
+the
+code
+you
+want
+to
+generate
+given
+a
+specific
+input
+,
+and
+then
+if
+it
+’
+s
+correct
+extract
+the
+code
+.
+Your
+code
+in
+the
+final
+answer
+should
+be
+enclosed
+in
+triple
+backticks
+like
+so
+:
+python
+YOUR
+CODE
+HERE
+.
+Use
+the
+backticks
+for
+your
+code
+only
+.
+<|
+eot_id
+|><|
+start_header_id
+|>
+assistant
+⬇
+Okay
+,
+let
+’
+s
+see
+.
+The
+problem
+is
+about
+calculating
+the
+number
+of
+possible
+attack
+plans
+for
+the
+big
+bad
+wolf
+.
+The
+wolf
+can
+attack
+at
+any
+minute
+during
+the
+convention
+,
+and
+he
+eats
+exactly
+x
+pigs
+.
+We
+need
+to
+find
+the
+number
+of
+ways
+he
+can
+do
+this
+for
+various
+values
+of
+x
+.
+Let
+’
+s
+jointly
+synthethize
+and
+trace
+the
+code
+for
+this
+using
+the
+first
+provided
+input
+/
+output
+example
+.
+First
+,
+I
+will
+emit
+the
+execution
+traces
+and
+check
+that
+it
+arrives
+to
+the
+expected
+output
+.
+Then
+,
+if
+the
+output
+is
+correct
+,
+extracting
+the
+code
+is
+straightforward
+,
+I
+don
+’
+t
+need
+to
+think
+anymore
+and
+can
+proceed
+to
+the
+final
+answer
+.
+But
+if
+the
+execution
+is
+incorrect
+,
+I
+need
+to
+think
+and
+then
+predict
+the
+corrected
+traces
+again
+.
+Let
+’
+s
+start
+jointly
+tracing
+and
+synthethizing
+.
+The
+input
+will
+be
+f
+(2,
+3,
+[1,
+5,
+6]),
+and
+the
+output
+must
+be
+[9,
+6,
+1];
+otherwise
+it
+will
+mean
+I
+predicted
+the
+wrong
+code
+and
+need
+to
+start
+again
+.
+Let
+’
+s
+see
+:
+<|
+trace_context_start
+|>
+from
+solution
+import
+f
+assert
+f
+(2,
+3,
+[1,
+5,
+6])
+==
+[9,
+6,
+1]
+assert
+f
+(5,
+4,
+[2,
+4,
+6,
+8])
+==
+[225,
+2001,
+6014,
+6939]
+f
+(2,
+3,
+[1,
+5,
+6])
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+call_sep
+|>{}<|
+action_sep
+|>
+f
+(2,
+3,
+[1,
+5,
+6])
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+line_sep
+|>{}<|
+action_sep
+|>
+f
+(2,
+3,
+[1,
+5,
+6])
+#
+<<
+START_OF_TRACE
+<|
+frame_sep
+|><|
+call_sep
+|>{"
+n
+":
+"2",
+"
+q
+":
+"3",
+"
+queries
+":
+"[1,
+5,
+6]"}<|
+action_sep
+|>
+def
+f
+(
+n
+,
+q
+,
+queries
+):
+<|
+frame_sep
+|>
+----
+END
+OF
+PROMPT
+---
+<|
+line_sep
+|>{"
+n
+":
+"..",
+"
+q
+":
+"..",
+"
+queries
+":
+".."}<|
+action_sep
+|>
+MOD
+=
+10**9
++
+7
+<|
+frame_sep
+|><|
+line_sep
+|>{"
+n
+":
+"..",
+"
+q
+":
+"..",
+"
+queries
+":
+"..",
+"
+MOD
+":
+"1000000007"}<|
+action_sep
+|>
+max_x
+=
+3
+*
+n
+...
+<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+return
+ans
+<|
+arg_sep
+|>"[9,
+6,
+1]"<|
+frame_sep
+|><|
+return_sep
+|><|
+action_sep
+|>
+f
+(2,
+3,
+[1,
+5,
+6])
+#
+<<
+START_OF_TRACE
+<|
+arg_sep
+|>"[9,
+6,
+1]"<|
+frame_sep
+|>
+Okay
+,
+the
+execution
+trace
+is
+correct
+.
+Now
+,
+I
+can
+proceed
+to
+extract
+the
+code
+.
+⬇
+‘‘‘
+python
+def
+f
+(
+n
+,
+q
+,
+queries
+):
+MOD
+=
+10**9
++
+7
+max_x
+=
+3
+*
+n
+fact
+=
+[1]
+*
+(
+max_x
++
+1)
+for
+i
+in
+range
+(1,
+max_x
++
+1):
+fact
+[
+i
+]
+=
+fact
+[
+i
+-
+1]
+*
+i
+%
+inv_fact
+=
+[1]
+*
+(
+max_x
++
+1)
+inv_fact
+[
+max_x
+]
+=
+pow
+(
+fact
+[
+max_x
+],
+MOD
+-
+2,
+MOD
+)
+for
+i
+in
+range
+(
+max_x
+-
+1,
+-1,
+-1):
+inv_fact
+[
+i
+]
+=
+inv_fact
+[
+i
++
+1]
+*
+(
+i
++
+1)
+%
+def
+comb
+(
+n
+,
+k
+):
+if
+k
+<
+0
+or
+k
+>
+n
+:
+return
+0
+return
+fact
+[
+n
+]
+*
+inv_fact
+[
+k
+]
+*
+inv_fact
+[
+n
+-
+k
+]
+%
+total
+=
+0
+for
+i
+in
+range
+(1,
+n
++
+1):
+total
++=
+3
+*
+i
+ans
+=
+[]
+for
+x
+in
+queries
+:
+res
+=
+0
+for
+i
+in
+range
+(1,
+n
++
+1):
+if
+3
+*
+i
+>=
+x
+:
+res
++=
+comb
+(3
+*
+i
+,
+x
+)
+ans
+.
+append
+(
+res
+%
+return
+ans
+‘‘‘
+Figure 11.27
+:
+CWM
+interspersing code generation and execution tracing.
+As indicated by “…”, we have omitted parts of
+CWM
+reasoning for clarity here.
+⬇
+You
+will
+be
+given
+a
+function
+f
+and
+a
+call
+to
+that
+function
+in
+the
+form
+f
+(??).
+This
+function
+might
+either
+terminate
+for
+all
+inputs
+or
+diverge
+for
+some
+input
+.
+If
+it
+terminates
+for
+all
+inputs
+,
+then
+answer
+"#
+T
+".
+Otherwise
+,
+if
+it
+diverges
+,
+provide
+an
+input
+such
+that
+executing
+f
+on
+the
+input
+leads
+to
+a
+diverging
+execution
+(
+i
+.
+e
+.,
+f
+does
+NOT
+halt
+with
+that
+input
+),
+followed
+by
+the
+comment
+"#
+NT
+".
+In
+case
+non
+-
+terminating
+inputs
+exist
+,
+there
+may
+be
+multiple
+possible
+answers
+,
+but
+you
+should
+only
+output
+one
+.
+Express
+your
+final
+answer
+with
+either
+"#
+T
+",
+if
+the
+function
+terminates
+for
+all
+inputs
+,
+or
+as
+function
+call
+to
+f
+containing
+the
+input
+that
+leads
+to
+the
+non
+-
+terminating
+execution
+followed
+by
+the
+comment
+"#
+NT
+".
+‘‘‘
+def
+f
+(
+x
+):
+def
+g
+(
+i
+):
+if
+i
+<=
+0:
+return
+0
+return
+g
+(
+g
+(
+g
+(
+i
+-2)
+-
+1))
++
+1
+def
+h
+(
+j
+):
+if
+j
+<=
+0:
+return
+0
+return
+h
+(
+g
+(
+j
+)-1)
+-
+1
+h
+(
+x
+)
+return
+None
+f
+(??)
+#
+Terminates
+?
+‘‘‘
+----
+END
+OF
+PROMPT
+---
+Okay
+,
+I
+need
+to
+figure
+out
+if
+the
+function
+f
+(
+x
+)
+terminates
+for
+all
+inputs
+or
+if
+there
+’
+s
+some
+input
+where
+it
+diverges
+.
+Let
+’
+s
+start
+by
+understanding
+what
+the
+function
+does
+.
+The
+function
+f
+(
+x
+)
+defines
+two
+helper
+functions
+,
+g
+(
+i
+)
+and
+h
+(
+j
+).
+It
+then
+calls
+h
+(
+x
+)
+and
+returns
+None
+.
+So
+,
+the
+termination
+of
+f
+(
+x
+)
+depends
+entirely
+on
+the
+behavior
+of
+h
+(
+x
+),
+which
+in
+turn
+depends
+on
+g
+(
+i
+).
+Let
+me
+break
+down
+the
+helper
+functions
+:
+First
+,
+g
+(
+i
+):
+If
+i
+is
+less
+than
+or
+equal
+to
+0,
+return
+0.
+Otherwise
+,
+return
+g
+(
+g
+(
+g
+(
+i
+-2)
+-
+1))
++
+1.
+Hmm
+,
+that
+’
+s
+a
+recursive
+function
+.
+Let
+’
+s
+see
+what
+happens
+when
+we
+call
+g
+with
+some
+positive
+integer
+.
+For
+example
+,
+if
+i
+is
+1:
+g
+(1)
+would
+call
+g
+(
+g
+(
+g
+(-1)
+-
+1))
++
+1.
+Since
+g
+(-1)
+is
+0,
+this
+becomes
+g
+(
+g
+(-1)
+-1)
+=
+g
+(0
+-1)
+=
+g
+(-1)
+=
+0.
+Then
+,
+adding
+1
+gives
+1.
+Wait
+,
+so
+g
+(1)
+would
+return
+1?
+Let
+me
+check
+that
+again
+.
+Wait
+,
+let
+’
+s
+step
+through
+g
+(1):
+g
+(1)
+calls
+g
+(
+g
+(
+g
+(1-2)
+-
+1))
++
+1
+1-2
+is
+-1,
+so
+g
+(-1)
+is
+0.
+Then
+,
+0
+-1
+is
+-1.
+So
+g
+(-1)
+is
+0
+again
+.
+Then
+,
+the
+innermost
+g
+is
+0,
+so
+we
+have
+g
+(0)
++1.
+Wait
+,
+no
+,
+let
+me
+re
+-
+express
+this
+.
+Wait
+,
+the
+expression
+is
+g
+(
+g
+(
+g
+(
+i
+-2)
+-
+1))
++
+1.
+So
+for
+i
+=1:
+i
+-2
+=
+-1.
+So
+g
+(-1)
+is
+0.
+Then
+,
+subtract
+1:
+0
+-1
+=
+-1.
+Then
+,
+call
+g
+(-1)
+again
+,
+which
+is
+0.
+Then
+,
+call
+g
+(0)
+which
+is
+0.
+Then
+add
+1.
+Wait
+,
+no
+,
+maybe
+I
+’
+m
+getting
+the
+order
+wrong
+.
+Wait
+,
+the
+innermost
+call
+is
+g
+(
+i
+-2).
+So
+for
+i
+=1,
+i
+-2
+is
+-1.
+So
+g
+(-1)
+returns
+0.
+Then
+subtract
+1:
+0
+-1
+=
+-1.
+Then
+,
+call
+g
+(-1)
+again
+,
+which
+returns
+0.
+Then
+,
+call
+g
+(0),
+which
+returns
+0.
+Then
+add
+1.
+Wait
+,
+that
+seems
+off
+.
+Let
+me
+write
+it
+step
+by
+step
+:
+g
+(1)
+=
+g
+(
+g
+(
+g
+(1-2)
+-
+1))
++
+1
+=
+g
+(
+g
+(
+g
+(-1)
+-
+1))
++
+1
+=
+g
+(
+g
+(0
+-
+1))
++
+1
+(
+since
+g
+(-1)
+is
+0)
+=
+g
+(
+g
+(-1))
++
+1
+=
+g
+(0)
++
+1
+(
+since
+g
+(-1)
+is
+0)
+=
+0
++
+1
+=
+1
+So
+g
+(1)
+returns
+1.
+That
+’
+s
+interesting
+.
+What
+about
+g
+(2)?
+g
+(2)
+=
+g
+(
+g
+(
+g
+(0)
+-
+1))
++
+1
+g
+(0)
+is
+0,
+so
+0
+-1
+=
+-1.
+Then
+g
+(-1)
+is
+0.
+Then
+g
+(0)
+is
+0.
+So
+0
++1
+=
+1.
+So
+g
+(2)
+is
+1.
+Wait
+,
+that
+’
+s
+the
+same
+as
+g
+(1).
+What
+about
+g
+(3)?
+g
+(3)
+=
+g
+(
+g
+(
+g
+(1)
+-
+1))
++
+1
+g
+(1)
+is
+1,
+so
+1
+-1
+=
+0.
+Then
+g
+(0)
+is
+0.
+Then
+g
+(0)
+is
+0.
+So
+0
++1
+=
+1.
+So
+g
+(3)
+is
+1.
+Hmm
+,
+so
+for
+i
+=1,2,3,
+g
+(
+i
+)
+returns
+1.
+What
+about
+i
+=4?
+g
+(4)
+=
+g
+(
+g
+(
+g
+(2)
+-
+1))
++
+1
+g
+(2)
+is
+1,
+so
+1
+-1
+=
+0.
+Then
+g
+(0)
+is
+0.
+Then
+g
+(0)
+is
+0.
+So
+0
++1
+=
+1.
+So
+g
+(4)
+is
+1.
+Wait
+,
+so
+it
+seems
+like
+for
+any
+i
+>
+0,
+g
+(
+i
+)
+returns
+1.
+Is
+that
+possible
+?
+Let
+me
+check
+i
+=5.
+g
+(5)
+=
+g
+(
+g
+(
+g
+(3)
+-
+1))
++
+1
+g
+(3)
+is
+1,
+so
+1
+-1
+=
+0.
+Then
+g
+(0)
+is
+0.
+Then
+g
+(0)
+is
+0.
+So
+0
++1
+=
+1.
+Yep
+,
+same
+result
+.
+So
+,
+in
+general
+,
+for
+any
+i
+>
+0,
+g
+(
+i
+)
+will
+compute
+to
+1.
+Because
+no
+matter
+what
+i
+is
+,
+when
+you
+subtract
+2
+enough
+times
+,
+you
+get
+to
+a
+point
+where
+the
+inner
+g
+call
+returns
+0,
+leading
+to
+a
+chain
+that
+ends
+up
+with
+0
++1
+=
+1.
+Wait
+,
+but
+what
+if
+i
+is
+even
+or
+odd
+?
+Let
+’
+s
+see
+.
+Let
+’
+s
+take
+i
+=6:
+g
+(6)
+=
+g
+(
+g
+(
+g
+(4)
+-
+1))
++
+1
+g
+(4)
+is
+1,
+so
+1
+-1
+=
+0.
+Then
+g
+(0)
+is
+0.
+Then
+g
+(0)
+is
+0.
+So
+0
++1
+=
+1.
+Same
+result
+.
+So
+regardless
+of
+the
+input
+i
+(
+as
+long
+as
+it
+’
+s
+positive
+),
+g
+(
+i
+)
+returns
+1.
+That
+’
+s
+a
+key
+insight
+.
+So
+g
+(
+i
+)
+is
+a
+function
+that
+returns
+0
+if
+i
+<=0,
+and
+1
+otherwise
+.
+Now
+,
+moving
+on
+to
+h
+(
+j
+):
+h
+(
+j
+)
+is
+defined
+as
+:
+If
+j
+<=
+0,
+return
+0.
+Otherwise
+,
+return
+h
+(
+g
+(
+j
+)
+-
+1)
+-
+1.
+But
+since
+g
+(
+j
+)
+is
+1
+for
+any
+j
+>
+0,
+then
+g
+(
+j
+)
+-1
+is
+0.
+So
+for
+any
+j
+>
+0,
+h
+(
+j
+)
+becomes
+h
+(0)
+-1.
+But
+h
+(0)
+is
+0,
+so
+h
+(
+j
+)
+=
+0
+-1
+=
+-1.
+Wait
+,
+but
+then
+h
+(
+j
+)
+for
+j
+>
+0
+would
+be
+-1.
+But
+then
+,
+if
+we
+call
+h
+again
+with
+that
+value
+?
+Wait
+,
+let
+’
+s
+step
+through
+h
+(
+j
+)
+for
+j
+>
+0.
+Let
+’
+s
+take
+j
+=1:
+h
+(1)
+=
+h
+(
+g
+(1)
+-
+1)
+-1
+g
+(1)
+is
+1,
+so
+1
+-1
+=
+0.
+Then
+h
+(0)
+is
+0.
+So
+0
+-1
+=
+-1.
+So
+h
+(1)
+returns
+-1.
+What
+about
+j
+=2?
+h
+(2)
+=
+h
+(
+g
+(2)
+-1)
+-1
+g
+(2)
+is
+1,
+so
+1
+-1
+=
+0.
+h
+(0)
+is
+0.
+0
+-1
+=
+-1.
+So
+h
+(2)
+is
+-1.
+Same
+for
+any
+j
+>
+0.
+So
+h
+(
+j
+)
+for
+j
+>
+0
+is
+always
+-1.
+So
+when
+f
+(
+x
+)
+calls
+h
+(
+x
+),
+if
+x
+is
+positive
+,
+h
+(
+x
+)
+returns
+-1,
+and
+then
+f
+returns
+None
+.
+If
+x
+is
+0
+or
+negative
+,
+h
+(
+x
+)
+returns
+0,
+and
+then
+f
+returns
+None
+.
+Wait
+,
+but
+this
+seems
+too
+straightforward
+.
+So
+in
+all
+cases
+,
+h
+(
+x
+)
+terminates
+.
+Because
+even
+if
+x
+is
+a
+very
+large
+number
+,
+h
+(
+x
+)
+just
+does
+a
+constant
+number
+of
+steps
+.
+For
+example
+,
+h
+(1000)
+would
+compute
+g
+(1000)
+which
+is
+1,
+subtract
+1
+to
+get
+0,
+then
+call
+h
+(0)
+which
+is
+0,
+subtract
+1
+to
+get
+-1.
+So
+it
+’
+s
+a
+finite
+number
+of
+steps
+.
+Therefore
+,
+h
+(
+x
+)
+always
+terminates
+,
+regardless
+of
+the
+input
+x
+.
+Therefore
+,
+the
+function
+f
+(
+x
+)
+will
+always
+terminate
+,
+no
+matter
+what
+input
+x
+is
+given
+.
+So
+the
+answer
+should
+be
+"#
+T
+".
+</
+think
+>
+<
+answer
+>
+#
+T
+Figure 11.28
+:
+CWM
+predicting termination. The prompt features a terminating example in HaltEval-prelim. After emitting a reasoning trace, which considers behavior on several specific inputs, CWM correctly predicts termination on all inputs.
+12
+RL algorithm
+Given a prompt
+x
+x
+, we perform
+G
+G
+rollouts, producing a set of trajectories (i.e., token sequences)
+{
+y
+1
+,
+y
+2
+,
+…
+​
+y
+G
+}
+\{y_{1},y_{2},...y_{G}\}
+.
+In general, rollouts are multi-turn, so the trajectories
+y
+i
+y_{i}
+consist of a prompt
+x
+x
+followed by a sequence of actions and observations. We use the binary mask
+M
+i
+,
+t
+M_{i,t}
+to signal whether token
+y
+i
+,
+t
+y_{i,t}
+was generated by the agent (
+M
+i
+,
+t
+=
+1
+M_{i,t}=1
+) or environment (initial prompt and later observations;
+M
+i
+,
+t
+=
+0
+M_{i,t}=0
+).
+The first input required by the PPO loss is an estimate of the advantage. We denote by
+R
+i
+R_{i}
+the total return (i.e., sum of undiscounted rewards) of trajectory
+i
+i
+.
+For a batch of
+G
+G
+trajectories, we compute the length-weighted mean return
+μ
+=
+1
+L
+​
+∑
+i
+=
+1
+G
+R
+i
+×
+L
+i
+\mu=\frac{1}{L}\sum_{i=1}^{G}R_{i}\times L_{i}
+, where
+L
+i
+=
+∑
+t
+M
+i
+,
+t
+L_{i}=\sum_{t}M_{i,t}
+and
+L
+=
+∑
+i
+L
+i
+L=\sum_{i}L_{i}
+is the total number of agent-generated tokens. The advantage is then
+A
+^
+i
+=
+R
+i
+−
+μ
+\hat{A}_{i}=R_{i}-\mu
+.
+The PPO loss further requires the log probabilities of the trajectory under the behavior policy, often denoted
+π
+old
+\pi_{\text{old}}
+, in order to compute the importance ratio.
+One complicating factor here is that the workers continue rollouts in parallel to model updates (see
+Section
+˜
+6.2
+).
+At a given point in time, any number of the
+G
+G
+rollouts in a batch may be in progress. Hence, the true behavior policy distribution is difficult to describe mathematically. Nevertheless, we use the notation
+log
+⁡
+π
+old
+​
+(
+y
+i
+,
+t
+|
+y
+i
+,
+<
+t
+)
+\log\pi_{\text{old}}(y_{i,t}|y_{i,<t})
+to denote the token log probability produced by our inference backend at the moment token
+y
+i
+,
+t
+y_{i,t}
+was sampled, and use this quantity for importance weighting as described below.
+Finally, the PPO loss requires the policy log probabilities, which are computed on the trainer nodes. When a trainer receives a worker batch of
+G
+G
+trajectories associated with a prompt
+x
+x
+, it computes the advantages and adds the trajectories to a queue.
+Then, to produce a batch
+ℬ
+\mathcal{B}
+for training, trajectories are popped from the queue until a limit of
+N
+N
+tokens is reached. By keeping a fixed limit of
+N
+N
+tokens we reduce the variance in the batch size between different steps, and optimize the GPU utilization without over-allocating GPU memory.
+The trajectories are packed into a flat batch, padded to
+N
+N
+tokens, and forwarded to produce the log probabilities of the tokens
+log
+⁡
+π
+​
+(
+y
+i
+,
+t
+|
+y
+i
+,
+<
+t
+)
+\log\pi(y_{i,t}|y_{i,<t})
+.
+As a consequence, the trajectories associated with one prompt may be spread out over multiple gradient updates.
+Finally, the loss is calculated as
+𝒥
+​
+(
+θ
+)
+=
+1
+N
+​
+∑
+y
+i
+,
+A
+i
+∈
+ℬ
+∑
+t
+=
+1
+|
+y
+i
+|
+M
+i
+,
+t
+​
+min
+⁡
+[
+ρ
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+clip
+​
+(
+ρ
+i
+,
+t
+​
+(
+θ
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+​
+A
+^
+i
+]
+,
+\displaystyle\mathcal{J}(\theta)={\frac{1}{N}}\sum_{y_{i},A_{i}\in\mathcal{B}}\sum_{t=1}^{|y_{i}|}M_{i,t}\min\left[\rho_{i,t}(\theta)\hat{A}_{i},\;\text{clip}\left(\rho_{i,t}(\theta),1-\varepsilon_{\text{low}},1+\varepsilon_{\text{high}}\right)\hat{A}_{i}\right],
+where, as noted before,
+ℬ
+\mathcal{B}
+is the trainer batch (not generally equal to the set of
+G
+G
+trajectories per prompt),
+M
+i
+,
+t
+∈
+{
+0
+,
+1
+}
+M_{i,t}\in\{0,1\}
+masks out environment-generated tokens,
+ε
+low
+\varepsilon_{\text{low}}
+and
+ε
+high
+\varepsilon_{\text{high}}
+are the PPO clipping thresholds, and
+N
+N
+is the maximum number of tokens in a batch (which we set to
+131 072
+131\,072
+, the maximum context size of our model).
+The importance ratio
+ρ
+i
+,
+t
+\rho_{i,t}
+is computed from the log probabilities
+log
+⁡
+π
+θ
+\log\pi_{\theta}
+and
+log
+⁡
+π
+old
+\log\pi_{\text{old}}
+(computed on the trainer and worker, respectively) as follows:
+ρ
+i
+,
+t
+​
+(
+θ
+)
+=
+exp
+⁡
+(
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+y
+i
+,
+<
+t
+)
+−
+log
+⁡
+π
+old
+​
+(
+y
+i
+,
+t
+|
+y
+i
+,
+<
+t
+)
+)
+.
+\displaystyle\rho_{i,t}(\theta)=\exp{\left(\log\pi_{\theta}(y_{i,t}|y_{i,<t})-\log\pi_{\text{old}}(y_{i,t}|y_{i,<t})\right)}.
+The gradient of
+ρ
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+\rho_{i,t}(\theta)\hat{A}_{i}
+equals
+π
+θ
+π
+old
+​
+∇
+log
+⁡
+π
+θ
+​
+A
+^
+i
+\frac{\pi_{\theta}}{\pi_{\text{old}}}\nabla\log\pi_{\theta}\hat{A}_{i}
+, which is the importance-weighted policy gradient estimator.
+Thus, the PPO loss can be understood as a clipped version of this.
+13
+Activ image-building pipeline
+The Activ pipeline, shown in Figure
+13.29
+, automatically builds executable repository images at scale by modifying their GitHub Actions workflows and running them locally using the
+act
+(Lee,
+2019
+)
+library within a virtual environment. Our approach builds on the insight that the execution environment of a GitHub Actions workflow running CI tests is a fully built environment with dependencies, and can therefore be captured as a standalone Docker image for later execution.
+Figure 13.29
+:
+Activ image building pipeline for a single repository. After cloning from GitHub, the repository’s GitHub Actions workflows and pytest
+conftest.py
+files (in Python repositories) are modified and copied into the outer Docker (or virtual machine) for isolated CI execution via
+act
+. Modified workflow jobs are executed in parallel within individual containers, until the
+container_id
+and built-environment state are captured from the target container that holds repository dependencies. Framework executability detection precedes capture to ensure targeting the correct container: for pytest repositories, this occurs implicitly when injected
+conftest.py
+code executes within a session-scoped fixture, with an optional function-scoped fixture available for Python execution tracing. Non-Python repositories use modified workflow steps to verify framework executability (such as
+Jest
+) before capture. Upon successful capture, an early exit is triggered and the resulting container is committed and pushed for standalone execution.
+As we require only a single successful build of the repository, the pipeline reduces complex cross-platform and framework build matrices into a single entry
+(Saavedra et al.,
+2024
+)
+, by selecting most-compatible Python versions and Ubuntu variants. The pipeline also modifies each repository’s GitHub Actions workflows to
+continue-on-error
+, ensuring pipeline completion when encountering noncritical failures. We also implement multiple early exit strategies to terminate the pipeline as soon as a built-environment state has been captured, progress has stalled, or a timeout is reached.
+We modify each GitHub Actions workflow to probe for available test frameworks by checking if a list of predefined frameworks are executable when the workflow is running. For each detected framework, the pipeline captures the corresponding build environment and executes container ID capture logic that is equivalent to the Python-specific pytest capture process, detailed below.
+The pipeline modifies (or adds)
+conftest.py
+pytest configuration files to each test directory in Python repositories. A session-scoped fixture is automatically injected during pytest execution to capture the build state of containers running unit tests. This fixture detects the ID of the container running unit tests for the
+docker commit
+of the repository’s current build state. The environment capture process further preserves the container’s build state by writing-out bash environment variables and creating archives of the mounted repository code and hosted toolcache dependencies to the container’s file-system for later restoration via a Docker
+entrypoint
+script.
+To achieve the scale required for our dataset, we run on an internal sandboxing platform to execute approximately 500 repositories in parallel within secure, isolated virtual environments.
+14
+Hyper-parameter scaling laws
+14.1
+Derivation of per-token compute formula
+Consider a transformer model with hidden dimension
+d
+d
+, sequence length
+S
+S
+, batch size
+B
+B
+, and
+L
+L
+layers.
+Linear layers.
+For a linear transformation of size
+N
+N
+, the forward pass requires
+2
+​
+N
+2N
+floating point operations (FLOP): one multiplication and one addition per weight–input pair. The backward pass is approximately twice as expensive, since gradients must be computed with respect to both the weights and the inputs. Thus, the total cost per linear layer is
+6
+​
+N
+6N
+FLOP.
+Self-attention.
+In multi-head self-attention, the two dominant operations are
+Q
+​
+K
+⊤
+QK^{\top}
+and
+softmax
+​
+(
+Q
+​
+K
+⊤
+)
+​
+V
+\mathrm{softmax}(QK^{\top})V
+. The FLOP cost of the softmax itself is negligible compared to these matrix multiplications. The forward pass of each multiplication costs
+2
+​
+B
+​
+S
+2
+​
+d
+2BS^{2}d
+FLOP, while the backward pass is about twice as expensive, contributing
+4
+​
+B
+​
+S
+2
+​
+d
+4BS^{2}d
+FLOP. Summing both gives
+12
+​
+B
+​
+S
+2
+​
+d
+12BS^{2}d
+FLOP. Because causal attention only computes half of the entries of
+Q
+​
+K
+⊤
+QK^{\top}
+, the cost reduces to
+6
+​
+B
+​
+S
+2
+​
+d
+6BS^{2}d
+. Dividing by the number of tokens
+B
+​
+S
+BS
+gives the per-token cost
+6
+​
+S
+​
+d
+6Sd
+. Since each of the
+L
+L
+layers contains one self-attention block, the per-token cost for attention across all layers is
+6
+​
+S
+​
+d
+​
+L
+6SdL
+.
+14.2
+Quasi-random search for batch size and learning rate
+To estimate the optimal batch size (BS) and learning rate (LR) range, and how it evolves with scale, we performed a quasi-random search using Sobol sequences. At each scale, BS/LR candidates were generated by sampling two-dimensional Sobol sequences and rescaling them according to ranges that increase for BS and decrease for LR as the model scale increases.
+In Figure
+14.30
+, the gray points correspond to all BS/LR candidates evaluated at each scale, while the blue points indicate those within
+1
+%
+1\%
+of the best validation loss at that scale.
+Figure 14.30
+:
+Optimal range for batch size and learning rate across scales is quite large. However going beyond that range leads to rapidly degrading performance.
+15
+RL data decontamination
+We use MinHash LSH to decontaminate all our prompts in our RL datasets against the following evaluation benchmarks:
+•
+Math reasoning
+: AIME 2024/2025, HARP
+(Yue et al.,
+2024
+)
+, GSM8K test
+(Cobbe et al.,
+2021
+)
+, OmniMath (test)
+(Gao et al.,
+2025
+)
+, Math500
+(Hendrycks et al.,
+2021b
+)
+.
+•
+Code generation
+: HumanEval
+(Chen et al.,
+2021
+)
+, MBPP (valid/test)
+(Austin et al.,
+2021
+)
+, LiveCodeBench (20240801-20250501)
+(Jain et al.,
+2025a
+)
+.
+•
+Scientific reasoning
+: GPQA (Main/Extended/Diamond)
+(Rein et al.,
+2023
+)
+.
+•
+Commonsense and general reasoning
+: ARC Challenge/Easy (valid/test)
+(Clark et al.,
+2018
+)
+, CommonsenseQA (valid/test)
+(Talmor et al.,
+2019
+)
+, DROP
+(Dua et al.,
+2019
+)
+, PIQA (valid/test)
+(Bisk et al.,
+2020
+)
+, HellaSwag (valid/test)
+(Zellers et al.,
+2019
+)
+, SimpleQA
+(Wei et al.,
+2024
+)
+, OpenBookQA main/additional (valid/test)
+(Mihaylov et al.,
+2018
+)
+, WinoGrande 1.1 (dev/test)
+(Sakaguchi et al.,
+2020
+)
+.
+•
+Conversation and evaluation frameworks
+: ArenaHard
+(Li et al.,
+2024
+)
+, MTBench
+(Zheng et al.,
+2023
+)
+.
+We decontaminate our dataset of Dockerized executable repositories against SWE-bench Verified
+(Jimenez et al.,
+2024
+)
+by doing the following:
+•
+Removed all Docker images built from repositories in SWE-bench Verified.
+•
+Confirmed no remaining unexpected instance-level contamination by verifying that pairwise (dataset instance to SWE-bench Verified instance) Jaccard similarities between diff patch line sets remained below 0.2.
+16
+Mathematical expression comparison for RL
+Reinforcement learning on mathematical computation problems, both with numerical and symbolic answers, requires comparing the predicted answer to the ground truth answer contained in the dataset. We do this using the disjunction of two verifiers: a custom one described below and MathVerify
+(Kydlicek et al.,
+2025
+)
+. If any of them considers the expressions to be equivalent, the predicted answer is deemed correct.
+The custom verifier grows a set of equivalent expressions for both the predicted answer and the ground truth answer, and returns whether at the end, there is a nonempty intersection between those sets.
+Expressions are added based on various normalizations and transformations: string normalizations and replacements, normalization of unicode math symbols to Latex, normalization of Latex expressions, numerical computatation with floating point numbers up to a certain precision, symbolic computations, simplifications and normalization using SymPy
+(Meurer et al.,
+2017
+)
+and recursion in this manner for structured objects such as matrices and real intervals.
+17
+Prompting guide
+Reserved tokens
+are used for general text and chat formatting, and are not intended to be encoded from user input.
+They include text sequence start and end markers, padding, chat message header delimiters, and an end of chat message token.
+•
+<|begin_of_text|>
+(128000): global text sequence start marker.
+•
+<|end_of_text|>
+(128001): global text sequence end marker.
+•
+<|pad|>
+(128004): padding token.
+•
+<|start_header_id|>
+(128006): start of chat message header.
+•
+<|end_header_id|>
+(128007): end of chat message header.
+•
+<|eot_id|>
+(128008): end of chat message.
+Trace prediction tokens
+are designed for predicting program execution traces and may be enabled when encoding user-controllable input to expose CWM’s trace prediction functionality.
+They include tokens for frame delimiting, action separation, function return/call, next line, exception, and argument separation, as well as a sentinel token for the start of the source code context for trace prediction (see below).
+•
+<|frame_sep|>
+(128100): start of trace sentinel, end of execution step.
+•
+<|action_sep|>
+(128101): start of source code line.
+•
+<|return_sep|>
+(128102): execution step: return from function scope.
+•
+<|call_sep|>
+(128103): execution step: enter function scope.
+•
+<|line_sep|>
+(128104): execution step: next line.
+•
+<|exception_sep|>
+(128105): execution step: exception.
+•
+<|arg_sep|>
+(128106): separator for return and exception values.
+•
+<|trace_context_start|>
+(128107): start of source code context for trace prediction.
+Chat format.
+A chat is structured as a list of messages, each with the following format:
+⬇
+<|
+start_header_id
+|>
+$ROLE
+<|
+end_header_id
+|>
+$CONTENT
+<|
+eot_id
+|>
+The
+$ROLE
+can be
+system
+,
+user
+,
+assistant
+, or
+tool: $TOOL
+.
+$CONTENT
+is the message content.
+The model is to be prompted with an assistant header and two following newline characters; the
+<|eot_id|>
+token marks the end of its reply and is thus the stop token for inference.
+The conversation’s first token is expected to be
+<|begin_of_text|>
+.
+Reasoning.
+CWM is a hybrid reasoning and non-reasoning model; reasoning mode is enabled via prompting. Reasoning mode is turned on by starting the system prompt with:
+⬇
+You
+are
+a
+helpful
+AI
+assistant
+.
+You
+always
+reason
+before
+responding
+,
+using
+the
+following
+format
+:
+<
+think
+>
+your
+internal
+reasoning
+</
+think
+>
+your
+external
+response
+The model should be prompted with a leading
+<think>\n
+, i.e., a prompt should end with (showing newline characters for clarity here):
+⬇
+<|
+start_header_id
+|>
+assistant
+<|
+end_header_id
+|>\
+n
+\
+n
+<
+think
+>\
+n
+The reasoning section will be closed with
+</think>
+, and any text produced afterwards is the answer to the preceding user input.
+Tool Use.
+The model performs tool calls with the following format:
+⬇
+<
+tool
+:
+$TOOL
+>
+$CONTENT
+</
+tool
+>
+Any available tools are to be announced in the system prompt. User code is responsible for detecting tool calls in model output and responding with a message marked with the respective role.
+An example tool output of the
+python
+tool could be:
+⬇
+<|
+start_header_id
+|>
+tool
+:
+python
+<|
+end_header_id
+|>
+completed
+.
+[
+stdout
+]
+$STDOUT_CONTENT
+[/
+stdout
+]
+[
+stderr
+]
+$STDERR_CONTENT
+[/
+stderr
+]<|
+eot_id
+|>
+Control is then handed back to the model for further processing.
+An example of how tools can be specified in the system prompt:
+⬇
+You
+have
+access
+to
+the
+following
+tools
+:
+<
+tool
+:
+bash
+>
+[
+command
+(
+s
+)]
+</
+tool
+>
+Executes
+bash
+command
+(
+s
+)
+[
+command
+(
+s
+)]
+in
+the
+current
+session
+.
+[
+command
+(
+s
+)]
+can
+be
+any
+non
+-
+interactive
+bash
+command
+(
+s
+)
+either
+single
+or
+multi
+-
+line
+.
+<
+tool
+:
+create
+>
+[
+path
+]
+[
+content
+]
+</
+tool
+>
+Creates
+a
+new
+file
+at
+[
+path
+]
+with
+[
+content
+],
+where
+[
+path
+]
+must
+not
+exist
+,
+but
+its
+parent
+directory
+must
+exist
+.
+Here, the model may invoke either the
+bash
+or the
+create
+tool.
+Trace Prediction.
+CWM is able to predict the execution of Python programs on a step-by-step basis using dedicated trace prediction tokens. The prompt requires a source code context,
+$CONTEXT
+, and a sentinel
+<|frame_sep|>
+token to induce trace prediction, structured as:
+⬇
+<|
+begin_of_text
+|><|
+trace_context_start
+|>
+$CONTEXT
+<|
+frame_sep
+|>
+In
+$CONTEXT
+, the entry point for trace prediction is marked with a
+<< START_OF_TRACE
+comment. An execution trace in CWM is a series of
+frames
+, with each frame consisting of an
+observation
+(local variables) and an
+action
+(source code line). There are four different types of frames, formatted as follows:
+⬇
+<|
+call_sep
+|>
+$LOCALS
+<|
+action_sep
+|>
+$SOURCE
+<|
+frame_sep
+|>
+<|
+line_sep
+|>
+$LOCALS
+<|
+action_sep
+|>
+$SOURCE
+<|
+frame_sep
+|>
+<|
+return_sep
+|><|
+action_sep
+|>
+$SOURCE
+<|
+arg_sep
+|>
+$VALUE
+<|
+frame_sep
+|>
+<|
+exception_sep
+|><|
+action_sep
+|>
+$SOURCE
+<|
+arg_sep
+|>
+$VALUE
+<|
+frame_sep
+|>
+The model produces an
+<|end_of_text|>
+token to denote the end of the execution, which is reached when exiting the scope of the trace’s entry point. Locals are formatted as JSON key-value pairs where values are always rendered as JSON strings. and
+$VALUE
+for return and exception frames is also a JSON-encoded string representation.
+18
+Formal mathematics datamix
+We use the following datasets of mathematics in the Lean 4
+(Moura and Ullrich,
+2021
+)
+theorem proving language.
+•
+LeanUniverse
+(Aram H. Markosyan,
+2024
+)
+, a dataset of (initial proof state, tactic, resulting proof state) triples from Lean’s mathematical library
+(mathlib Community,
+2020
+)
+and other open-source Lean repositories, as a form of code world modeling in Lean.
+•
+Goedel-Pset
+(Lin et al.,
+2025
+)
+, formatted as a mathematical statement and proof formalization dataset, where the task is to translate a mathematical problem and solution from natural language to Lean.
+•
+Mathematical statement formalization datasets, where the task is to translate a mathematical statement from natural language
+to a Lean theorem statement without proof: Compfiles
+(CompFiles authors,
+2025
+)
+, Lean Workbook
+(Ying et al.,
+2025
+)
+, miniF2F
+(Zheng et al.,
+2022
+)
+, ProofNet
+(Azerbayev et al.,
+2023
+)
+, Goedel-Pset.
+19
+RULER evaluation
+We compare
+CWM
+against Gemma3-27B and Qwen3-32B post-trained models. Results can be seen on
+Table
+˜
+14
+. Results suggest that
+CWM
+outperforms Gemma3-27B under
+128
+128
+k sequence length, but falls short under
+32
+32
+k sequence length. Both
+CWM
+and Gemma3-27B achieve worse performance compared to Qwen3-32B. However, it is important to note that Qwen3-32B uses full attention across all layers, resulting in significantly higher computational costs, particularly for longer sequences.
+Therefore, we believe
+CWM
+represents a good trade-off between efficiency and model performance.
+Table 14
+:
+RULER results at 32k and 128k sequence length. Results are reported for
+CWM
+, Qwen3-32B and Gemma-3-27B.
+Context
+Gemma-3-27B
+Qwen3-32B
+CWM
+32k
+91.1
+94.4
+84.3
+128k
+66.0
+85.6
+69.7
+20
+Agent capabilities learnt during RL training
+In the context of SWE agentic tasks, we highlight in
+Figure
+˜
+20.31
+two notable capabilities learnt by the SWE RL agent during the RL training stage.
+First,
+Figure
+˜
+31(a)
+shows that the agent learns to test code more often over the course of RL training: while at the beginning of RL training the agent runs tests on at least one turn of a rollout for 57% of trajectories, after only 4,000 steps of RL training it runs tests on at least one turn of a rollout for 74% of trajectories.
+Second, through RL training the agent learns to better localize files relevant to solving the issue. We formalize it by defining the
+recall
+for a given task to be the percentage of files in the gold patch that were edited by the agent during a rollout:
+Recall
+=
+|
+{
+files edited by the agent
+}
+∩
+{
+files edited in the gold patch
+}
+|
+|
+{
+files edited in the gold patch
+}
+|
+.
+\text{Recall}=\frac{\left|\{\text{files edited by the agent}\}\cap\{\text{files edited in the gold patch}\}\right|}{\left|\{\text{files edited in the gold patch}\}\right|}.
+LABEL:{fig:swerl_recall}
+shows that the agent’s average recall increases from 58% at the start of RL training to over 66% after only 4,000 steps.
+(a)
+Percentage of trajectories where SWE RL performs tests on at least one turn increases over the course of RL training.
+(b)
+SWE RL learns to localize the relevant files over the course of RL training.
+Figure 20.31
+:
+Agentic capabilities learnt during
+CWM
+RL training.
\ No newline at end of file
diff --git a/research/notes/dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md b/research/notes/dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md
new file mode 100644
index 0000000000000000000000000000000000000000..8edcfe284c64e89508c3b5dbb214421329cf36e7
--- /dev/null
+++ b/research/notes/dapo-an-open-source-llm-reinforcement-learning-system-at-scale.md
@@ -0,0 +1,3949 @@
+---
+title: 'DAPO: An Open-Source LLM Reinforcement Learning System at Scale'
+id: dapo-an-open-source-llm-reinforcement-learning-system-at-scale
+tags:
+- deepread
+created: '2026-06-10T00:30:48.458505Z'
+source: https://arxiv.org/html/2503.14476
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:48.458357Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+1]ByteDance Seed
+2
+Institute for AI Industry Research (AIR), Tsinghua University
+3]The University of Hong Kong
+4]SIA-Lab of Tsinghua AIR and ByteDance Seed
+\contribution
+Full author list in Contributions
+DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+(March 17, 2025)
+Abstract
+Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results.
+We propose the
+D
+ecoupled Clip and
+D
+ynamic s
+A
+mpling
+P
+olicy
+O
+ptimization (
+DAPO
+) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model.
+Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the
+verl
+framework
+1
+1
+1
+https://github.com/volcengine/verl
+, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL.
+\correspondence
+,
+\checkdata
+[Project Page]
+https://dapo-sia.github.io/
+Figure 1
+:
+AIME 2024 scores of
+DAPO
+on the Qwen2.5-32B base model, outperforming the previous SoTA DeepSeek-R1-Zero-Qwen-32B using 50% training steps. The x-axis represents the gradient update steps.
+1
+Introduction
+Test-time scaling such as OpenAI’s o1
+[
+1
+]
+and DeepSeek’s R1
+[
+2
+]
+brings a profound paradigm shift to Large Language Models (LLMs)
+[
+3
+,
+4
+,
+5
+,
+6
+,
+7
+]
+. Test-time scaling enables longer Chain-of-Thought thinking and induces sophisticated reasoning behaviors, which makes the models superior in competitive math and coding tasks like AIME and Codeforces.
+The central technique driving the revolution is large-scale Reinforcement Learning (RL), which elicits complex reasoning behaviors such as self-verification and iterative refinement. However, the actual algorithm and key recipe for scalable RL training remains a myth, hidden from technical reports of existing reasoning models
+[
+1
+,
+2
+,
+8
+,
+9
+,
+10
+,
+11
+]
+. In this paper, we reveal significant obstacles in large-scale RL training and open-source a scalable RL system with fully open-sourced algorithm, training code and dataset that provides democratized solutions with industry-level RL results.
+We experiment over Qwen2.5-32B
+[
+12
+]
+as the pretrained model for RL. In our initial GRPO run, we achieved only 30 points on AIME — a performance significantly below DeepSeek’s RL (47 points). A thorough analysis reveals that the naive GRPO baseline suffers from several key issues such as entropy collapse, reward noise, and training instability. The broader community has encountered similar challenges in reproducing DeepSeek’s results
+[
+13
+,
+14
+,
+15
+,
+16
+,
+17
+,
+18
+,
+19
+]
+suggesting that critical training details may have been omitted in the R1 paper that are required to develop an industry-level, large-scale, and reproducible RL system.
+To close this gap, we release an open-source state-of-the-art system for large-scale LLM RL, which achieves 50 points on AIME 2024 based on Qwen2.5-32B model, outperforming previous state-of-the-art results achieved by DeepSeek-R1-Zero-Qwen-32B
+[
+2
+]
+(47 points) using 50% training steps (Figure
+1
+). We propose the
+D
+ecoupled Clip and
+D
+ynamic s
+A
+mpling
+P
+olicy
+O
+ptimization (
+DAPO
+) algorithm, and introduce 4 key techniques to make RL shine in the long-CoT RL scenario. Details are presented in Section
+3
+.
+1.
+Clip-Higher
+, which promotes the diversity of the system and avoids entropy collapse;
+2.
+Dynamic Sampling
+, which improves training efficiency and stability;
+3.
+Token-Level Policy Gradient Loss
+, which is critical in long-CoT RL scenarios;
+4.
+Overlong Reward Shaping
+, which reduces reward noise and stabilizes training.
+Our implementation is based on verl
+[
+20
+]
+. By fully releasing our state-of-the-art RL system including training code and data, we aim to reveal valuable insights to large-scale LLM RL that benefit the larger community.
+2
+Preliminary
+2.1
+Proximal Policy Optimization (PPO)
+PPO
+[
+21
+]
+introduces a clipped surrogate objective for policy optimization. By constraining the policy updates within a proximal region of the previous policy using clip, PPO stabilizes training and improves sample efficiency. Specifically, PPO updates the policy by maximizing the following objective:
+𝒥
+PPO
+⁢
+(
+θ
+)
+=
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+o
+≤
+t
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+⁢
+[
+min
+⁡
+(
+π
+θ
+⁢
+(
+o
+t
+∣
+q
+,
+o
+<
+t
+)
+π
+θ
+old
+⁢
+(
+o
+t
+∣
+q
+,
+o
+<
+t
+)
+⁢
+A
+^
+t
+,
+clip
+⁢
+(
+π
+θ
+⁢
+(
+o
+t
+∣
+q
+,
+o
+<
+t
+)
+π
+θ
+old
+⁢
+(
+o
+t
+∣
+q
+,
+o
+<
+t
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+⁢
+A
+^
+t
+)
+]
+,
+\displaystyle\mathcal{J}_{\text{PPO}}(\theta)=\mathbb{E}_{(q,a)\sim\mathcal{D}%
+,o_{\leq t}\sim\pi_{\theta_{\text{old}}}(\cdot\mid q)}\Bigg{[}\min\Bigg{(}%
+\frac{\pi_{\theta}(o_{t}\mid q,o_{<t})}{\pi_{\theta_{\text{old}}}(o_{t}\mid q,%
+o_{<t})}\hat{A}_{t},\ \text{clip}\Bigg{(}\frac{\pi_{\theta}(o_{t}\mid q,o_{<t}%
+)}{\pi_{\theta_{\text{old}}}(o_{t}\mid q,o_{<t})},1-\varepsilon,1+\varepsilon%
+\Bigg{)}\hat{A}_{t}\Bigg{)}\Bigg{]},
+caligraphic_J start_POSTSUBSCRIPT PPO end_POSTSUBSCRIPT ( italic_θ ) = blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , italic_o start_POSTSUBSCRIPT ≤ italic_t end_POSTSUBSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT [ roman_min ( divide start_ARG italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT < italic_t end_POSTSUBSCRIPT ) end_ARG start_ARG italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT < italic_t end_POSTSUBSCRIPT ) end_ARG over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , clip ( divide start_ARG italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT < italic_t end_POSTSUBSCRIPT ) end_ARG start_ARG italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT < italic_t end_POSTSUBSCRIPT ) end_ARG , 1 - italic_ε , 1 + italic_ε ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ] ,
+(1)
+where
+(
+q
+,
+a
+)
+𝑞
+𝑎
+(q,a)
+( italic_q , italic_a )
+is a question-answer pair from the data distribution
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+,
+ε
+𝜀
+\varepsilon
+italic_ε
+is the clipping range of importance sampling ratio, and
+A
+^
+t
+subscript
+^
+𝐴
+𝑡
+\hat{A}_{t}
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+is an estimator of the advantage at time step
+t
+𝑡
+t
+italic_t
+. Given the value function
+V
+𝑉
+V
+italic_V
+and the reward function
+R
+𝑅
+R
+italic_R
+,
+A
+^
+t
+subscript
+^
+𝐴
+𝑡
+\hat{A}_{t}
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+is computed using the Generalized Advantage Estimation (GAE)
+[
+22
+]
+:
+A
+^
+t
+GAE
+⁢
+(
+γ
+,
+λ
+)
+=
+∑
+l
+=
+0
+∞
+(
+γ
+⁢
+λ
+)
+l
+⁢
+δ
+t
++
+l
+,
+superscript
+subscript
+^
+𝐴
+𝑡
+GAE
+𝛾
+𝜆
+superscript
+subscript
+𝑙
+0
+superscript
+𝛾
+𝜆
+𝑙
+subscript
+𝛿
+𝑡
+𝑙
+\displaystyle\hat{A}_{t}^{\text{GAE}(\gamma,\lambda)}=\sum_{l=0}^{\infty}(%
+\gamma\lambda)^{l}\delta_{t+l},
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT start_POSTSUPERSCRIPT GAE ( italic_γ , italic_λ ) end_POSTSUPERSCRIPT = ∑ start_POSTSUBSCRIPT italic_l = 0 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ∞ end_POSTSUPERSCRIPT ( italic_γ italic_λ ) start_POSTSUPERSCRIPT italic_l end_POSTSUPERSCRIPT italic_δ start_POSTSUBSCRIPT italic_t + italic_l end_POSTSUBSCRIPT ,
+(2)
+where
+δ
+l
+=
+R
+l
++
+γ
+⁢
+V
+⁢
+(
+s
+l
++
+1
+)
+−
+V
+⁢
+(
+s
+l
+)
+,
+0
+≤
+γ
+,
+λ
+≤
+1
+.
+formulae-sequence
+subscript
+𝛿
+𝑙
+subscript
+𝑅
+𝑙
+𝛾
+𝑉
+subscript
+𝑠
+𝑙
+1
+𝑉
+subscript
+𝑠
+𝑙
+formulae-sequence
+0
+𝛾
+𝜆
+1
+\delta_{l}=R_{l}+\gamma V(s_{l+1})-V(s_{l}),\quad 0\leq\gamma,\lambda\leq 1.
+italic_δ start_POSTSUBSCRIPT italic_l end_POSTSUBSCRIPT = italic_R start_POSTSUBSCRIPT italic_l end_POSTSUBSCRIPT + italic_γ italic_V ( italic_s start_POSTSUBSCRIPT italic_l + 1 end_POSTSUBSCRIPT ) - italic_V ( italic_s start_POSTSUBSCRIPT italic_l end_POSTSUBSCRIPT ) , 0 ≤ italic_γ , italic_λ ≤ 1 .
+(3)
+2.2
+Group Relative Policy Optimization (GRPO)
+Compared to PPO, GRPO eliminates the value function and estimates the advantage in a group-relative manner. For a specific question-answer pair
+(
+q
+,
+a
+)
+𝑞
+𝑎
+(q,a)
+( italic_q , italic_a )
+, the behavior policy
+π
+θ
+old
+subscript
+𝜋
+subscript
+𝜃
+old
+\pi_{\theta_{\text{old}}}
+italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT
+samples a group of
+G
+𝐺
+G
+italic_G
+individual responses
+{
+o
+i
+}
+i
+=
+1
+G
+superscript
+subscript
+subscript
+𝑜
+𝑖
+𝑖
+1
+𝐺
+\{o_{i}\}_{i=1}^{G}
+{ italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT
+. Then, the advantage of the
+i
+𝑖
+i
+italic_i
+-th response is calculated by normalizing the group-level rewards
+{
+R
+i
+}
+i
+=
+1
+G
+superscript
+subscript
+subscript
+𝑅
+𝑖
+𝑖
+1
+𝐺
+\{R_{i}\}_{i=1}^{G}
+{ italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT
+:
+A
+^
+i
+,
+t
+=
+r
+i
+−
+mean
+⁢
+(
+{
+R
+i
+}
+i
+=
+1
+G
+)
+std
+⁢
+(
+{
+R
+i
+}
+i
+=
+1
+G
+)
+.
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝑟
+𝑖
+mean
+superscript
+subscript
+subscript
+𝑅
+𝑖
+𝑖
+1
+𝐺
+std
+superscript
+subscript
+subscript
+𝑅
+𝑖
+𝑖
+1
+𝐺
+\hat{A}_{i,t}=\frac{r_{i}-\text{mean}(\{R_{i}\}_{i=1}^{G})}{\text{std}(\{R_{i}%
+\}_{i=1}^{G})}.
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT = divide start_ARG italic_r start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT - mean ( { italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG start_ARG std ( { italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG .
+(4)
+Similar to PPO, GRPO adopts a clipped objective, together with a directly imposed KL penalty term:
+𝒥
+GRPO
+⁢
+(
+θ
+)
+subscript
+𝒥
+GRPO
+𝜃
+\displaystyle\mathcal{J}_{\text{GRPO}}(\theta)
+caligraphic_J start_POSTSUBSCRIPT GRPO end_POSTSUBSCRIPT ( italic_θ )
+=
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+\displaystyle=\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot\mid q)}
+= blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT
+(5)
+[
+1
+G
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+∑
+t
+=
+1
+|
+o
+i
+|
+(
+min
+(
+r
+i
+,
+t
+(
+θ
+)
+A
+^
+i
+,
+t
+,
+clip
+(
+r
+i
+,
+t
+(
+θ
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+A
+^
+i
+,
+t
+)
+−
+β
+D
+KL
+(
+π
+θ
+|
+|
+π
+ref
+)
+)
+]
+,
+\displaystyle\Bigg{[}\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{t=1}^{|o_%
+{i}|}\Bigg{(}\min\Big{(}r_{i,t}(\theta)\hat{A}_{i,t},\ \text{clip}\Big{(}r_{i,%
+t}(\theta),1-\varepsilon,1+\varepsilon\Big{)}\hat{A}_{i,t}\Big{)}-\beta D_{%
+\text{KL}}(\pi_{\theta}||\pi_{\text{ref}})\Bigg{)}\Bigg{]},
+[ divide start_ARG 1 end_ARG start_ARG italic_G end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT divide start_ARG 1 end_ARG start_ARG | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT ( roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ε , 1 + italic_ε ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) - italic_β italic_D start_POSTSUBSCRIPT KL end_POSTSUBSCRIPT ( italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT | | italic_π start_POSTSUBSCRIPT ref end_POSTSUBSCRIPT ) ) ] ,
+where
+r
+i
+,
+t
+⁢
+(
+θ
+)
+=
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+.
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+subscript
+𝜋
+subscript
+𝜃
+old
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+r_{i,t}(\theta)=\frac{\pi_{\theta}(o_{i,t}\mid q,o_{i,<t})}{\pi_{\theta_{\text%
+{old}}}(o_{i,t}\mid q,o_{i,<t})}.
+italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) = divide start_ARG italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG start_ARG italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG .
+(6)
+It is also worth noting that GRPO computes the objective at the sample-level. To be exact, GRPO first calculates the mean loss within each generated sequence, before averaging the loss of different samples. As we will be discussing in Section
+3.3
+, such difference may have an impact on the performance of the algorithm.
+(a)
+Accuracies on AIME.
+(b)
+Entropy of actor model.
+Figure 2
+:
+The accuracy on the AIME test set and the entropy of the actor model’s generated probabilities during the RL training process, both before and after applying
+Clip-Higher
+strategy.
+2.3
+Removing KL Divergence
+The KL penalty term is used to regulate the
+divergence between the online policy and the frozen reference policy.
+In the RLHF scenario
+[
+23
+]
+, the goal of RL is to align the model behavior without diverging too far from the initial model.
+However, during training the long-CoT reasoning model, the model distribution can diverge significantly from the initial model, thus this restriction is not necessary. Therefore, we will exclude the KL term from our proposed algorithm.
+2.4
+Rule-based Reward Modeling
+The use of reward model usually suffers from the reward hacking problem
+[
+24
+,
+25
+,
+26
+,
+27
+,
+28
+,
+29
+]
+.
+Instead, we directly use the final accuracy of a verifiable task as the outcome reward, computed using the following rule:
+R
+⁢
+(
+y
+^
+,
+y
+)
+=
+{
+1
+,
+is_equivalent
+⁢
+(
+y
+^
+,
+y
+)
+−
+1
+,
+otherwise
+𝑅
+^
+𝑦
+𝑦
+cases
+1
+is_equivalent
+^
+𝑦
+𝑦
+1
+otherwise
+R(\hat{y},y)=\begin{cases}1,&\texttt{is\_equivalent}(\hat{y},y)\\
+-1,&\text{otherwise}\end{cases}
+italic_R ( over^ start_ARG italic_y end_ARG , italic_y ) = { start_ROW start_CELL 1 , end_CELL start_CELL is_equivalent ( over^ start_ARG italic_y end_ARG , italic_y ) end_CELL end_ROW start_ROW start_CELL - 1 , end_CELL start_CELL otherwise end_CELL end_ROW
+(7)
+where
+y
+𝑦
+y
+italic_y
+is the ground-truth answer and
+y
+^
+^
+𝑦
+\hat{y}
+over^ start_ARG italic_y end_ARG
+is the predicted answer.
+This is proved to be an effective approach to activating the base model’s reasoning capability, as shown in multiple domains such as automated theorem proving
+[
+30
+,
+31
+,
+32
+,
+33
+]
+, computer programming
+[
+34
+,
+35
+,
+36
+,
+37
+]
+, and mathematics competition
+[
+2
+]
+.
+3
+DAPO
+We propose the
+D
+ecouple Clip and
+D
+ynamic s
+A
+mpling
+P
+olicy
+O
+ptimization (DAPO) algorithm. DAPO samples a group of outputs
+{
+o
+i
+}
+i
+=
+1
+G
+superscript
+subscript
+subscript
+𝑜
+𝑖
+𝑖
+1
+𝐺
+\{o_{i}\}_{i=1}^{G}
+{ italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT
+for each question
+q
+𝑞
+q
+italic_q
+paired with the answer
+a
+𝑎
+a
+italic_a
+, and optimizes the policy via the following objective:
+𝒥
+DAPO
+⁢
+(
+θ
+)
+=
+subscript
+𝒥
+DAPO
+𝜃
+absent
+\displaystyle\mathcal{J}_{\text{DAPO}}(\theta)=
+caligraphic_J start_POSTSUBSCRIPT DAPO end_POSTSUBSCRIPT ( italic_θ ) =
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+\displaystyle\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot\mid q)}
+blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT
+(8)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+⁢
+A
+^
+i
+,
+t
+,
+clip
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+⁢
+A
+^
+i
+,
+t
+)
+]
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+clip
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+𝜀
+low
+1
+subscript
+𝜀
+high
+subscript
+^
+𝐴
+𝑖
+𝑡
+\displaystyle\Bigg{[}\frac{1}{\sum_{i=1}^{G}|o_{i}|}\sum_{i=1}^{G}\sum_{t=1}^{%
+|o_{i}|}\min\Big{(}r_{i,t}(\theta)\hat{A}_{i,t},\ \text{clip}\Big{(}r_{i,t}(%
+\theta),1-{\varepsilon_{\text{low}}},1+{\varepsilon_{\text{high}}}\Big{)}\hat{%
+A}_{i,t}\Big{)}\Bigg{]}
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT , 1 + italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) ]
+s.t.
+0
+<
+|
+{
+o
+i
+∣
+is_equivalent
+⁢
+(
+a
+,
+o
+i
+)
+}
+|
+<
+G
+,
+0
+conditional-set
+subscript
+𝑜
+𝑖
+is_equivalent
+𝑎
+subscript
+𝑜
+𝑖
+𝐺
+\displaystyle 0<\Big{|}\{o_{i}\mid\texttt{is\_equivalent}(a,o_{i})\}\Big{|}<G,
+0 < | { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ is_equivalent ( italic_a , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ) } | < italic_G ,
+where
+r
+i
+,
+t
+⁢
+(
+θ
+)
+=
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+,
+A
+^
+i
+,
+t
+=
+R
+i
+−
+mean
+⁢
+(
+{
+R
+i
+}
+i
+=
+1
+G
+)
+std
+⁢
+(
+{
+R
+i
+}
+i
+=
+1
+G
+)
+.
+formulae-sequence
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+subscript
+𝜋
+subscript
+𝜃
+old
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝑅
+𝑖
+mean
+superscript
+subscript
+subscript
+𝑅
+𝑖
+𝑖
+1
+𝐺
+std
+superscript
+subscript
+subscript
+𝑅
+𝑖
+𝑖
+1
+𝐺
+r_{i,t}(\theta)=\frac{\pi_{\theta}(o_{i,t}\mid q,o_{i,<t})}{\pi_{\theta_{\text%
+{old}}}(o_{i,t}\mid q,o_{i,<t})},\quad\hat{A}_{i,t}=\frac{R_{i}-\text{mean}(\{%
+R_{i}\}_{i=1}^{G})}{\text{std}(\{R_{i}\}_{i=1}^{G})}.
+italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) = divide start_ARG italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG start_ARG italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG , over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT = divide start_ARG italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT - mean ( { italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG start_ARG std ( { italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG .
+(9)
+The full algorithm can be found in Algorithm
+1
+. In this section, we will introduce the key techniques associated with DAPO.
+3.1
+Raise the Ceiling: Clip-Higher
+In our initial experiments using naive PPO
+[
+21
+]
+or GRPO
+[
+38
+]
+, we observed the entropy collapse phenomenon: the entropy of the policy decreases quickly as training progresses (
+Figure
+2(b)
+). The sampled responses of certain groups tend to be nearly identical. This indicates limited exploration and early deterministic policy, which can hinder the scaling process.
+We propose the
+Clip-Higher
+strategy to address this issue. Clipping over the importance sampling ratio is introduced in Clipped Proximal Policy Optimization (PPO-Clip)
+[
+21
+]
+to restrict the trust region and enhance the stability of RL.
+We identify that the upper clip can restrict the exploration of the policy, where making an ‘exploitation’ token more probable is much easier yet the probability of an unlikely ‘exploration’ token is too tightly bounded to be uplifted.
+Concretely, when
+ε
+=
+0.2
+𝜀
+0.2
+\varepsilon=0.2
+italic_ε = 0.2
+(the default value of most algorithms) and
+A
+^
+i
+,
+t
+>
+0
+subscript
+^
+𝐴
+𝑖
+𝑡
+0
+\hat{A}_{i,t}>0
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT > 0
+(the system tries to increase the probability), consider two actions with probabilities
+π
+θ
+old
+⁢
+(
+o
+i
+∣
+q
+)
+=
+0.01
+subscript
+𝜋
+subscript
+𝜃
+old
+conditional
+subscript
+𝑜
+𝑖
+𝑞
+0.01
+\pi_{\theta_{\text{old}}}(o_{i}\mid q)=0.01
+italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ italic_q ) = 0.01
+and
+0.9
+0.9
+0.9
+0.9
+. The upper bounds of the increased probabilities
+π
+θ
+⁢
+(
+o
+i
+∣
+q
+)
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑞
+\pi_{\theta}(o_{i}\mid q)
+italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ italic_q )
+are
+0.012
+0.012
+0.012
+0.012
+and
+1.08
+1.08
+1.08
+1.08
+, respectively
+(
+π
+θ
+old
+⋅
+(
+1
++
+ϵ
+)
+)
+⋅
+subscript
+𝜋
+subscript
+𝜃
+old
+1
+italic-ϵ
+\left(\pi_{\theta_{\text{old}}}\cdot(1+\epsilon)\right)
+( italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ⋅ ( 1 + italic_ϵ ) )
+.
+This implies that ‘exploitation’ tokens with a higher probability (
+e.g.
+, 0.9) are not constrained to get even extremely larger probabilities like 0.999. Conversely, for low-probability ‘exploration’ tokens, achieving a non-trivial increase in probability is considerably more challenging.
+Empirically, we also observe that the mean probability of up-clipped tokens is low:
+π
+θ
+⁢
+(
+o
+i
+∣
+q
+)
+<
+0.2
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑞
+0.2
+\pi_{\theta}(o_{i}\mid q)<0.2
+italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ italic_q ) < 0.2
+(
+Figure
+3(a)
+). This finding supports our intuition that the upper clipping threshold indeed restricts the probability increase of low-probability ‘exploration’ tokens, thereby potentially constraining the exploration of the system.
+Adhering to the
+Clip-Higher
+strategy, we decouple the lower and higher clipping range as
+ε
+low
+subscript
+𝜀
+low
+\varepsilon_{\text{low}}
+italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT
+and
+ε
+high
+subscript
+𝜀
+high
+\varepsilon_{\text{high}}
+italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT
+, as highlighted in Equation
+10
+:
+𝒥
+DAPO
+⁢
+(
+θ
+)
+=
+subscript
+𝒥
+DAPO
+𝜃
+absent
+\displaystyle\mathcal{J}_{\text{DAPO}}(\theta)=
+caligraphic_J start_POSTSUBSCRIPT DAPO end_POSTSUBSCRIPT ( italic_θ ) =
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+\displaystyle\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot\mid q)}
+blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT
+(10)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+⁢
+A
+^
+i
+,
+t
+,
+clip
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+⁢
+A
+^
+i
+,
+t
+)
+]
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+clip
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+𝜀
+low
+1
+subscript
+𝜀
+high
+subscript
+^
+𝐴
+𝑖
+𝑡
+\displaystyle\Bigg{[}\frac{1}{\sum_{i=1}^{G}|o_{i}|}\sum_{i=1}^{G}\sum_{t=1}^{%
+|o_{i}|}\min\Big{(}r_{i,t}(\theta)\hat{A}_{i,t},\ \text{clip}\Big{(}r_{i,t}(%
+\theta),1-{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}%
+\varepsilon_{\text{low}}},1+{\color[rgb]{1,0,0}\definecolor[named]{%
+pgfstrokecolor}{rgb}{1,0,0}\varepsilon_{\text{high}}}\Big{)}\hat{A}_{i,t}\Big{%
+)}\Bigg{]}
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT , 1 + italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) ]
+s.t.
+0
+<
+|
+{
+o
+i
+∣
+is_equivalent
+⁢
+(
+a
+,
+o
+i
+)
+}
+|
+<
+G
+.
+0
+conditional-set
+subscript
+𝑜
+𝑖
+is_equivalent
+𝑎
+subscript
+𝑜
+𝑖
+𝐺
+\displaystyle 0<\Big{|}\{o_{i}\mid\texttt{is\_equivalent}(a,o_{i})\}\Big{|}<G.
+0 < | { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ is_equivalent ( italic_a , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ) } | < italic_G .
+We increase the value of
+ε
+high
+subscript
+𝜀
+high
+\varepsilon_{\text{high}}
+italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT
+to leave more room for the increase of low-probability tokens. As shown in
+Figure
+2
+, this adjustment effectively enhances the policy’s entropy and facilitates the generation of more diverse samples.
+We keep
+ε
+low
+subscript
+𝜀
+low
+\varepsilon_{\text{low}}
+italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT
+as it is, because increasing it will suppress the probability of these tokens to
+0
+0
+, resulting in the collapse of the sampling space.
+(a)
+Mean up-clipped probability.
+(b)
+The proportion of samples with an accuracy of 1.
+Figure 3
+:
+The mean up-clipped probability as well as the ratio of prompts with accuracy=1.
+3.2
+The More the Merrier: Dynamic Sampling
+Existing RL algorithm suffers from the gradient-decreasing problem when some prompts have accuracy equal to 1. For example for GRPO, if all outputs
+{
+o
+i
+}
+i
+=
+1
+G
+superscript
+subscript
+subscript
+𝑜
+𝑖
+𝑖
+1
+𝐺
+\{o_{i}\}_{i=1}^{G}
+{ italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT
+of a particular prompt are correct and receive the same reward, the resulting advantage for this group is
+zero
+. A zero advantage results in zero policy gradients, shrinking the magnitude and increasing the noise sensitivity of the batch gradient, thereby degrading sample efficiency. Empirically, the number of samples with accuracy equal to 1 continues to increase, as shown in
+Figure
+3(b)
+. This means that the effective number of prompts in each batch keeps decreasing, which can lead to larger variance in gradient and dampens the gradient signals for model training.
+To this end, we propose to
+over-sample and filter out prompts with the accuracy equal to 1 and 0
+as illustrated in Equation
+11
+, leaving all prompts in the batch with effective gradients and keeping a consistent number of prompts. The sampling cost for each batch is dynamic. Before training, we keep sampling until the batch is fully filled with samples whose accuracy is neither 0 nor 1.
+𝒥
+DAPO
+⁢
+(
+θ
+)
+=
+subscript
+𝒥
+DAPO
+𝜃
+absent
+\displaystyle\mathcal{J}_{\text{DAPO}}(\theta)=
+caligraphic_J start_POSTSUBSCRIPT DAPO end_POSTSUBSCRIPT ( italic_θ ) =
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+\displaystyle\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot\mid q)}
+blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT
+(11)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+⁢
+A
+^
+i
+,
+t
+,
+clip
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+⁢
+A
+^
+i
+,
+t
+)
+]
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+clip
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+𝜀
+low
+1
+subscript
+𝜀
+high
+subscript
+^
+𝐴
+𝑖
+𝑡
+\displaystyle\Bigg{[}\frac{1}{\sum_{i=1}^{G}|o_{i}|}\sum_{i=1}^{G}\sum_{t=1}^{%
+|o_{i}|}\min\Big{(}r_{i,t}(\theta)\hat{A}_{i,t},\ \text{clip}\Big{(}r_{i,t}(%
+\theta),1-{\varepsilon_{\text{low}}},1+{\varepsilon_{\text{high}}}\Big{)}\hat{%
+A}_{i,t}\Big{)}\Bigg{]}
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT , 1 + italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) ]
+s.t.
+0
+<
+|
+{
+o
+i
+∣
+is_equivalent
+⁢
+(
+a
+,
+o
+i
+)
+}
+|
+<
+G
+.
+0
+conditional-set
+subscript
+𝑜
+𝑖
+is_equivalent
+𝑎
+subscript
+𝑜
+𝑖
+𝐺
+\displaystyle{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0%
+}0<\Big{|}\{o_{i}\mid\texttt{is\_equivalent}(a,o_{i})\}\Big{|}<G}.
+0 < | { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ is_equivalent ( italic_a , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ) } | < italic_G .
+Note that this strategy does not necessarily impede training efficiency, because the generation time is typically dominated by the generation of long-tail samples if the RL system is synchronized and the generation stage is not pipelined. Besides, we find that with dynamic sampling the experiment achieves the same performance faster as shown in
+Figure
+6
+.
+3.3
+Rebalancing Act: Token-Level Policy Gradient Loss
+The original GRPO algorithm employs a sample-level loss calculation, which involves first averaging the losses by token within each sample and then aggregating the losses across samples. In this approach, each sample is assigned an equal weight in the final loss computation. However, we find that this method of loss reduction introduces several challenges in the context of long-CoT RL scenarios.
+Since all samples are assigned the same weight in the loss calculation, tokens within longer responses (which contain more tokens) may have a disproportionately lower contribution to the overall loss, which can lead to two adverse effects.
+First, for high-quality long samples, this effect can impede the model’s ability to learn reasoning-relevant patterns within them.
+Second, we observe that excessively long samples often exhibit low-quality patterns such as gibberish and repetitive words. Thus, sample-level loss calculation, due to its inability to effectively penalize those undesirable patterns in long samples, leads to an unhealthy increase in entropy and response length, as shown in
+Figure
+4(a)
+and
+Figure
+4(b)
+.
+(a)
+Entropy of actor model’s generation probabilities.
+(b)
+Average length of actor model-generated responses
+Figure 4
+:
+The entropy of the probability distribution of the actor model, as well as the changes in response length.
+We introduce a
+Token-level Policy Gradient Loss
+in the long-CoT RL scenario to address the above limitations:
+𝒥
+DAPO
+⁢
+(
+θ
+)
+=
+subscript
+𝒥
+DAPO
+𝜃
+absent
+\displaystyle\mathcal{J}_{\text{DAPO}}(\theta)=
+caligraphic_J start_POSTSUBSCRIPT DAPO end_POSTSUBSCRIPT ( italic_θ ) =
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+∣
+q
+)
+\displaystyle\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot\mid q)}
+blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ ∣ italic_q ) end_POSTSUBSCRIPT
+(12)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+⁢
+A
+^
+i
+,
+t
+,
+clip
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+⁢
+A
+^
+i
+,
+t
+)
+]
+,
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+clip
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+𝜀
+low
+1
+subscript
+𝜀
+high
+subscript
+^
+𝐴
+𝑖
+𝑡
+\displaystyle\Bigg{[}\frac{1}{\color[rgb]{1,0,0}\definecolor[named]{%
+pgfstrokecolor}{rgb}{1,0,0}\sum_{i=1}^{G}|o_{i}|}{\color[rgb]{1,0,0}%
+\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\sum_{i=1}^{G}\sum_{t=1}^{|o_{i%
+}|}}\min\Big{(}r_{i,t}(\theta)\hat{A}_{i,t},\ \text{clip}\Big{(}r_{i,t}(\theta%
+),1-{\varepsilon_{\text{low}}},1+{\varepsilon_{\text{high}}}\Big{)}\hat{A}_{i,%
+t}\Big{)}\Bigg{]},
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT , 1 + italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) ] ,
+s.t.
+0
+<
+|
+{
+o
+i
+∣
+is_equivalent
+⁢
+(
+a
+,
+o
+i
+)
+}
+|
+<
+G
+.
+0
+conditional-set
+subscript
+𝑜
+𝑖
+is_equivalent
+𝑎
+subscript
+𝑜
+𝑖
+𝐺
+\displaystyle 0<\Big{|}\{o_{i}\mid\texttt{is\_equivalent}(a,o_{i})\}\Big{|}<G.
+0 < | { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∣ is_equivalent ( italic_a , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ) } | < italic_G .
+In this setting, longer sequences can have more influence on the overall gradient update compared to shorter sequences.
+Moreover, from the perspective of individual tokens, if a particular generation pattern can lead to an increase or decrease in reward, it will be equally prompted or suppressed, regardless of the length of the response in which it appears.
+3.4
+Hide and Seek: Overlong Reward Shaping
+In RL training, we typically set a maximum length for generation, with overlong samples truncated accordingly. We find that improper reward shaping for truncated samples can introduce reward noise and significantly disrupt the training process.
+By default, we assign a punitive reward to truncated samples.
+This approach may introduce noise into the training process, as a sound reasoning process can be penalized solely due to its excessive length. Such penalties can potentially confuse the model regarding the validity of its reasoning process.
+To investigate the impact of this reward noise, we first apply an
+Overlong Filtering
+strategy which masks the loss of truncated samples. We find that this approach significantly stabilizes training and enhances performance, as demonstrated in
+Figure
+5
+.
+(a)
+Performance on AIME.
+(b)
+Entropy of actor model.
+Figure 5
+:
+The accuracy of the actor model on AIME and the entropy of its generation probabilities, both before and after applying
+Overlong Reward Shaping
+strategy.
+Algorithm 1
+DAPO
+:
+D
+ecoupled Clip and
+D
+ynamic s
+A
+mpling
+P
+olicy
+O
+ptimization
+Input
+initial policy model
+π
+θ
+subscript
+𝜋
+𝜃
+\pi_{\theta}
+italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT
+; reawrd model
+R
+𝑅
+R
+italic_R
+; task prompts
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+; hyperparameters
+ε
+𝚕𝚘𝚠
+,
+ε
+𝚑𝚒𝚐𝚑
+subscript
+𝜀
+𝚕𝚘𝚠
+subscript
+𝜀
+𝚑𝚒𝚐𝚑
+\varepsilon_{\mathtt{low}},\varepsilon_{\mathtt{high}}
+italic_ε start_POSTSUBSCRIPT typewriter_low end_POSTSUBSCRIPT , italic_ε start_POSTSUBSCRIPT typewriter_high end_POSTSUBSCRIPT
+1:
+for
+step = 1,…,M
+do
+2:     Sample a batch
+𝒟
+b
+subscript
+𝒟
+𝑏
+\mathcal{D}_{b}
+caligraphic_D start_POSTSUBSCRIPT italic_b end_POSTSUBSCRIPT
+from
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+3:     Update the old policy model
+π
+θ
+o
+⁢
+l
+⁢
+d
+←
+π
+θ
+←
+subscript
+𝜋
+subscript
+𝜃
+𝑜
+𝑙
+𝑑
+subscript
+𝜋
+𝜃
+\pi_{\theta_{old}}\leftarrow\pi_{\theta}
+italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT italic_o italic_l italic_d end_POSTSUBSCRIPT end_POSTSUBSCRIPT ← italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT
+4:     Sample
+G
+outputs
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+q
+)
+\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|q)
+{ italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ | italic_q )
+for each question
+q
+∈
+𝒟
+b
+𝑞
+subscript
+𝒟
+𝑏
+q\in\mathcal{D}_{b}
+italic_q ∈ caligraphic_D start_POSTSUBSCRIPT italic_b end_POSTSUBSCRIPT
+5:     Compute rewards
+{
+r
+i
+}
+i
+=
+1
+G
+superscript
+subscript
+subscript
+𝑟
+𝑖
+𝑖
+1
+𝐺
+\{r_{i}\}_{i=1}^{G}
+{ italic_r start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT
+for each sampled output
+o
+i
+subscript
+𝑜
+𝑖
+o_{i}
+italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+by running
+R
+𝑅
+R
+italic_R
+6:     Filter out
+o
+i
+subscript
+𝑜
+𝑖
+o_{i}
+italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+and add the remaining to the dynamic sampling buffer (
+Dynamic Sampling
+Equation
+11
+)
+7:
+if
+buffer size
+n
+b
+<
+N
+subscript
+𝑛
+𝑏
+𝑁
+n_{b}<N
+italic_n start_POSTSUBSCRIPT italic_b end_POSTSUBSCRIPT < italic_N
+:
+8:
+continue
+9:     For each
+o
+i
+subscript
+𝑜
+𝑖
+o_{i}
+italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+in the buffer, compute
+A
+^
+i
+,
+t
+subscript
+^
+𝐴
+𝑖
+𝑡
+\hat{A}_{i,t}
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT
+for the
+t
+-th token of
+o
+i
+subscript
+𝑜
+𝑖
+o_{i}
+italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+(
+Equation
+9
+)
+10:
+for
+iteration = 1, …,
+μ
+𝜇
+\mu
+italic_μ
+do
+11:         Update the policy model
+π
+θ
+subscript
+𝜋
+𝜃
+\pi_{\theta}
+italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT
+by maximizing the DAPO objective (
+Equation
+8
+)
+Output
+π
+θ
+subscript
+𝜋
+𝜃
+\pi_{\theta}
+italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT
+Table 1
+:
+Furthermore, we propose
+Soft Overlong Punishment
+(Equation
+13
+), a length-aware penalty mechanism designed to shape the reward for truncated samples.
+Specifically, when the response length exceeds the predefined maximum value, we define a punishment interval. Within this interval, the longer the response, the greater the punishment it receives.
+This penalty is added to the original rule-based correctness reward, thereby signaling to the model to avoid excessively long responses.
+R
+length
+⁢
+(
+y
+)
+=
+{
+0
+,
+|
+y
+|
+≤
+L
+max
+−
+L
+cache
+(
+L
+max
+−
+L
+cache
+)
+−
+|
+y
+|
+L
+cache
+,
+L
+max
+−
+L
+cache
+<
+|
+y
+|
+≤
+L
+max
+−
+1
+,
+L
+max
+<
+|
+y
+|
+subscript
+𝑅
+length
+𝑦
+cases
+0
+𝑦
+subscript
+𝐿
+max
+subscript
+𝐿
+cache
+subscript
+𝐿
+max
+subscript
+𝐿
+cache
+𝑦
+subscript
+𝐿
+cache
+subscript
+𝐿
+max
+subscript
+𝐿
+cache
+𝑦
+subscript
+𝐿
+max
+1
+subscript
+𝐿
+max
+𝑦
+R_{\text{length}}(y)=\begin{cases}0,&|y|\leq L_{\text{max}}-L_{\text{cache}}\\
+\frac{(L_{\text{max}}-L_{\text{cache}})-|y|}{L_{\text{cache}}},&L_{\text{max}}%
+-L_{\text{cache}}<|y|\leq L_{\text{max}}\\
+-1,&L_{\text{max}}<|y|\end{cases}
+italic_R start_POSTSUBSCRIPT length end_POSTSUBSCRIPT ( italic_y ) = { start_ROW start_CELL 0 , end_CELL start_CELL | italic_y | ≤ italic_L start_POSTSUBSCRIPT max end_POSTSUBSCRIPT - italic_L start_POSTSUBSCRIPT cache end_POSTSUBSCRIPT end_CELL end_ROW start_ROW start_CELL divide start_ARG ( italic_L start_POSTSUBSCRIPT max end_POSTSUBSCRIPT - italic_L start_POSTSUBSCRIPT cache end_POSTSUBSCRIPT ) - | italic_y | end_ARG start_ARG italic_L start_POSTSUBSCRIPT cache end_POSTSUBSCRIPT end_ARG , end_CELL start_CELL italic_L start_POSTSUBSCRIPT max end_POSTSUBSCRIPT - italic_L start_POSTSUBSCRIPT cache end_POSTSUBSCRIPT < | italic_y | ≤ italic_L start_POSTSUBSCRIPT max end_POSTSUBSCRIPT end_CELL end_ROW start_ROW start_CELL - 1 , end_CELL start_CELL italic_L start_POSTSUBSCRIPT max end_POSTSUBSCRIPT < | italic_y | end_CELL end_ROW
+(13)
+3.5
+Dataset Transformation
+Our dataset is sourced from the web and official competition homepages through a combination of web scraping and manual annotation.
+The answers of math dataset typically come in a variety of formats, such as expression, formula and number, which makes it challenging to design comprehensive rules to parse them.
+To provide accurate reward signals using rules and minimize errors introduced by formula parsers, inspired by AIME, we select and transform the answers into integers, which are easy to parse.
+For example, if the original answer is expressed in the form of
+a
++
+b
+c
+𝑎
+𝑏
+𝑐
+\frac{a+\sqrt{b}}{c}
+divide start_ARG italic_a + square-root start_ARG italic_b end_ARG end_ARG start_ARG italic_c end_ARG
+, we instruct the LLM to modify the question so that the expected answer becomes
+a
++
+b
++
+c
+𝑎
+𝑏
+𝑐
+a+b+c
+italic_a + italic_b + italic_c
+.
+After selection and transformation, we obtained the
+DAPO
+-Math-17K
+dataset, which consists of 17K prompts, each paired with an integer as the answer.
+4
+Experiments
+4.1
+Training Details
+In this work, we focus specifically on mathematical tasks to evaluate our algorithm, which can be readily transferred to other tasks. We adopt the verl framework
+[
+20
+]
+for training. We use naive GRPO
+[
+38
+]
+as our baseline algorithm and estimate advantages using group reward normalization.
+For hyper-parameters, we utilize the AdamW
+[
+39
+]
+optimizer with a constant learning rate of
+1
+×
+10
+−
+6
+1
+superscript
+10
+6
+1\times 10^{-6}
+1 × 10 start_POSTSUPERSCRIPT - 6 end_POSTSUPERSCRIPT
+, incorporating a linear warm-up over 20 rollout steps.
+For rollout, the prompt batch size is 512 and we sample 16 responses for each prompt. For training, the mini-batch size is set to 512, i.e., 16 gradient updates for each rollout step. For
+Overlong Reward Shaping
+, we set the expected maximum length as 16,384 tokens and allocate additional 4,096 tokens as the soft punish cache. Therefore, the maximum number of tokens for generation is set to 20,480 tokens.
+As for the
+Clip-Higher
+mechanism, we set the clipping parameter
+ε
+low
+subscript
+𝜀
+low
+\varepsilon_{\text{low}}
+italic_ε start_POSTSUBSCRIPT low end_POSTSUBSCRIPT
+to 0.2 and
+ε
+high
+subscript
+𝜀
+high
+\varepsilon_{\text{high}}
+italic_ε start_POSTSUBSCRIPT high end_POSTSUBSCRIPT
+to 0.28, which effectively balance the trade-off between exploration and exploitation.
+For evaluation on AIME, we repeat the evaluation set for 32 times and report avg@32 for results stability. The inference hyperparameters of evaluation are set to temperature 1.0 and topp 0.7.
+Figure 6
+:
+The training progress before and after applying dynamic sampling on a baseline setting.
+4.2
+Main Results
+Experiments on AIME 2024 demonstrate that
+DAPO
+has successfully trained the Qwen-32B Base model into a powerful reasoning model, achieving performance superior to DeepSeek’s experiments on Qwen2.5-32B using the R1 approach.
+In Figure
+1
+, we observe a substantial improvement of performance on AIME 2024, with accuracy increasing from near
+0
+0
+% to 50%. Notably, this improvement is achieved with only 50% of the training steps required by DeepSeek-R1-Zero-Qwen-32B.
+We analyze the contributions of each training technique in our methodology, as detailed in
+Table
+1
+.
+The observed improvements demonstrate the effectiveness of these techniques in RL training, each contributing several accuracy points in AIME 2024.
+Notably, given the vanilla GRPO setting, only 30% accuracy can be reached by training from a Qwen2.5-32B base model.
+For token-level loss, although it brings less performance improvement, we find it enhances training stability and makes the length increase more healthily.
+When applying
+Dynamic Sampling
+, although more data needs to be sampled due to the filtering out of zero-gradient data, the overall training time is not significantly affected.
+As shown in
+Figure
+6
+, although the number of sampling instances increases, the model’s convergence time is even reduced, due to fewer training steps required.
+Table 1
+:
+Main results of progressive techniques applied to
+DAPO
+Model
+AIME24
+avg@32
+subscript
+AIME24
+avg@32
+\textbf{AIME24}_{\text{avg@32}}
+AIME24 start_POSTSUBSCRIPT avg@32 end_POSTSUBSCRIPT
+DeepSeek-R1-Zero-Qwen-32B
+47
+Naive GRPO
+30
++ Overlong Filtering
+36
++ Clip-Higher
+38
++ Soft Overlong Punishment
+41
++ Token-level Loss
+42
++ Dynamic Sampling (
+DAPO
+)
+50
+4.3
+Training Dynamics
+Reinforcement learning on large language models is not only a cutting-edge research direction but also an intrinsically complex systems engineering challenge, characterized by the interdependence of its various subsystems. Modifications to any single subsystem can propagate through the system, leading to unforeseen consequences due to the intricate interplay among these components. Even seemingly minor changes in initial conditions, such as variations in data and hyperparameters, can amplify through iterative reinforcement learning processes, yielding substantial deviations in outcomes. This complexity often confronts researchers with a dilemma: even after meticulous analysis and well-founded expectations that a modification will enhance specific aspects of the training process, the actual results frequently diverge from the anticipated trajectory. Therefore, monitoring of key intermediate results during experimentation is essential for swiftly identifying the sources of discrepancies and, ultimately, for refining the system.
+(a)
+Mean response length.
+(b)
+Reward score.
+(c)
+Generation entropy.
+(d)
+Mean probability.
+Figure 7
+:
+The metric curves of response length, reward score, generation entropy, and the mean probability of
+DAPO
+, which show the dynamics of RL training and serve as essential monitoring indicators to identify potential issues.
+•
+The Length of Generated Responses
+is a metric closely related to training stability and performance, as shown in
+Figure
+7(a)
+. The increase in length provides the model with a larger space for exploration, allowing more complex reasoning behaviors to be sampled and gradually reinforced through training. However, it is important to note that length does not always maintain a continuous upward trend during training. In some considerable periods, it can exhibit a trend of stagnation or even decline, which has also been demonstrated in
+[
+2
+]
+. We typically use length in conjunction with validation accuracy as indicators to assess whether an experiment is deteriorating.
+•
+The Dynamics of Reward
+during training has always been one of the crucial monitoring indicators in reinforcement learning, as shown in
+Figure
+7(b)
+. In the majority of our experiments, the trend of reward increase is relatively stable and does not fluctuate or decline significantly due to adjustments in experimental settings. This indicates that, given a reliable reward signal, language models can robustly fit the distribution of training set. However, we find that the final reward on the training set often exhibits little correlation with the accuracy on the validation set, which indicates overfitting to the training set.
+•
+The Entropy of the Actor Model and Generation Probability
+are related to the model’s exploration capability and are key metrics that we closely monitor in our experiments. Intuitively, the model’s entropy needs to be maintained within an appropriate range. An excessively low entropy indicates that the probability distribution is overly sharp, leading to a loss of exploration capability. Conversely, an excessively high entropy is often associated with issues of over-exploration such as gibberish and repetitive generation. For the generation probability, the situation is exactly the opposite. As demonstrated in
+Section
+3.1
+, by applying the Clip-Higher strategy, we effectively addressed the issue of entropy collapse. In subsequent experiments, we find that maintaining a slow upward trend in entropy is conducive to the improvement of model performance, shown in
+Figure
+7(c)
+and
+Figure
+7(d)
+.
+4.4
+Case Study
+Question
+:
+Given that the base
+A
+⁢
+B
+⁢
+C
+𝐴
+𝐵
+𝐶
+ABC
+italic_A italic_B italic_C
+of the tetrahedron
+S
+−
+A
+⁢
+B
+⁢
+C
+𝑆
+𝐴
+𝐵
+𝐶
+S-ABC
+italic_S - italic_A italic_B italic_C
+is an equilateral triangle, the projection
+H
+𝐻
+H
+italic_H
+of point
+A
+𝐴
+A
+italic_A
+on the face
+S
+⁢
+B
+⁢
+C
+𝑆
+𝐵
+𝐶
+SBC
+italic_S italic_B italic_C
+is the orthocenter of
+△
+⁢
+S
+⁢
+B
+⁢
+C
+△
+𝑆
+𝐵
+𝐶
+\triangle SBC
+△ italic_S italic_B italic_C
+, the dihedral angle
+H
+−
+A
+⁢
+B
+−
+C
+𝐻
+𝐴
+𝐵
+𝐶
+H-AB-C
+italic_H - italic_A italic_B - italic_C
+is
+30
+∘
+superscript
+30
+30^{\circ}
+30 start_POSTSUPERSCRIPT ∘ end_POSTSUPERSCRIPT
+, and
+S
+⁢
+A
+=
+2
+𝑆
+𝐴
+2
+SA=2
+italic_S italic_A = 2
+, find the volume of the tetrahedron. The answer is in the form
+k
+m
+𝑘
+𝑚
+\frac{k}{m}
+divide start_ARG italic_k end_ARG start_ARG italic_m end_ARG
+, so give the value of
+k
++
+m
+𝑘
+𝑚
+k+m
+italic_k + italic_m
+.
+Response
+:
+Let us consider the given problem: We need to determine …
+…
+Next, let
+S
+𝑆
+S
+italic_S
+have coordinates
+(
+x
+0
+,
+y
+0
+,
+z
+0
+)
+subscript
+𝑥
+0
+subscript
+𝑦
+0
+subscript
+𝑧
+0
+(x_{0},y_{0},z_{0})
+( italic_x start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT , italic_y start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT )
+. Since
+S
+⁢
+A
+=
+2
+𝑆
+𝐴
+2
+SA=2
+italic_S italic_A = 2
+, the distance formula gives us
+(
+x
+0
+)
+2
++
+(
+y
+0
+)
+2
++
+(
+z
+0
+)
+2
+=
+2
+superscript
+subscript
+𝑥
+0
+2
+superscript
+subscript
+𝑦
+0
+2
+superscript
+subscript
+𝑧
+0
+2
+2
+\sqrt{(x_{0})^{2}+(y_{0})^{2}+(z_{0})^{2}}=2
+square-root start_ARG ( italic_x start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT ) start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + ( italic_y start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT ) start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + ( italic_z start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT ) start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT end_ARG = 2
+, so we can write
+x
+0
+2
++
+y
+0
+2
++
+z
+0
+2
+=
+4
+superscript
+subscript
+𝑥
+0
+2
+superscript
+subscript
+𝑦
+0
+2
+superscript
+subscript
+𝑧
+0
+2
+4
+x_{0}^{2}+y_{0}^{2}+z_{0}^{2}=4
+italic_x start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + italic_y start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + italic_z start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT = 4
+. Now, remember that
+H
+𝐻
+H
+italic_H
+is the orthogonal projection of
+…
+However, wait a moment, let’s rethink about the dihedral angle involving planes in a more thoughtful geometric way.
+Consider the plane
+α
+1
+=
+A
+⁢
+B
+⁢
+C
+subscript
+𝛼
+1
+𝐴
+𝐵
+𝐶
+\alpha_{1}=ABC
+italic_α start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT = italic_A italic_B italic_C
+, which lies entirely on the xy coordinate plane (as
+z
+=
+0
+𝑧
+0
+z=0
+italic_z = 0
+). Let the plane
+α
+2
+=
+S
+⁢
+B
+⁢
+C
+subscript
+𝛼
+2
+𝑆
+𝐵
+𝐶
+\alpha_{2}=SBC
+italic_α start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT = italic_S italic_B italic_C
+. The point
+A
+𝐴
+A
+italic_A
+projected perpendicularly to plane
+α
+2
+subscript
+𝛼
+2
+\alpha_{2}
+italic_α start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+lands on
+H
+𝐻
+H
+italic_H
+. The line
+l
+=
+A
+⁢
+B
+𝑙
+𝐴
+𝐵
+l=AB
+italic_l = italic_A italic_B
+…
+…
+Table 2
+:
+Emergence of Reflective Behavior in Reinforcement Learning
+During the RL training process, we observe an interesting phenomenon: the reasoning patterns of the actor model evolve dynamically over time. Specifically, the algorithm not only reinforces existing reasoning patterns that facilitate correct problem-solving but also gradually gives rise to entirely new modes of reasoning that were initially absent.
+This finding reveals the adaptability and exploration capability of RL algorithms and offers new insights into the learning mechanisms of the model.
+For example, in the early stages of model training, there was virtually no occurrence of checking and reflecting on previous reasoning steps.
+However, as training progresses, the model exhibits distinct behaviors of reflection and backtracking, as shown in
+Table
+2
+. This observation sheds light on further exploration into interpreting the emergence of reasoning abilities during RL, which we leave for future research.
+5
+Conclusion
+In this paper, we release a fully open-sourced system for large-scale LLM RL, including algorithm, code infrastructure, and dataset. The system achieves state-of-the-art large-scale LLM RL performance (AIME 50 using Qwen-32B pretrained model). We propose the
+D
+ecoupled Clip and
+D
+ynamic s
+A
+mpling
+P
+olicy
+O
+ptimization (
+DAPO
+) algorithm, and introduce 4 key techniques to make RL powerfully effective and efficient in the long-CoT RL scenario.
+Additionally, by open-sourcing the training code and dataset, we provide the broader research community and society with practical access to a scalable reinforcement learning solution, enabling all to benefit from these advancements.
+Contributions
+Project Lead
+Qiying Yu
+1,2,4
+Algorithm
+Qiying Yu
+1,2,4
+, Zheng Zhang
+1
+, Ruofei Zhu
+1
+, Yufeng Yuan
+1
+, Xiaochen Zuo
+1
+, Yu Yue
+1
+Infrastructure
+∗
+Weinan Dai
+1,2,4
+, Tiantian Fan
+1
+, Gaohong Liu
+1
+, Juncai Liu
+1
+, Lingjun Liu
+1
+, Xin Liu
+1
+, Haibin Lin
+1
+, Zhiqi Lin
+1
+, Bole Ma
+1
+, Guangming Sheng
+1,3
+, Yuxuan Tong
+1,2,4
+, Qiying Yu
+1,2,4
+, Chi Zhang
+1
+, Mofan Zhang
+1
+, Ru Zhang
+1
+, Wang Zhang
+1
+, Hang Zhu
+1
+, Jinhua Zhu
+1
+∗
+Last-Name in Alphabetical Order
+Dataset
+Jiaze Chen
+1
+, Jiangjie Chen
+1,4
+, Chengyi Wang
+1
+, Hongli Yu
+1,2,4
+, Yuxuan Song
+1,2,4
+, Xiangpeng Wei
+1
+, Qiying Yu
+1,2,4
+Supervision
+Hao Zhou
+2,4
+, Jingjing Liu
+2,4
+, Wei-Ying Ma
+2,4
+, Ya-Qin Zhang
+2,4
+, Lin Yan
+1,4
+, Mu Qiao
+1,4
+, Yonghui Wu
+1
+, Mingxuan Wang
+1,4
+Affiliation
+1
+ByteDance Seed
+2
+Institute for AI Industry Research (AIR), Tsinghua University
+3
+The University of Hong Kong
+4
+SIA-Lab of Tsinghua AIR and ByteDance Seed
+Acknowledgments
+We thank Zhengyin Du, Shengding Hu, Kai Shen, Tianyang Zhan, Zhen Xiao, Renjie Zheng, Li Han, Kaihua Jiang as well as other colleagues at ByteDance for their support for the
+DAPO
+project.
+References
+[1]
+OpenAI.
+Learning to reason with llms, 2024.
+[2]
+Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+[3]
+OpenAI.
+GPT4 technical report.
+arXiv preprint arXiv:2303.08774
+, 2023.
+[4]
+Anthropic.
+Claude 3.5 sonnet, 2024.
+[5]
+Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.
+Language models are few-shot learners.
+Advances in neural information processing systems
+, 33:1877–1901, 2020.
+[6]
+Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al.
+Palm: Scaling language modeling with pathways.
+Journal of Machine Learning Research
+, 24(240):1–113, 2023.
+[7]
+Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al.
+Deepseek-v3 technical report.
+arXiv preprint arXiv:2412.19437
+, 2024.
+[8]
+XAI.
+Grok 3 beta — the age of reasoning agents, 2024.
+[9]
+Google DeepMind.
+Gemini 2.0 flash thinking, 2024.
+[10]
+Qwen.
+Qwq-32b: Embracing the power of reinforcement learning, 2024.
+[11]
+Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al.
+Kimi k1. 5: Scaling reinforcement learning with llms.
+arXiv preprint arXiv:2501.12599
+, 2025.
+[12]
+An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al.
+Qwen2. 5 technical report.
+arXiv preprint arXiv:2412.15115
+, 2024.
+[13]
+Zhipeng Chen, Yingqian Min, Beichen Zhang, Jie Chen, Jinhao Jiang, Daixuan Cheng, Wayne Xin Zhao, Zheng Liu, Xu Miao, Yang Lu, et al.
+An empirical study on eliciting and improving r1-like reasoning models.
+arXiv preprint arXiv:2503.04548
+, 2025.
+[14]
+Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, and Heung-Yeung Shum Xiangyu Zhang.
+Open-reasoner-zero: An open source approach to scaling reinforcement learning on the base model.
+https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero
+, 2025.
+[15]
+Jian Hu.
+Reinforce++: A simple and efficient approach for aligning large language models.
+arXiv preprint arXiv:2501.03262
+, 2025.
+[16]
+Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al.
+Process reinforcement through implicit rewards.
+arXiv preprint arXiv:2502.01456
+, 2025.
+[17]
+Jung Hyun Lee, June Yong Yang, Byeongho Heo, Dongyoon Han, and Kang Min Yoo.
+Token-supervised value models for enhancing mathematical reasoning capabilities of large language models.
+arXiv preprint arXiv:2407.12863
+, 2024.
+[18]
+Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux.
+Vineppo: Unlocking rl potential for llm reasoning through refined credit assignment.
+arXiv preprint arXiv:2410.01679
+, 2024.
+[19]
+Yufeng Yuan, Yu Yue, Ruofei Zhu, Tiantian Fan, and Lin Yan.
+What’s behind ppo’s collapse in long-cot? value optimization holds the secret.
+arXiv preprint arXiv:2503.01491
+, 2025.
+[20]
+Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu.
+Hybridflow: A flexible and efficient rlhf framework.
+arXiv preprint arXiv:2409.19256
+, 2024.
+[21]
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+[22]
+John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel.
+High-dimensional continuous control using generalized advantage estimation, 2018.
+[23]
+Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F Christiano, Jan Leike, and Ryan Lowe.
+Training language models to follow instructions with human feedback.
+In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors,
+Advances in Neural Information Processing Systems
+, volume 35, pages 27730–27744. Curran Associates, Inc., 2022.
+[24]
+Dario Amodei, Chris Olah, Jacob Steinhardt, Paul Christiano, John Schulman, and Dan Mané.
+Concrete problems in ai safety, 2016.
+[25]
+Tom Everitt, Victoria Krakovna, Laurent Orseau, Marcus Hutter, and Shane Legg.
+Reinforcement learning with a corrupted reward channel, 2017.
+[26]
+Victoria Krakovna, Jonathan Uesato, Vladimir Mikulik, Matthew Rahtz, Tom Everitt, Ramana Kumar, Zac Kenton, Jan Leike, and Shane Legg.
+Specification gaming: the flip side of ai ingenuity, 2020.
+[27]
+Tom Everitt, Marcus Hutter, Ramana Kumar, and Victoria Krakovna.
+Reward tampering problems and solutions in reinforcement learning: A causal influence diagram perspective, 2021.
+[28]
+Leo Gao, John Schulman, and Jacob Hilton.
+Scaling laws for reward model overoptimization, 2022.
+[29]
+Lilian Weng.
+Reward hacking in reinforcement learning.
+lilianweng.github.io
+, Nov 2024.
+[30]
+Stanislas Polu and Ilya Sutskever.
+Generative language modeling for automated theorem proving, 2020.
+[31]
+Trieu H Trinh, Yuhuai Wu, Quoc V Le, He He, and Thang Luong.
+Solving olympiad geometry without human demonstrations.
+Nature
+, 625(7995):476–482, 2024.
+[32]
+Trieu Trinh and Thang Luong.
+Alphageometry: An olympiad-level ai system for geometry, 2024.
+[33]
+AlphaProof and AlphaGeometry Teams.
+Ai achieves silver-medal standard solving international mathematical olympiad problems, 2024.
+[34]
+Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi.
+Coderl: Mastering code generation through pretrained models and deep reinforcement learning.
+Advances in Neural Information Processing Systems
+, 35:21314–21328, 2022.
+[35]
+Noah Shinn, Federico Cassano, Edward Berman, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao.
+Reflexion: Language agents with verbal reinforcement learning, 2023.
+[36]
+Xinyun Chen, Maxwell Lin, Nathanael Schärli, and Denny Zhou.
+Teaching large language models to self-debug, 2023.
+[37]
+Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve.
+Rlef: Grounding code llms in execution feedback with reinforcement learning, 2025.
+[38]
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Mingchuan Zhang, YK Li, Y Wu, and Daya Guo.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+[39]
+Ilya Loshchilov and Frank Hutter.
+Decoupled weight decay regularization.
+In
+International Conference on Learning Representations
+, 2019.
+\beginappendix
+6
+Dataset Transformation
+Here is an example of data transformation, where we demonstrate the original problem, the transformed problem, and the model outputs during the transformation process (the original is in markdown text, which has been rendered for readability).
+To ensure the model can complete this task with high accuracy, our goal is to have it perform comprehensive reasoning while avoiding hallucinations. We achieve this by providing a clear thinking framework that encourages thorough reasoning. Specifically, we employ Chain of Thought (CoT) reasoning along with four well-defined steps: extract the answer format, rewrite the problem statement, solve the modified problem and provide an integer as the final answer. For each step, we provide few-shot solutions or detailed guidelines to guide the model’s reasoning.
+Through this approach, we have observed that in most cases, the LLM can generate reformulations with both format and quality that are satisfactory.
+Original Problem
+Let
+x
+𝑥
+x
+italic_x
+and
+y
+𝑦
+y
+italic_y
+be real numbers such that
+x
+2
++
+y
+2
+−
+22
+⁢
+x
+−
+16
+⁢
+y
++
+113
+=
+0
+.
+superscript
+𝑥
+2
+superscript
+𝑦
+2
+22
+𝑥
+16
+𝑦
+113
+0
+x^{2}+y^{2}-22x-16y+113=0.
+italic_x start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + italic_y start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT - 22 italic_x - 16 italic_y + 113 = 0 .
+Determine the smallest possible value of
+x
+.
+𝑥
+x.
+italic_x .
+Answer:
+11
+−
+2
+⁢
+6
+11
+2
+6
+11-2\sqrt{6}
+11 - 2 square-root start_ARG 6 end_ARG
+Transformed Problem
+Let
+x
+𝑥
+x
+italic_x
+and
+y
+𝑦
+y
+italic_y
+be real numbers such that
+x
+2
++
+y
+2
+−
+22
+⁢
+x
+−
+16
+⁢
+y
++
+113
+=
+0
+.
+superscript
+𝑥
+2
+superscript
+𝑦
+2
+22
+𝑥
+16
+𝑦
+113
+0
+x^{2}+y^{2}-22x-16y+113=0.
+italic_x start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + italic_y start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT - 22 italic_x - 16 italic_y + 113 = 0 .
+Determine the smallest possible value of
+x
+𝑥
+x
+italic_x
+. The original answer is in the form
+k
+−
+m
+⁢
+n
+𝑘
+𝑚
+𝑛
+k-m\sqrt{n}
+italic_k - italic_m square-root start_ARG italic_n end_ARG
+, where
+k
+𝑘
+k
+italic_k
+,
+m
+𝑚
+m
+italic_m
+, and
+n
+𝑛
+n
+italic_n
+are integers. Please find the value of
+k
++
+m
++
+n
+𝑘
+𝑚
+𝑛
+k+m+n
+italic_k + italic_m + italic_n
+.
+Answer:
+19
+19
+19
+19
+Model’s output (Markdown Text)
+Transformation Thought
+Process
+The original answer to the problem is an algebraic expression
+11
+−
+2
+⁢
+6
+.
+11
+2
+6
+11-2\sqrt{6}.
+11 - 2 square-root start_ARG 6 end_ARG .
+We need to transform this into an integer form. First, note that the answer consists of an integer part and a square root term. According to the given rule, we replace the integer and square root components with undetermined coefficients. The original problem uses
+x
+𝑥
+x
+italic_x
+and
+y
+𝑦
+y
+italic_y
+as variables, so we choose other letters as undetermined coefficients. We express the answer in the form
+k
+−
+m
+⁢
+n
+𝑘
+𝑚
+𝑛
+k-m\sqrt{n}
+italic_k - italic_m square-root start_ARG italic_n end_ARG
+and require the student to find
+k
++
+m
++
+n
+𝑘
+𝑚
+𝑛
+k+m+n
+italic_k + italic_m + italic_n
+.
+Answer Format
+k
+−
+m
+⁢
+n
+𝑘
+𝑚
+𝑛
+k-m\sqrt{n}
+italic_k - italic_m square-root start_ARG italic_n end_ARG
+Transformed Problem
+Let
+x
+𝑥
+x
+italic_x
+and
+y
+𝑦
+y
+italic_y
+be real numbers such that
+x
+2
++
+y
+2
+−
+22
+⁢
+x
+−
+16
+⁢
+y
++
+113
+=
+0
+.
+superscript
+𝑥
+2
+superscript
+𝑦
+2
+22
+𝑥
+16
+𝑦
+113
+0
+x^{2}+y^{2}-22x-16y+113=0.
+italic_x start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT + italic_y start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT - 22 italic_x - 16 italic_y + 113 = 0 .
+Determine the smallest possible value of
+x
+𝑥
+x
+italic_x
+. The original answer is in the form
+k
+−
+m
+⁢
+n
+𝑘
+𝑚
+𝑛
+k-m\sqrt{n}
+italic_k - italic_m square-root start_ARG italic_n end_ARG
+, where
+k
+𝑘
+k
+italic_k
+,
+m
+𝑚
+m
+italic_m
+, and
+n
+𝑛
+n
+italic_n
+are integers. Please find the value of
+k
++
+m
++
+n
+𝑘
+𝑚
+𝑛
+k+m+n
+italic_k + italic_m + italic_n
+.
+Answer Calculation for the Transformed Problem
+•
+The original answer is:
+11
+−
+2
+⁢
+6
+11
+2
+6
+11-2\sqrt{6}
+11 - 2 square-root start_ARG 6 end_ARG
+.
+•
+The required format is:
+k
+−
+m
+⁢
+n
+𝑘
+𝑚
+𝑛
+k-m\sqrt{n}
+italic_k - italic_m square-root start_ARG italic_n end_ARG
+.
+•
+Matching the coefficients:
+k
+=
+11
+𝑘
+11
+k=11
+italic_k = 11
+,
+m
+=
+2
+𝑚
+2
+m=2
+italic_m = 2
+,
+n
+=
+6
+𝑛
+6
+n=6
+italic_n = 6
+.
+•
+The final result is:
+k
++
+m
++
+n
+=
+11
++
+2
++
+6
+=
+19
+.
+𝑘
+𝑚
+𝑛
+11
+2
+6
+19
+k+m+n=11+2+6=19.
+italic_k + italic_m + italic_n = 11 + 2 + 6 = 19 .
+Final Answer
+19
+19
+19
+19
+7
+Supplementary Case
+Question:
+Among the 900 residents of Aimeville, there are 195 who own a diamond ring, 367 who own a set of golf clubs, and 562 who own a garden spade. In addition, each of the 900 residents owns a bag of candy hearts. There are 437 residents who own exactly two of these things, and 234 residents who own exactly three of these things. Find the number of residents of Aimeville who own all four of these things.
+Response
+:
+Step by Step Solution:
+1. Let’s define the following variables:
+Let
+A
+𝐴
+A
+italic_A
+be the set of residents who own a diamond ring.
+Let
+B
+𝐵
+B
+italic_B
+be the set of residents who own a set of golf clubs.
+Let
+C
+𝐶
+C
+italic_C
+be the set of residents who own a garden spade.
+Let
+D
+𝐷
+D
+italic_D
+be the set of residents who own a bag of candy hearts. Since all 900 residents own a bag of candy hearts,
+|
+D
+|
+=
+900
+𝐷
+900
+|D|=900
+| italic_D | = 900
+.
+Let
+a
+i
+subscript
+𝑎
+𝑖
+a_{i}
+italic_a start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+denote the number of residents who own exactly
+i
+𝑖
+i
+italic_i
+out of the 4 items.
+Let’s use the principle of inclusion-exclusion for the union of 4 sets
+A
+,
+B
+,
+C
+,
+D
+𝐴
+𝐵
+𝐶
+𝐷
+A,B,C,D
+italic_A , italic_B , italic_C , italic_D
+.
+…
+a
+4
+=
+−
+219
+−
+4
+=
+219
+4
+=
+54.75
+subscript
+𝑎
+4
+219
+4
+219
+4
+54.75
+a_{4}=\frac{-219}{-4}=\frac{219}{4}=54.75
+italic_a start_POSTSUBSCRIPT 4 end_POSTSUBSCRIPT = divide start_ARG - 219 end_ARG start_ARG - 4 end_ARG = divide start_ARG 219 end_ARG start_ARG 4 end_ARG = 54.75
+.
+Since
+a
+4
+subscript
+𝑎
+4
+a_{4}
+italic_a start_POSTSUBSCRIPT 4 end_POSTSUBSCRIPT
+, the number of people owning all four items, must be a whole number, our current approach needs to be reconsidered
+in terms of the combinatorial counting of people owning different combinations of the items.
+Instead of directly using the inclusion-exclusion principle, we can use a different approach involving the number of people owning combinations of the items
+A
+𝐴
+A
+italic_A
+,
+B
+𝐵
+B
+italic_B
+,
+C
+𝐶
+C
+italic_C
+, and
+D
+𝐷
+D
+italic_D
+.
+Let
+x
+i
+subscript
+𝑥
+𝑖
+x_{i}
+italic_x start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+be the number of people owning exactly
+i
+𝑖
+i
+italic_i
+item among
+A
+𝐴
+A
+italic_A
+,
+B
+𝐵
+B
+italic_B
+,
+C
+𝐶
+C
+italic_C
+(
+excluding the item
+D
+𝐷
+D
+italic_D
+which everyone owns
+).
+Then the total number of people owning exactly
+k
+𝑘
+k
+italic_k
+items among the
+A
+𝐴
+A
+italic_A
+,
+B
+𝐵
+B
+italic_B
+,
+C
+𝐶
+C
+italic_C
+, and
+D
+𝐷
+D
+italic_D
+can be broken down:
+…
+Table 3
+:
+A case of reflective behavior emergence.
\ No newline at end of file
diff --git a/research/notes/deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-model.md b/research/notes/deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-model.md
new file mode 100644
index 0000000000000000000000000000000000000000..5aaaa82faf8cdd691eccc945ff851ca26c6ceae6
--- /dev/null
+++ b/research/notes/deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-model.md
@@ -0,0 +1,5435 @@
+---
+title: 'DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
+  Models'
+id: deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-model
+tags:
+- deepread
+created: '2026-06-10T00:30:50.422839Z'
+source: https://arxiv.org/html/2402.03300
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:50.422684Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+\reportnumber
+001
+\correspondingauthor
+∗
+Core contributors.
+†
+Work done during internship at DeepSeek-AI.
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+Zhihong Shao
+1,2∗†
+Peiyi Wang
+1,3∗†
+Qihao Zhu
+1,3∗†
+Runxin Xu
+1
+Junxiao Song
+1
+Xiao Bi
+1
+Haowei Zhang
+1
+Mingchuan Zhang
+1
+Y.K. Li
+1
+Y. Wu
+1
+Daya Guo
+1∗
+1
+DeepSeek-AI
+2
+Tsinghua University
+3
+Peking University
+{zhihongshao,wangpeiyi,zhuqh,guoday}@deepseek.com
+https://github.com/deepseek-ai/DeepSeek-Math
+Abstract
+Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.
+Figure 1:
+Top1 accuracy of open-source models on the competition-level MATH benchmark
+(Hendrycks et al.,
+2021
+)
+without the use of external toolkits and voting techniques.
+1
+Introduction
+Large language models (LLM) have revolutionized the approach to mathematical reasoning in artificial intelligence, spurring significant advancements in both the quantitative reasoning benchmark
+(Hendrycks et al.,
+2021
+)
+and the geometry reasoning benchmark
+(Trinh et al.,
+2024
+)
+. Moreover, these models have proven instrumental in assisting humans in solving complex mathematical problems
+(Tao,
+2023
+)
+. However, cutting-edge models such as GPT-4
+(OpenAI,
+2023
+)
+and Gemini-Ultra
+(Anil et al.,
+2023
+)
+are not publicly available, and the currently accessible open-source models considerably trail behind in performance.
+In this study, we introduce DeepSeekMath, a domain-specific language model that significantly outperforms the mathematical capabilities of open-source models and approaches the performance level of GPT-4 on academic benchmarks.
+To achieve this, we create the DeepSeekMath Corpus, a large-scale high-quality pre-training corpus comprising 120B math tokens.
+This dataset is extracted from the Common Crawl (CC) using a fastText-based classifier
+(Joulin et al.,
+2016
+)
+. In the initial iteration, the classifier is trained using instances from OpenWebMath
+(Paster et al.,
+2023
+)
+as positive examples, while incorporating a diverse selection of other web pages to serve as negative examples. Subsequently, we employ the classifier to mine additional positive instances from the CC, which are further refined through human annotation. The classifier is then updated with this enhanced dataset to improve its performance. The evaluation results indicate that the large-scale corpus is of high quality, as our base model DeepSeekMath-Base 7B achieves 64.2% on GSM8K
+(Cobbe et al.,
+2021
+)
+and 36.2% on the competition-level MATH dataset
+(Hendrycks et al.,
+2021
+)
+, outperforming Minerva 540B
+(Lewkowycz et al.,
+2022a
+)
+. In addition, the DeepSeekMath Corpus is multilingual, so we notice an improvement in Chinese mathematical benchmarks
+(Wei et al.,
+2023
+; Zhong et al.,
+2023
+)
+.
+We believe that our experience in mathematical data processing is a starting point for the research community, and there is significant room for improvement in the future.
+DeepSeekMath-Base is initialized with DeepSeek-Coder-Base-v1.5 7B
+(Guo et al.,
+2024
+)
+, as we notice that starting from a code training model is a better choice compared to a general LLM. Furthermore, we observe the math training also improves model capability on MMLU
+(Hendrycks et al.,
+2020
+)
+and BBH benchmarks
+(Suzgun et al.,
+2022
+)
+, indicating it does not only enhance the model’s mathematical abilities but also amplifies general reasoning capabilities.
+After pre-training, we apply mathematical instruction tuning to DeepSeekMath-Base with chain-of-thought
+(Wei et al.,
+2022
+)
+, program-of-thought
+(Chen et al.,
+2022
+; Gao et al.,
+2023
+)
+, and tool-integrated reasoning
+(Gou et al.,
+2023
+)
+data.
+The resulting model DeepSeekMath-Instruct 7B beats all 7B counterparts and is comparable with 70B open-source instruction-tuned models.
+Furthermore, we introduce the Group Relative Policy Optimization (GRPO), a variant reinforcement learning (RL) algorithm of Proximal Policy Optimization (PPO)
+(Schulman et al.,
+2017
+)
+.
+GRPO foregoes the critic model, instead estimating
+the baseline from group scores, significantly reducing training resources.
+By solely using a subset of English instruction tuning data, GRPO obtains a substantial improvement over the strong DeepSeekMath-Instruct, including both in-domain (GSM8K: 82.9%
+→
+\rightarrow
+88.2%, MATH: 46.8%
+→
+\rightarrow
+51.7%) and out-of-domain mathematical tasks (e.g., CMATH: 84.6%
+→
+\rightarrow
+88.8%) during the reinforcement learning phase.
+We also provide a unified paradigm to understand different methods, such as Rejection Sampling Fine-Tuning (RFT)
+(Yuan et al.,
+2023a
+)
+, Direct Preference Optimization (DPO)
+(Rafailov et al.,
+2023
+)
+, PPO and GRPO.
+Based on such a unified paradigm, we find that all these methods are conceptualized as either direct or simplified RL techniques.
+We also conduct extensive experiments, e.g., online v.s. offline training, outcome v.s. process supervision, single-turn v.s. iterative RL and so on, to deeply investigate the essential elements of this paradigm.
+At last, we explain why our RL boosts the performance of instruction-tuned models, and further summarize potential directions to achieve more effective RL based on this unified paradigm.
+1.1
+Contributions
+Our contribution includes scalable math pre-training, along with the exploration and analysis of reinforcement learning.
+Math Pre-Training at Scale
+•
+Our research provides compelling evidence that the publicly accessible Common Crawl data contains valuable information for mathematical purposes.
+By implementing a meticulously designed data selection pipeline, we successfully construct the DeepSeekMath Corpus, a high-quality dataset of 120B tokens from web pages filtered for mathematical content, which is almost 7 times the size of the math web pages used by Minerva
+(Lewkowycz et al.,
+2022a
+)
+and 9 times the size of the recently released OpenWebMath
+(Paster et al.,
+2023
+)
+.
+•
+Our pre-trained base model DeepSeekMath-Base 7B achieves comparable performance with Minerva 540B
+(Lewkowycz et al.,
+2022a
+)
+, indicating the number of parameters is not the only key factor in mathematical reasoning capability.
+A smaller model pre-trained on high-quality data could achieve strong performance as well.
+•
+We share our findings from math training experiments.
+Code training prior to math training improves models’ ability to solve mathematical problems both with and without tool use.
+This offers a partial answer to the long-standing question:
+does code training improve reasoning abilities?
+We believe it does, at least for mathematical reasoning.
+•
+Although training on arXiv papers is common, especially in many math-related papers, it brings no notable improvements on all mathematical benchmarks adopted in this paper.
+Exploration and Analysis of Reinforcement Learning
+•
+We introduce Group Relative Policy Optimization (GRPO), an efficient and effective reinforcement learning algorithm. GRPO foregoes the critic model, instead estimating the baseline from group scores, significantly reducing training resources compared to Proximal Policy Optimization (PPO).
+•
+We demonstrate that GRPO significantly enhances the performance of our instruction-tuned model DeepSeekMath-Instruct, by solely using the instruction-tuning data.
+Furthermore, we observe enhancements in the out-of-domain performance during the reinforcement learning process.
+•
+We provide a unified paradigm to understand different methods, such as RFT, DPO, PPO, and GRPO. We also conduct extensive experiments, e.g., online v.s. offline training, outcome v.s. process supervision, single-turn v.s. iterative reinforcement learning, and so on to deeply investigate the essential elements of this paradigm.
+•
+Based on our unified paradigm, we explore the reasons behind the effectiveness of reinforcement learning, and summarize several potential directions to achieve more effective reinforcement learning of LLMs.
+1.2
+Summary of Evaluations and Metrics
+•
+English and Chinese Mathematical Reasoning
+:
+We conduct comprehensive assessments of our models on English and Chinese benchmarks, covering mathematical problems from grade-school level to college level.
+English benchmarks include GSM8K
+(Cobbe et al.,
+2021
+)
+, MATH
+(Hendrycks et al.,
+2021
+)
+, SAT
+(Azerbayev et al.,
+2023
+)
+, OCW Courses
+(Lewkowycz et al.,
+2022a
+)
+, MMLU-STEM
+(Hendrycks et al.,
+2020
+)
+.
+Chinese benchmarks include MGSM-zh
+(Shi et al.,
+2023
+)
+, CMATH
+(Wei et al.,
+2023
+)
+, Gaokao-MathCloze
+(Zhong et al.,
+2023
+)
+, and Gaokao-MathQA
+(Zhong et al.,
+2023
+)
+.
+We evaluate models’ ability to generate self-contained text solutions without tool use, and also the ability to solve problems using Python.
+On English benchmarks, DeepSeekMath-Base is competitive with the closed-source Minerva 540B
+(Lewkowycz et al.,
+2022a
+)
+, and surpasses all open-source base models (e.g., Mistral 7B
+(Jiang et al.,
+2023
+)
+and Llemma-34B
+(Azerbayev et al.,
+2023
+)
+), regardless of whether they’ve undergone math pre-training or not, often by a significant margin.
+Notably, DeepSeekMath-Base is superior on Chinese benchmarks, likely because we don’t follow previous works
+(Lewkowycz et al.,
+2022a
+; Azerbayev et al.,
+2023
+)
+to collect English-only math pre-training data, and also include high-quality non-English ones.
+With mathematical instruction tuning and reinforcement learning, the resulting DeepSeekMath-Instruct and DeepSeekMath-RL demonstrate strong performance, obtaining an accuracy of over 50% on the competition-level MATH dataset for the first time within the open-source community.
+•
+Formal Mathematics
+:
+We evaluate DeepSeekMath-Base using the informal-to-formal theorem proving task from
+(Jiang et al.,
+2022
+)
+on miniF2F
+(Zheng et al.,
+2021
+)
+with Isabelle
+(Wenzel et al.,
+2008
+)
+chosen to be the proof assistant.
+DeepSeekMath-Base demonstrates strong few-shot autoformalization performance.
+•
+Natural Language Understanding, Reasoning, and Code
+:
+To build a comprehensive profile of models’ general understanding, reasoning, and coding capabilities, we evaluate DeepSeekMath-Base on the Massive Multitask Language Understanding (MMLU) benchmark
+(Hendrycks et al.,
+2020
+)
+which encompasses 57 multiple-choice tasks covering diverse subjects, BIG-Bench Hard (BBH)
+(Suzgun et al.,
+2022
+)
+which consists of 23 challenging tasks that mostly require multi-step reasoning to solve, as well as HumanEval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+which are widely used to evaluate code language models.
+Math pre-training benefits both language understanding and reasoning performance.
+2
+Math Pre-Training
+2.1
+Data Collection and Decontamination
+In this section, we will outline the process of constructing the DeepSeekMath Corpus from Common Crawl.
+As depicted in Figure
+2
+, we present an iterative pipeline that demonstrates how to systematically gather a large-scale mathematical corpus from Common Crawl, starting with a seed corpus (e.g., a small but high-quality collection of math-related dataset).
+It’s worth noting that this approach is also applicable to other domains, such as coding.
+Figure 2:
+An iterative pipeline that collects mathematical web pages from Common Crawl.
+First, we choose OpenWebMath
+(Paster et al.,
+2023
+)
+, a collection of high-quality mathematical web texts, as our initial seed corpus.
+Using this corpus, we train a fastText model
+(Joulin et al.,
+2016
+)
+to recall more OpenWebMath-like mathematical web pages.
+Specifically, we randomly select 500,000 data points from the seed corpus as positive training examples and another 500,000 web pages from Common Crawl as negative ones.
+We employ an open-source library
+1
+1
+1
+https://fasttext.cc
+for training, configuring the vector dimension to 256, learning rate to 0.1, the maximum length of word n-gram to 3, the minimum number of word occurrences to 3, and the number of training epochs to 3.
+To reduce the size of the original Common Crawl, we employ URL-based deduplication and near-deduplication techniques, resulting in 40B HTML web pages.
+We then recall mathematical web pages from deduplicated Common Crawl with the fastText model.
+To filter out low-quality mathematical content, we rank the collected pages according to their scores predicted by the fastText model, and only preserve the top-ranking ones.
+The volume of data preserved is assessed through pre-training experiments on the top 40B, 80B, 120B, and 160B tokens.
+In the first iteration, we choose to keep the top 40B tokens.
+After the first iteration of data collection, numerous mathematical web pages remain uncollected, mainly because the fastText model is trained on a set of positive examples that lacks sufficient diversity.
+We therefore identify additional mathematical web sources to enrich the seed corpus, so that we can optimize the fastText model.
+Specifically, we first organize the entire Common Crawl into disjoint domains;
+a domain is defined as web pages sharing the same base URL.
+For each domain, we calculate the percentage of web pages that are collected in the first iteration.
+Domains where over 10% of the web pages have been collected are classified as math-related (e.g.,
+mathoverflow.net
+).
+Subsequently, we manually annotate the URLs associated with mathematical content within these identified domains (e.g.,
+mathoverflow.net/questions
+).
+Web pages linked to these URLs, yet uncollected, will be added to the seed corpus.
+This approach enables us to gather more positive examples, thereby training an improved fastText model capable of recalling more mathematical data in the subsequent iteration.
+After four iterations of data collection, we end up with 35.5M mathematical web pages, totaling 120B tokens.
+In the fourth iteration, we notice that nearly 98% of the data has already been collected in the third iteration, so we decide to cease data collection.
+To avoid benchmark contamination, we follow
+Guo et al. (
+2024
+)
+to filter out web pages containing questions or answers from English mathematical benchmarks such as GSM8K
+(Cobbe et al.,
+2021
+)
+and MATH
+(Hendrycks et al.,
+2021
+)
+and Chinese benchmarks such as CMATH
+(Wei et al.,
+2023
+)
+and AGIEval
+(Zhong et al.,
+2023
+)
+.
+The filtering criteria are as follows: any text segment containing a 10-gram string that matches exactly with any sub-string from the evaluation benchmarks is removed from our math training corpus.
+For benchmark texts that are shorter than 10 grams but have at least 3 grams, we employ exact matching to filter out contaminated web pages.
+2.2
+Validating the Quality of the DeepSeekMath Corpus
+We run pre-training experiments to investigate how the DeepSeekMath Corpus is compared with the recently released math-training corpora:
+•
+MathPile
+(Wang et al.,
+2023c
+)
+: a multi-source corpus (8.9B tokens) aggregated from textbooks, Wikipedia, ProofWiki, CommonCrawl, StackExchange, and arXiv, with the majority (over 85%) sourced from arXiv;
+•
+OpenWebMath
+(Paster et al.,
+2023
+)
+: CommonCrawl data filtered for mathematical content, totaling 13.6B tokens;
+•
+Proof-Pile-2
+(Azerbayev et al.,
+2023
+)
+: a mathematical corpus consisting of OpenWebMath, AlgebraicStack (10.3B tokens of mathematical code), and arXiv papers (28.0B tokens).
+When experimenting on Proof-Pile-2, we follow
+Azerbayev et al. (
+2023
+)
+to use an arXiv:Web:Code ratio of 2:4:1.
+2.2.1
+Training Setting
+We apply math training to a general pre-trained language model with 1.3B parameters, which shares the same framework as the DeepSeek LLMs
+(DeepSeek-AI,
+2024
+)
+, denoted as DeepSeek-LLM 1.3B.
+We separately train a model on each mathematical corpus for 150B tokens. All experiments are conducted using the efficient and light-weight HAI-LLM
+(High-flyer,
+2023
+)
+training framework.
+Following the training practice of DeepSeek LLMs, we use the AdamW optimizer
+(Loshchilov and Hutter,
+2017
+)
+with
+β
+1
+=
+0.9
+\beta_{1}=0.9
+,
+β
+2
+=
+0.95
+\beta_{2}=0.95
+, and
+weight
+​
+_
+​
+decay
+=
+0.1
+\mathrm{weight\_decay}=0.1
+, along with a multi-step learning rate schedule where the learning rate reaches the peak after 2,000 warmup steps, decreases to its 31.6% after 80% of the training process, and further decreases to 10.0% of the peak after 90% of the training process.
+We set the maximum value of learning rate to 5.3e-4, and use a batch size of 4M tokens with a 4K context length.
+Math Corpus
+Size
+English Benchmarks
+Chinese Benchmarks
+GSM8K
+MATH
+OCW
+SAT
+MMLU
+STEM
+CMATH
+Gaokao
+MathCloze
+Gaokao
+MathQA
+No Math Training
+N/A
+2.9%
+3.0%
+2.9%
+15.6%
+19.5%
+12.3%
+0.8%
+17.9%
+MathPile
+8.9B
+2.7%
+3.3%
+2.2%
+12.5%
+15.7%
+1.2%
+0.0%
+2.8%
+OpenWebMath
+13.6B
+11.5%
+8.9%
+3.7%
+31.3%
+29.6%
+16.8%
+0.0%
+14.2%
+Proof-Pile-2
+51.9B
+14.3%
+11.2%
+3.7%
+43.8%
+29.2%
+19.9%
+5.1%
+11.7%
+DeepSeekMath Corpus
+120.2B
+23.8%
+13.6%
+4.8%
+56.3%
+33.1%
+41.5%
+5.9%
+23.6%
+Table 1:
+Performance of DeepSeek-LLM 1.3B trained on different mathematical corpora, evaluated using few-shot chain-of-thought prompting.
+Corpus sizes are calculated using our tokenizer with a vocabulary size of 100K.
+Figure 3:
+Benchmark curves of DeepSeek-LLM 1.3B trained on different mathematical corpora.
+2.2.2
+Evaluation Results
+The DeepSeekMath Corpus is of high quality, covers multilingual mathematical content, and is the largest in size.
+•
+High-quality
+:
+We evaluate downstream performance on 8 mathematical benchmarks using few-shot chain-of-thought prompting
+Wei et al. (
+2022
+)
+.
+As shown in Table
+1
+, there is a clear performance lead of the model trained on the DeepSeekMath Corpus. Figure
+3
+shows that the model trained on the DeepSeekMath Corpus demonstrates better performance than Proof-Pile-2 at 50B tokens (1 full epoch of Proof-Pile-2), indicating the average quality of DeepSeekMath Corpus is higher.
+•
+Multilingual
+:
+The DeepSeekMath Corpus encompasses data in multiple languages, predominantly featuring English and Chinese as the two most represented languages.
+As shown in Table
+1
+, training on the DeepSeekMath Corpus enhances mathematical reasoning performance in both English and Chinese.
+In contrast, existing mathematical corpora, which are primarily English-centric, show limited improvement and may even hinder performance in Chinese mathematical reasoning.
+•
+Large-scale
+:
+The DeepSeekMath Corpus is several times larger than existing mathematical corpora.
+As depicted in Figure
+3
+, DeepSeek-LLM 1.3B, when trained on the DeepSeekMath Corpus, shows a steeper learning curve along with more lasting improvements.
+In contrast, the baseline corpora are much smaller, and have already been repeated multiple rounds during training, with the resulting model performance quickly reaching a plateau.
+2.3
+Training and Evaluating DeepSeekMath-Base 7B
+In this section, we introduce DeepSeekMath-Base 7B, a base model with strong reasoning abilities, especially in mathematics.
+Our model is initialized with DeepSeek-Coder-Base-v1.5 7B
+(Guo et al.,
+2024
+)
+and trained for 500B tokens. The distribution of the data is as follows: 56% is from the DeepSeekMath Corpus, 4% from AlgebraicStack, 10% from arXiv, 20% is Github code, and the remaining 10% is natural language data from Common Crawl in both English and Chinese.
+We mainly adopt the training setting specified in Section
+2.2.1
+, except that we set the maximum value of the learning rate to 4.2e-4 and use a batch size of 10M tokens.
+We conduct a comprehensive assessment of the mathematical capabilities of DeepSeekMath-Base 7B, focusing on its ability to produce self-contained mathematical solutions without relying on external tools, solve mathematical problems using tools, and conduct formal theorem proving.
+Beyond mathematics, we also provide a more general profile of the base model, including its performance of natural language understanding, reasoning, and programming skills.
+Mathematical Problem Solving with Step-by-Step Reasoning
+We evaluate DeepSeekMath-Base’s performance of solving mathematical problems using few-shot chain-of-thought prompting
+(Wei et al.,
+2022
+)
+, across eight benchmarks in English and Chinese.
+These benchmarks encompass quantitative reasoning (e.g., GSM8K
+(Cobbe et al.,
+2021
+)
+, MATH
+(Hendrycks et al.,
+2021
+)
+, and CMATH
+(Wei et al.,
+2023
+)
+) and multiple-choice problems (e.g., MMLU-STEM
+(Hendrycks et al.,
+2020
+)
+and Gaokao-MathQA
+(Zhong et al.,
+2023
+)
+), covering diverse fields of mathematics from elementary to college-level complexity.
+As shown in Table
+2
+, DeepSeekMath-Base 7B leads in performance across all eight benchmarks among the open-source base models (including the widely-used general model Mistral 7B
+(Jiang et al.,
+2023
+)
+and the recently released Llemma 34B
+(Azerbayev et al.,
+2023
+)
+which underwent math training on Proof-Pile-2
+(Azerbayev et al.,
+2023
+)
+).
+Notably, on the competition-level MATH dataset, DeepSeekMath-Base surpasses existing open-source base models by over 10% absolute, and outperforms Minerva 540B
+(Lewkowycz et al.,
+2022a
+)
+, a closed-source base model 77 times larger which builds on PaLM
+(Lewkowycz et al.,
+2022b
+)
+and is further trained on mathematical texts.
+Model
+Size
+English Benchmarks
+Chinese Benchmarks
+GSM8K
+MATH
+OCW
+SAT
+MMLU
+STEM
+CMATH
+Gaokao
+MathCloze
+Gaokao
+MathQA
+Closed-Source Base Model
+Minerva
+7B
+16.2%
+14.1%
+7.7%
+-
+35.6%
+-
+-
+-
+Minerva
+62B
+52.4%
+27.6%
+12.0%
+-
+53.9%
+-
+-
+-
+Minerva
+540B
+58.8%
+33.6%
+17.6%
+-
+63.9%
+-
+-
+-
+Open-Source Base Model
+Mistral
+7B
+40.3%
+14.3%
+9.2%
+71.9%
+51.1%
+44.9%
+5.1%
+23.4%
+Llemma
+7B
+37.4%
+18.1%
+6.3%
+59.4%
+43.1%
+43.4%
+11.9%
+23.6%
+Llemma
+34B
+54.0%
+25.3%
+10.3%
+71.9%
+52.9%
+56.1%
+11.9%
+26.2%
+DeepSeekMath-Base
+7B
+64.2%
+36.2%
+15.4%
+84.4%
+56.5%
+71.7%
+20.3%
+35.3%
+Table 2:
+Comparisons between DeepSeekMath-Base 7B and strong base models on English and Chinese mathematical benchmarks.
+Models are evaluated with chain-of-thought prompting.
+Minerva results are quoted from
+Lewkowycz et al. (
+2022a
+)
+.
+Mathematical Problem Solving with Tool Use
+We evaluate program-aided mathematical reasoning on GSM8K and MATH using few-shot program-of-thought prompting
+(Chen et al.,
+2022
+; Gao et al.,
+2023
+)
+.
+Models are prompted to solve each problem by writing a Python program where libraries such as
+math
+and
+sympy
+can be utilized for intricate computations.
+The execution result of the program is evaluated as the answer.
+As shown in Table
+3
+, DeepSeekMath-Base 7B outperforms the prior state-of-the-art Llemma 34B.
+Model
+Size
+Problem Solving w/ Tools
+Informal-to-Formal Proving
+GSM8K+Python
+MATH+Python
+miniF2F-valid
+miniF2F-test
+Mistral
+7B
+48.5%
+18.2%
+18.9%
+18.0%
+CodeLlama
+7B
+27.1%
+17.2%
+16.3%
+17.6%
+CodeLlama
+34B
+52.7%
+23.5%
+18.5%
+18.0%
+Llemma
+7B
+41.0%
+18.6%
+20.6%
+22.1%
+Llemma
+34B
+64.6%
+26.3%
+21.0%
+21.3%
+DeepSeekMath-Base
+7B
+66.9%
+31.4%
+25.8%
+24.6%
+Table 3:
+Few-shot evaluation of base models’ ability to solve mathematical problems using tools and the ability to conduct informal-to-formal theorem proving in Isabelle.
+Formal Mathematics
+Formal proof automation is beneficial to ensure the accuracy and reliability of mathematical proofs and enhance efficiency, with increasing attention in recent years.
+We evaluate DeepSeekMath-Base 7B on the task of informal-to-formal proving from
+(Jiang et al.,
+2022
+)
+which is to generate a formal proof based on an informal statement, a formal counterpart of the statement, and an informal proof.
+We evaluate on miniF2F
+(Zheng et al.,
+2021
+)
+, a benchmark for formal Olympiad-level mathematics, and generate a formal proof in Isabelle for each problem with few-shot prompting.
+Following
+Jiang et al. (
+2022
+)
+, we leverage models to generate proof sketches, and execute the off-the-shelf automated prover Sledgehammer
+(Paulson,
+2010
+)
+to fill in the missing details.
+As shown in Table
+3
+, DeepSeekMath-Base 7B demonstrates strong performance in proof autoformalization.
+Model
+Size
+MMLU
+BBH
+HumanEval (Pass@1)
+MBPP (Pass@1)
+Mistral
+7B
+62.4%
+55.7%
+28.0%
+41.4%
+DeepSeek-Coder-Base-v1.5
+†
+{\dagger}
+7B
+42.9%
+42.9%
+40.2%
+52.6%
+DeepSeek-Coder-Base-v1.5
+7B
+49.1%
+55.2%
+43.2%
+60.4%
+DeepSeekMath-Base
+7B
+54.9%
+59.5%
+40.9%
+52.6%
+Table 4:
+Evaluation on natural language understanding, reasoning, and code benchmarks.
+DeepSeek-Coder-Base-v1.5
+†
+{\dagger}
+is the checkpoint right before learning rate decay, which is used to train DeepSeekMath-Base.
+On MMLU and BBH, we use few-shot chain-of-thought prompting.
+On HumanEval and MBPP, we evaluate model performance under the zero-shot setting and a few-shot setting, respectively.
+Natural Language Understanding, Reasoning, and Code
+We evaluate model performance of natural language understanding on MMLU
+(Hendrycks et al.,
+2020
+)
+, reasoning on BBH
+(Suzgun et al.,
+2022
+)
+, and coding capabilities on HumanEval
+(Chen et al.,
+2021
+)
+and MBPP
+(Austin et al.,
+2021
+)
+. As shown in Table
+4
+, DeepSeekMath-Base 7B exhibits significant enhancements in performance on MMLU and BBH over its precursor, DeepSeek-Coder-Base-v1.5
+(Guo et al.,
+2024
+)
+, illustrating the positive impact of math training on language understanding and reasoning.
+Additionally, by including code tokens for continual training, DeepSeekMath-Base 7B effectively maintains the performance of DeepSeek-Coder-Base-v1.5 on the two coding benchmarks.
+Overall, DeepSeekMath-Base 7B significantly outperforms the general model Mistral 7B
+(Jiang et al.,
+2023
+)
+on the three reasoning and coding benchmarks.
+3
+Supervised Fine-Tuning
+3.1
+SFT Data Curation
+We construct a mathematical instruction-tuning dataset covering English and Chinese problems from different mathematical fields and of varying complexity levels:
+problems are paired with solutions in chain-of-thought (CoT)
+(Wei et al.,
+2022
+)
+, program-of-thought (PoT)
+(Chen et al.,
+2022
+; Gao et al.,
+2023
+)
+, and tool-integrated reasoning format
+(Gou et al.,
+2023
+)
+.
+The total number of training examples is 776K.
+•
+English mathematical datasets
+:
+We annotate GSM8K and MATH problems with tool-integrated solutions, and adopt a subset of MathInstruct
+(Yue et al.,
+2023
+)
+along with the training set of Lila-OOD
+(Mishra et al.,
+2022
+)
+where problems are solved with CoT or PoT.
+Our English collection covers diverse fields of mathematics, e.g., algebra, probability, number theory, calculus, and geometry.
+•
+Chinese mathematical datasets
+:
+We collect Chinese K-12 mathematical problems spanning 76 sub-topics such as linear equations, with solutions annotated in both CoT and tool-integrated reasoning format.
+3.2
+Training and Evaluating DeepSeekMath-Instruct 7B
+In this section, we introduce DeepSeekMath-Instruct 7B which undergoes mathematical instruction tuning based on DeepSeekMath-Base.
+Training examples are randomly concatenated until reaching a maximum context length of 4K tokens.
+We train the model for 500 steps with a batch size of 256 and a constant learning rate of 5e-5.
+We evaluate models’ mathematical performance both without and with tool use, on 4 quantitative reasoning benchmarks in English and Chinese.
+We benchmark our model against the leading models of the time:
+•
+Closed-source models
+include:
+(1) the GPT family among which GPT-4
+(OpenAI,
+2023
+)
+and GPT-4 Code Interpreter
+2
+2
+2
+https://openai.com/blog/chatgpt-plugins##code-interpreter
+are the most capable ones,
+(2) Gemini Ultra and Pro
+(Anil et al.,
+2023
+)
+,
+(3) Inflection-2
+(Inflection AI,
+2023
+)
+,
+(4) Grok-1
+3
+3
+3
+https://x.ai/model-card
+,
+as well as models recently released by Chinese companies including
+(5) Baichuan-3
+4
+4
+4
+https://www.baichuan-ai.com
+,
+(6) the latest GLM-4
+5
+5
+5
+https://open.bigmodel.cn/dev/api#glm-4
+from the GLM family
+(Du et al.,
+2022
+)
+.
+These models are for general purposes, most of which have undergone a series of alignment procedures.
+•
+Open-source models
+include:
+general models like (1) DeepSeek-LLM-Chat 67B
+(DeepSeek-AI,
+2024
+)
+, (2) Qwen 72B
+(Bai et al.,
+2023
+)
+, (3) SeaLLM-v2 7B
+(Nguyen et al.,
+2023
+)
+, and (4) ChatGLM3 6B
+(ChatGLM3 Team,
+2023
+)
+,
+as well as models with enhancements in mathematics including
+(5) InternLM2-Math 20B
+6
+6
+6
+https://github.com/InternLM/InternLM-Math
+which builds on InternLM2 and underwent math training followed by instruction tuning,
+(6) Math-Shepherd-Mistral 7B which applys PPO training
+(Schulman et al.,
+2017
+)
+to Mistral 7B
+(Jiang et al.,
+2023
+)
+with a process-supervised reward model,
+(7) the WizardMath series
+(Luo et al.,
+2023
+)
+which improves mathematical reasoning in Mistral 7B and Llama-2 70B
+(Touvron et al.,
+2023
+)
+using evolve-instruct (i.e., a version of instruction tuning that uses AI-evolved instructions) and PPO training with training problems primarily sourced from GSM8K and MATH,
+(8) MetaMath 70B
+(Yu et al.,
+2023
+)
+which is Llama-2 70B fine-tuned on an augmented version of GSM8K and MATH,
+(9) ToRA 34B
+Gou et al. (
+2023
+)
+which is CodeLlama 34B fine-tuned to do tool-integrated mathematical reasoning,
+(10) MAmmoTH 70B
+(Yue et al.,
+2023
+)
+which is Llama-2 70B instruction-tuned on MathInstruct.
+Model
+Size
+English Benchmarks
+Chinese Benchmarks
+GSM8K
+MATH
+MGSM-zh
+CMATH
+Chain-of-Thought Reasoning
+Closed-Source Model
+Gemini Ultra
+-
+94.4%
+53.2%
+-
+-
+GPT-4
+-
+92.0%
+52.9%
+-
+86.0%
+Inflection-2
+-
+81.4%
+34.8%
+-
+-
+GPT-3.5
+-
+80.8%
+34.1%
+-
+73.8%
+Gemini Pro
+-
+86.5%
+32.6%
+-
+-
+Grok-1
+-
+62.9%
+23.9%
+-
+-
+Baichuan-3
+-
+88.2%
+49.2%
+-
+-
+GLM-4
+-
+87.6%
+47.9%
+-
+-
+Open-Source Model
+InternLM2-Math
+20B
+82.6%
+37.7%
+-
+-
+Qwen
+72B
+78.9%
+35.2%
+-
+-
+Math-Shepherd-Mistral
+7B
+84.1%
+33.0%
+-
+-
+WizardMath-v1.1
+7B
+83.2%
+33.0%
+-
+-
+DeepSeek-LLM-Chat
+67B
+84.1%
+32.6%
+74.0%
+80.3%
+MetaMath
+70B
+82.3%
+26.6%
+66.4%
+70.9%
+SeaLLM-v2
+7B
+78.2%
+27.5%
+64.8%
+-
+ChatGLM3
+6B
+72.3%
+25.7%
+-
+-
+WizardMath-v1.0
+70B
+81.6%
+22.7%
+64.8%
+65.4%
+DeepSeekMath-Instruct
+7B
+82.9%
+46.8%
+73.2%
+84.6%
+DeepSeekMath-RL
+7B
+88.2%
+51.7%
+79.6%
+88.8%
+Tool-Integrated Reasoning
+Closed-Source Model
+GPT-4 Code Interpreter
+-
+97.0%
+69.7%
+-
+-
+Open-Source Model
+InternLM2-Math
+20B
+80.7%
+54.3%
+-
+-
+DeepSeek-LLM-Chat
+67B
+86.7%
+51.1%
+76.4%
+85.4%
+ToRA
+34B
+80.7%
+50.8%
+41.2%
+53.4%
+MAmmoTH
+70B
+76.9%
+41.8%
+-
+-
+DeepSeekMath-Instruct
+7B
+83.7%
+57.4%
+72.0%
+84.3%
+DeepSeekMath-RL
+7B
+86.7%
+58.8%
+78.4%
+87.6%
+Table 5:
+Performance of Open- and Closed-Source models with both Chain-of-Thought and Tool-Integrated Reasoning on English and Chinese Benchmarks.
+Scores in
+gray
+denote majority votes with 32 candidates; The others are Top1 scores.
+DeepSeekMath-RL 7B beats all open-source models from 7B to 70B, as well as the majority of closed-source models. Although DeepSeekMath-RL 7B is only further trained on chain-of-thought-format instruction tuning data of GSM8K and MATH, it improves over DeepSeekMath-Instruct 7B on all benchmarks.
+As shown in Table
+5
+, under the evaluation setting where tool use is disallowed, DeepSeekMath-Instruct 7B demonstrates strong performance of step-by-step reasoning.
+Notably, on the competition-level MATH dataset, our model surpasses all open-source models and the majority of proprietary models (e.g., Inflection-2 and Gemini Pro) by at least 9% absolute.
+This is true even for models that are substantially larger (e.g., Qwen 72B) or have been specifically enhanced through math-focused reinforcement learning (e.g., WizardMath-v1.1 7B).
+While DeepSeekMath-Instruct rivals the Chinese proprietary models GLM-4 and Baichuan-3 on MATH, it still underperforms GPT-4 and Gemini Ultra.
+Under the evaluation setting where models are allowed to integrate natural language reasoning and program-based tool use for problem solving, DeepSeekMath-Instruct 7B approaches an accuracy of 60% on MATH, surpassing all existing open-source models.
+On the other benchmarks, our model is competitive with DeepSeek-LLM-Chat 67B, the prior state-of-the-art that is 10 times larger.
+4
+Reinforcement Learning
+4.1
+Group Relative Policy Optimization
+Reinforcement learning (RL) has been proven to be effective in further improving the mathematical reasoning ability of LLMs after the Supervised Fine-Tuning (SFT) stage
+(Wang et al.,
+2023b
+; Luo et al.,
+2023
+)
+.
+In this section, we introduce our efficient and effective RL algorithm, Group Relative Policy Optimization (GRPO).
+4.1.1
+From PPO to GRPO
+Proximal Policy Optimization (PPO)
+(Schulman et al.,
+2017
+)
+is an actor-critic RL algorithm that is widely used in the RL fine-tuning stage of LLMs
+(Ouyang et al.,
+2022
+)
+. In particular, it optimizes LLMs by maximizing the following surrogate objective:
+𝒥
+P
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+​
+(
+Q
+)
+,
+o
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+​
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+min
+⁡
+[
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+​
+A
+t
+,
+clip
+​
+(
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+t
+]
+,
+\footnotesize\mathcal{J}_{PPO}(\theta)=\mathbb{E}{[q\sim P(Q),o\sim\pi_{\theta_{old}}(O|q)]}\frac{1}{|o|}\sum_{t=1}^{|o|}\min\left[\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{\theta_{old}}(o_{t}|q,o_{<t})}A_{t},\text{clip}\left(\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{\theta_{old}}(o_{t}|q,o_{<t})},1-\varepsilon,1+\varepsilon\right)A_{t}\right],
+(1)
+where
+π
+θ
+\pi_{\theta}
+and
+π
+θ
+o
+​
+l
+​
+d
+\pi_{\theta_{old}}
+are the current and old policy models, and
+q
+,
+o
+q,o
+are questions and outputs sampled from the question dataset and the old policy
+π
+θ
+o
+​
+l
+​
+d
+\pi_{\theta_{old}}
+, respectively.
+ε
+\varepsilon
+is a clipping-related hyper-parameter introduced in PPO for stabilizing training.
+A
+t
+A_{t}
+is the advantage, which is computed by applying Generalized Advantage Estimation (GAE)
+(Schulman et al.,
+2015
+)
+, based on the rewards
+{
+r
+≥
+t
+}
+\{r_{\geq t}\}
+and a learned value function
+V
+ψ
+V_{\psi}
+. Thus, in PPO, a value function needs to be trained alongside the policy model and to mitigate over-optimization of the reward model, the standard approach is to add a per-token KL penalty from a reference model in the reward at each token
+(Ouyang et al.,
+2022
+)
+, i.e.,
+r
+t
+=
+r
+φ
+​
+(
+q
+,
+o
+≤
+t
+)
+−
+β
+​
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+r
+​
+e
+​
+f
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+,
+r_{t}=r_{\varphi}(q,o_{\leq t})-\beta\log\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{ref}(o_{t}|q,o_{<t})},
+(2)
+where
+r
+φ
+r_{\varphi}
+is the reward model,
+π
+r
+​
+e
+​
+f
+\pi_{ref}
+is the reference model, which is usually the initial SFT model, and
+β
+\beta
+is the coefficient of the KL penalty.
+Figure 4:
+Demonstration of PPO and our GRPO. GRPO foregoes the value model, instead estimating the baseline from group scores, significantly reducing training resources.
+As the value function employed in PPO is typically another model of comparable size as the policy model, it brings a substantial memory and computational burden. Additionally, during RL training, the value function is treated as a baseline in the calculation of the advantage for variance reduction. While in the LLM context, usually only the last token is assigned a reward score by the reward model, which may complicate the training of a value function that is accurate at each token. To address this, as shown in Figure
+4
+, we propose Group Relative Policy Optimization (GRPO), which obviates the need for additional value function approximation as in PPO, and instead uses the average reward of multiple sampled outputs, produced in response to the same question, as the baseline. More specifically, for each question
+q
+q
+, GRPO samples a group of outputs
+{
+o
+1
+,
+o
+2
+,
+⋯
+,
+o
+G
+}
+\{o_{1},o_{2},\cdots,o_{G}\}
+from the old policy
+π
+θ
+o
+​
+l
+​
+d
+\pi_{\theta_{old}}
+and then optimizes the policy model by maximizing the following objective:
+𝒥
+G
+​
+R
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+​
+(
+Q
+)
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+1
+G
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+∑
+t
+=
+1
+|
+o
+i
+|
+{
+min
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+A
+^
+i
+,
+t
+,
+clip
+(
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+A
+^
+i
+,
+t
+]
+−
+β
+𝔻
+K
+​
+L
+[
+π
+θ
+|
+|
+π
+r
+​
+e
+​
+f
+]
+}
+,
+\footnotesize\begin{split}\mathcal{J}_{GRPO}(\theta)&=\mathbb{E}{[q\sim P(Q),\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(O|q)]}\\
+&\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{t=1}^{|o_{i}|}\left\{\min\left[\frac{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|q,o_{i,<t})}\hat{A}_{i,t},\text{clip}\left(\frac{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|q,o_{i,<t})},1-\varepsilon,1+\varepsilon\right)\hat{A}_{i,t}\right]-\beta\mathbb{D}_{KL}\left[\pi_{\theta}||\pi_{ref}\right]\right\},\end{split}
+(3)
+where
+ε
+\varepsilon
+and
+β
+\beta
+are hyper-parameters, and
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+is the advantage calculated based on relative rewards of the outputs inside each group only, which will be detailed in the following subsections. The group relative way that GRPO leverages to calculate the advantages, aligns well with the comparative nature of rewards models, as reward models are typically trained on datasets of comparisons between outputs on the same question. Also note that, instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss, avoiding complicating the calculation of
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+. And different from the KL penalty term used in (
+2
+), we estimate the KL divergence with the following unbiased estimator
+(Schulman,
+2020
+)
+:
+𝔻
+K
+​
+L
+[
+π
+θ
+|
+|
+π
+r
+​
+e
+​
+f
+]
+=
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+−
+log
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+−
+1
+,
+\small\mathbb{D}_{KL}\left[\pi_{\theta}||\pi_{ref}\right]=\frac{\pi_{ref}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}-\log\frac{\pi_{ref}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}-1,
+(4)
+which is guaranteed to be positive.
+Algorithm 1
+Iterative Group Relative Policy Optimization
+Input
+initial policy model
+π
+θ
+init
+\pi_{\theta_{\text{init}}}
+; reward models
+r
+φ
+r_{\varphi}
+; task prompts
+𝒟
+\mathcal{D}
+;
+hyperparameters
+ε
+\varepsilon
+,
+β
+\beta
+,
+μ
+\mu
+1:
+policy model
+π
+θ
+←
+π
+θ
+init
+\pi_{\theta}\leftarrow\pi_{\theta_{\text{init}}}
+2:
+for
+iteration = 1, …, I
+do
+3:
+reference model
+π
+r
+​
+e
+​
+f
+←
+π
+θ
+\pi_{ref}\leftarrow\pi_{\theta}
+4:
+for
+step = 1, …, M
+do
+5:
+Sample a batch
+𝒟
+b
+\mathcal{D}_{b}
+from
+𝒟
+\mathcal{D}
+6:
+Update the old policy model
+π
+θ
+o
+​
+l
+​
+d
+←
+π
+θ
+\pi_{\theta_{old}}\leftarrow\pi_{\theta}
+7:
+Sample
+G
+G
+outputs
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+(
+⋅
+∣
+q
+)
+\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(\cdot\mid q)
+for each question
+q
+∈
+𝒟
+b
+q\in\mathcal{D}_{b}
+8:
+Compute rewards
+{
+r
+i
+}
+i
+=
+1
+G
+\{r_{i}\}_{i=1}^{G}
+for each sampled output
+o
+i
+o_{i}
+by running
+r
+φ
+r_{\varphi}
+9:
+Compute
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+for the
+t
+t
+-th token of
+o
+i
+o_{i}
+through group relative advantage estimation.
+10:
+for
+GRPO iteration = 1, …,
+μ
+\mu
+do
+11:
+Update the policy model
+π
+θ
+\pi_{\theta}
+by maximizing the GRPO objective (Equation
+21
+)
+12:
+Update
+r
+φ
+r_{\varphi}
+through continuous training using a replay mechanism.
+Output
+π
+θ
+\pi_{\theta}
+4.1.2
+Outcome Supervision RL with GRPO
+Formally, for each question
+q
+q
+, a group of outputs
+{
+o
+1
+,
+o
+2
+,
+⋯
+,
+o
+G
+}
+\{o_{1},o_{2},\cdots,o_{G}\}
+are sampled from the old policy model
+π
+θ
+o
+​
+l
+​
+d
+\pi_{\theta_{old}}
+. A reward model is then used to score the outputs, yielding
+G
+G
+rewards
+𝐫
+=
+{
+r
+1
+,
+r
+2
+,
+⋯
+,
+r
+G
+}
+\mathbf{r}=\{r_{1},r_{2},\cdots,r_{G}\}
+correspondingly. Subsequently, these rewards are normalized by subtracting the group average and dividing by the group standard deviation. Outcome supervision provides the normalized reward at the end of each output
+o
+i
+o_{i}
+and sets the advantages
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+of all tokens in the output as the normalized reward, i.e.,
+A
+^
+i
+,
+t
+=
+r
+~
+i
+=
+r
+i
+−
+mean
+​
+(
+𝐫
+)
+std
+​
+(
+𝐫
+)
+\hat{A}_{i,t}=\widetilde{r}_{i}=\frac{r_{i}-{\rm mean}(\mathbf{r})}{{\rm std}(\mathbf{r})}
+, and then optimizes the policy by maximizing the objective defined in equation (
+3
+).
+4.1.3
+Process Supervision RL with GRPO
+Outcome supervision only provides a reward at the end of each output, which may not be sufficient and efficient to supervise the policy in complex mathematical tasks. Following
+Wang et al. (
+2023b
+)
+, we also explore process supervision, which provides a reward at the end of each reasoning step. Formally, given the question
+q
+q
+and
+G
+G
+sampled outputs
+{
+o
+1
+,
+o
+2
+,
+⋯
+,
+o
+G
+}
+\{o_{1},o_{2},\cdots,o_{G}\}
+, a process reward model is used to score each step of the outputs, yielding corresponding rewards:
+𝐑
+=
+{
+{
+r
+1
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+1
+)
+,
+⋯
+,
+r
+1
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+K
+1
+)
+}
+,
+⋯
+,
+{
+r
+G
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+1
+)
+,
+⋯
+,
+r
+G
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+K
+G
+)
+}
+}
+\mathbf{R}=\{\{r_{1}^{index(1)},\cdots,r_{1}^{index(K_{1})}\},\cdots,\{r_{G}^{index(1)},\cdots,r_{G}^{index(K_{G})}\}\}
+, where
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+j
+)
+index(j)
+is the end token index of the
+j
+j
+-th step, and
+K
+i
+K_{i}
+is the total number of steps in the
+i
+i
+-th output. We also normalize these rewards with the average and the standard deviation, i.e.,
+r
+~
+i
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+j
+)
+=
+r
+i
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+j
+)
+−
+mean
+​
+(
+𝐑
+)
+std
+​
+(
+𝐑
+)
+\widetilde{r}_{i}^{index(j)}=\frac{r_{i}^{index(j)}-{\rm mean(\mathbf{R})}}{{\rm std(\mathbf{R})}}
+.
+Subsequently, the process supervision calculates the advantage of each token as the sum of the normalized rewards from the following steps, i.e.,
+A
+^
+i
+,
+t
+=
+∑
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+j
+)
+≥
+t
+r
+~
+i
+i
+​
+n
+​
+d
+​
+e
+​
+x
+​
+(
+j
+)
+\hat{A}_{i,t}=\sum_{index(j)\geq t}\widetilde{r}_{i}^{index(j)}
+,
+and then optimizes the policy by maximizing the objective defined in equation (
+3
+).
+4.1.4
+Iterative RL with GRPO
+As the reinforcement learning training process progresses, the old reward model may not be sufficient to supervise the current policy model.
+Therefore, we also explore the iterative RL with GRPO.
+As shown in Algorithm
+1
+, in iterative GRPO, we generate new training sets for the reward model based on the sampling results from the policy model and continually train the old reward model using a replay mechanism that incorporates 10% of historical data.
+Then, we set the reference model as the policy model, and continually train the policy model with the new reward model.
+4.2
+Training and Evaluating DeepSeekMath-RL
+We conduct RL based on DeepSeekMath-Instruct 7B.
+The training data of RL are chain-of-thought-format questions related to GSM8K and MATH from the SFT data, which consists of around 144K questions.
+We exclude other SFT questions to investigate the impact of RL on benchmarks that lack data throughout the RL phase.
+We construct the training set of reward models following
+(Wang et al.,
+2023b
+)
+.
+We train our initial reward model based on the DeepSeekMath-Base 7B with a learning rate of 2e-5.
+For GRPO, we set the learning rate of the policy model as 1e-6. The KL coefficient is 0.04. For each question, we sample
+64
+64
+outputs. The max length is set to 1024, and the training batch size is 1024.
+The policy model only has a single update following each
+exploration stage.
+We evaluate DeepSeekMath-RL 7B on benchmarks following DeepSeekMath-Instruct 7B.
+For DeepSeekMath-RL 7B, GSM8K and MATH with chain-of-thought reasoning can be regarded as in-domain tasks and all the other benchmarks can be regarded as out-of-domain tasks.
+Table
+5
+demonstrates the performance of open- and closed-source models with both chain-of-thought and tool-integrated reasoning on English and Chinese benchmarks. We find that:
+1) DeepSeekMath-RL 7B attains accuracies of 88.2% and 51.7% on GSM8K and MATH, respectively, utilizing chain-of-thought reasoning. This performance surpasses that of all open-source models in the 7B to 70B range, as well as the majority of closed-source models.
+2) Crucially, DeepSeekMath-RL 7B is only trained on chain-of-thought-format instruction tuning data of GSM8K and MATH, starting from DeepSeekMath-Instruct 7B. Despite the constrained scope of its training data, it outperforms DeepSeekMath-Instruct 7B across all evaluation metrics, showcasing the effectiveness of reinforcement learning.
+5
+Discussion
+In this section, we will share our findings in pre-training and RL experiments.
+5.1
+Lessons Learnt in Pre-Training
+We first share our experience in pre-training. Unless otherwise specified, we will adhere to the training settings outlined in Section
+2.2.1
+. It is worth noting that, when referring to the DeepSeekMath Corpus in this section, we use an 89B-token dataset from the second iteration of the data collection process.
+5.1.1
+Code Training Benefits Mathematical Reasoning
+A popular yet unverified hypothesis suggests that code training improves reasoning.
+We attempt to offer a partial response to this, particularly within the mathematical domain:
+code training improves models’ ability to do mathematical reasoning both with and without tool use.
+To study how code training affects mathematical reasoning, we experimented with the following two-stage training and one-stage training settings:
+Two-Stage Training
+•
+Code Training for 400B Tokens
+→
+\rightarrow
+Math Training for 150B Tokens
+:
+We train DeepSeek-LLM 1.3B for 400B code tokens followed by 150B math tokens;
+•
+General Training for 400B Tokens
+→
+\rightarrow
+Math Training for 150B Tokens
+:
+As a control experiment, we also experiment with general tokens (sampled from a large-scale general corpus created by DeepSeek-AI) instead of code tokens in the first stage of training, in an attempt to investigate the advantages of code tokens over general tokens in improving mathematical reasoning.
+One-Stage Training
+•
+Math Training for 150B Tokens
+:
+We train DeepSeek-LLM 1.3B for 150B math tokens;
+•
+Training on a mixture of 400B Code Tokens and 150B Math Tokens
+:
+Math training following code training degrades coding performance.
+We investigate whether code tokens, when mixed with math tokens for one-stage training, would still improve mathematical reasoning and also alleviate the problem of catastrophic forgetting.
+Training Setting
+Training Tokens
+w/o Tool Use
+w/ Tool Use
+General
+Code
+Math
+GSM8K
+MATH
+CMATH
+GSM8K+Python
+MATH+Python
+No Continual Training
+–
+–
+–
+2.9%
+3.0%
+12.3%
+2.7%
+2.3%
+Two-Stage Training
+Stage 1: General Training
+400B
+–
+–
+2.9%
+3.2%
+14.8%
+3.3%
+2.3%
+Stage 2: Math Training
+–
+–
+150B
+19.1%
+14.4%
+37.2%
+14.3%
+6.7%
+Stage 1: Code Training
+–
+400B
+–
+5.9%
+3.6%
+19.9%
+12.4%
+10.0%
+Stage 2: Math Training
+–
+–
+150B
+21.9%
+15.3%
+39.7%
+17.4%
+9.4%
+One-Stage Training
+Math Training
+–
+–
+150B
+20.5%
+13.1%
+37.6%
+11.4%
+6.5%
+Code & Math Mixed Training
+–
+400B
+150B
+17.6%
+12.1%
+36.3%
+19.7%
+13.5%
+Table 6:
+Investigation of how code affects mathematical reasoning under different training settings.
+We experiment with DeepSeek-LLM 1.3B, and evaluate its mathematical reasoning performance without and with tool use via few-shot chain-of-thought prompting and few-shot program-of-thought prompting, respectively.
+Training Setting
+Training Tokens
+MMLU
+BBH
+HumanEval (Pass@1)
+MBPP (Pass@1)
+General
+Code
+Math
+No Continual Training
+–
+–
+–
+24.5%
+28.1%
+12.2%
+13.0%
+Two-Stage Training
+Stage 1: General Training
+400B
+–
+–
+25.9%
+27.7%
+15.2%
+13.6%
+Stage 2: Math Training
+–
+–
+150B
+33.1%
+32.7%
+12.8%
+13.2%
+Stage 1: Code Training
+–
+400B
+–
+25.0%
+31.5%
+25.0%
+40.0%
+Stage 2: Math Training
+–
+–
+150B
+36.2%
+35.3%
+12.2%
+17.0%
+One-Stage Training
+Math Training
+–
+–
+150B
+32.3%
+32.5%
+11.6%
+13.2%
+Code & Math Mixed Training
+–
+400B
+150B
+33.5%
+35.6%
+29.3%
+39.4%
+Table 7:
+Investigation of how different settings of code and math training affect model performance of language understanding, reasoning, and coding.
+We experiment with DeepSeek-LLM 1.3B.
+We evaluate the models on MMLU and BBH using few-shot chain-of-thought prompting.
+On HumanEval and MBPP, we conduct zero-shot and few-shot evaluations, respectively.
+Results
+Table
+6
+and Table
+7
+demonstrate the downstream performance under different training settings.
+Code training benefits program-aided mathematical reasoning, both under the two-stage training and one-stage training settings.
+As shown in Table
+6
+, under the two-stage training setting, code training alone already significantly enhances the ability to solve GSM8K and MATH problems using Python.
+Math training in the second stage yields further improvements.
+Interestingly, under the one-stage training setting, mixing code tokens and math tokens effectively mitigates the issue of catastrophic forgetting that arises from two-stage training, and also synergizes coding (Table
+7
+) and program-aided mathematical reasoning (Table
+6
+).
+Code training also improves mathematical reasoning without tool use.
+Under the two-stage training setting, the initial stage of code training already results in moderate enhancements.
+It also boosts the efficiency of the subsequent math training, eventually leading to the best performance.
+However, combining code tokens and math tokens for one-stage training compromises mathematical reasoning without tool use.
+One conjecture is that DeepSeek-LLM 1.3B, due to its limited scale, lacks the capacity to fully assimilate both code and mathematical data simultaneously.
+Model
+Size
+ArXiv Corpus
+English Benchmarks
+Chinese Benchmarks
+GSM8K
+MATH
+OCW
+SAT
+MMLU
+STEM
+CMATH
+Gaokao
+MathCloze
+Gaokao
+MathQA
+DeepSeek-LLM
+1.3B
+No Math Training
+2.9%
+3.0%
+2.9%
+15.6%
+19.5%
+12.3%
+0.8%
+17.9%
+MathPile
+2.7%
+3.3%
+2.2%
+12.5%
+15.7%
+1.2%
+0.0%
+2.8%
+ArXiv-RedPajama
+3.3%
+3.4%
+4.0%
+9.4%
+9.0%
+7.4%
+0.8%
+2.3%
+DeepSeek-Coder-Base-v1.5
+7B
+No Math Training
+29.0%
+12.5%
+6.6%
+40.6%
+38.1%
+45.9%
+5.9%
+21.1%
+MathPile
+23.6%
+11.5%
+7.0%
+46.9%
+35.8%
+37.9%
+4.2%
+25.6%
+ArXiv-RedPajama
+28.1%
+11.1%
+7.7%
+50.0%
+35.2%
+42.6%
+7.6%
+24.8%
+Table 8:
+Effect of math training on different arXiv datasets.
+Model performance is evaluated with few-shot chain-of-thought prompting.
+ArXiv Corpus
+miniF2F-valid
+miniF2F-test
+No Math Training
+20.1%
+21.7%
+MathPile
+16.8%
+16.4%
+ArXiv-RedPajama
+14.8%
+11.9%
+Table 9:
+Effect of math training on different arXiv corpora, the base model being DeepSeek-Coder-Base-v1.5 7B.
+We evaluate informal-to-formal proving in Isabelle.
+5.1.2
+ArXiv Papers Seem Ineffective in Improving Mathematical Reasoning
+ArXiv papers are commonly included as a component of math pre-training data
+(Lewkowycz et al.,
+2022a
+; Polu and Sutskever,
+2020
+; Azerbayev et al.,
+2023
+; Wang et al.,
+2023c
+)
+.
+However, detailed analysis regarding their impact on mathematical reasoning has not been extensively conducted.
+Perhaps counter-intuitively, according to our experiments, arXiv papers seem ineffective in improving mathematical reasoning.
+We experiment with models of different sizes, including DeepSeek-LLM 1.3B and DeepSeek-Coder-Base-v1.5 7B
+(Guo et al.,
+2024
+)
+, using arXiv corpora that underwent varied processing pipelines:
+•
+MathPile
+(Wang et al.,
+2023c
+)
+:
+an 8.9B-token corpus developed with cleaning and filtering heuristic rules, over 85% of which are scientific arXiv papers;
+•
+ArXiv-RedPajama
+(Computer,
+2023
+)
+:
+the entirety of arXiv LaTeX files with preambles, comments, macros, and bibliographies removed, totaling 28.0B tokens.
+In our experiments, we separately train DeepSeek-LLM 1.3B for 150B tokens and DeepSeek-Coder-Base-v1.5 7B for 40B tokens on each arXiv corpus. It seems that arXiv papers are ineffective in improving mathematical reasoning.
+When trained on a arXiv-only corpus, both models display no notable improvements or even deterioration across various mathematical benchmarks of different complexities employed in this study.
+These benchmarks include quantitative reasoning datasets like GSM8K and MATH (Table
+8
+), multiple-choice challenges like MMLU-STEM (Table
+8
+), and formal mathematics like miniF2F (Table
+9
+).
+However, this conclusion has its limitations and should be taken with a grain of salt.
+We have not yet studied:
+•
+The impact of arXiv tokens on specific math-related tasks not included in this research, such as informalization of theorems which is to convert formal statements or proofs to their informal versions;
+•
+The effect of arXiv tokens when combined with other types of data;
+•
+Whether the benefits of arXiv papers would manifest themselves at a larger model scale.
+Thus, further exploration is required, which we leave for future studies.
+5.2
+Insights of Reinforcement Learning
+5.2.1
+Towards to a Unified Paradigm
+In this section, we provide a unified paradigm to analyze different training methods, such as SFT, RFT, DPO, PPO, GRPO, and further conduct experiments to explore the factors of the unified paradigm.
+Generally, the gradient with respect to the parameter
+θ
+\theta
+of a training method can be written as:
+∇
+θ
+𝒥
+𝒜
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+(
+q
+,
+o
+)
+∼
+𝒟
+⏟
+D
+​
+a
+​
+t
+​
+a
+​
+S
+​
+o
+​
+u
+​
+r
+​
+c
+​
+e
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+G
+​
+C
+𝒜
+​
+(
+q
+,
+o
+,
+t
+,
+π
+r
+​
+f
+)
+⏟
+G
+​
+r
+​
+a
+​
+d
+​
+i
+​
+e
+​
+n
+​
+t
+​
+C
+​
+o
+​
+e
+​
+f
+​
+f
+​
+i
+​
+c
+​
+i
+​
+e
+​
+n
+​
+t
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\nabla_{\theta}\mathcal{J}_{{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\mathcal{A}}}(\theta)=\mathbb{E}[\underbrace{(q,o)\sim{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\mathcal{D}}}_{Data\ Source}]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}\underbrace{GC_{{\mathcal{A}}}(q,o,t,{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\pi_{{rf}}})}_{Gradient\ Coefficient}\nabla_{\theta}\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(5)
+There exist three key components:
+1)
+Data Source
+𝒟
+\mathcal{D}
+, which determines the training data;
+2)
+Reward Function
+π
+r
+​
+f
+\pi_{{rf}}
+, which is the source of the training reward signal;
+3)
+Algorithm
+𝒜
+\mathcal{A}
+: which processes the training data and the reward signal to the gradient coefficient
+G
+​
+C
+GC
+that determines the magnitude of the penalty or reinforcement for the data. We analyze several representative methods based on such a unified paradigm:
+Methods
+Data Source
+Reward Function
+Gradient Coefficient
+SFT
+q
+,
+o
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+,
+O
+)
+q,o\sim P_{sft}(Q,O)
+-
+1
+RFT
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+q\sim P_{sft}(Q)
+,
+o
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+o\sim\pi_{sft}(O|q)
+Rule
+Equation
+10
+DPO
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+q\sim P_{sft}(Q)
+,
+o
++
+,
+o
+−
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+o^{+},o^{-}\sim\pi_{sft}(O|q)
+Rule
+Equation
+14
+Online RFT
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+q\sim P_{sft}(Q)
+,
+o
+∼
+π
+θ
+​
+(
+O
+|
+q
+)
+o\sim\pi_{\theta}(O|q)
+Rule
+Equation
+10
+PPO
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+q\sim P_{sft}(Q)
+,
+o
+∼
+π
+θ
+​
+(
+O
+|
+q
+)
+o\sim\pi_{\theta}(O|q)
+Model
+Equation
+18
+GRPO
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+q\sim P_{sft}(Q)
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+​
+(
+O
+|
+q
+)
+\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta}(O|q)
+Model
+Equation
+21
+Table 10:
+The data source and gradient coefficient of different methods.
+P
+s
+​
+f
+​
+t
+P_{sft}
+denotes the data distribution of supervised fine-tuning datasets.
+π
+θ
+s
+​
+f
+​
+t
+\pi_{\theta_{sft}}
+and
+π
+θ
+\pi_{\theta}
+denote the supervised fine-tuned model and the real-time policy model during the online training process, respectively.
+•
+Supervised Fine-tuning (SFT)
+: SFT fine-tunes pretrained model on human selected SFT data.
+•
+Rejection Sampling Fine-tuning (RFT)
+: RFT further fine-tunes the SFT model on the filtered outputs sampled from the SFT model based on SFT questions. RFT filters the outputs based on the correctness of their answers.
+•
+Direct Preference Optimization (DPO)
+: DPO further refines the SFT model by fine-tuning it on augmented outputs sampled from the SFT model, using pair-wise DPO loss.
+•
+Online Rejection Sampling Fine-tuning (Online RFT)
+: Different from RFT, Online RFT initiates the policy model using the SFT model and refines it by fine-tuning with the augmented outputs sampled from the real-time policy model.
+•
+PPO/GRPO
+: PPO/GRPO initializes the policy model using the SFT model and reinforces it with the outputs sampled from the real-time policy model.
+We summarize the components of these methods in Table
+10
+.
+Please refer to Appendix
+A.1
+for a more detailed derivation process.
+Figure 5:
+Performance of the DeepSeekMath-Instruct 1.3B model, which was further trained using various methods, on two benchmarks.
+Figure 6:
+Performance of iterative reinforcement learning with DeepSeekMath-Instruct 7B on two benchmarks.
+Observation about Data Source
+We divide the data source into two categories, online sampling, and offline sampling.
+Online sampling denotes that the training data is from the exploration results of the real-time training policy model, while offline sampling denotes that the training data is from the sampling results of the initial SFT model.
+RFT and DPO follow the offline style, while Online RFT and GRPO follow the online style.
+As shown in Figure
+5
+,
+we find that the Online RFT significantly outperforms RFT on two benchmarks.
+Specifically, Online RFT is comparable to RFT in the early stage of training but gains an absolute advantage in the later stage, demonstrating the superiority of online training.
+This is intuitive, as in the initial stage, the actor and the SFT model exhibit close resemblance, with the sampled data revealing only minor differences. In the later stage, however, the data sampled from the actor will exhibit more significant differences, and real-time data sampling will offer greater advantages.
+Observation about Gradient Coefficient
+The algorithm processes the reward signal to the gradient coefficient to update the model parameter.
+We divide the reward function as ‘Rule’ and ‘Model’ in our experiments.
+Rule refers to judging the quality of a response based on the correctness of the answer, and Model denotes that we train a reward model to score each response. The training data of the reward model is based on the rule judgment.
+Equations
+10
+and
+21
+highlight a key difference between GRPO and Online RFT: GRPO uniquely adjusts its gradient coefficient based on the reward value provided by the reward model. This allows for differential reinforcement and penalization of responses according to their varying magnitudes. In contrast, Online RFT lacks this feature; it does not penalize incorrect responses and uniformly reinforces all responses with correct answers at the same level of intensity.
+As demonstrated in Figure
+5
+, GRPO surpasses online RFT, thereby highlighting the efficiency of altering positive and negative gradient coefficients. In addition, GRPO+PS shows superior performance compared to GRPO+OS, indicating the benefits of using fine-grained, step-aware gradient coefficients.
+Furthermore, we explore the iterative RL, in our experiments, we conduct two rounds of iteration. As shown in Figure
+6
+, we notice that the iterative RL significantly improves the performance, especially at the first iteration.
+Figure 7:
+The Maj@K and Pass@K of SFT and RL DeepSeekMath 7B on GSM8K and MATH (temperature
+0.7
+0.7
+). It was noted that RL enhances Maj@K but not Pass@K.
+5.2.2
+Why RL Works?
+In this paper, we conduct reinforcement learning based on a subset of instruction tuning data, and it achieves significant performance enhancement upon the instruction tuning model. To further explain why reinforcement learning works.
+We evaluate the Pass@K and Maj@K accuracy of the Instruct and RL models on two benchmarks.
+As shown in Figure
+7
+, RL enhances Maj@K’s performance but not Pass@K. These findings indicate that RL enhances the model’s overall performance by rendering the output distribution more robust, in other words,
+it seems that the improvement is attributed to boosting the correct response from TopK rather than the enhancement of fundamental capabilities.
+Similarly,
+(Wang et al.,
+2023a
+)
+identified a
+misalignment problem
+in reasoning tasks within the SFT model, showing that the reasoning performance of SFT models can be improved through a series of preference alignment strategies
+(Yuan et al.,
+2023b
+; Song et al.,
+2023
+; Wang et al.,
+2023a
+)
+.
+5.2.3
+How to Achieve More Effective RL?
+We demonstrate RL works pretty well in mathematical reasoning tasks. We also provide a unified paradigm to understand different representative training methods.
+Within this paradigm, all methods are conceptualized as either direct or simplified RL techniques.
+As summarized in Equation
+5
+, there exist three key components: Data Source, Algorithm, and Reward Function. We provide some potential future directions about the three components.
+Data Source
+Data source is the raw material of all training methods.
+In the context of RL, we specifically refer to the data source as the unlabeled questions with the outputs sampled from the policy model. In this paper, we only use the questions from the instruction tuning stage and a naive nucleus sampling to sample outputs. We think this is a potential reason that
+our RL pipeline only improves the Maj@K performance. In the future, we will explore our RL pipeline on out-of-distribution question prompts, in conjunction with
+advanced sampling (decoding) strategies
+, like those based on tree-search methods
+(Yao et al.,
+2023
+)
+.
+Also, the
+efficient inference techniques
+(Xia et al.,
+2023
+; Leviathan et al.,
+2023
+; Kwon et al.,
+2023
+; Xia et al.,
+2024
+)
+, which determines the exploration efficiency of policy models, also play an exceedingly important role.
+Algorithms
+Algorithms process the data and reward signal to the gradient coefficient to update the model parameter.
+Based on Equation
+5
+, to some extent, all methods now fully
+TRUST
+the signal of the reward function to increase or decrease the conditional probability of a certain token.
+However, it is impossible to ensure the reward signal is always reliable, especially in extremely complex tasks. For example, even the PRM800K datasets
+(Lightman et al.,
+2023
+)
+, which have been carefully annotated by well-trained annotators, still contain approximately 20% of incorrectly annotations
+7
+7
+7
+https://github.com/openai/prm800k/issues/12#issuecomment-1728491852
+. To this end, we will explore the reinforcement learning algorithm that is robust against noisy reward signals. We believe such
+WEAK-TO-STRONG
+(Burns et al.,
+2023
+)
+alignment methods will bring a fundamental change to the learning algorithms.
+Reward Function
+Reward function is the source of the training signal.
+In RL, the reward function is usually the neural reward model.
+We think there exist three important directions for reward models:
+1)
+How to enhance the generalization ability of the reward model.
+The reward model must be effectively generalized to handle out-of-distribution questions and advanced decoding outputs; otherwise, reinforcement learning may merely stabilize the distribution of LLMs rather than improve their fundamental capabilities;
+2)
+How to reflect the uncertainty of reward model.
+The uncertainty could potentially act as a linking bridge between the weak reward model and the weak-to-strong learning algorithms;
+3)
+How to efficiently build high-quality process reward models
+that can provide fine-grained training signals for the reasoning process
+(Lightman et al.,
+2023
+; Wang et al.,
+2023b
+)
+.
+6
+Conclusion, Limitation, and Future Work
+We present DeepSeekMath, which outperforms all open-source models on the competition-level MATH benchmark and approaches the performance of closed models.
+DeepSeekMath is initialized with DeepSeek-Coder-v1.5 7B and undergoes continual training for 500B tokens, with a significant component of the training data being 120B math tokens sourced from Common Crawl.
+Our extensive ablation study shows web pages offer significant potential for high-quality mathematical data, while arXiv may not as beneficial as we expected.
+We introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), which can notably improve mathematical reasoning capabilities with less memory consumption.
+The experiment results show that GRPO is effective even if DeepSeekMath-Instruct 7B has reached a high score on benchmarks.
+We also provide a unified paradigm to understand a series of methods and summarize several potential directions for more effective reinforcement learning.
+Although DeepSeekMath achieves impressive scores on quantitative reasoning benchmarks, its capability on geometry and theorem-proof are relatively weaker than closed models.
+For instance, in our dry run, the model cannot handle problems related to triangles and ellipses, which may indicate data selection bias in pre-training and fine-tuning. In addition, restricted by the model scale, DeepSeekMath is worse than GPT-4 on few-shot capability.
+GPT-4 could improve its performance with few-shot inputs, while DeepSeekMath shows similar performance in zero-shot and few-shot evaluation.
+In the future, we will further improve our engineered data selection pipeline to construct more high-quality pre-trained corpus.
+In addition, we will explore the potential directions (Section
+5.2.3
+) for more effective reinforcement learning of LLMs.
+References
+Anil et al. (2023)
+R. Anil, S. Borgeaud, Y. Wu, J. Alayrac, J. Yu, R. Soricut, J. Schalkwyk, A. M. Dai, A. Hauth, K. Millican, D. Silver, S. Petrov, M. Johnson, I. Antonoglou, J. Schrittwieser, A. Glaese, J. Chen, E. Pitler, T. P. Lillicrap, A. Lazaridou, O. Firat, J. Molloy, M. Isard, P. R. Barham, T. Hennigan, B. Lee, F. Viola, M. Reynolds, Y. Xu, R. Doherty, E. Collins, C. Meyer, E. Rutherford, E. Moreira, K. Ayoub, M. Goel, G. Tucker, E. Piqueras, M. Krikun, I. Barr, N. Savinov, I. Danihelka, B. Roelofs, A. White, A. Andreassen, T. von Glehn, L. Yagati, M. Kazemi, L. Gonzalez, M. Khalman, J. Sygnowski, and et al.
+Gemini: A family of highly capable multimodal models.
+CoRR
+, abs/2312.11805, 2023.
+10.48550/ARXIV.2312.11805
+.
+URL
+https://doi.org/10.48550/arXiv.2312.11805
+.
+Austin et al. (2021)
+J. Austin, A. Odena, M. Nye, M. Bosma, H. Michalewski, D. Dohan, E. Jiang, C. Cai, M. Terry, Q. Le, et al.
+Program synthesis with large language models.
+arXiv preprint arXiv:2108.07732
+, 2021.
+Azerbayev et al. (2023)
+Z. Azerbayev, H. Schoelkopf, K. Paster, M. D. Santos, S. McAleer, A. Q. Jiang, J. Deng, S. Biderman, and S. Welleck.
+Llemma: An open language model for mathematics.
+arXiv preprint arXiv:2310.10631
+, 2023.
+Bai et al. (2023)
+J. Bai, S. Bai, Y. Chu, Z. Cui, K. Dang, X. Deng, Y. Fan, W. Ge, Y. Han, F. Huang, et al.
+Qwen technical report.
+arXiv preprint arXiv:2309.16609
+, 2023.
+Burns et al. (2023)
+C. Burns, P. Izmailov, J. H. Kirchner, B. Baker, L. Gao, L. Aschenbrenner, Y. Chen, A. Ecoffet, M. Joglekar, J. Leike, et al.
+Weak-to-strong generalization: Eliciting strong capabilities with weak supervision.
+arXiv preprint arXiv:2312.09390
+, 2023.
+ChatGLM3 Team (2023)
+ChatGLM3 Team.
+Chatglm3 series: Open bilingual chat llms, 2023.
+URL
+https://github.com/THUDM/ChatGLM3
+.
+Chen et al. (2021)
+M. Chen, J. Tworek, H. Jun, Q. Yuan, H. P. de Oliveira Pinto, J. Kaplan, H. Edwards, Y. Burda, N. Joseph, G. Brockman, A. Ray, R. Puri, G. Krueger, M. Petrov, H. Khlaaf, G. Sastry, P. Mishkin, B. Chan, S. Gray, N. Ryder, M. Pavlov, A. Power, L. Kaiser, M. Bavarian, C. Winter, P. Tillet, F. P. Such, D. Cummings, M. Plappert, F. Chantzis, E. Barnes, A. Herbert-Voss, W. H. Guss, A. Nichol, A. Paino, N. Tezak, J. Tang, I. Babuschkin, S. Balaji, S. Jain, W. Saunders, C. Hesse, A. N. Carr, J. Leike, J. Achiam, V. Misra, E. Morikawa, A. Radford, M. Knight, M. Brundage, M. Murati, K. Mayer, P. Welinder, B. McGrew, D. Amodei, S. McCandlish, I. Sutskever, and W. Zaremba.
+Evaluating large language models trained on code.
+CoRR
+, abs/2107.03374, 2021.
+URL
+https://arxiv.org/abs/2107.03374
+.
+Chen et al. (2022)
+W. Chen, X. Ma, X. Wang, and W. W. Cohen.
+Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks.
+CoRR
+, abs/2211.12588, 2022.
+10.48550/ARXIV.2211.12588
+.
+URL
+https://doi.org/10.48550/arXiv.2211.12588
+.
+Cobbe et al. (2021)
+K. Cobbe, V. Kosaraju, M. Bavarian, M. Chen, H. Jun, L. Kaiser, M. Plappert, J. Tworek, J. Hilton, R. Nakano, et al.
+Training verifiers to solve math word problems.
+arXiv preprint arXiv:2110.14168
+, 2021.
+Computer (2023)
+T. Computer.
+Redpajama: an open dataset for training large language models, Oct. 2023.
+URL
+https://github.com/togethercomputer/RedPajama-Data
+.
+DeepSeek-AI (2024)
+DeepSeek-AI.
+Deepseek LLM: scaling open-source language models with longtermism.
+CoRR
+, abs/2401.02954, 2024.
+10.48550/ARXIV.2401.02954
+.
+URL
+https://doi.org/10.48550/arXiv.2401.02954
+.
+Du et al. (2022)
+Z. Du, Y. Qian, X. Liu, M. Ding, J. Qiu, Z. Yang, and J. Tang.
+Glm: General language model pretraining with autoregressive blank infilling.
+In
+Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
+, pages 320–335, 2022.
+Gao et al. (2023)
+L. Gao, A. Madaan, S. Zhou, U. Alon, P. Liu, Y. Yang, J. Callan, and G. Neubig.
+PAL: program-aided language models.
+In A. Krause, E. Brunskill, K. Cho, B. Engelhardt, S. Sabato, and J. Scarlett, editors,
+International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA
+, volume 202 of
+Proceedings of Machine Learning Research
+, pages 10764–10799. PMLR, 2023.
+URL
+https://proceedings.mlr.press/v202/gao23f.html
+.
+Gou et al. (2023)
+Z. Gou, Z. Shao, Y. Gong, Y. Shen, Y. Yang, M. Huang, N. Duan, and W. Chen.
+Tora: A tool-integrated reasoning agent for mathematical problem solving.
+CoRR
+, abs/2309.17452, 2023.
+10.48550/ARXIV.2309.17452
+.
+URL
+https://doi.org/10.48550/arXiv.2309.17452
+.
+Guo et al. (2024)
+D. Guo, Q. Zhu, D. Yang, Z. Xie, K. Dong, W. Zhang, G. Chen, X. Bi, Y. Wu, Y. K. Li, F. Luo, Y. Xiong, and W. Liang.
+Deepseek-coder: When the large language model meets programming – the rise of code intelligence, 2024.
+Hendrycks et al. (2020)
+D. Hendrycks, C. Burns, S. Basart, A. Zou, M. Mazeika, D. Song, and J. Steinhardt.
+Measuring massive multitask language understanding.
+arXiv preprint arXiv:2009.03300
+, 2020.
+Hendrycks et al. (2021)
+D. Hendrycks, C. Burns, S. Kadavath, A. Arora, S. Basart, E. Tang, D. Song, and J. Steinhardt.
+Measuring mathematical problem solving with the math dataset.
+arXiv preprint arXiv:2103.03874
+, 2021.
+High-flyer (2023)
+High-flyer.
+Hai-llm: 高效且轻量的大模型训练工具, 2023.
+URL
+https://www.high-flyer.cn/en/blog/hai-llm
+.
+Inflection AI (2023)
+Inflection AI.
+Inflection-2, 2023.
+URL
+https://inflection.ai/inflection-2
+.
+Jiang et al. (2022)
+A. Q. Jiang, S. Welleck, J. P. Zhou, W. Li, J. Liu, M. Jamnik, T. Lacroix, Y. Wu, and G. Lample.
+Draft, sketch, and prove: Guiding formal theorem provers with informal proofs.
+arXiv preprint arXiv:2210.12283
+, 2022.
+Jiang et al. (2023)
+A. Q. Jiang, A. Sablayrolles, A. Mensch, C. Bamford, D. S. Chaplot, D. d. l. Casas, F. Bressand, G. Lengyel, G. Lample, L. Saulnier, et al.
+Mistral 7b.
+arXiv preprint arXiv:2310.06825
+, 2023.
+Joulin et al. (2016)
+A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, and T. Mikolov.
+Fasttext. zip: Compressing text classification models.
+arXiv preprint arXiv:1612.03651
+, 2016.
+Kwon et al. (2023)
+W. Kwon, Z. Li, S. Zhuang, Y. Sheng, L. Zheng, C. H. Yu, J. E. Gonzalez, H. Zhang, and I. Stoica.
+Efficient memory management for large language model serving with pagedattention.
+In
+Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles
+, 2023.
+Leviathan et al. (2023)
+Y. Leviathan, M. Kalman, and Y. Matias.
+Fast inference from transformers via speculative decoding.
+In
+International Conference on Machine Learning
+, pages 19274–19286. PMLR, 2023.
+Lewkowycz et al. (2022a)
+A. Lewkowycz, A. Andreassen, D. Dohan, E. Dyer, H. Michalewski, V. Ramasesh, A. Slone, C. Anil, I. Schlag, T. Gutman-Solo, et al.
+Solving quantitative reasoning problems with language models.
+Advances in Neural Information Processing Systems
+, 35:3843–3857, 2022a.
+Lewkowycz et al. (2022b)
+A. Lewkowycz, A. Andreassen, D. Dohan, E. Dyer, H. Michalewski, V. V. Ramasesh, A. Slone, C. Anil, I. Schlag, T. Gutman-Solo, Y. Wu, B. Neyshabur, G. Gur-Ari, and V. Misra.
+Solving quantitative reasoning problems with language models.
+In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors,
+Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022
+, 2022b.
+URL
+http://papers.nips.cc/paper_files/paper/2022/hash/18abbeef8cfe9203fdf9053c9c4fe191-Abstract-Conference.html
+.
+Lightman et al. (2023)
+H. Lightman, V. Kosaraju, Y. Burda, H. Edwards, B. Baker, T. Lee, J. Leike, J. Schulman, I. Sutskever, and K. Cobbe.
+Let’s verify step by step.
+arXiv preprint arXiv:2305.20050
+, 2023.
+Loshchilov and Hutter (2017)
+I. Loshchilov and F. Hutter.
+Decoupled weight decay regularization.
+arXiv preprint arXiv:1711.05101
+, 2017.
+Luo et al. (2023)
+H. Luo, Q. Sun, C. Xu, P. Zhao, J. Lou, C. Tao, X. Geng, Q. Lin, S. Chen, and D. Zhang.
+Wizardmath: Empowering mathematical reasoning for large language models via reinforced evol-instruct.
+arXiv preprint arXiv:2308.09583
+, 2023.
+Mishra et al. (2022)
+S. Mishra, M. Finlayson, P. Lu, L. Tang, S. Welleck, C. Baral, T. Rajpurohit, O. Tafjord, A. Sabharwal, P. Clark, and A. Kalyan.
+LILA: A unified benchmark for mathematical reasoning.
+In Y. Goldberg, Z. Kozareva, and Y. Zhang, editors,
+Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, EMNLP 2022, Abu Dhabi, United Arab Emirates, December 7-11, 2022
+, pages 5807–5832. Association for Computational Linguistics, 2022.
+10.18653/V1/2022.EMNLP-MAIN.392
+.
+URL
+https://doi.org/10.18653/v1/2022.emnlp-main.392
+.
+Nguyen et al. (2023)
+X. Nguyen, W. Zhang, X. Li, M. M. Aljunied, Q. Tan, L. Cheng, G. Chen, Y. Deng, S. Yang, C. Liu, H. Zhang, and L. Bing.
+Seallms - large language models for southeast asia.
+CoRR
+, abs/2312.00738, 2023.
+10.48550/ARXIV.2312.00738
+.
+URL
+https://doi.org/10.48550/arXiv.2312.00738
+.
+OpenAI (2023)
+OpenAI.
+GPT4 technical report.
+arXiv preprint arXiv:2303.08774
+, 2023.
+Ouyang et al. (2022)
+L. Ouyang, J. Wu, X. Jiang, D. Almeida, C. Wainwright, P. Mishkin, C. Zhang, S. Agarwal, K. Slama, A. Ray, et al.
+Training language models to follow instructions with human feedback.
+Advances in Neural Information Processing Systems
+, 35:27730–27744, 2022.
+Paster et al. (2023)
+K. Paster, M. D. Santos, Z. Azerbayev, and J. Ba.
+Openwebmath: An open dataset of high-quality mathematical web text.
+CoRR
+, abs/2310.06786, 2023.
+10.48550/ARXIV.2310.06786
+.
+URL
+https://doi.org/10.48550/arXiv.2310.06786
+.
+Paulson (2010)
+L. C. Paulson.
+Three years of experience with sledgehammer, a practical link between automatic and interactive theorem provers.
+In R. A. Schmidt, S. Schulz, and B. Konev, editors,
+Proceedings of the 2nd Workshop on Practical Aspects of Automated Reasoning, PAAR-2010, Edinburgh, Scotland, UK, July 14, 2010
+, volume 9 of
+EPiC Series in Computing
+, pages 1–10. EasyChair, 2010.
+10.29007/TNFD
+.
+URL
+https://doi.org/10.29007/tnfd
+.
+Polu and Sutskever (2020)
+S. Polu and I. Sutskever.
+Generative language modeling for automated theorem proving.
+CoRR
+, abs/2009.03393, 2020.
+URL
+https://arxiv.org/abs/2009.03393
+.
+Rafailov et al. (2023)
+R. Rafailov, A. Sharma, E. Mitchell, S. Ermon, C. D. Manning, and C. Finn.
+Direct preference optimization: Your language model is secretly a reward model.
+2023.
+Schulman (2020)
+J. Schulman.
+Approximating kl divergence, 2020.
+URL
+http://joschu.net/blog/kl-approx.html
+.
+Schulman et al. (2015)
+J. Schulman, P. Moritz, S. Levine, M. Jordan, and P. Abbeel.
+High-dimensional continuous control using generalized advantage estimation.
+arXiv preprint arXiv:1506.02438
+, 2015.
+Schulman et al. (2017)
+J. Schulman, F. Wolski, P. Dhariwal, A. Radford, and O. Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Shi et al. (2023)
+F. Shi, M. Suzgun, M. Freitag, X. Wang, S. Srivats, S. Vosoughi, H. W. Chung, Y. Tay, S. Ruder, D. Zhou, D. Das, and J. Wei.
+Language models are multilingual chain-of-thought reasoners.
+In
+The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023
+. OpenReview.net, 2023.
+URL
+https://openreview.net/pdf?id=fR3wGCk-IXp
+.
+Song et al. (2023)
+F. Song, B. Yu, M. Li, H. Yu, F. Huang, Y. Li, and H. Wang.
+Preference ranking optimization for human alignment.
+arXiv preprint arXiv:2306.17492
+, 2023.
+Suzgun et al. (2022)
+M. Suzgun, N. Scales, N. Schärli, S. Gehrmann, Y. Tay, H. W. Chung, A. Chowdhery, Q. V. Le, E. H. Chi, D. Zhou, et al.
+Challenging big-bench tasks and whether chain-of-thought can solve them.
+arXiv preprint arXiv:2210.09261
+, 2022.
+Tao (2023)
+T. Tao.
+Embracing change and resetting expectations, 2023.
+URL
+https://unlocked.microsoft.com/ai-anthology/terence-tao/
+.
+Touvron et al. (2023)
+H. Touvron, L. Martin, K. Stone, P. Albert, A. Almahairi, Y. Babaei, N. Bashlykov, S. Batra, P. Bhargava, S. Bhosale, D. Bikel, L. Blecher, C. Canton-Ferrer, M. Chen, G. Cucurull, D. Esiobu, J. Fernandes, J. Fu, W. Fu, B. Fuller, C. Gao, V. Goswami, N. Goyal, A. Hartshorn, S. Hosseini, R. Hou, H. Inan, M. Kardas, V. Kerkez, M. Khabsa, I. Kloumann, A. Korenev, P. S. Koura, M. Lachaux, T. Lavril, J. Lee, D. Liskovich, Y. Lu, Y. Mao, X. Martinet, T. Mihaylov, P. Mishra, I. Molybog, Y. Nie, A. Poulton, J. Reizenstein, R. Rungta, K. Saladi, A. Schelten, R. Silva, E. M. Smith, R. Subramanian, X. E. Tan, B. Tang, R. Taylor, A. Williams, J. X. Kuan, P. Xu, Z. Yan, I. Zarov, Y. Zhang, A. Fan, M. Kambadur, S. Narang, A. Rodriguez, R. Stojnic, S. Edunov, and T. Scialom.
+Llama 2: Open foundation and fine-tuned chat models.
+CoRR
+, abs/2307.09288, 2023.
+10.48550/arXiv.2307.09288
+.
+URL
+https://doi.org/10.48550/arXiv.2307.09288
+.
+Trinh et al. (2024)
+T. H. Trinh, Y. Wu, Q. V. Le, H. He, and T. Luong.
+Solving olympiad geometry without human demonstrations.
+Nature
+, 625(7995):476–482, 2024.
+Wang et al. (2023a)
+P. Wang, L. Li, L. Chen, F. Song, B. Lin, Y. Cao, T. Liu, and Z. Sui.
+Making large language models better reasoners with alignment.
+arXiv preprint arXiv:2309.02144
+, 2023a.
+Wang et al. (2023b)
+P. Wang, L. Li, Z. Shao, R. Xu, D. Dai, Y. Li, D. Chen, Y. Wu, and Z. Sui.
+Math-shepherd: Verify and reinforce llms step-by-step without human annotations.
+CoRR, abs/2312.08935
+, 2023b.
+Wang et al. (2023c)
+Z. Wang, R. Xia, and P. Liu.
+Generative AI for math: Part I - mathpile: A billion-token-scale pretraining corpus for math.
+CoRR
+, abs/2312.17120, 2023c.
+10.48550/ARXIV.2312.17120
+.
+URL
+https://doi.org/10.48550/arXiv.2312.17120
+.
+Wei et al. (2022)
+J. Wei, X. Wang, D. Schuurmans, M. Bosma, B. Ichter, F. Xia, E. H. Chi, Q. V. Le, and D. Zhou.
+Chain-of-thought prompting elicits reasoning in large language models.
+In
+NeurIPS
+, 2022.
+URL
+http://papers.nips.cc/paper_files/paper/2022/hash/9d5609613524ecf4f15af0f7b31abca4-Abstract-Conference.html
+.
+Wei et al. (2023)
+T. Wei, J. Luan, W. Liu, S. Dong, and B. Wang.
+Cmath: Can your language model pass chinese elementary school math test?, 2023.
+Wenzel et al. (2008)
+M. Wenzel, L. C. Paulson, and T. Nipkow.
+The isabelle framework.
+In O. A. Mohamed, C. A. Muñoz, and S. Tahar, editors,
+Theorem Proving in Higher Order Logics, 21st International Conference, TPHOLs 2008, Montreal, Canada, August 18-21, 2008. Proceedings
+, volume 5170 of
+Lecture Notes in Computer Science
+, pages 33–38. Springer, 2008.
+10.1007/978-3-540-71067-7_7
+.
+URL
+https://doi.org/10.1007/978-3-540-71067-7_7
+.
+Xia et al. (2023)
+H. Xia, T. Ge, P. Wang, S.-Q. Chen, F. Wei, and Z. Sui.
+Speculative decoding: Exploiting speculative execution for accelerating seq2seq generation.
+In H. Bouamor, J. Pino, and K. Bali, editors,
+Findings of the Association for Computational Linguistics: EMNLP 2023
+, pages 3909–3925, Singapore, Dec. 2023. Association for Computational Linguistics.
+10.18653/v1/2023.findings-emnlp.257
+.
+URL
+https://aclanthology.org/2023.findings-emnlp.257
+.
+Xia et al. (2024)
+H. Xia, Z. Yang, Q. Dong, P. Wang, Y. Li, T. Ge, T. Liu, W. Li, and Z. Sui.
+Unlocking efficiency in large language model inference: A comprehensive survey of speculative decoding.
+arXiv preprint arXiv:2401.07851
+, 2024.
+Yao et al. (2023)
+S. Yao, D. Yu, J. Zhao, I. Shafran, T. L. Griffiths, Y. Cao, and K. Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language models.
+arXiv preprint arXiv:2305.10601
+, 2023.
+Yu et al. (2023)
+L. Yu, W. Jiang, H. Shi, J. Yu, Z. Liu, Y. Zhang, J. T. Kwok, Z. Li, A. Weller, and W. Liu.
+Metamath: Bootstrap your own mathematical questions for large language models.
+CoRR
+, abs/2309.12284, 2023.
+10.48550/ARXIV.2309.12284
+.
+URL
+https://doi.org/10.48550/arXiv.2309.12284
+.
+Yuan et al. (2023a)
+Z. Yuan, H. Yuan, C. Li, G. Dong, C. Tan, and C. Zhou.
+Scaling relationship on learning mathematical reasoning with large language models.
+arXiv preprint arXiv:2308.01825
+, 2023a.
+Yuan et al. (2023b)
+Z. Yuan, H. Yuan, C. Tan, W. Wang, S. Huang, and F. Huang.
+Rrhf: Rank responses to align language models with human feedback without tears.
+arXiv preprint arXiv:2304.05302
+, 2023b.
+Yue et al. (2023)
+X. Yue, X. Qu, G. Zhang, Y. Fu, W. Huang, H. Sun, Y. Su, and W. Chen.
+Mammoth: Building math generalist models through hybrid instruction tuning.
+CoRR
+, abs/2309.05653, 2023.
+10.48550/ARXIV.2309.05653
+.
+URL
+https://doi.org/10.48550/arXiv.2309.05653
+.
+Zheng et al. (2021)
+K. Zheng, J. M. Han, and S. Polu.
+Minif2f: a cross-system benchmark for formal olympiad-level mathematics.
+arXiv preprint arXiv:2109.00110
+, 2021.
+Zhong et al. (2023)
+W. Zhong, R. Cui, Y. Guo, Y. Liang, S. Lu, Y. Wang, A. Saied, W. Chen, and N. Duan.
+AGIEval: A human-centric benchmark for evaluating foundation models.
+CoRR
+, abs/2304.06364, 2023.
+10.48550/arXiv.2304.06364
+.
+URL
+https://doi.org/10.48550/arXiv.2304.06364
+.
+Appendix A
+Appendix
+A.1
+Analysis of Reinforcement Learning
+We provide the detailed derivation of the data source and gradient coefficient (algorithm and reward function) across various methods, including SFT, RFT, Online RFT, DPO, PPO, and GRPO.
+A.1.1
+Supervised Fine-tuning
+The objective of Supervised Fine-tuning is maximizing the following objective:
+𝒥
+S
+​
+F
+​
+T
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+,
+o
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+,
+O
+)
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\mathcal{J}_{SFT}(\theta)=\mathbb{E}[q,o\sim P_{sft}(Q,O)]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(6)
+The gradient of
+𝒥
+S
+​
+F
+​
+T
+​
+(
+θ
+)
+\mathcal{J}_{SFT}(\theta)
+is:
+∇
+θ
+𝒥
+S
+​
+F
+​
+T
+=
+𝔼
+​
+[
+q
+,
+o
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+,
+O
+)
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\nabla_{\theta}\mathcal{J}_{SFT}=\mathbb{E}[q,o\sim P_{sft}(Q,O)]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}\nabla_{\theta}\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(7)
+Data Source: The dataset employed for SFT. Reward Function: This can be regarded as human selection. Gradient Coefficient: always set to 1.
+A.1.2
+Rejection Sampling Fine-tuning
+Rejection Sampling Fine-tuning first samples multiple outputs from the supervised fine-tuned LLMs for each question, and then trains LLMs on the sampled outputs with the correct answer.
+Formally, the objective of RFT is to maximize the following objectives:
+𝒥
+R
+​
+F
+​
+T
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+𝕀
+​
+(
+o
+)
+​
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\mathcal{J}_{RFT}(\theta)=\mathbb{E}[q\sim P_{sft}(Q),o\sim\pi_{sft}(O|q)]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}\mathbb{I}(o)\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(8)
+The gradient of
+𝒥
+R
+​
+F
+​
+T
+​
+(
+θ
+)
+\mathcal{J}_{RFT}(\theta)
+is:
+∇
+θ
+𝒥
+R
+​
+F
+​
+T
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+𝕀
+​
+(
+o
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\nabla_{\theta}\mathcal{J}_{RFT}(\theta)=\mathbb{E}[{q\sim P_{sft}(Q),o\sim\pi_{sft}(O|q)}]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}{\mathbb{I}(o)}\nabla_{\theta}\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(9)
+Data Source: question in SFT dataset with outputs sampled from SFT model. Reward Function: Rule (whether the answer is correct or not). Gradient Coefficient:
+G
+C
+R
+​
+F
+​
+T
+(
+q
+,
+o
+,
+t
+)
+=
+𝕀
+(
+o
+)
+=
+{
+1
+the
+​
+answer
+​
+of
+​
+o
+​
+is
+​
+correct
+0
+the
+​
+answer
+​
+of
+​
+o
+​
+is
+​
+incorrect
+GC_{RFT}(q,o,t)=\mathbb{I}(o)=\left\{\begin{aligned} 1&&{\rm the\ answer\ of\ o\ is\ correct}\\
+0&&{\rm the\ answer\ of\ o\ is\ incorrect}\\
+\end{aligned}\right.
+(10)
+A.1.3
+Online Rejection Sampling Fine-tuning
+The only difference between RFT and Online RFT is that the outputs of Online RFT are sampled from the real-time policy model
+π
+θ
+\pi_{\theta}
+, rather than from the SFT model
+π
+θ
+s
+​
+f
+​
+t
+\pi_{\theta_{sft}}
+. Therefore, the gradient of online RFT is:
+∇
+θ
+𝒥
+O
+​
+n
+​
+R
+​
+F
+​
+T
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+θ
+​
+(
+O
+|
+q
+)
+]
+​
+(
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+𝕀
+​
+(
+o
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+)
+.
+\nabla_{\theta}\mathcal{J}_{OnRFT}(\theta)=\mathbb{E}[{q\sim P_{sft}(Q),o\sim\pi_{\theta}(O|q)}]\left(\frac{1}{|o|}\sum_{t=1}^{|o|}{\mathbb{I}(o)}\nabla_{\theta}\log\pi_{\theta}(o_{t}|q,o_{<t})\right).
+(11)
+A.1.4
+Direct Preference Optimization (DPO)
+The objective of DPO is:
+𝒥
+D
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
++
+,
+o
+−
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+]
+​
+log
+⁡
+σ
+​
+(
+β
+​
+1
+|
+o
++
+|
+​
+∑
+t
+=
+1
+|
+o
++
+|
+log
+⁡
+π
+θ
+​
+(
+o
+t
++
+|
+q
+,
+o
+<
+t
++
+)
+π
+ref
+​
+(
+o
+t
++
+|
+q
+,
+o
+<
+t
++
+)
+−
+β
+​
+1
+|
+o
+−
+|
+​
+∑
+t
+=
+1
+|
+o
+−
+|
+log
+⁡
+π
+θ
+​
+(
+o
+<
+t
+−
+|
+q
+,
+o
+<
+t
+−
+)
+π
+ref
+​
+(
+o
+<
+t
+−
+|
+q
+,
+o
+<
+t
+−
+)
+)
+\footnotesize\begin{split}\mathcal{J}_{DPO}(\theta)=\mathbb{E}{[q\sim P_{sft}(Q),o^{+},o^{-}\sim\pi_{sft}(O|q)]}\log\sigma\left(\beta\frac{1}{|o^{+}|}\sum_{t=1}^{|o^{+}|}\log\frac{\pi_{\theta}(o^{+}_{t}|q,o^{+}_{<t})}{\pi_{\text{ref}}(o^{+}_{t}|q,o^{+}_{<t})}-\beta\frac{1}{|o^{-}|}\sum_{t=1}^{|o^{-}|}\log\frac{\pi_{\theta}(o^{-}_{<t}|q,o^{-}_{<t})}{\pi_{\text{ref}}(o^{-}_{<t}|q,o^{-}_{<t})}\right)\end{split}
+(12)
+The gradient of
+𝒥
+D
+​
+P
+​
+O
+​
+(
+θ
+)
+\mathcal{J}_{DPO}(\theta)
+is:
+∇
+θ
+𝒥
+D
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
++
+,
+o
+−
+∼
+π
+s
+​
+f
+​
+t
+​
+(
+O
+|
+q
+)
+]
+(
+1
+|
+o
++
+|
+∑
+t
+=
+1
+|
+o
++
+|
+G
+C
+D
+​
+P
+​
+O
+(
+q
+,
+o
+,
+t
+)
+∇
+θ
+log
+π
+θ
+(
+o
+t
++
+|
+q
+,
+o
+<
+t
++
+)
+−
+1
+|
+o
+−
+|
+∑
+t
+=
+1
+|
+o
+−
+|
+G
+C
+D
+​
+P
+​
+O
+(
+q
+,
+o
+,
+t
+)
+∇
+θ
+log
+π
+θ
+(
+o
+t
+−
+|
+q
+,
+o
+<
+t
+−
+)
+)
+\footnotesize\begin{split}\nabla_{\theta}\mathcal{J}_{DPO}(\theta)=\mathbb{E}{[q\sim P_{sft}(Q),o^{+},o^{-}\sim\pi_{sft}(O|q)]}&\left(\frac{1}{|o^{+}|}\sum_{t=1}^{|o^{+}|}GC_{DPO}(q,o,t)\nabla_{\theta}\log\pi_{\theta}(o^{+}_{t}|q,o^{+}_{<t})\right.\\
+-&\left.\frac{1}{|o^{-}|}\sum_{t=1}^{|o^{-}|}GC_{DPO}(q,o,t)\nabla_{\theta}\log\pi_{\theta}(o^{-}_{t}|q,o^{-}_{<t})\right)\end{split}
+(13)
+Data Source: question in SFT dataset with outputs sampled from SFT model.
+Reward Function: human preference in the general domain (can be ‘Rule’ in mathematical tasks).
+Gradient Coefficient:
+G
+​
+C
+D
+​
+P
+​
+O
+​
+(
+q
+,
+o
+,
+t
+)
+=
+σ
+​
+(
+β
+​
+log
+⁡
+π
+θ
+​
+(
+o
+t
+−
+|
+q
+,
+o
+<
+t
+−
+)
+π
+ref
+​
+(
+o
+t
+−
+|
+q
+,
+o
+<
+t
+−
+)
+−
+β
+​
+log
+⁡
+π
+θ
+​
+(
+o
+t
++
+|
+q
+,
+o
+<
+t
++
+)
+π
+ref
+​
+(
+o
+t
++
+|
+q
+,
+o
+<
+t
++
+)
+)
+\footnotesize GC_{DPO}(q,o,t)=\sigma\left(\beta\log\frac{\pi_{\theta}(o^{-}_{t}|q,o^{-}_{<t})}{\pi_{\text{ref}}(o^{-}_{t}|q,o^{-}_{<t})}-\beta\log\frac{\pi_{\theta}(o^{+}_{t}|q,o^{+}_{<t})}{\pi_{\text{ref}}(o^{+}_{t}|q,o^{+}_{<t})}\right)
+(14)
+A.1.5
+Proximal Policy Optimization (PPO)
+The objective of PPO is:
+𝒥
+P
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+​
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+min
+⁡
+[
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+​
+A
+t
+,
+clip
+​
+(
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+t
+]
+.
+\footnotesize\mathcal{J}_{PPO}(\theta)=\mathbb{E}{[q\sim P_{sft}(Q),o\sim\pi_{\theta_{old}}(O|q)]}\frac{1}{|o|}\sum_{t=1}^{|o|}\min\left[\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{\theta_{old}}(o_{t}|q,o_{<t})}A_{t},\text{clip}\left(\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{\theta_{old}}(o_{t}|q,o_{<t})},1-\varepsilon,1+\varepsilon\right)A_{t}\right].
+(15)
+To simplify the analysis, it is assumed that the model only has a single update following each exploration stage, thereby ensuring that
+π
+θ
+o
+​
+l
+​
+d
+=
+π
+θ
+\pi_{\theta_{old}}=\pi_{\theta}
+.
+In this case, we can remove the
+min
+\min
+and
+clip
+{\rm clip}
+operation:
+𝒥
+P
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+​
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+​
+A
+t
+.
+\footnotesize\mathcal{J}_{PPO}(\theta)=\mathbb{E}{[q\sim P_{sft}(Q),o\sim\pi_{\theta_{old}}(O|q)]}\frac{1}{|o|}\sum_{t=1}^{|o|}\frac{\pi_{\theta}(o_{t}|q,o_{<t})}{\pi_{\theta_{old}}(o_{t}|q,o_{<t})}A_{t}.
+(16)
+The gradient of
+𝒥
+P
+​
+P
+​
+O
+​
+(
+θ
+)
+\mathcal{J}_{PPO}(\theta)
+is:
+∇
+θ
+𝒥
+P
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+o
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+​
+1
+|
+o
+|
+​
+∑
+t
+=
+1
+|
+o
+|
+A
+t
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+q
+,
+o
+<
+t
+)
+\footnotesize\begin{split}\nabla_{\theta}\mathcal{J}_{PPO}(\theta)=\mathbb{E}{[q\sim P_{sft}(Q),o\sim\pi_{\theta_{old}}(O|q)]}\frac{1}{|o|}\sum_{t=1}^{|o|}A_{t}\nabla_{\theta}\log\pi_{\theta}(o_{t}|q,o_{<t})\end{split}
+(17)
+Data Source: question in SFT dataset with outputs sampled from policy model.
+Reward Function: reward model.
+Gradient Coefficient:
+G
+​
+C
+P
+​
+P
+​
+O
+​
+(
+q
+,
+o
+,
+t
+,
+π
+θ
+r
+​
+m
+)
+=
+A
+t
+,
+GC_{PPO}(q,o,t,\pi_{\theta_{rm}})=A_{t},
+(18)
+where
+A
+t
+A_{t}
+is the advantage, which is computed by applying Generalized Advantage Estimation (GAE)
+(Schulman et al.,
+2015
+)
+, based on the rewards
+{
+r
+≥
+t
+}
+\{r_{\geq t}\}
+and a learned value function
+V
+ψ
+V_{\psi}
+.
+A.1.6
+Group Relative Policy Optimization (GRPO)
+The objective of GRPO is (assume
+π
+θ
+o
+​
+l
+​
+d
+=
+π
+θ
+\pi_{\theta_{old}}=\pi_{\theta}
+for simplified analysis):
+𝒥
+G
+​
+R
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+​
+∑
+t
+=
+1
+|
+o
+i
+|
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+​
+A
+^
+i
+,
+t
+−
+β
+​
+(
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+−
+log
+⁡
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+−
+1
+)
+]
+.
+\footnotesize\begin{split}\mathcal{J}_{GRPO}(\theta)&=\mathbb{E}{[q\sim P_{sft}(Q),\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(O|q)]}\\
+&\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{t=1}^{|o_{i}|}\left[\frac{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|q,o_{i,<t})}\hat{A}_{i,t}-\beta(\frac{\pi_{ref}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}-\log\frac{\pi_{ref}(o_{i,t}|q,o_{i,<t})}{\pi_{\theta}(o_{i,t}|q,o_{i,<t})}-1)\right].\end{split}
+(19)
+The gradient of
+𝒥
+G
+​
+R
+​
+P
+​
+O
+​
+(
+θ
+)
+\mathcal{J}_{GRPO}(\theta)
+is:
+∇
+θ
+𝒥
+G
+​
+R
+​
+P
+​
+O
+​
+(
+θ
+)
+=
+𝔼
+​
+[
+q
+∼
+P
+s
+​
+f
+​
+t
+​
+(
+Q
+)
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+O
+|
+q
+)
+]
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+​
+∑
+t
+=
+1
+|
+o
+i
+|
+[
+A
+^
+i
+,
+t
++
+β
+​
+(
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+o
+i
+,
+<
+t
+)
+−
+1
+)
+]
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+q
+,
+o
+i
+,
+<
+t
+)
+.
+\footnotesize\begin{split}\nabla_{\theta}\mathcal{J}_{GRPO}(\theta)&=\mathbb{E}{[q\sim P_{sft}(Q),\{o_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(O|q)]}\\
+&\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{t=1}^{|o_{i}|}\left[\hat{A}_{i,t}+\beta\left(\frac{\pi_{ref}(o_{i,t}|o_{i,<t})}{\pi_{\theta}(o_{i,t}|o_{i,<t})}-1\right)\right]\nabla_{\theta}\log\pi_{\theta}(o_{i,t}|q,o_{i,<t}).\end{split}
+(20)
+Data Source: question in SFT dataset with outputs sampled from policy model.
+Reward Function: reward model.
+Gradient Coefficient:
+G
+​
+C
+G
+​
+R
+​
+P
+​
+O
+​
+(
+q
+,
+o
+,
+t
+,
+π
+θ
+r
+​
+m
+)
+=
+A
+^
+i
+,
+t
++
+β
+​
+(
+π
+r
+​
+e
+​
+f
+​
+(
+o
+i
+,
+t
+|
+o
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+o
+i
+,
+<
+t
+)
+−
+1
+)
+,
+\footnotesize GC_{GRPO}(q,o,t,\pi_{\theta_{rm}})=\hat{A}_{i,t}+\beta\left(\frac{\pi_{ref}(o_{i,t}|o_{i,<t})}{\pi_{\theta}(o_{i,t}|o_{i,<t})}-1\right),
+(21)
+where
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+is computed based on the group reward scores.
\ No newline at end of file
diff --git a/research/notes/diloco-distributed-low-communication-training-of-language-models.md b/research/notes/diloco-distributed-low-communication-training-of-language-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcc5d3aa4751a6db5412748876750d77e65be765
--- /dev/null
+++ b/research/notes/diloco-distributed-low-communication-training-of-language-models.md
@@ -0,0 +1,2320 @@
+---
+title: 'DiLoCo: Distributed Low-Communication Training of Language Models'
+id: diloco-distributed-low-communication-training-of-language-models
+tags:
+- deepread
+created: '2026-06-10T00:30:45.791157Z'
+source: https://arxiv.org/html/2311.08105
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:45.790933Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+DiLoCo: Distributed Low-Communication Training of Language Models
+\pdftrailerid
+redacted
+\correspondingauthor
+douillard@google.com
+\reportnumber
+DiLoCo: Distributed Low-Communication Training of Language Models
+Arthur Douillard
+Google DeepMind
+Qixuan Feng
+Google DeepMind
+Andrei A. Rusu
+Google DeepMind
+Rachita Chhaparia
+Google DeepMind
+Yani Donchev
+Google DeepMind
+Adhiguna Kuncoro
+Google DeepMind
+Marc’Aurelio Ranzato
+Google DeepMind
+Arthur Szlam
+Google DeepMind
+Jiajun Shen
+Google DeepMind
+Abstract
+Large language models (LLM) have become a critical component in many applications of machine learning. However, standard approaches to training LLM require a large number of tightly interconnected accelerators, with devices exchanging gradients and other intermediate states at each optimization step. While it is difficult to build and maintain a single computing cluster hosting many accelerators, it might be easier to find several computing clusters each hosting a smaller number of devices. In this work, we propose a distributed optimization algorithm, Distributed Low-Communication (DiLoCo), that enables training of language models on islands of devices that are poorly connected. The approach is a variant of federated averaging, where the number of inner steps is large, the inner optimizer is AdamW, and the outer optimizer is Nesterov momentum. On the widely used C4 dataset, we show that DiLoCo on 8 workers performs as well as fully synchronous optimization while communicating 500 times less. DiLoCo exhibits great robustness to the data distribution of each worker. It is also robust to resources becoming unavailable over time, and vice versa, it can seamlessly leverage resources that become available during training.
+keywords:
+large-scale, language modeling, distributed learning
+1
+Introduction
+Language models have shown remarkable ability to generalize to new tasks, and are at the heart of a multitude of new applications of machine learning. Because performance has scaled with model size, practitioners train increasingly larger models on increasingly large data. Nevertheless, at a high level, the basic training approach remains standard mini-batch back-propagation of the error.
+At modern scale, training via standard back-propagation poses unprecedented engineering and infrastructure challenges. To start, several thousands of devices need to be powered and be placed at the same physical location; and interconnected with high-bandwidth cables to minimize latency. Careful software engineering is required to orchestrate the passage of gradients, parameters and intermediate states between these devices at each optimization step, keeping all devices fully utilized. Furthermore, the more devices that are used for each synchronous training step, the more chances there are that one of them fails, risking halting training, or introducing subtle numerical issues. Moreover, the current paradigm poorly leverages heterogeneous devices, that might have different speed and topology. In the simplest terms, it is difficult to co-locate and tightly synchronize a large number of accelerators.
+In this work, we take inspiration from literature on Federated Learning, to address the above mentioned difficulties. In Federated Learning, there are
+k
+𝑘
+k
+italic_k
+workers, each operating on their own island of devices, each consuming a certain partition of the data, and each updating a model replica. Such workers perform some amount of work locally, and then exchange gradients every
+H
+𝐻
+H
+italic_H
+steps to get their model replica back in sync.
+We propose a variant of the popular Federated Averaging (FedAvg) algorithm
+(McMahan et al.,
+2017
+)
+, or a particular instantiation with a momentum-based optimizer as in the FedOpt algorithm
+(Reddi et al.,
+2021
+)
+, whereby the number of inner steps is large, the inner optimizer is replaced with AdamW, and the outer optimizer with Nesterov Momentum for best performance. This combination enables us to address the shortcomings mentioned above, because a) while each worker requires co-located devices their number is roughly
+k
+𝑘
+k
+italic_k
+times smaller than the total, b) workers need not to communicate at each and every single step but only every
+H
+𝐻
+H
+italic_H
+steps which can be in the order of hundreds or even thousands, and c) while devices within an island need to be homogeneous, different islands can operate with different device types. We dub this approach Distributed Low-Communication (DiLoCo) training.
+In a large-batch training setting with overtraining, our empirical validation on the C4 dataset
+(Raffel et al.,
+2020
+)
+demonstrates that DiLoCo can achieve even better performance (as measured in perplexity) than a fully synchronous model, while communicating 500 times less. DiLoCo is capable of effectively utilizing several islands of devices at training time, despite a low bandwidth connectivity among these islands. Finally, at inference time the resulting model has the same size and speed as a model trained in fully synchronous mode.
+Our experiments show that DiLoCo is robust against different data distributions used by local workers and frequency of global parameter updates. Finally, DiLoCo exhibits robustness to island failure, and nicely leverage islands whenever these become available.
+2
+DiLoCo
+Figure 1
+:
+DiLoCo
+: First, a pretrained model
+θ
+(
+0
+)
+superscript
+𝜃
+0
+\theta^{(0)}
+italic_θ start_POSTSUPERSCRIPT ( 0 ) end_POSTSUPERSCRIPT
+is replicated
+k
+𝑘
+k
+italic_k
+times (in this illustration
+k
+=
+4
+𝑘
+4
+k=4
+italic_k = 4
+) and each worker
+θ
+i
+(
+1
+)
+subscript
+superscript
+𝜃
+1
+𝑖
+\theta^{(1)}_{i}
+italic_θ start_POSTSUPERSCRIPT ( 1 ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+trains a model replica on its own shard of data for
+H
+𝐻
+H
+italic_H
+steps independently and in parallel. Afterwards, workers average their outer gradients and an outer optimizer updates the global copy of the parameters
+θ
+(
+1
+)
+superscript
+𝜃
+1
+\theta^{(1)}
+italic_θ start_POSTSUPERSCRIPT ( 1 ) end_POSTSUPERSCRIPT
+. This will then be re-dispatched to the workers. The process repeats
+T
+𝑇
+T
+italic_T
+times (in this illustration only the first two iterations are displayed). Each replica can be trained in different locations of the world, with different accelerators.
+We assume that we have a base model architecture (e.g., a transformer) with parameters
+θ
+𝜃
+\theta
+italic_θ
+. We denote a training dataset as
+𝒟
+=
+{
+(
+𝐱
+,
+𝐲
+)
+,
+…
+}
+𝒟
+𝐱
+𝐲
+…
+\mathcal{D}=\{(\mathbf{x},\mathbf{y}),...\}
+caligraphic_D = { ( bold_x , bold_y ) , … }
+with
+𝐱
+𝐱
+\mathbf{x}
+bold_x
+and
+𝐲
+𝐲
+\mathbf{y}
+bold_y
+being respectively the input data and target. In language modeling
+(Vaswani et al.,
+2017
+)
+, the input is a sequence of tokens and the target is the input sequence shifted by one. When the dataset is split across multiple shards, we denote the
+i
+𝑖
+i
+italic_i
+-th shard with
+𝒟
+i
+subscript
+𝒟
+𝑖
+\mathcal{D}_{i}
+caligraphic_D start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+.
+DiLoCo training proceeds as outlined in
+Algorithm 1
+(Reddi et al.,
+2021
+)
+, and illustrated in
+Figure 1
+. First, we start from an initial model with parameters
+θ
+(
+0
+)
+superscript
+𝜃
+0
+\theta^{(0)}
+italic_θ start_POSTSUPERSCRIPT ( 0 ) end_POSTSUPERSCRIPT
+, which can be initialized at random or using a pretrained model (see
+subsection 3.1
+). We also have
+k
+𝑘
+k
+italic_k
+workers each capable of training a model replica and
+k
+𝑘
+k
+italic_k
+shards of data, one for each worker.
+There are two optimization processes. There is an
+outer
+optimization (line 1, 12, and 14 in
+Algorithm 1
+), which consists of
+T
+𝑇
+T
+italic_T
+outer steps. At each outer step
+t
+𝑡
+t
+italic_t
+, outer gradients from each worker are gathered, averaged and sent to an outer optimizer (
+OuterOpt
+) to update the shared copy of the parameters. Afterwards, this shared copy of the parameters is re-dispatched to each local worker (line 3).
+Within each phase, each worker (line 3) performs
+independently and in parallel
+its own inner optimization (lines 4 to 9) for
+H
+𝐻
+H
+italic_H
+steps using an inner optimizer, denoted by
+InnerOpt
+. Each worker samples data from its own shard (line 5), and updates its own local copy of the parameters (line 8). Note that the inner optimization consists of
+H
+≫
+1
+much-greater-than
+𝐻
+1
+H\gg 1
+italic_H ≫ 1
+steps; for instance, several hundred steps. Therefore, communication across workers is minimal.
+Once all workers have completed their inner optimization step, the delta in parameters space spanning multiple inner steps is computed per worker and averaged in the
+outer gradient
+(line 12) which is used to update the shared copy of the parameters (line 14), which is then used as starting point for the next round of inner optimizations. This is the only time when communication among workers is required, and it happens once every
+H
+𝐻
+H
+italic_H
+inner optimization steps. In total, a worker trains for
+N
+=
+T
+×
+H
+𝑁
+𝑇
+𝐻
+N=T\times H
+italic_N = italic_T × italic_H
+inner steps.
+In our work, we use as
+inner
+optimizer (
+InnerOpt
+) AdamW
+(Kingma and Ba,
+2014
+; Loshchilov and Hutter,
+2019
+)
+, which is the most widely used optimizer for transformer language models.
+As for the
+outer
+optimizer (
+OuterOpt
+) we use Nesterov momentum
+(Sutskever et al.,
+2013
+)
+because it gave the best convergence empirically (see
+Figure 6
+). When
+OuterOpt
+is SGD, then the outer optimizer is equivalent to classical Federated Averaging
+(McMahan et al.,
+2017
+)
+. If the total number of outer optimization steps
+T
+𝑇
+T
+italic_T
+is further set to 1, then DiLoCo reduces to “souping”
+(Wortsman et al.,
+2021
+)
+. Finally, if the number of inner optimization steps
+H
+𝐻
+H
+italic_H
+is set to 1 and
+InnerOpt
+is SGD, DiLoCo is equivalent to large-batch training with data-parallelism.
+Overall, DiLoCo can be interpreted as a data parallelism method that requires very little communication, and therefore, it can scale to workers that are poorly connected, e.g., workers placed in very distant geographic regions. Workers could of course use standard data and model parallelism for their inner optimization.
+Algorithm 1
+DiLoCo Algorithm
+1:
+Initial model
+θ
+(
+0
+)
+superscript
+𝜃
+0
+\theta^{(0)}
+italic_θ start_POSTSUPERSCRIPT ( 0 ) end_POSTSUPERSCRIPT
+2:
+k
+𝑘
+k
+italic_k
+workers
+3:
+Data shards
+{
+𝒟
+1
+,
+…
+,
+𝒟
+k
+}
+subscript
+𝒟
+1
+…
+subscript
+𝒟
+𝑘
+\{\mathcal{D}_{1},\dots,\mathcal{D}_{k}\}
+{ caligraphic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , … , caligraphic_D start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT }
+4:
+Optimizers
+InnerOpt
+and
+OuterOpt
+5:
+for
+outer step
+t
+=
+1
+⁢
+…
+⁢
+T
+𝑡
+1
+…
+𝑇
+t=1\ldots T
+italic_t = 1 … italic_T
+do
+6:
+for
+worker
+i
+=
+1
+⁢
+…
+⁢
+k
+𝑖
+1
+…
+𝑘
+i=1\ldots k
+italic_i = 1 … italic_k
+do
+7:
+θ
+i
+(
+t
+)
+←
+θ
+(
+t
+−
+1
+)
+←
+superscript
+subscript
+𝜃
+𝑖
+𝑡
+superscript
+𝜃
+𝑡
+1
+\theta_{i}^{(t)}\leftarrow\theta^{(t-1)}
+italic_θ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← italic_θ start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT
+8:
+for
+inner step
+h
+=
+1
+⁢
+…
+⁢
+H
+ℎ
+1
+…
+𝐻
+h=1\ldots H
+italic_h = 1 … italic_H
+do
+9:
+x
+∼
+𝒟
+i
+similar-to
+𝑥
+subscript
+𝒟
+𝑖
+x\sim\mathcal{D}_{i}
+italic_x ∼ caligraphic_D start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+10:
+ℒ
+←
+f
+⁢
+(
+x
+,
+θ
+i
+(
+t
+)
+)
+←
+ℒ
+𝑓
+𝑥
+superscript
+subscript
+𝜃
+𝑖
+𝑡
+\mathcal{L}\leftarrow f(x,\theta_{i}^{(t)})
+caligraphic_L ← italic_f ( italic_x , italic_θ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT )
+11:
+▷
+▷
+\triangleright
+▷
+Inner optimization:
+12:
+θ
+i
+(
+t
+)
+←
+InnerOpt
+⁢
+(
+θ
+i
+(
+t
+)
+,
+∇
+ℒ
+)
+←
+superscript
+subscript
+𝜃
+𝑖
+𝑡
+InnerOpt
+superscript
+subscript
+𝜃
+𝑖
+𝑡
+subscript
+∇
+ℒ
+\theta_{i}^{(t)}\leftarrow\texttt{InnerOpt}(\theta_{i}^{(t)},\nabla_{\mathcal{%
+L}})
+italic_θ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← InnerOpt ( italic_θ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT , ∇ start_POSTSUBSCRIPT caligraphic_L end_POSTSUBSCRIPT )
+13:
+end
+for
+14:
+end
+for
+15:
+▷
+▷
+\triangleright
+▷
+Averaging outer gradients:
+16:
+Δ
+(
+t
+)
+←
+1
+k
+⁢
+∑
+i
+=
+1
+k
+(
+θ
+(
+t
+−
+1
+)
+−
+θ
+i
+(
+t
+)
+)
+←
+superscript
+Δ
+𝑡
+1
+𝑘
+superscript
+subscript
+𝑖
+1
+𝑘
+superscript
+𝜃
+𝑡
+1
+superscript
+subscript
+𝜃
+𝑖
+𝑡
+\Delta^{(t)}\leftarrow\frac{1}{k}\sum_{i=1}^{k}(\theta^{(t-1)}-\theta_{i}^{(t)})
+roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← divide start_ARG 1 end_ARG start_ARG italic_k end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_k end_POSTSUPERSCRIPT ( italic_θ start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT - italic_θ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT )
+17:
+▷
+▷
+\triangleright
+▷
+Outer optimization:
+18:
+θ
+(
+t
+)
+←
+OuterOpt
+⁢
+(
+θ
+(
+t
+−
+1
+)
+,
+Δ
+(
+t
+)
+)
+←
+superscript
+𝜃
+𝑡
+OuterOpt
+superscript
+𝜃
+𝑡
+1
+superscript
+Δ
+𝑡
+\theta^{(t)}\leftarrow\texttt{OuterOpt}(\theta^{(t-1)},\Delta^{(t)})
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← OuterOpt ( italic_θ start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT , roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT )
+19:
+end
+for
+3
+Experiments
+Hyperparameter
+60M
+150M
+400M
+Number of layers
+3
+12
+12
+Hidden dim
+896
+896
+1536
+Number of heads
+16
+16
+12
+K/V size
+64
+64
+128
+Vocab size
+32
+,
+000
+32
+000
+32{,}000
+32 , 000
+Table 1
+:
+Model Configuration
+for the three evaluated sizes. All are based on the transformer architecture, chinchilla-style
+(Hoffmann et al.,
+2022
+)
+.
+Figure 2
+:
+Main result
+: After pretraining a 150M baseline for
+24
+,
+000
+24
+000
+24{,}000
+24 , 000
+training steps on C4, we compare networks finetuned for an additional
+64
+,
+000
+64
+000
+64{,}000
+64 , 000
+steps (
+teal
+using the same batch size, and
+purple
+using
+8
+8
+8
+8
+times bigger batch size), and a transformer model trained from scratch (
+red
+). DiLoCo(
+blue
+) using
+8
+8
+8
+8
+workers yields lower perplexity, even compared to the baseline using
+8
+8
+8
+8
+times bigger batch size, while being
+8
+8
+8
+8
+times faster in wall-clock time and communicating
+500
+500
+500
+500
+times less.
+Model
+Communication
+Time
+Compute & Data
+Perplexity
+Baseline
+0
+1
+×
+1\times
+1 ×
+1
+×
+1\times
+1 ×
+16.23
+Baseline,
+8
+×
+8\times
+8 ×
+batch size with data parallelism
+8
+×
+N
+8
+𝑁
+8\times N
+8 × italic_N
+1
+×
+1\times
+1 ×
+8
+×
+8\times
+8 ×
+15.30
+Baseline,
+8
+×
+8\times
+8 ×
+batch size with microbatching
+0
+8
+×
+8\times
+8 ×
+8
+×
+8\times
+8 ×
+15.30
+Baseline,
+8
+×
+8\times
+8 ×
+updates
+0
+8
+×
+8\times
+8 ×
+8
+×
+8\times
+8 ×
+14.72
+DiLoCo
+8
+×
+N
+/
+H
+8
+𝑁
+𝐻
+8\times\nicefrac{{N}}{{H}}
+8 × / start_ARG italic_N end_ARG start_ARG italic_H end_ARG
+1
+×
+1\times
+1 ×
+8
+×
+8\times
+8 ×
+15.02
+Table 2
+:
+Trade-offs of various training algorithms
+: We compare four baselines
+vs
+DiLoCo across their communication cost, time spent, and compute & data used. For the same time and amount of compute, we can compare the second baseline and DiLoCo. The former communicates gradients at each time step (
+N
+𝑁
+N
+italic_N
+total steps), while DiLoCo communicates
+H
+=
+500
+𝐻
+500
+H=500
+italic_H = 500
+times less (and is amenable to distributed training) while also reaching better generalization performance. Note that
+T
+=
+N
+/
+H
+𝑇
+𝑁
+𝐻
+T=\nicefrac{{N}}{{H}}
+italic_T = / start_ARG italic_N end_ARG start_ARG italic_H end_ARG
+(see
+Algorithm 1
+).
+In this section we report the main experiments validating DiLoCo. We consider a language modeling task on the C4 dataset, a dataset derived from Common Crawl
+(Raffel et al.,
+2020
+)
+. We report perplexity on the validation set against number of steps used at training time, which is a good proxy for wall clock time since communication across workers is rather infrequent. The total number of steps is set to
+88
+,
+000
+88
+000
+88{,}000
+88 , 000
+. We consider three model sizes, all decoder-only transformers adapted from the Chinchilla architecture
+(Hoffmann et al.,
+2022
+)
+. Their respective configuration is described in
+Table 1
+. We perform experiments both in the i.i.d. and non-i.i.d. settings, meaning when the data distribution of the shards
+𝒟
+i
+subscript
+𝒟
+𝑖
+\mathcal{D}_{i}
+caligraphic_D start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+is the same for all
+i
+𝑖
+i
+italic_i
+and when these are different like in heterogeneous federated learning. Since the latter is a more challenging use case, we use this setting by default except when indicated otherwise. Similarly, by default all training experiments start from a transformer language model pretrained for
+24
+,
+000
+24
+000
+24{,}000
+24 , 000
+steps on the same training set, refer to
+subsection 3.1
+for further details.
+In our experiments we have searched over the hyper-parameters of the outer optimizer (
+e.g.
+learning rate, momentum, etc.). We use a sequence length of
+1
+,
+024
+1
+024
+1{,}024
+1 , 024
+tokens and a batch of size
+512
+512
+512
+512
+but otherwise we left unchanged the inner optimization and model architecture. We list all the hyper-parameters in the appendix (
+Table 5
+).
+In
+Figure 2
+, we show the performance through time of DiLoCo (in
+blue
+with
+k
+=
+8
+𝑘
+8
+k=8
+italic_k = 8
+replicas in the non-i.i.d. data setting) when each worker performs
+T
+=
+128
+𝑇
+128
+T=128
+italic_T = 128
+times
+H
+=
+500
+𝐻
+500
+H=500
+italic_H = 500
+inner steps (
+64
+,
+000
+64
+000
+64{,}000
+64 , 000
+steps in total). In this experiment, DiLoCo starts from a model
+θ
+(
+0
+)
+superscript
+𝜃
+0
+\theta^{(0)}
+italic_θ start_POSTSUPERSCRIPT ( 0 ) end_POSTSUPERSCRIPT
+pretrained for
+24
+,
+000
+24
+000
+24{,}000
+24 , 000
+steps.
+There are four baselines. The first baseline is a model trained from scratch for
+88
+,
+000
+88
+000
+88{,}000
+88 , 000
+steps (in
+red
+), the second starts from a model pretrained for
+24
+,
+000
+24
+000
+24{,}000
+24 , 000
+steps and performs an additional
+64
+,
+000
+64
+000
+64{,}000
+64 , 000
+steps (in
+teal
+). The third baseline starts from the same pre-trained model, but during finetuning uses an
+8
+×
+8\times
+8 ×
+bigger batch size (in
+purple
+). The fourth baseline is running the standard batch size for
+8
+×
+8\times
+8 ×
+the number of updates.
+We compare in
+Table 2
+all baselines with respect to the communication cost, time spent training, and the amount of compute & data used. Increasing the batch size can be done in two manners: with data parallelism (second row) at the cost of increased communication, or with microbatching (third row) at the cost of longer training time. DiLoCo (last row) doesn’t increase training time, communicates
+H
+=
+500
+×
+H=500\times
+italic_H = 500 ×
+less than the second baseline (and is thus amenable to distributed training across compute islands), while also reaching better generalization. Increasing by
+8
+×
+8\times
+8 ×
+the number of updates improves perplexity over our method, but at the cost of being
+8
+×
+8\times
+8 ×
+slower.
+3.1
+Ablations
+We perform extensive ablations of DiLoCo to better understand its capabilities and stress-test its limits.
+Figure 3
+:
+Impact of number of pretraining steps
+in a non-i.i.d. setting. DiLoCo can be initialized from a pretrained model
+θ
+(
+0
+)
+superscript
+𝜃
+0
+\theta^{(0)}
+italic_θ start_POSTSUPERSCRIPT ( 0 ) end_POSTSUPERSCRIPT
+, or even from scratch with minimal (-0.1 PPL) degradation of model quality. The vertical dashed lines indicate the transition between pretraining and DiLoCo training.
+Number of Pretraining Steps
+For all experiments here we perform
+88
+,
+000
+88
+000
+88{,}000
+88 , 000
+training steps. A subset of those steps are done during the pretraining stage, and the remainder with DiLoCo. In
+Figure 3
+, we study the impact of the number of pretraining steps on the final generalization performance in a non-i.i.d. data regime. Specifically, we compare no pretraining (in
+teal
+), pretraining of 12k (in
+purple
+), 24k (in
+red
+), and 48k (in
+orange
+) steps.
+We highlight the pretrain’s ending and DiLoCo’s beginning with vertical dashed lines. Note that as we keep the total amount of steps (wall-clock time) fixed, few or no pretraining steps will result in more compute spent overall.
+In general, we observe that starting DiLoCo before 24k steps achieves a similar final PPL, demonstrating the robustness of the approach. Interestingly, performance is not degraded even when starting from a randomly initialized network. This result contradicts the findings of prior work on
+post local-SGD
+(Lin et al.,
+2020
+)
+and its large-scale study on a vision classification task
+(Ortiz et al.,
+2021
+)
+.
+The attentive reader may also note spikes in perplexity after the vertical dashed lines: a warm-up of the inner learning rate is the culprit. Despite the transient spike, such warm up is ultimately beneficial, as previously noted also in the
+continual pretraining
+setting by
+Gupta et al. (
+2023
+)
+.
+Figure 4
+:
+Varying the communication frequency
+every
+H
+=
+{
+50
+,
+100
+,
+250
+,
+500
+,
+1000
+,
+2000
+}
+𝐻
+50
+100
+250
+500
+1000
+2000
+H=\{50,100,250,500,1000,2000\}
+italic_H = { 50 , 100 , 250 , 500 , 1000 , 2000 }
+steps in a non-i.i.d setting.
+Communication frequency
+In order to scale up distributed training across a set of poorly connected machines, the frequency of communication needs to be reduced. Doing a single communication at the training’s end
+(Wortsman et al.,
+2022a
+)
+is sub-optimal. Most works instead consider communicating every
+H
+≤
+20
+𝐻
+20
+H\leq 20
+italic_H ≤ 20
+steps
+Ortiz et al. (
+2021
+)
+, which is too frequent for many distirbuted learning applications.
+In
+Figure 4
+, we vary the communication frequency for a 150M transformer, in the non-i.i.d. data regime, from
+H
+=
+50
+𝐻
+50
+H=50
+italic_H = 50
+steps (in
+teal
+) to
+H
+=
+2000
+𝐻
+2000
+H=2000
+italic_H = 2000
+steps (in
+green
+). In general, we observe that communicating more frequently improves generalization performance.
+However, communicating more frequently than
+H
+=
+500
+𝐻
+500
+H=500
+italic_H = 500
+steps leads to diminishing returns. Moreover, the performance degradation is very mild up to
+H
+=
+1000
+𝐻
+1000
+H=1000
+italic_H = 1000
+steps. For instance, when
+H
+=
+1000
+𝐻
+1000
+H=1000
+italic_H = 1000
+the perplexity increases by only
+2.9
+%
+percent
+2.9
+2.9\%
+2.9 %
+relative to
+H
+=
+50
+𝐻
+50
+H=50
+italic_H = 50
+, despite communicating
+20
+×
+20\times
+20 ×
+less. Based on these considerations, for all remaining experiments we choose
+H
+=
+500
+𝐻
+500
+H=500
+italic_H = 500
+as this strikes a good trade-off between generalization performance and communication cost.
+i.i.d. vs non-i.i.d. data regimes
+According to
+Gao et al. (
+2022
+)
+, the distribution of the data across replicas can have a significant impact on generalization. In this ablation study we assess the effect that different data distributions have on the convergence of DiLoCo.
+Similarly
+Gururangan et al. (
+2023
+)
+, we create the non-i.i.d. setting by clustering with
+k
+𝑘
+k
+italic_k
+-Means the entire training set using a pretrained model’s last layer features. The i.i.d. setting is a random partitioning of the data. We showcase in
+Figure 5
+the performance of DiLoCo with
+k
+=
+8
+𝑘
+8
+k=8
+italic_k = 8
+workers/shards in a non-i.i.d. setting (in
+blue
+) and i.i.d setting (in
+red
+). Despite the latter converging faster early on in training, the final generalization performance of the two settings is comparable. Intuitively, we would expect the non-i.i.d. setting to yield worse performance because each worker might produce very different outer gradients, but DiLoCo exhibits very strong robustness. The reason why this might be happening is further investigated in the appendix (
+subsection 6.2
+).
+Figure 5
+:
+i.i.d.
+vs
+non-i.i.d. data regimes
+: DiLoCo converges faster in the i.i.d. setting but towards the end both data regimes attain similar generalization, highlighting the robustness of DiLoCo.
+Number of replicas
+We now investigate the impact of the number of replicas/clusters in
+Table 3
+, assuming there are as many workers as there are shards of data. The results in
+Table 3
+show that increasing the number of replicas improves generalization performance, but with diminishing returns when there are more than 8 workers. This finding applies to both i.i.d. and non-i.i.d. settings. Unlike what is reported in prior work in the vision domain on ImageNet
+(Ortiz et al.,
+2021
+)
+, we do not observe significant performance degradation by increasing the number of replicas.
+Number of replicas
+i.i.d
+non-
+i.i.d
+1
+16.23
+4
+15.23
+15.18
+8
+15.08
+15.02
+16
+15.02
+14.91
+64
+14.95
+14.96
+Table 3
+:
+Impact of the number of replicas/clusters
+on the evaluation perplexity for a fixed amount of inner steps per replica. With more replicas, the model consumes more data and uses more compute overall, although this requires very infrequent communication (once every 500 inner steps).
+Model Size
+Relative (%)
+Absolute (PPL)
+60M
+4.33%
+1.01
+150M
+7.45%
+1.21
+400M
+7.49%
+1.01
+Table 4
+:
+Varying the model size
+: For each model size, we report the improvement of DiLoCo over the baseline (using a single worker). DiLoCo uses 8 workers and non-i.i.d. shards.
+Model size
+In
+Table 4
+we vary the model size. We train models of size 60, 150 and 400 million parameters. We consider the usual setting where data distribution is non i.i.d. and all workers start from a model (of the same size) pretrained for
+24
+,
+000
+24
+000
+24{,}000
+24 , 000
+steps. Hyper-parameters were tuned on the 150M model, which may be sub-optimal for the other model sizes. We observe a monotonic improvement of performance as the model size increases. We surmise that (1) in an overtrained setting with large amount of steps, larger models are more efficient at fitting the same amount of data, and (2) as the linear connectivity literature
+(Ilharco et al.,
+2022
+)
+suggests, larger models are less subject to interference when averaging their parameters.
+Outer Optimizers
+We experimented with various outer optimizers (see L14 of
+Algorithm 1
+). For each, we tuned their momentum if any, and their outer learning rate. We found that using as outer optimizer SGD (equivalent to FedAvg
+(McMahan et al.,
+2017
+)
+) or Adam (eq. to FedOpt
+(Reddi et al.,
+2021
+)
+) performed poorly, as shown in
+Figure 6
+. Adam was particularly unstable with a high second order momemtum norm. We alleviated the issue by increasing the
+ϵ
+italic-ϵ
+\epsilon
+italic_ϵ
+factor to
+0.1
+0.1
+0.1
+0.1
+. We found Nesterov optimizer
+(Sutskever et al.,
+2013
+)
+(see FedMom in
+(Huo et al.,
+2020
+)
+) to perform the best. In particular, the setting with outer learning rate equal to
+0.7
+0.7
+0.7
+0.7
+and outer momentum equal to
+0.9
+0.9
+0.9
+0.9
+is very robust, and it is adopted for all our experiments throughout. We hypothesize that the Nesterov’s gradient correction is particularly helpful with the outer gradient that span hundred of training steps.
+We also considered decaying the outer learning rate with a cosine scheduling but it resulted in similar performance.
+Since we decay the inner learning rate, the outer gradient norm gets naturally smaller over the course of training, removing the need to further decay the outer learning rate.
+Figure 6
+:
+Outer Optimizers
+: Comparison of different outer optimizers.
+(a)
+Number of replicas per training steps.
+(b)
+Perplexity across training steps.
+Figure 7
+:
+Adaptive compute
+: We vary the number of replicas (i.e., the amount of compute) across time. Models generalize equally well for the same total amount of compute, regardless of how this is made available over time.
+Adaptive compute pool
+The total amount of compute any given user has, is rarely constant over time. For instance, a preemptible machine, that is regularly killed, is a cheaper alternative to a dedicated server. Similarly, university’s computing clusters often use
+karma
+systems to balance compute among all users, but this means that resources available to each user vary over time. Finally, a collaborative system like Petals
+(Borzunov et al.,
+2022
+)
+or
+Diskin et al. (
+2021
+)
+where individual users provide their own devices to the shared compute pool is subject to extreme pool resizing depending on how many people participate at any given time.
+In this study, we then explore the performance of DiLoCo when the amount of compute varies throughout training. In our case, the amount of compute is varied by changing the number of replicas used in an i.i.d. setting. In
+Figure 7
+, we show the validation perplexity through time when using different schedules of compute allocation.
+Constant local
+(in
+green
+) and
+Constant Distributed
+(in
+blue
+) use a constant amount of replicas: respectively 1 (baseline) and 8 (standard DiLoCo setting).
+Doubling Compute
+(in
+teal
+) and
+Halving Compute
+(in
+purple
+) use respectively 4 and 8 replicas during the first half of the training, and then 8 and 4.
+Ramping Up
+(in
+red
+) and
+Ramping Down
+(in
+orange
+) ramps up (respectively ramps down) the compute from 1 to 8 (resp. from 8 to 1).
+We observe that the factor determining the ultimate generalization ability of the model is the total amount of compute given to DiLoCo, but this is robust to how the budget is spread over time. For instance,
+Doubling Compute
+and
+Halving Compute
+use as much compute in total and achieve similar performance.
+Similarly,
+Ramping Up
+and
+Ramping Down
+obtain similar performance despite the different budgeting schedule, and their generalization is worse than other baselines using more total compute. In conclusion, models quality is affected by the total amount of compute, but not as much by how such computed is allocated over time.
+Asynchronous Communication
+In DiLoCo all workers communicate their outer-gradients after
+H
+𝐻
+H
+italic_H
+inner optimization steps.
+In practice, it might happen that a worker gets rebooted or that the network might lose packets. In these cases, communication is not feasible.
+In
+Figure 8
+, we simulate such inability to communicate by randomly dropping outer gradients with probability equal to 0% (in
+teal
+), 10% (in
+purple
+), 30% (in
+red
+), to 50% (in
+orange
+). When an outer gradient is dropped, the local worker continues training for the following
+H
+𝐻
+H
+italic_H
+steps starting from its own parameters
+θ
+i
+(
+t
+)
+subscript
+superscript
+𝜃
+𝑡
+𝑖
+\theta^{(t)}_{i}
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+(as opposed to the shared parameters
+θ
+(
+t
+)
+superscript
+𝜃
+𝑡
+\theta^{(t)}
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT
+).
+In both i.i.d. and non-i.i.d. data settings, a higher probability of being dropped results in more unstable learning with transient spikes in perplexity. However, even in the extreme non-i.i.d setting where each replica has 50% probability of dropping communication, the degradation of perplexity relative to perfect communication is only
+2.1
+%
+percent
+2.1
+2.1\%
+2.1 %
+. Consequently, with robustness to communication failure, the need of a synchronization barrier is less critical and thus training can be accelerated without having to wait all replicas.
+(a)
+i.i.d.
+data regime.
+(b)
+non-i.i.d.
+data regime.
+Figure 8
+:
+Asynchronous communication
+: we drop communication of outer gradients of each replica with a certain probability. If a replica is dropped, it continues training without synchronizing its parameters.
+Accelerating a single worker
+Recently works have shown that linear connectivity can also
+accelerate
+the convergence of a non-distributed model. For instance, the Lookahead optimizer
+(Zhang et al.,
+2019
+)
+proposes to take an outer step of outer SGD using a single replica, which is equivalent at interpolating between the starting and end points of a phase in the parameters space.
+Figure 9
+shows that DiLoCo applied to a
+single
+replica/cluster (
+k
+=
+1
+𝑘
+1
+k=1
+italic_k = 1
+but
+H
+≫
+1
+much-greater-than
+𝐻
+1
+H\gg 1
+italic_H ≫ 1
+) can improve both convergence speed and final generalization performance at null communication cost. Specifically, every
+H
+=
+500
+𝐻
+500
+H=500
+italic_H = 500
+inner steps, we compute the only outer gradient as described in
+Algorithm 1
+, and then (locally) update the parameters using the outer optimizer. This experiment further corroborates the robustness and wide applicability of DiLoCo.
+Figure 9
+:
+Accelerating a single worker
+: DiLoCo applied to a single replica
+k
+=
+1
+𝑘
+1
+k=1
+italic_k = 1
+provides both faster and better generalization.
+4
+Related Work
+In this section we review relevant work from the literature, limiting the discussion to only few representative works given the large body of literature.
+We cover the literature of distributed learning, specifically local SGD and federated learning. We also relate to recent works done on linear mode connectivity which inspired much of our work.
+4.1
+Local SGD and Federated Learning
+Several communities have proposed and studied local SGD. To the best of our knowledge, the first instantation was in
+McMahan et al. (
+2017
+)
+who introduced the concept of federated learning and local SGD as a way to enable learning on a network of mobile devices which retain private access to their own data. In this work, the outer optimization consists of a mere parameter averaging step. This was later extended to more powerful outer optimizers by
+Wang et al. (
+2020
+); Reddi et al. (
+2021
+)
+; this work inspired our use of Nesterov momentum in the outer optimization.
+Lin et al. (
+2020
+)
+considered local SGD as a way to improve generalization when learning with large batch sizes.
+Stich (
+2019
+)
+instead focused on local SGD because of its ability to limit communication in distributed learning, a perspective we share also in our work. To the best of our knowledge, only FedMom
+(Huo et al.,
+2020
+)
+considers Nesterov as the outer optimizer as we did. While they also tackle a language modeling task, the setting is much smaller (1-layer LSTM), with only 2 replicas, and rather frequent communication (every 20 inner steps). In our work instead, we consider a larger setting with up to a 400M transformer language model, across up to 64 replicas, and up to
+100
+×
+100\times
+100 ×
+less communication. Furthermore, we use AdamW as inner optimizer while they used SGD.
+Ortiz et al. (
+2021
+)
+is one of the few works in federated learning / local SGD body of literature that has validated on a large-scale setting. They consider ImageNet
+(Deng et al.,
+2009
+)
+with Resnet50 and Resnet101
+(He et al.,
+2015
+)
+, and found that local SGD struggles at scale. In particular, they reported that fewer inner steps (e.g.,
+H
+=
+8
+𝐻
+8
+H=8
+italic_H = 8
+), no pretraining, and a relatively large number of replicas (
+≥
+k
+=
+16
+absent
+𝑘
+16
+\geq k=16
+≥ italic_k = 16
+) degrade generalization. Thus the authors conclude that "
+local SGD encounters challenges at scale.
+". Instead, we show in
+section 3
+that DiLoCo can robustly operate while communicating
+125
+×
+125\times
+125 ×
+less (
+H
+=
+1000
+𝐻
+1000
+H=1000
+italic_H = 1000
+), even without pretraining, and using up to
+4
+×
+4\times
+4 ×
+more replicas (
+k
+=
+64
+𝑘
+64
+k=64
+italic_k = 64
+) both in the i.i.d. and non-i.i.d. settings. Recently, multiple works
+(Presser,
+2020
+; Diskin et al.,
+2021
+; Ryabinin et al.,
+2021
+)
+also applied Local SGD for language models but without outer optimization.
+4.2
+Linear Mode Connectivity
+The field of linear mode connectivity studies how to linearly interpolate between several models in parameters space, to yield a single model with the best capabilities of all models combined
+(Frankle et al.,
+2020
+; Wortsman et al.,
+2021
+)
+. A surprising result from this field is the relative easiness to find a linear interpolation between several models where all intermediary points have a low loss, avoiding any
+loss barrier
+. Specifically,
+Wortsman et al. (
+2022c
+)
+started from a pretrained model, finetuned different replicas on various tasks or choice of hyperparameters
+(Wortsman et al.,
+2022b
+)
+, and then averaged the resulting parameters. Originally proposed in the vision domain, this method has then been used also in NLP
+(Li et al.,
+2022
+)
+, RLHF
+(Ramé et al.,
+2023a
+)
+, noisy data
+(Rebuffi et al.,
+2022
+)
+, and OOD
+(Ramé et al.,
+2023b
+)
+. Recently, several works studied other ways to alleviate loss barriers
+(Jordan et al.,
+2023
+; Stoica et al.,
+2023
+; Jin et al.,
+2023
+)
+. While we didn’t apply any of these methods to DiLoCo, they are complementary and could be used in future works.
+The majority of works on linear connectivity considers only averaging once all replicas have been fully finetuned, while we exploit the linear mode connectivity
+during
+training. There are however notable exceptions: BTM
+(Li et al.,
+2022
+)
+and PAPA
+(Jolicoeur-Martineau et al.,
+2023
+)
+are roughly equivalent to our framework but use as outer optimizer
+OuterOpt
+=
+SGD(lr=1.0)
+OuterOpt
+SGD(lr=1.0)
+\texttt{OuterOpt}=\texttt{SGD(lr=1.0)}
+OuterOpt = SGD(lr=1.0)
+.
+The former communicates very little because each replica is fully finetuned on a task before synchronization. The latter communicates every few steps and with at most 10 replicas. Finally,
+Kaddour (
+2022
+)
+only considers a few previous checkpoints of the same model trained on a single task, and don’t re-use it for training. Git-theta
+(Kandpal et al.,
+2023
+)
+argues that linear mode connectivity can facilitate collaboration by merging models trained by different teams
+(Diskin et al.,
+2021
+)
+on various tasks; we show that DiLoCo is actually capable to do so
+during
+training, even when the data of each worker is different.
+5
+Limitations
+Our work has several limitations, which constitute avenue for future work. First, we only considered a single task, namely language modeling, and a single architecture, a transformer. Other datasets, domains (
+e.g.
+vision), and other architectures (
+e.g.
+, CNNs which are known to be more sensitive to linear mode connectivity
+(Jordan et al.,
+2023
+)
+) should also be considered.
+Second, we have presented results at the scale of 60 to 400 million parameters. However, at the time of writing state-of-the-art language models use 3 orders of magnitude more parameters. Therefore, it would be interesting to see how DiLoCo works at larger scale. Our initial extrapolation indicate that DiLoCo might perform even better at larger scales, because there is less interference during the outer-gradient averaging step. However, this hypothesis should be validated empirically.
+Third, the version of DiLoCo presented here assumes that all workers are homogeneous. However, in practice workers might operate at wildly different speed. In these cases, waiting for all workers to perform the same number of steps is rather inefficient. Another avenue of future work is then to extend DiLoCo to the asynchronous setting, whereby workers update the global parameter without ever waiting for any other worker.
+Fourth, DiLoCo exhibits diminishing returns beyond 8 workers. Another avenue of future research is to improve the algorithm to better leverage any additional compute that might be available.
+Finally,
+DiLoCo attains fast convergence in terms of wall-clock time. However, the distributed nature of the computation reduces the FLOP and data efficiency of the model, as shown by the
+8
+×
+8\times
+8 ×
+updates row in Table
+2
+. At a high level, this is because the outer updates have effectively too large a batch size; but naively reducing the outer-update batch size would result in the workers being destabilized because their batch-size is too small.
+Therefore, another avenue of future research is on balancing wall-clock time efficiency with compute efficiency and data efficiency, among other quantities of interest. In particular, we believe
+asynchronous
+variants of local SGD may allow distributed training with relatively more data-efficient updates.
+6
+Conclusion
+In this work we study the problem of how to distribute training of large-scale transformer language models when not all devices are co-located, and the network between the various machines may have low bandwidth. To address this problem, we propose DiLoCo, a variant of Federated Averaging whereby the outer optimizer is replaced with Nesterov momentum, the inner optimizer is AdamW (the
+de facto
+standard optimizer for transformer language models), and the number of inner optimization steps is large (our default value is 500). The latter is crucial to reduce communication, and it means that workers only need to send data once every 500 steps. Practically speaking, while standard mini-batch methods relying on data and model parallelism require sending data every few hundred milliseconds, DiLoCo does so only every few minutes. Therefore, if each communication step takes a lot of time, DiLoCo converges much faster in terms of wall-clock time.
+Our empirical validation demonstrate the robustness of DiLoCo on several fronts, from the type of data distribution each worker consumes, to the number of inner optimization steps, and number of workers which can even change over time.
+In conclusion,
+DiLoCo is a robust and effective way to distribute training of transformer language models when there are several available machines but poorly connected
+. Of course, it remains to be seen whether these findings generalize to models of larger scale, or to other domains and architecture types.
+\nobibliography
+*
+References
+Borzunov et al. (2022)
+Alexander Borzunov, Dmitry Baranchuk, Tim Dettmers, Max Ryabinin, Younes Belkada, Artem Chumachenko, Pavel Samygin, and Colin Raffel.
+Petals: Collaborative inference and fine-tuning of large models.
+arXiv preprint library
+, 2022.
+Deng et al. (2009)
+Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei.
+Imagenet: A large-scale hierarchical image database.
+Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)
+, 2009.
+Diskin et al. (2021)
+Michael Diskin, Alexey Bukhtiyarov, Max Ryabinin, Lucile Saulnier, Quentin Lhoest, Anton Sinitsin, Dmitry Popov, Dmitry Pyrkin, Maxim Kashirin, Alexander Borzunov, Albert Villanova del Moral, Denis Mazur, Ilia Kobelev, Yacine Jernite, Thomas Wolf, and Gennady Pekhimenko.
+Distributed deep learning in open collaborations.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2021.
+Frankle et al. (2020)
+Jonathan Frankle, Gintare Karolina Dziugaite, Daniel M. Roy, and Michael Carbin.
+Linear mode connectivity and the lottery ticket hypothesis.
+International Conference on Machine Learning (ICML)
+, 2020.
+Gao et al. (2022)
+Dashan Gao, Xin Yao, and Qiang Yang.
+A survey on heterogeneous federated learning.
+arXiv preprint library
+, 2022.
+Gu et al. (2023)
+Xinran Gu, Kaifeng Lyu, Longbo Huang, and Sanjeev Arora.
+Why (and when) does local sgd generalize better than sgd?
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2023.
+Gupta et al. (2023)
+Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats L. Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, and Timothée Lesort.
+Continual pre-training of large language models: How to (re)warm your model?
+arXiv preprint library
+, 2023.
+Gururangan et al. (2023)
+Suchin Gururangan, Margaret Li, Mike Lewis, Weijia Shi, Tim Althoff, Noah A. Smith, and Luke Zettlemoyer.
+Scaling expert language models with unsupervised domain discovery.
+arXiv preprint library
+, 2023.
+He et al. (2015)
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun.
+Deep residual learning for image recognition.
+Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)
+, 2015.
+Hoffmann et al. (2022)
+Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, Tom Hennigan, Eric Noland, Katie Millican, George van den Driessche, Bogdan Damoc, Aurelia Guy, Simon Osindero, Karen Simonyan, Erich Elsen, Jack W. Rae, Oriol Vinyals, and Laurent Sifre.
+Training compute-optimal large language models.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2022.
+Huo et al. (2020)
+Zhouyuan Huo, Qian Yang, Bin Gu, and Lawrence Carin. Heng Huang.
+Faster on-device training using new federated momentum algorithm.
+arXiv preprint library
+, 2020.
+Ilharco et al. (2022)
+Gabriel Ilharco, Mitchell Wortsman, Samir Yitzhak Gadre, Shuran Song, Hannaneh Hajishirzi, Simon Kornblith, Ali Farhadi, and Ludwig Schmidt.
+Patching open-vocabulary models by interpolating weights.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2022.
+Jin et al. (2023)
+Xisen Jin, Xiang Ren, Daniel Preotiuc-Pietro, and Pengxiang Cheng.
+Dataless knowledge fusion by merging weights of language models.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2023.
+Jolicoeur-Martineau et al. (2023)
+Alexia Jolicoeur-Martineau, Emy Gervais, Kilian Fatras, Yan Zhang, and Simon Lacoste-Julien.
+Population parameter averaging (papa).
+arXiv preprint library
+, 2023.
+Jordan et al. (2023)
+Keller Jordan, Hanie Sedghi, Olga Saukh, Rahim Entezari, and Behnam Neyshabur.
+Repair: Renormalizing permuted activations for interpolation repair.
+arXiv preprint library
+, 2023.
+Kaddour (2022)
+Jean Kaddour.
+Stop wasting my time! saving days of imagenet and bert training with latest weight averaging.
+Advances in Neural Information Processing Systems (NeurIPS) Workshop
+, 2022.
+Kandpal et al. (2023)
+Nikhil Kandpal, Brian Lester, Mohammed Muqeeth, Anisha Mascarenhas, Monty Evans, Vishal Baskaran, Tenghao Huang, Haokun Liu, and Colin Raffel.
+Git-theta: A git extension for collaborative development of machine learning models.
+arXiv preprint library
+, 2023.
+Kingma and Ba (2014)
+Diederik P. Kingma and Jimmy Ba.
+Adam: A method for stochastic optimization.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2014.
+Li et al. (2022)
+Margaret Li, Suchin Gururangan, Tim Dettmers, Mike Lewis, Tim Althoff, Noah A. Smith, and Luke Zettlemoyer.
+Branch-train-merge: Embarrassingly parallel training of expert language models.
+arXiv preprint library
+, 2022.
+Lin et al. (2020)
+Tao Lin, Sebastian U. Stich, Kumar Kshitij Patel, and Martin Jaggi.
+Don’t use large mini-batches, use local sgd.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2020.
+Loshchilov and Hutter (2019)
+Ilya Loshchilov and Frank Hutter.
+Decoupled weight decay regularization.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2019.
+McMahan et al. (2017)
+H. Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and Blaise Agüera y Arcas.
+Communication-efficient learning of deep networks from decentralized data.
+International Conference on Artificial Intelligence and Statistics (AISTATS)
+, 2017.
+Ortiz et al. (2021)
+Jose Javier Gonzalez Ortiz, Jonathan Frankle, Mike Rabbat, Ari Morcos, and Nicolas Ballas.
+Trade-offs of local sgd at scale: An empirical study.
+arXiv preprint library
+, 2021.
+Presser (2020)
+Shawn Presser.
+Swarm training, 2020.
+URL
+https://battle.shawwn.com/swarm-training-v01a.pdf
+.
+Raffel et al. (2020)
+Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
+Exploring the limits of transfer learning with a unified text-to-text transformer.
+Journal of Machine Learning Research
+, 2020.
+Ramé et al. (2023a)
+Alexandre Ramé, Guillaume Couairon, Mustafa Shukor, Corentin Dancette, Jean-Baptiste Gaya, Laure Soulier, and Matthieu Cord.
+Rewarded soups: towards pareto-optimal alignment by interpolating weights fine-tuned on diverse rewards.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2023a.
+Ramé et al. (2023b)
+Alexandre Ramé, Matthieu Kirchmeyer, Thibaud Rahier, Alain Rakotomamonjy, Patrick Gallinari, and Matthieu Cord.
+Diverse weight averaging for out-of-distribution generalization.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2023b.
+Rebuffi et al. (2022)
+Sylvestre-Alvise Rebuffi, Francesco Croce, and Sven Gowal.
+Revisiting adapters with adversarial training.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2022.
+Reddi et al. (2021)
+Sashank Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush, Jakub Konečný, Sanjiv Kumar, and H. Brendan McMahan.
+Adaptive federated optimization.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2021.
+Ryabinin et al. (2021)
+Max Ryabinin, Eduard Gorbunov, Vsevolod Plokhotnyuk, and Gennady Pekhimenko.
+Moshpit sgd: Communication-efficient decentralized training on heterogeneous unreliable devices.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2021.
+Stich (2019)
+Sebastian U. Stich.
+Local SGD converges fast and communicates little.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2019.
+Stoica et al. (2023)
+George Stoica, Daniel Bolya, Jakob Bjorner, Taylor Hearn, and Judy Hoffman.
+Zipit! merging models from different tasks without training.
+arXiv preprint library
+, 2023.
+Sutskever et al. (2013)
+Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton.
+On the importance of initialization and momentum in deep learning.
+International Conference on Machine Learning (ICML)
+, 2013.
+Tang et al. (2023)
+Zhenheng Tang, Shaohuai Shi, Wei Wang, Bo Li, and Xiaowen Chu.
+Communication-efficient distributed deep learning: A comprehensive survey.
+arXiv preprint library
+, 2023.
+Vaswani et al. (2017)
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.
+Attention is all you need.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2017.
+Wang et al. (2020)
+Jianyu Wang, Vinayak Tantia, Nicolas Ballas, and Michael Rabbat.
+Slowmo: Improving communication-efficient distributed sgd with slow momentum.
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2020.
+Wortsman et al. (2021)
+Mitchell Wortsman, Maxwell Horton, Carlos Guestrin, Ali Farhadi, and Mohammad Rastegari.
+Learning neural network subspaces.
+International Conference on Machine Learning (ICML)
+, 2021.
+Wortsman et al. (2022a)
+Mitchell Wortsman, Suchin Gururangan, Shen Li, Ali Farhadi, Ludwig Schmidt, Michael Rabbat, and Ari S. Morcos.
+lo-fi: distributed fine-tuning without communication.
+arXiv preprint library
+, 2022a.
+Wortsman et al. (2022b)
+Mitchell Wortsman, Gabriel Ilharco, Samir Ya Gadre, Rebecca Roelofs, Raphael Gontijo-Lopes, Ari S Morcos, Hongseok Namkoong, Ali Farhadi, Yair Carmon, Simon Kornblith, and Ludwig Schmidt.
+Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time.
+International Conference on Machine Learning (ICML)
+, 2022b.
+Wortsman et al. (2022c)
+Mitchell Wortsman, Gabriel Ilharco, Jong Wook Kim, Mike Li, Simon Kornblith, Rebecca Roelofs, Raphael Gontijo-Lopes, Hannaneh Hajishirzi, Ali Farhadi, Hongseok Namkoong, and Ludwig Schmidt.
+Robust fine-tuning of zero-shot models.
+Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)
+, 2022c.
+Yadav et al. (2023)
+Prateek Yadav, Derek Tam, Leshem Choshen, Colin Raffel, and Mohit Bansal.
+Resolving interference when merging models.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2023.
+Zhang et al. (2019)
+Michael R. Zhang, James Lucas, Geoffrey Hinton, and Jimmy Ba.
+Lookahead optimizer: k steps forward, 1 step back.
+Advances in Neural Information Processing Systems (NeurIPS)
+, 2019.
+Acknowledgements
+We would like to thank Ross Hemsley, Bo Liu, Amal Rannen-Triki, and Jack Rae for their valuable feedback.
+Supplementary Materials
+6.1
+Implementation Details
+Hyperparameter
+Value
+Inner Learning rate
+4
+⁢
+e
+−
+4
+4
+superscript
+𝑒
+4
+4e^{-4}
+4 italic_e start_POSTSUPERSCRIPT - 4 end_POSTSUPERSCRIPT
+Number of warmup steps
+1
+,
+000
+1
+000
+1{,}000
+1 , 000
+Weight decay
+0.1
+0.1
+0.1
+0.1
+Batch Size
+512
+Sequence length
+1
+,
+024
+1
+024
+1{,}024
+1 , 024
+Outer Optimizer
+SGD, SGDM,
+Nesterov
+, Adam
+Inner Optimizer
+AdamW
+Outer SGD learning rate
+1.0, 0.7,
+0.5
+, 0.3, 0.1
+Outer SGDM learning rate
+1.0, 0.7, 0.5,
+0.3
+, 0.1
+Outer SGDM momentum
+0.9
+Outer Nesterov learning rate
+1.0,
+0.7
+, 0.5, 0.3, 0.1
+Outer Nesterov momentum
+0.95,
+0.9
+, 0.8
+Outer Adam learning rate
+1.0, 0.7, 0.5,
+0.3
+, 0.1
+Outer Adam beta1
+0.9
+Outer Adam beta2
+0.999,
+0.95
+Outer Adam epsilon
+1.0,
+𝟏𝟎
+−
+𝟏
+superscript
+10
+1
+\mathbf{10^{-1}}
+bold_10 start_POSTSUPERSCRIPT - bold_1 end_POSTSUPERSCRIPT
+,
+10
+−
+3
+superscript
+10
+3
+10^{-3}
+10 start_POSTSUPERSCRIPT - 3 end_POSTSUPERSCRIPT
+,
+10
+−
+5
+superscript
+10
+5
+10^{-5}
+10 start_POSTSUPERSCRIPT - 5 end_POSTSUPERSCRIPT
+,
+10
+−
+7
+superscript
+10
+7
+10^{-7}
+10 start_POSTSUPERSCRIPT - 7 end_POSTSUPERSCRIPT
+Communication frequency
+H
+𝐻
+H
+italic_H
+50, 100, 250,
+500
+,
+1
+,
+000
+1
+000
+1{,}000
+1 , 000
+,
+2
+,
+000
+2
+000
+2{,}000
+2 , 000
+Number of pretraining steps
+0,
+12
+,
+000
+12
+000
+12{,}000
+12 , 000
+,
+𝟐𝟒
+,
+𝟎𝟎𝟎
+24
+000
+\mathbf{24{,}000}
+bold_24 , bold_000
+,
+48
+,
+000
+48
+000
+48{,}000
+48 , 000
+Number of replicas
+4,
+8
+, 16, 64
+Data regimes
+i.i.d.,
+non-i.i.d
+Table 5
+:
+Optimization Hyperparameters
+evaluated during in this work. Chosen values for main experiments are highlighted in bold.
+Hyperparameters
+We displayed in
+Table 1
+the architectural difference between the 60M, 150M, and 400M models we evaluted. In
+Table 5
+, we outline the optimization hyperparameters considered for this study, and highlight in bold the values chosen for the main experiments. We detailled extensively the impact of each hyparameters in
+subsection 3.1
+.
+Inner Optimizer States
+In all experiments, the inner optimizer,
+InnerOpt
+, is AdamW
+(Loshchilov and Hutter,
+2019
+)
+as standard practice when training transformer language models. Each replica in our method has a separate Adam state (
+e.g.
+first and second momentum). DiLoCo synchronizes the parameters of the model, but we also considered synchronizing the inner optimizer states. It did not lead to significant improvements while significantly increasing the communication cost (
+×
+3
+absent
+3
+\times 3
+× 3
+more data to transmit). Therefore, we let each model replica own their own version of optimizer states. Similar findings were found in the literature where SGDM or Nesterov momentum were used as inner optimizers
+(Wang et al.,
+2020
+; Ortiz et al.,
+2021
+)
+.
+Weighted Average of Outer Gradients
+In line 12 of
+Algorithm 1
+, we perform a uniform average of every model replica’s outer gradient. There are also other strategies, such as
+greedy soup
+(Wortsman et al.,
+2022b
+)
+where model replicas are selected sequentially to minimize validation loss,
+or
+disjoint merge
+(Yadav et al.,
+2023
+)
+which uses a sign-based heuristics. The first strategy is too time-costly in our setting. We tried the latter, but got slightly worse results. Thus, for the random i.i.d. data regime we use a uniform average. For the non-i.i.d. data regime, we rescale each outer gradient by the number of examples in its shard. While at
+k
+=
+4
+𝑘
+4
+k=4
+italic_k = 4
+, all clusters are quite balanced, imbalance can be striking at
+k
+=
+64
+𝑘
+64
+k=64
+italic_k = 64
+and giving more importance to larger clusters is beneficial.
+Infrastructure
+The empirical validation of this work was performed on machines hosting 16 A100 GPUs. These machines were not necessarily co-located in the same geographic region. The outer optimization step is performed on a CPU server connected to the local machines.
+% of pruned values
+Perplexity
+Relative change
+0%
+15.02
+0%
+25%
+15.01
+-0.06%
+50%
+15.08
++0.39%
+75%
+15.27
++1.66%
+Table 6
+:
+Pruning outer gradients
+using a per-neuron sign pruning
+(Yadav et al.,
+2023
+)
+.
+6.2
+Experiments & Ablations
+(a)
+i.i.d.
+data regime.
+(b)
+non-i.i.d.
+data regime.
+Figure 10
+:
+Cosine Similarity between Outer Gradients
+: The line is the average similarity among the
+k
+=
+8
+𝑘
+8
+k=8
+italic_k = 8
+replicas’ outer gradients, the shaded area is the standard deviation. This is almost null in the case of i.i.d. shards.
+Pruning outer gradients
+Although DiLoCo communicates infrequently, when communication is required the network might get saturated, particularly when there are lots of workers, or when the model replicas are large. We thus explored pruning of outer gradients in order to reduce the need for high-bandwidth networks.
+We consider the simplest pruning technique, sign-based pruning following
+Yadav et al. (
+2023
+)
+. More efficient methods could be explored in the future
+(Tang et al.,
+2023
+)
+, particularly those leveraging structured sparsity. In
+Table 6
+, we prune between 25% to 75% of the individual outer gradients per replica before averaging them. Pruning up to 50% of the individual values resulted in negligible loss of performance (
++
+0.39
+%
+percent
+0.39
++0.39\%
++ 0.39 %
+perplexity). Therefore, DiLoCo’s communication efficiency can be further improved using standard compression techniques.
+Figure 11
+:
+Outer Gradients similarity versus number of replicas
+: in a non-i.i.d. data regime increasing the number of replicas/clusters (
+k
+=
+4
+→
+8
+𝑘
+4
+→
+8
+k=4\rightarrow 8
+italic_k = 4 → 8
+) produces more dissimilar outer gradients.
+Cosine Similarity of outer gradients
+Our empirical validation shows remarkable robustness of DiLoCo to the number of inner optimization steps and data distribution of the shards. Why does DiLoCo converge even when performing 500 inner steps? And why using shards with different data distribution does not harm performance at all?
+To shed light on these questions, we have gathered statistics of the outer gradients returned by workers.
+In particular, we calculate the average cosine similarity between outer gradients returned by workers while varying the number of inner optimization steps (
+H
+=
+{
+250
+,
+500
+,
+1000
+}
+𝐻
+250
+500
+1000
+H=\{250,500,1000\}
+italic_H = { 250 , 500 , 1000 }
+) for both the i.i.d. (in
+10(a)
+) and non-i.i.d. (in
+10(b)
+) settings.
+The former regime has close to no variance compared to the latter, since all shards have the same data distribution and therefore outer-gradients are much more correlated. For both data regimes, perhaps unintuitively, similarity is inversely proportional to the communication frequency however. We surmise that when the number of inner step is larger (up to some extent) model replicas converge towards a similar general direction
+(Gu et al.,
+2023
+)
+averaging out the noise of stochastic gradient descent.
+Interestingly, as the learning rate anneals to 0 towards the end of training, the outer gradients similarity increases in the i.i.d. case but in the non-i.i.d. case only the variance increases. Since shards have a different distribution, each local optimization seems to fall in a different nearby loss basin. However, the averaging of such more orthogonal gradients grants beneficial generalization as the non-i.i.d. version of DiLoCo tends to generalize at a better rate towards the end of training as can be seen in
+Figure 5
+.
+Lastly, in the non-i.i.d. setting we expect that the larger the number of shards the more distinctive their distribution, and therefore, the less correlated the corresponding outer gradients.
+Figure 11
+shows precisely this trend when going from
+k
+=
+4
+𝑘
+4
+k=4
+italic_k = 4
+to
+k
+=
+8
+𝑘
+8
+k=8
+italic_k = 8
+shards.
+We also found that in this setting the averaged outer gradient’s norm is inversely proportional to the square root of the number of replicas.
\ No newline at end of file
diff --git a/research/notes/evolving-deeper-llm-thinking.md b/research/notes/evolving-deeper-llm-thinking.md
new file mode 100644
index 0000000000000000000000000000000000000000..e110ab46b772c3de857ccb715f5b8c493685eaf2
--- /dev/null
+++ b/research/notes/evolving-deeper-llm-thinking.md
@@ -0,0 +1,3884 @@
+---
+title: Evolving Deeper LLM Thinking
+id: evolving-deeper-llm-thinking
+tags:
+- deepread
+created: '2026-06-10T00:25:08.642966Z'
+source: https://arxiv.org/html/2501.09891
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:25:08.642655Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Evolving Deeper LLM Thinking
+Evolving Deeper LLM Thinking
+Kuang-Huei Lee
+First author contribution
+Senior author contribution
+Google DeepMind
+Ian Fischer
+First author contribution
+Google DeepMind
+Yueh-Hua Wu
+Work done as a student researcher at Google DeepMind
+UC San Diego
+Dave Marwood
+Google DeepMind
+Shumeet Baluja
+Google DeepMind
+Dale Schuurmans
+Google DeepMind
+University of Alberta
+Xinyun Chen
+Google DeepMind
+Abstract
+We explore an evolutionary search strategy for scaling inference time compute
+in Large Language Models.
+The proposed approach, Mind Evolution, uses a language model to generate,
+recombine and refine candidate responses.
+The proposed approach avoids the need to formalize the underlying inference
+problem whenever a solution evaluator is available.
+Controlling for inference cost, we find that Mind Evolution significantly
+outperforms other inference strategies such as Best-of-N and Sequential Revision
+in natural language planning tasks.
+In the TravelPlanner and Natural Plan benchmarks, Mind Evolution solves more
+than 98% of the problem instances using Gemini 1.5 Pro
+without the use of a formal solver.
+1
+Introduction
+How can a large language model (LLM) be guided to
+think deeper
+about
+a complex problem and leverage inference time compute to improve its problem
+solving ability?
+Prior research has investigated various strategies for leveraging
+inference time compute,
+such as chain-of-thought
+[
+41
+,
+21
+]
+,
+self-consistency
+[
+39
+]
+,
+sequential revision based on feedback
+[
+36
+,
+30
+,
+8
+,
+19
+,
+1
+]
+,
+and search guided by auxiliary verifiers or evaluators
+[
+43
+]
+.
+When a solution evaluator is available,
+search strategies have
+an advantage of being able to reliably improve problem solving ability with increased compute.
+For example, methods
+such as Best-of-N
+[
+4
+,
+24
+,
+25
+]
+and tree search
+[
+37
+]
+naturally exploit additional compute
+to explore a larger set of solution candidates,
+thereby increasing the probability of finding a successful solution.
+Figure 1
+:
+Mind Evolution is a genetic-based evolutionary search strategy that operates
+in natural language space.
+The figure illustrates how Mind Evolution evolves a population of solution
+candidates toward higher quality candidates for a travel planning task.
+The candidate population is improved through an iterative process,
+where an LLM is used to recombine and refine candidates in each iteration.
+To better exploit inference time compute,
+we propose an evolutionary search strategy for LLMs that combines free-flowing
+stochastic exploration with large-scale iterative refinement.
+We refer to this approach as
+Mind Evolution
+.
+As illustrated in
+Figure
+˜
+1
+, Mind Evolution is a genetic search
+strategy that evolves a diverse population of candidate solutions,
+leveraging an LLM to generate, recombine and refine solution
+candidates based on feedback from an evaluator.
+The overall process is analogous to combining divergent thinking
+(free-flowing parallel idea exploration) with convergent thinking
+(idea evaluation and selection),
+considered as hallmarks of intelligent problem solving
+behavior
+[
+14
+]
+.
+Unlike Best-of-N, which searches broadly by generating independent candidates
+for evaluation,
+Mind Evolution searches both broadly and deeply, exploring a diverse set of
+candidates and refining the most promising alternatives.
+Unlike sequential reasoning approaches,
+such as self-refinement or tree search
+[
+37
+,
+25
+]
+,
+which require evaluation of individual reasoning steps,
+Mind Evolution performs global refinement of complete solutions,
+and therefore only requires a global solution evaluator
+rather than a stepwise process reward.
+Also, typical of evolutionary methods, Mind Evolution can be easily parallelized.
+There has been prior work on combining evolutionary search with LLMs,
+primarily in the literature on evolutionary program generation
+[
+34
+,
+17
+,
+29
+,
+23
+,
+6
+]
+.
+However, this prior work focuses on searching through formal program spaces,
+using guidance from execution feedback or code explanation.
+By contrast, Mind Evolution is not restricted to searching in a formal space.
+This allows Mind Evolution to be applied to problems that
+are not formalized, or remain difficult to formalize,
+as long as a programmatic solution evaluator is available.
+In particular, we focus on natural language planning tasks
+where candidate solutions can still be automatically parsed, evaluated
+and critiqued using an implementable oracle evaluator.
+This approach exploits the observation that it is often easier to evaluate
+the quality of a candidate solution than it is to generate good solutions
+for a given problem
+[
+11
+]
+.
+In the domain of natural language planning,
+we consider the TravelPlanner
+[
+42
+]
+and
+Natural Plan
+[
+47
+]
+benchmarks,
+where constraint satisfaction problems are expressed in natural language
+without any explicit formalization of the underlying
+objectives, constraints or variables.
+These problems require a set of interconnected decisions
+that satisfy a set of global and local constraints.
+For example, in TravelPlanner, a travel plan should be produced
+that respects various accommodation and dinning constraints,
+while also considering budget limitations and other preferences,
+all expressed solely in natural language.
+To date, LLMs have yet to achieve good performance on these tasks
+without the aid of formal solvers
+[
+16
+]
+.
+For example, Gemini 1.5 Flash and o1-preview only achieve a success rate of
+5.6% and 11.7% on TravelPlanner respectively,
+while for the Meeting Planning domain in Natural Plan,
+they respectively only achieve 20.8% and 44.2%.
+Even exploiting Best-of-N over 800 independently generated responses,
+Gemini 1.5 Flash still only achieves 55.6% success on
+TravelPlanner and 69.4% on Meeting Planning.
+In this paper, we show that exploration and refinement with evolutionary search
+can notably improve problem solving ability.
+In particular, when controlling for inference time compute,
+Mind Evolution allows Gemini 1.5 Flash to achieve a 95.6% success rate on
+TravelPlanner and 85.0% on Meeting Planning.
+We further experiment with a two-stage approach,
+where any unsolved problem instances are subsequently tackled by
+Mind Evolution with Gemini 1.5 Pro,
+which leads to 100% success on TravelPlanner and 98.4% on Meeting Planning.
+All of the experiments in this paper only use off-the-shelf LLMs
+without any finetuning.
+To our knowledge, the only prior work that achieves comparable performance on
+the TravelPlanner benchmark is
+[
+16
+]
+,
+which leverages an auxiliary formal solver and requires the LLM to
+first translate a given problem instance into an equivalent formalization.
+In general, it takes significant effort and expertise to correctly
+formalize a problem expressed in natural language;
+prompting an LLM to correctly perform such a translation
+requires at least as much domain expertise.
+Mind Evolution removes this constraint by directly optimizing solutions in the space of natural language.
+Finally, we introduce a new benchmark problem, StegPoet,
+that involves encoding a hidden message in a generated essay, story or poem.
+This form of stenography
+[
+33
+]
+is difficult to formalize and solve,
+yet a hidden message detector can still be implemented to programmatically
+guide the search.
+Our motivation is to demonstrate the applicability of
+search beyond natural language domains that can be easily formalized.
+We find that Mind Evolution allows Gemini 1.5 Pro to
+achieve a success rate of 87% in this task.
+2
+Related Work
+Pairing LLMs with Evolutionary Search
+In addition to the program generation studies discussed in
+Section
+˜
+1
+, several recent works have explored combining LLMs and evolution for numerical optimization
+[
+26
+,
+3
+]
+and combinatorial optimization
+[
+28
+,
+44
+]
+.
+The problem spaces we tackle in this work, such as natural language planning, can also be viewed as combinatorial optimization problems – optimizing plans subject to constraints specified in natural language.
+In contrast to these previous studies, we focus on evolving solutions in natural language spaces instead of formal spaces.
+This removes the requirement of task formalization, which requires significant effort and expert knowledge for each task instance.
+Other works have also applied evolutionary search to prompt optimization, with the goal of improving performance on target tasks
+[
+45
+,
+10
+,
+15
+]
+.
+Among these, EvoAgent
+[
+45
+]
+also evaluated their approach on the TravelPlanner benchmark.
+In contrast to our work, which performs evolutionary search directly on plans, EvoAgent evolves new LLM agents to form a multi-agent system for problem solving. Their best success rate on the TravelPlanner validation set was
+7.2
+%
+7.2\%
+with GPT-4, while our approach achieved over
+95
+%
+95\%
+with Gemini 1.5 Flash.
+Pairing LLMs with Evaluators
+In this work, we evaluate solutions with program-based evaluators during the evolutionary search.
+The idea of integrating execution-based evaluators in the inference loop has been widely adopted in the literature of code generation, where the execution environment provides feedback for the LLM to fix bugs in the generated code
+[
+7
+,
+22
+,
+27
+,
+46
+,
+8
+,
+17
+,
+29
+,
+23
+,
+6
+,
+36
+]
+.
+Other prior work has also considered using learned verifiers, reward models, or self-evaluation for response refinement
+[
+20
+,
+30
+]
+, search
+[
+37
+,
+4
+,
+9
+,
+43
+,
+35
+]
+, and improving model learning
+[
+40
+,
+25
+,
+32
+,
+1
+]
+.
+These approaches can often be applied to wider domains and free-form solutions,
+but learned feedback models or self-evaluators can be noisy and are not perfectly reliable.
+We leave consideration of such approximate feedback mechanisms for future work.
+3
+Method
+Mind Evolution employs a genetic search strategy,
+combined with an LLM and a tailored set of prompts,
+to orchestrate an efficient search for solutions to
+natural language planning tasks.
+Before describing Mind Evolution in detail,
+we first provide a brief overview of language-based genetic algorithms.
+3.1
+Language-based Genetic Algorithm Overview
+Genetic algorithms
+[
+18
+,
+12
+,
+31
+]
+are a meta-heuristic inspired by natural selection.
+In a genetic algorithm, a population of candidate solutions is evolved
+toward populations that contain a greater proportion of higher quality
+individuals with respect to a target optimization objective.
+Such an objective is also often referred to as the “fitness” function.
+Each individual candidate has a genetic representation that can be
+mutated and recombined with others.
+Evolutionary search usually begins with a population of independently generated
+candidate solutions.
+In each generation, the fitness of every individual is evaluated
+with respect to the target objective.
+Candidates are then stochastically selected for reproduction
+based on their fitness (“selection”).
+In reproduction, the genetic representations of selected parents
+are combined (“crossover”) and potentially altered (“mutation”)
+to produce new child solutions.
+Such a process creates the next generation of children,
+which then enter the population.
+Population fitness generally increases over successive generations,
+as parents with greater fitness are more likely to be selected for
+recombination.
+Island Model
+To sustain diversity in an evolving population it is also helpful to introduce
+an island model
+[
+38
+,
+5
+]
+,
+where distinct sub-populations (“islands”) are created and evolved
+independently between “migration” and “island reset” events
+that occur at specified frequencies.
+For a migration operation, the solutions on one island are stochastically
+chosen based on fitness to migrate to an adjacent island.
+For an Island Reset operation, the populations on islands with low overall
+fitness are replaced by strong solutions from the global population,
+which also has a selection effect.
+The island model has been adopted in recent successful efforts,
+such as FunSearch
+[
+34
+]
+.
+Language-based Genetic Representation
+The individual candidates in a language-based genetic algorithm are
+represented by natural language.
+This allows the strong language understanding and generation capabilities of
+an LLM to be leveraged to implement powerful recombination
+(crossover and mutation) and island reset operations through prompting.
+3.2
+Mind Evolution
+Parameter
+Default Value
+Description
+N
+gens
+N_{\text{gens}}
+10
+The maximum number of generations to search for a solution.
+N
+island
+N_{\text{island}}
+4
+How many independent populations to evolve.
+N
+convs
+N_{\text{convs}}
+5
+How many conversations per island.
+N
+seq
+N_{\text{seq}}
+4
+How many turns per conversation.
+N
+reset interval
+N_{\text{reset interval}}
+3
+How frequently to reset islands in generations.
+N
+reset
+N_{\text{reset}}
+2
+How many islands to reset. Lowest mean score islands are chosen.
+N
+top
+N_{\text{top}}
+5
+How many starting parents to transfer to islands when reset.
+N
+candidate
+N_{\text{candidate}}
+15
+How many candidate parents to consider when resetting islands with the LLM.
+N
+parent
+N_{\text{parent}}
+5
+Maximum number of parents a conversation can have.
+P
+​
+r
+no parents
+Pr_{\text{no parents}}
+1/6
+Probability of a conversation having no parents.
+N
+emigrate
+N_{\text{emigrate}}
+5
+How many plans to emigrate to the next island after each island.
+N
+retries
+N_{\text{retries}}
+5
+How many times to try to generate a plan before giving up at each turn.
+Table 1:
+Definition of hyperparameters in Mind Evolution.
+Unless otherwise specified, the experiments in work use the default values.
+The product of the first four hyperparameters gives the maximum number of candidate solutions generated (800 in the default setting).
+Figure
+˜
+1
+illustrates the design of Mind Evolution,
+with its hyperparameters listed in
+Table
+˜
+1
+.
+The core components of Mind Evolution are:
+1.
+the specific choices for the selection and migration operations;
+2.
+the set of prompts that implement the initialization, recombination
+(crossover and mutation), and island reset operations with an LLM;
+3.
+the fitness function that evaluates the quality of a given solution
+and optionally provides feedback on issues detected.
+The overall evolution process is repeated until a valid solution is found,
+or until
+N
+gens
+N_{\text{gens}}
+generations have been completed,
+after which the best scoring candidate is returned.
+Fitness Evaluation
+As discussed in
+Section
+˜
+1
+,
+we implement a fitness function for each problem domain,
+where candidate solutions are parsed and evaluated programmatically.
+In principle, any function that can evaluate solution quality can be used,
+including LLM evaluation.
+The evaluation function plays three key roles in Mind Evolution:
+(1) scoring solutions by measuring the optimization objective, if any;
+(2) verifying whether the solution satisfies given constraints;
+and (3) providing corresponding textual feedback.
+For example, the evaluation function for the Meeting Planning task scores a
+proposed plan and provides textual feedback based on how many constraints are
+violated
+(e.g. meetings conflict with existing schedules),
+how many valid meeting events are included in the schedule,
+and whether the plan follows the required format
+(see
+Section
+˜
+A.2
+for more details).
+We have found that using textual feedback is important empirically,
+as shown in our ablation study in
+Section
+˜
+4.4
+.
+Note that for many classical search problems (e.g., NP-complete problems),
+verifying solutions can be much easier than solving the problem
+[
+11
+]
+.
+Similarly, we observe that it is possible to write an evaluation
+function for the natural language planning tasks we consider.
+The ability to check the correctness of a candidate solution does not
+obviously lead to the ability to generate a valid solution in the tasks we consider.
+That is,
+implementing an evaluation function is not equivalent to solving the task.
+Population Initialization
+Given a target problem, we independently sample
+N
+convs
+N_{\text{convs}}
+initial
+solutions by prompting an LLM with a description of the problem,
+any information needed for solving the problem, and relevant instructions.
+If
+N
+seq
+>
+1
+N_{\text{seq}}>1
+,
+each of these initial solutions is then evaluated and refined sequentially
+through
+N
+seq
+−
+1
+N_{\text{seq}}-1
+additional turns of the
+“Refinement through Critical Conversation” process explained below.
+In total, this initialization procedure generates
+N
+convs
+×
+N
+seq
+N_{\text{convs}}\times N_{\text{seq}}
+candidate solutions,
+which forms the initial population on the first island for the first generation.
+Refinement through Critical Conversation (RCC)
+Given a candidate solution
+(or a set of candidate solutions for the process of recombination)
+we leverage an LLM to generate an improved solution
+by organizing a critical conversation between
+a “critic” character and an “author” character,
+as illustrated in
+Figure
+˜
+2
+.
+Separating these two roles is intended to improve the critical thinking
+ability of an LLM.
+Each conversational turn is structured as a prompt-driven
+process, where solutions are refined based on critical feedback,
+similar to Reflexion
+[
+36
+]
+.
+In particular,
+the critic first analyzes the candidate solution(s) provided as input,
+interprets the textual evaluation feedback,
+and suggest ways to correct any issues presented in the feedback.
+The author then proposes a single refined solution based on the input
+candidate(s), the subsequent evaluation(s), and the critic’s analyses.
+The specific prompts used to drive these conversations
+are given in
+Section
+˜
+A.1
+.
+An ablation study in
+Section
+˜
+4.4
+shows that the critic’s analysis
+step provides substantial performance improvements.
+Figure 2
+:
+Illustrating the Refinement through Critical Conversation (RCC) process,
+where an initial solution is first proposed, then evaluated and subjected to
+feedback from a critic, after which an author proposed a refined solution
+and the process iterates.
+Selection
+To produce the next generation of an island,
+we follow Boltzmann tournament selection
+[
+13
+]
+where
+0
+to
+N
+parent
+N_{\text{parent}}
+parents are stochastically sampled from the
+population according a probability distribution
+that is derived from a softmax transformation of their fitness scores.
+In this way, higher-performing solutions are more likely to be selected for
+reproduction, while other candidates can still be occasionally selected for
+diversity.
+Crossover and Mutation
+We implement the crossover and mutation operations as a single recombination step,
+where an LLM is instructed to improve a given set of parents
+using the RCC process described above (
+Figure
+˜
+2
+).
+In particular, for recombination we sample
+1
+1
+to
+N
+parent
+N_{\text{parent}}
+parents
+and alter Step (b) in
+Figure
+˜
+2
+to first incorporate the evaluation results of the parents, then apply the critic to all parents
+and propose the revised solution as an “initial solution” for the next generation.
+Then, if
+N
+seq
+>
+1
+N_{\text{seq}}>1
+, we continue to follow Steps (c)(d)(e) to sequentially generate
+N
+seq
+−
+1
+N_{\text{seq}}-1
+child solutions by refining each previous child using the RCC process.
+For each generation on each island,
+N
+convs
+×
+N
+seq
+N_{\text{convs}}\times N_{\text{seq}}
+child solutions are added to the island population,
+with duplicate solutions removed.
+For selection, we follow a Boltzmann tournament instead of explicitly retiring
+candidate solutions, except when performing an Island Reset below.
+Migration between Islands
+Between migration events, each island population is evolved independently.
+During a migration,
+the top
+N
+emigrate
+N_{\text{emigrate}}
+solutions are cloned from the current Island
+i
+i
+to the next Island
+i
++
+1
+i+1
+after completing the generation on the current island
+(we update the populations on the islands sequentially from
+1
+1
+to
+N
+island
+N_{\text{island}}
+).
+Migration is performed cyclically between the islands,
+so emigrants from Island
+N
+island
+N_{\text{island}}
+arrive at Island
+1
+1
+.
+We have found that this form of cyclic migration accelerates the overall
+evolution process.
+Island Reset
+Island reset happens every
+N
+reset interval
+N_{\text{reset interval}}
+generations.
+During an Island Reset event, the top performers are first selected from the
+global population, the populations on
+N
+reset
+N_{\text{reset}}
+islands with the
+lowest average scores are retired, and the selected top performers are cloned
+onto the reset islands.
+To select top performers, we explore two approaches:
+(1) directly select the top
+N
+top
+N_{\text{top}}
+candidates according to fitness;
+and
+(2) first select the top
+N
+candidate
+N_{\text{candidate}}
+candidates according to fitness,
+then prompt the LLM to select
+N
+top
+N_{\text{top}}
+good candidates from this pool
+that are substantially different from each other.
+The ablation study in
+Section
+˜
+4.4
+show that the latter strategy,
+using an LLM for Island Reset, achieves better performance.
+4
+Experiments
+Tasks
+We evaluate Mind Evolution on three benchmark
+natural language planning domains:
+two tasks from Natural Plan
+[
+47
+]
+,
+including Trip Planning (
+Section
+˜
+4.2
+)
+and Meeting Planning (
+Section
+˜
+4.3
+),
+and the TravelPlanner
+[
+42
+]
+benchmark
+(
+Section
+˜
+4.1
+).
+(We omit the Calendar Scheduling task from Natural Plan,
+since these problems can be solved by enumeration.)
+Implementation details for each task is provided in
+Appendix
+˜
+A
+,
+including the prompts (
+Section
+˜
+A.1
+)
+and evaluation functions used (
+Section
+˜
+A.2
+).
+Models
+We use Gemini 1.5 Flash (gemini-1.5-flash-001) as the default LLM
+in our experiments below.
+The hyperparameters used when applying Mind Evolution to Flash
+are specified in
+Table
+˜
+1
+.
+In addition to evaluating Mind Evolution with the Flash model,
+we also investigate a two-stage approach,
+where Gemini 1.5 Pro model (gemini-1.5-pro-exp-0827)
+is used to tackle problems that are not solved
+within the
+N
+gens
+N_{\text{gens}}
+generation limit.
+Such a two-stage approach provides better cost-efficiency than using the
+Pro model on every problem instance.
+When applying Mind Evolution to the Pro model we alter the hyperparameters
+from those specified in
+Table
+˜
+1
+to:
+N
+convs
+=
+8
+N_{\text{convs}}=8
+,
+N
+seq
+=
+3
+N_{\text{seq}}=3
+,
+N
+parent
+=
+10
+N_{\text{parent}}=10
+,
+P
+​
+r
+no parents
+=
+1
+/
+5
+Pr_{\text{no parents}}=1/5
+.
+Baselines
+For each task, we compare Mind Evolution to three baseline search strategies
+that use the same solution evaluator and task-specific prompts:
+1.
+1-Pass
+,
+where a solution is proposed using a single forward pass of the LLM.
+2.
+Best-of-N
+[
+4
+]
+,
+where up to 800 candidate solutions are independently generated until a
+successful solution is found
+(the same upper bound as Mind Evolution).
+3.
+Sequential-Revision+
+,
+where 10 candidate solutions are proposed independently,
+then revised separately for 80 turns using the
+RCC process (
+Figure
+˜
+2
+).
+Note that 10 independent threads of 80-turn refinements are used instead of
+a single 800-turn refinement, because we rarely observe improvements after 80
+turns.
+This baseline is similar to running 10 trials of multi-turn
+Reflexion
+[
+36
+]
+.
+Additionally, for reference,
+we also include an additional 1-Pass baseline that uses OpenAI o1-preview.
+Metrics
+We measure Success Rate as the percentage of problem instances
+that are solved completely within a benchmark domain,
+separating the validation and test sets.
+(Note that the Success rate is referred to as Solve Rate in
+Natural Plan
+[
+47
+]
+and Final Pass Rate in TravelPlanner
+[
+42
+]
+.)
+To assess the cost of inference compute we report
+the number of LLM calls,
+the number of input and output tokens,
+and the total API cost of calling the LLM.
+(These costs are given in US Dollars, using prices from October 2024 when the
+experiments were conducted.
+The base rates are listed in
+Appendix
+˜
+D
+.)
+Note that assessing computational cost is particularly important when evaluating
+search strategies like Mind Evolution, since search is more expensive than
+generating a single solution.
+These statistics can help researchers and developers understand the cost-benefit trade-offs when using search to enhance LLM problem solving ability.
+Set
+Success Rate
+LLM Calls
+Input Tokens
+Output Tokens
+API Cost (Oct 2024)
+TravelPlanner
+[
+42
+]
+1-Pass
+val
+10
+/
+180
+=
+5.6
+%
+10/180=5.6\%
+1
+0.009
+0.009
+M
+0.001
+0.001
+M
+US$
+0.001
+0.001
+(o1-preview 1-Pass)
+val
+21
+/
+180
+=
+11.7
+%
+21/180=11.7\%
+1
+0.008
+0.008
+M
+0.008
+0.008
+M
+US$
+0.601
+0.601
+Best-of-N
+val
+100
+/
+180
+=
+55.6
+%
+100/180=55.6\%
+472
+4.44
+4.44
+M
+0.47
+0.47
+M
+US$
+0.47
+0.47
+Sequential-Revision+
+val
+149
+/
+180
+=
+82.8
+%
+149/180=82.8\%
+280
+35.53
+35.53
+M
+0.29
+0.29
+M
+US$
+2.75
+2.75
+Mind Evolution
+val
+172
+/
+180
+=
+172/180=
+95.6
+%
+\bf{95.6\%}
+174
+3.10
+3.10
+M
+0.18
+0.18
+M
+US$
+0.29
+0.29
+(+pro)
+val
+180
+/
+180
+=
+180/180=
+𝟏𝟎𝟎
+%
+\bf{100\%}
+(257)
+(
+3.25
+3.25
+M)
+(
+0.19
+0.19
+M)
+(US$
+0.54
+0.54
+)
+Mind Evolution
+test
+952
+/
+1000
+=
+952/1000=
+95.2
+%
+\bf{95.2\%}
+167
+3.02
+3.02
+M
+0.18
+0.18
+M
+US$
+0.28
+0.28
+(+pro)
+test
+999
+/
+1000
+=
+999/1000=
+99.9
+%
+\bf{99.9\%}
+(67)
+(
+3.05
+3.05
+M)
+(
+0.18
+0.18
+M)
+(US$
+0.33
+0.33
+)
+Natural Plan
+[
+47
+]
+Trip Planning
+1-Pass
+val
+66
+/
+320
+=
+20.6
+%
+66/320=20.6\%
+1
+0.002
+0.002
+M
+0.001
+0.001
+M
+<
+<
+US$
+0.001
+0.001
+(o1-preview 1-Pass)
+val
+116
+/
+320
+=
+36.2
+%
+116/320=36.2\%
+1
+0.002
+0.002
+M
+0.008
+0.008
+M
+US$
+0.53
+0.53
+Best-of-N
+val
+247
+/
+320
+=
+77.2
+%
+247/320=77.2\%
+274
+0.61
+0.61
+M
+0.18
+0.18
+M
+US$
+0.10
+0.10
+Sequential-Revision+
+val
+238
+/
+320
+=
+74.4
+%
+238/320=74.4\%
+391
+41.57
+41.57
+M
+0.38
+0.38
+M
+US$
+3.23
+3.23
+Mind Evolution
+val
+308
+/
+320
+=
+96.2%
+308/320=\textbf{96.2\%}
+168
+1.48
+1.48
+M
+0.19
+0.19
+M
+US$
+0.17
+0.17
+(+pro)
+val
+320
+/
+320
+=
+100%
+320/320=\textbf{100\%}
+(111)
+(
+1.51
+1.51
+M)
+(
+0.19
+0.19
+M)
+(US$
+0.22
+0.22
+)
+Mind Evolution
+test
+1204
+/
+1280
+=
+94.1%
+1204/1280=\textbf{94.1\%}
+196
+1.78
+1.78
+M
+0.22
+0.22
+M
+US$
+0.20
+0.20
+(+pro)
+test
+1275
+/
+1280
+=
+99.6%
+1275/1280=\textbf{99.6\%}
+(211)
+(
+1.86
+1.86
+M)
+(
+0.24
+0.24
+M)
+(US$
+0.37
+0.37
+)
+Natural Plan
+[
+47
+]
+Meeting Planning
+1-Pass
+val
+104
+/
+500
+=
+20.8
+%
+104/500=20.8\%
+1
+0.007
+0.007
+M
+0.001
+0.001
+M
+US$
+0.001
+0.001
+(o1-preview 1-Pass)
+val
+221
+/
+500
+=
+44.2
+%
+221/500=44.2\%
+1
+0.006
+0.006
+M
+0.006
+0.006
+M
+US$
+0.47
+0.47
+Best-of-N
+val
+347
+/
+500
+=
+69.4
+%
+347/500=69.4\%
+444
+3.99
+3.99
+M
+0.31
+0.31
+M
+US$
+0.39
+0.39
+Sequential-Revision+
+val
+310
+/
+500
+=
+62.0
+%
+310/500=62.0\%
+484
+32.16
+32.16
+M
+0.40
+0.40
+M
+US$
+2.53
+2.53
+Mind Evolution
+val
+425
+/
+500
+=
+85.0%
+425/500=\textbf{85.0\%}
+406
+5.35
+5.35
+M
+0.41
+0.41
+M
+US$
+0.52
+0.52
+(+pro)
+val
+492
+/
+500
+=
+98.4%
+492/500=\textbf{98.4\%}
+(890)
+(
+13.36
+13.36
+M)
+(
+0.91
+0.91
+M)
+(US$
+2.55
+2.55
+)
+Mind Evolution
+test
+419
+/
+500
+=
+83.8%
+419/500=\textbf{83.8\%}
+394
+5.24
+5.24
+M
+0.40
+0.40
+M
+US$
+0.51
+0.51
+(+pro)
+test
+491
+/
+500
+=
+98.2%
+491/500=\textbf{98.2\%}
+(828)
+(
+12.25
+12.25
+M)
+(
+0.83
+0.83
+M)
+(US$
+2.34
+2.34
+)
+Table 2:
+Experimental results on benchmark natural language planning tasks.
+“(+pro)” denotes the two-stage results, where we use Gemini 1.5 Pro to solve the problems that were not solved in experiments using Gemini 1.5 Flash.
+Number of LLM calls, token counts, and API cost are averaged across the validation or test problem set, and they are calculated only on the remaining problems for the “(+pro)” experiments.
+Here, we also show OpenAI o1-preview results as a reference.
+4.1
+TravelPlanner
+TravelPlanner
+[
+42
+]
+is a natural language planning
+benchmark that simulates the problem of organizing a trip plan for a user
+who expresses preferences and constraints.
+We focus on the sole-planning mode
+(see
+[
+42
+]
+for details),
+where each problem instance consists of a list of options regarding
+accommodation, restaurants, attractions and transportation,
+plus additional constraints that specify user preferences for budget, cuisine,
+etc.
+A plan is evaluated based on whether it satisfies the user preferences and
+commonsense constraints.
+Table
+˜
+2
+gives detailed results that compare
+the overall Success Rate and computational cost of Mind Evolution versus the baseline strategies.
+In terms of Success Rate, Mind Evolution clearly outperforms the baseline
+strategies, achieving over 95%.
+By comparison, Sequential-Revision+ provides a reasonable baseline, achieving
+almost 83%, while Best-of-N struggles, achieving only 55.6%.
+Overall,
+these results demonstrate a clear advantage of an evolutionary strategy that
+combines a broad search, through stochastic exploration, with a deep
+search that leverages an LLM for solution refinement.
+Considering the two-stage approach, where Mind Evolution uses
+Gemini 1.5 Pro for any unsolved problems,
+we find that nearly the entire dataset can be solved,
+achieving a 100% success rate on validation and
+99.9% on test problems respectively.
+The only work we are aware of that comes close to this success rate is
+[
+16
+]
+, which uses GPT-4 for auto-formalization then leverages
+a formal solver to achieve 98.9% and 97.0% on validation and test
+respectively.
+Mind Evolution achieves comparable results without requiring a formal solver.
+Finally, we note that the TravelPlanner dataset is organized into
+three levels of difficulty (Easy, Medium, Hard) and three trip durations
+(3 days, 5 days, 7 days), rendering 9 different problem classes.
+Figure
+˜
+3
+presents a breakdown of the success rates
+achieved across these different categories,
+showing that the success rates of 1-Pass and Best-of-N decline when planning
+for more travel days, but the trend is less clear for Mind Evolution and
+Sequential-Revision+, both of which iteratively refine proposed solutions.
+Figure 3
+:
+Success rate on the validation set of the TravelPlanner benchmark,
+organized by problem instance difficulty and the number of travel days.
+4.2
+Natural Plan – Trip Planning
+The Trip Planning task
+[
+47
+]
+involves finding an itinerary that consists of a sequence of cities to visit
+and number of days in each that satisfies flight connectivity
+and scheduling constraints –
+see
+Table
+˜
+3
+for a problem instance.
+We split the benchmark into 320 validation and 1,280 test instances
+(described in more detail in
+Appendix
+˜
+B
+).
+The results in
+Table
+˜
+2
+again show that Mind Evolution strongly outperforms the baselines on this task,
+achieving 96.2% on the validation and 94.1% on the test instances.
+Table
+˜
+2
+also shows a qualitative comparison between the
+results produced by Mind Evolution and the baseline strategies.
+Note that Best-of-N performs better in this scenario (77.2%), even beating
+Sequential-Revision+ (74.4%).
+We find that for the two-stage approach,
+Mind Evolution achieves 100% on the validation set and 99.6% on the test set.
+These findings again highlight the benefit of evolutionary search
+versus simple sampling and sequential refinement.
+Finally,
+we note that the difficulty of this task varies with the number of cities to
+visit, ranging from 3 to 10.
+Figure
+˜
+4
+shows a breakdown of the Success Rate in terms
+of number of cities, where the relative advantage of Mind Evolution appears to
+increase as the number of cities grows.
+Figure 4
+:
+Success rate on the validation set of the Trip Planning benchmark per number of cities to visit.
+Q: You plan to visit 5 European cities for 16 days in total. You only take direct flights to commute between cities. You want to spend 5 days in Madrid. From day 3 to day 7, there is a annual show you want to attend in Madrid. You plan to stay in Zurich for 3 days. You would like to visit Frankfurt for 3 days. You would like to visit Santorini for 6 days. You are going to attend a wedding in Santorini between day 7 and day 12. You want to spend 3 days in Riga.
+Here are the cities that have direct flights:
+Zurich and Riga, Frankfurt and Riga, Santorini and Zurich, Madrid and Zurich, Frankfurt and Zurich, Madrid and Santorini, Frankfurt and Madrid.
+Find a trip plan of visiting the cities for 16 days by taking direct flights to commute between them.
+Method
+Answer
+1-Pass
+Madrid
+(Day 1-7)
+Santorini (Day 7-12)
+Zurich (Day 12-14)
+Frankfurt (Day 14-16)
+Riga
+(Day 16-19)
+7 days for Madrid instead of 5; 4 days for Riga instead of 3; 19 days in total instead of 16.
+Best-of-N
+Madrid
+(Day 1-7)
+Santorini (Day 7-12)
+Zurich (Day 12-14)
+Frankfurt (Day 14-16)
+Riga
+(Day 16-16)
+7 days for Madrid instead of 5; 1 day for Riga instead of 3.
+Sequential Revisions+
+Zurich (Day 1-3)
+Frankfurt (Day 3-5)
+Riga (Day 5-7)
+Santorini (Day 7-12)
+Madrid
+(Day 12-16)
+omitted the show in Madrid (Day 3-7); no direct flight from Riga to Santorini.
+Mind Evolution (ours)
+Frankfurt (Day 1-3)
+Madrid (Day 3-7)
+Santorini (Day 7-12)
+Zurich (Day 12-14)
+Riga (Day 14-16)
+Table 3:
+An example problem instance from the Trip Planning task in Natural Plan,
+with the predicted plans from Mind Evolution and the baselines.
+1-Pass and Best-of-N both make mistakes on number of days to stay,
+but satisfy the requirements of being in Madrid and Santorini on specific days.
+The Sequential-Revision+ plan omits the annual show in Madrid and plans a non-existent
+flight, but is correct in the number of days.
+In contrast, the Mind Evolution plan satisfies all specified requirements.
+4.3
+Natural Plan – Meeting Planning
+For the Meeting Planning task a sequence of meetings should be scheduled
+to maximize the number of meetings between individuals subject to
+availability, location and travel time constraints
+[
+47
+]
+.
+This task differs from TravelPlanner and Trip Planning
+in that not every meeting can be scheduled for every problem instance,
+implying that it is not possible to know whether an optimal solution
+has been reached.
+Therefore,
+to obtain the results shown in
+Table
+˜
+2
+,
+we allow the searches to proceed until the upper bounds on iteration counts
+have been reached.
+For this task, we split the set of instances into 500 validation and 500 test
+instances (see
+Appendix
+˜
+B
+for details).
+The results shown in
+Table
+˜
+2
+continue to demonstrate a significant performance for Mind Evolution over
+baseline strategies, achieving an 85.0% Success Rate on the validation set
+and 83.8% on the test set.
+Notably, the two-stage approach using Gemini 1.5 Pro achieves
+success rates to 98.4% and 98.2% on validation and test respectively.
+Finally,
+Figure
+˜
+5
+shows the breakdown of success rates
+by the number of people to schedule meetings with.
+In this case, we find that Mind Evolution sustains a significant advantage in
+success rate as the number of people increases.
+Figure 5
+:
+Success rate on the validation set of the Meeting Planning benchmark per number of people to meet with.
+4.4
+Analysis and Ablation Studies
+To understand how Mind Evolution’s performance scales, and how the different
+components affect its behavior,
+we provide additional measurements and ablations to gain additional insight.
+Scaling
+Regarding scaling,
+Figure
+˜
+6
+reports the Success Rate achieved by Mind Evolution across
+the planning tasks as a function of the number of generations.
+These results clearly show steady improvement for
+Mind Evolution as the number of generations is increased.
+To compare the scaling of Mind Evolution to that of the baseline search methods,
+we also plot the Success Rate and average task evaluation scores
+as a function of the number of candidate solutions generated
+by the each strategy
+(Figures
+7
+–
+9
+).
+The task evaluation scores are calculated by penalizing unsatisfied constraints
+and suboptimality of the objective value,
+hence the maximum score that can be achieved in any problem instance is zero
+(see
+Section
+˜
+A.2
+for details).
+In
+Appendix
+˜
+D
+, we provide another perspective on the
+cost-benefit trade-offs in terms of the specific API costs incurred.
+Figures
+7
+–
+9
+show the results for the TravelPlanner, Trip Planning and Meeting Planning
+tasks respectively.
+In each case,
+we see that the overall success rates and average task evaluation scores
+improve monotonically with an increasing number of proposed solutions
+across all search methods.
+These plots also show that Mind Evolution is consistently more effective than
+the baseline strategies with respect to the number of candidate solutions
+needed to achieve a specified level of success rate
+(or average task performance).
+We note that Best-of-N appears to be significantly underperforming on
+TravelPlanner.
+We hypothesize that this occurs because this task involves implicit
+commonsense constraints
+(e.g., a trip plan should return to the origin city,
+a restaurant cannot be visited twice, etc.),
+which are not given in the problem instance but instead learned from evaluation
+feedback, which Best-of-N does not leverage.
+Figure 6
+:
+Success rate on the validation set for each natural language planning benchmark at each generation of Mind Evolution.
+Figure 7
+:
+TravelPlanner success rates and evaluation scores as the number of candidate solutions is increased.
+Figure 8
+:
+Trip Planning success rates and evaluation scores as the number of candidate solutions is increased.
+Figure 9
+:
+Meeting Planning success rates and evaluation scores as the number of candidate solutions is increased.
+Ablations
+We also conducted a set of ablations to study the contribution of the
+different components of Mind Evolution.
+Table
+˜
+4
+shows that
+using the critic step in the RCC process
+(
+Figure
+˜
+2
+in
+Section
+˜
+3.2
+)
+and textual feedback from the evaluation functions
+are the most critical to performance,
+although the other components also make meaningful contributions to performance.
+Critic
+✓
+✓
+✓
+✓
+S/Q Prompts
+✓
+✓
+✓
+Textual Feedback
+✓
+✓
+Reset with LLM
+✓
+Success Rate
+46.1
+%
+46.1\%
+71.1
+%
+71.1\%
+76.1
+%
+76.1\%
+91.1
+%
+91.1\%
+95.6
+%
+95.6\%
+Table 4
+:
+An ablation study of Mind Evolution components on the TravelPlanner
+validation set.
+Each column in the table shows an experiment where ✓ indicates
+whether a component is used.
+If “Critic” is disabled, we skip the critic step in
+Figure
+˜
+2
+and go straight to the author step.
+“S/Q Prompts” stands for Strategy/Question prompts,
+which are additional task-specific instructions in the critical thinking
+prompts (see
+Section
+˜
+A.1
+for details).
+If “Textual Feedback” is disabled, we do not include evaluation feedback
+in the prompts.
+If “Reset with LLM” is disabled, we directly select global elites by their
+evaluation scores in island reset events, rather than use an LLM to choose,
+as described in
+Section
+˜
+3.2
+.
+Succ. Rate
+w/ island model (
+N
+island
+=
+4
+N_{\text{island}}=4
+,
+N
+convs
+=
+5
+N_{\text{convs}}=5
+)
+87.5%
+w/o island model (
+N
+island
+=
+1
+N_{\text{island}}=1
+,
+N
+convs
+=
+20
+N_{\text{convs}}=20
+)
+77.4%
+N
+convs
+=
+10
+,
+N
+gens
+=
+5
+N_{\text{convs}}=10,N_{\text{gens}}=5
+82.5%
+N
+convs
+=
+5
+,
+N
+gens
+=
+10
+N_{\text{convs}}=5,N_{\text{gens}}=10
+(default)
+87.5%
+N
+convs
+=
+4
+,
+N
+gens
+=
+13
+N_{\text{convs}}=4,N_{\text{gens}}=13
+85.0%
+Table 5:
+Hyperparameter studies on the Trip Planning problem instances with 10 cities.
+The first two rows show the difference between enabling and disabling
+the island model.
+The bottom three rows illustrate a trade-off between the number of candidates
+per generation versus the number of generations.
+(Note that the bottom row (
+N
+convs
+=
+4
+,
+N
+gens
+=
+13
+N_{\text{convs}}=4,N_{\text{gens}}=13
+) produces
+slightly more than 800 solutions (832).
+Task
+Results (Cipher and Poem)
+⬇
+Message
+To
+Encode
+(
+M
+):
+10,20,30,40,
+50,60,70,80,
+90,100,10,20
+Style
+:
+Poem
+Inspiration
+:
+Shel
+Silverstein
+Words
+Between
+(
+B
+):
+4
+Topic
+:
+"
+The
+Fun
+of
+Walking
+"
+⬇
+10
+:
+rooster
+;
+20
+:
+flowers
+;
+30
+:
+bright
+;
+40
+:
+flames
+;
+50
+:
+cherry
+;
+60
+:
+crimson
+;
+70
+:
+sunset
+;
+80
+:
+ruby
+;
+90
+:
+scarlet
+;
+100
+:
+burning
+;
+⬇
+I
+like
+to
+walk
+,
+I
+like
+to
+stride
+,
+With
+ROOSTER
+crows
+and
+FLOWERS
+by
+my
+side
+.
+I
+like
+to
+walk
+,
+I
+like
+to
+roam
+,
+Past
+BRIGHT
+green
+fields
+and
+FLAMES
+at
+home
+.
+I
+like
+to
+walk
+,
+I
+like
+to
+stroll
+,
+To
+see
+the
+CHERRY
+trees
+and
+a
+CRIMSON
+hole
+.
+I
+like
+to
+walk
+,
+I
+like
+to
+ramble
+,
+To
+watch
+the
+SUNSET
+and
+hold
+a
+RUBY
+’
+s
+gamble
+.
+I
+like
+to
+walk
+,
+I
+like
+to
+wander
+,
+To
+watch
+a
+SCARLET
+leaf
+,
+and
+a
+fire
+,
+BURNING
+,
+under
+.
+I
+like
+to
+walk
+,
+I
+like
+to
+stride
+,
+With
+ROOSTER
+crows
+and
+FLOWERS
+by
+my
+side
+.
+Figure 10
+:
+StegPoet example.
+Example of the encoding of a StegPoet problem instance (left)
+and a correct solution (right) that includes the number-to-word cipher and
+a poem in the style of a children’s poetry author.
+Note that
+|
+M
+|
+=
+12
+|M|=12
+in this instance.
+We added capitalization to the code words to highlight them.
+Set
+Success Rate
+Input Tokens
+Output Tokens
+API Cost (Oct 2024)
+1-Pass
+val
+0
+/
+101
+=
+0.0
+%
+0/101=0.0\%
+0.002
+0.002
+M
+<
+0.001
+<0.001
+M
+<
+<
+$
+0.001
+0.001
+Best-of-N
+val
+1
+/
+101
+=
+1.0
+%
+1/101=1.0\%
+1.56
+1.56
+M
+0.25
+0.25
+M
+$
+0.19
+0.19
+Sequential-Revision+
+val
+20
+/
+101
+=
+19.8
+%
+20/101=19.8\%
+41.69
+41.69
+M
+0.24
+0.24
+M
+$
+3.20
+3.20
+Mind Evolution
+val
+47
+/
+101
+=
+46.5%
+47/101=\textbf{46.5\%}
+3.56
+3.56
+M
+0.20
+0.20
+M
+$
+0.33
+0.33
+(+pro)
+val
+88
+/
+101
+=
+87.1%
+88/101=\textbf{87.1\%}
+3.74
+3.74
+M
+0.22
+0.22
+M
+$
+0.65
+0.65
+Mind Evolution
+test
+106
+/
+245
+=
+43.3%
+106/245=\textbf{43.3\%}
+$
+0.34
+0.34
+3.63
+3.63
+M
+0.22
+0.22
+M
+(+pro)
+test
+194
+/
+245
+=
+79.2%
+194/245=\textbf{79.2\%}
+$
+0.72
+0.72
+3.84
+3.84
+M
+0.24
+0.24
+M
+Table 6
+:
+Experimental results on StegPoet.
+Price and token counts are averages per problem.
+All results use Gemini 1.5 Flash, except
+(+pro)
+, which solves the problems that were not solved in the Flash runs, using Gemini 1.5 Pro.
+Figure 11
+:
+Histogram of Success Rate for each difficulty level.
+1-Pass returns valid responses, but fails to solve any of the problems,
+so it is not visible in the histogram.
+To assess hyperparameter sensitivity, we investigated the Trip Planning task
+in greater detail, choosing the harder setting with 10 cities
+to better reveal differences in performance.
+(Similar results are also observed on the harder problem instances
+from the other benchmark tasks.)
+In
+Table
+˜
+5
+,
+the top two rows compare the effect of including or excluding the island
+model from the evolutionary search, controlling for the same number
+(800) of candidate solutions.
+These results show that the island model significantly improves the performance
+of Mind Evolution.
+The bottom three rows compare
+the effect of increasing the number of candidate solutions per generation
+versus having more generations
+while controlling for a similar number of candidates considered overall.
+In this case, it appears that deeper evolutionary search indeed has benefits,
+although it is also important to continue exploring broadly in each generation.
+5
+A Challenging New Task: StegPoet
+We introduce a challenging new task, StegPoet,
+where a hidden message should be stenographically encoded
+[
+33
+]
+into a piece of creative writing.
+Even though the problem is difficult to formalize,
+it remains amenable to programmatic verification,
+which makes it addressable by the methods considered in this paper.
+In this task, a hidden message (
+M
+M
+) expressed by a sequence of numbers
+should be encoded in a piece of creative text about a particular topic,
+expressed in the form of an essay, story or poem.
+The goal is to both provide a number-to-word substitution cipher
+and a generated text that uses the cipher to encode the message.
+Figure
+˜
+10
+gives an example.
+We impose an additional constraint that there must be, on average,
+B
+B
+words between successive cipher words in the generated text,
+which ensures
+that simply listing the cipher words as the text portion does not qualify as
+solution when
+B
+>
+0
+B>0
+.
+The difficulty of this problem varies along four axes:
+1.
+Difficulty increases with the length of the hidden message,
+M
+M
+.
+We set
+10
+≤
+|
+M
+|
+≤
+30
+10\leq|M|\leq 30
+.
+2.
+The repetition of the numbers in
+M
+M
+.
+The more repetition, the more stringent the constraints.
+3.
+The “closeness” of the repeated numbers to each other.
+Each form of writing dictates how much repetition of the same word and
+proximity of occurrence is acceptable.
+The LLM must balance adherence to the form with the need to correctly
+encode the message.
+4.
+Empirically, as
+B
+B
+(the mean distance between cipher words) grows, the problem becomes more difficult.
+In our tests,
+3
+≤
+B
+≤
+7
+3\leq B\leq 7
+.
+We divide the problem instances into
+a validation split of 101 instances and a test split of 245 instances.
+See
+Appendix
+˜
+F
+for additional details about the StegPoet evaluation.
+Detailed performance results for Mind Evolution and the baseline strategies
+are given in
+Table
+˜
+6
+,
+while
+Figure
+˜
+11
+shows performance per difficulty level.
+Here the two-stage Mind Evolution (+pro) achieves 87.1% on validation and 79.2% on test.
+Best-of-N only manages to solve 1% of the validation tasks.
+6
+Conclusion
+We have presented Mind Evolution, an evolutionary search approach for solving
+challenging natural language planning problems,
+by scaling inference-time compute for stochastic exploration and iterative
+refinement.
+An evaluation on the TravelPlanner and Natural Plan natural language planning
+benchmarks, as well as a new benchmark StegPoet introduced in this paper,
+demonstrates that Mind Evolution significantly outperforms Best-of-N
+and sequential revision.
+To our knowledge, this is the first approach that is able to achieve such a
+level of success on these tasks without explicitly leveraging a formal solver.
+Limitations
+The main limitation of the current work is the focus on natural language
+planning problems where proposed solutions can be programmatically
+evaluated and critiqued.
+In future work, we aim to extend beyond this limitation by developing
+LLM-based evaluators that would enable broader applications.
+Acknowledgement
+The authors thank Sergio Guadarrama and Doina Precup for supporting this work.
+We also thank Sirui Xie, John Canny, and the Google DeepMind FunSearch team for valuable discussion.
+References
+Bai et al. [2022]
+Y. Bai, S. Kadavath, S. Kundu, A. Askell, J. Kernion, A. Jones, A. Chen,
+A. Goldie, A. Mirhoseini, C. McKinnon, et al.
+Constitutional AI: Harmlessness from AI feedback.
+arXiv preprint arXiv:2212.08073
+, 2022.
+Berger et al. [2021]
+B. Berger, M. S. Waterman, and Y. W. Yu.
+Levenshtein distance, sequence comparison and biological database
+search.
+IEEE Transactions on Information Theory
+, 67(6):3287–3294, 2021.
+10.1109/TIT.2020.2996543
+.
+Brahmachary et al. [2024]
+S. Brahmachary, S. M. Joshi, A. Panda, K. Koneripalli, A. K. Sagotra, H. Patel,
+A. Sharma, A. D. Jagtap, and K. Kalyanaraman.
+Large language model-based evolutionary optimizer: Reasoning with
+elitism.
+arXiv preprint arXiv:2403.02054
+, 2024.
+Brown et al. [2024]
+B. Brown, J. Juravsky, R. Ehrlich, R. Clark, Q. V. Le, C. Ré, and
+A. Mirhoseini.
+Large language monkeys: Scaling inference compute with repeated
+sampling.
+arXiv preprint arXiv:2407.21787
+, 2024.
+Cantú-Paz et al. [1998]
+E. Cantú-Paz et al.
+A survey of parallel genetic algorithms.
+Calculateurs paralleles, reseaux et systems repartis
+,
+10(2):141–171, 1998.
+Chen et al. [2023a]
+A. Chen, D. M. Dohan, and D. R. So.
+EvoPrompting: Language models for code-level neural architecture
+search.
+In
+Proceedings of the 37th International Conference on Neural
+Information Processing Systems
+, pages 7787–7817, 2023a.
+Chen et al. [2023b]
+B. Chen, F. Zhang, A. Nguyen, D. Zan, Z. Lin, J.-G. Lou, and W. Chen.
+CodeT: Code generation with generated tests.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2023b.
+URL
+https://openreview.net/forum?id=ktrw68Cmu9c
+.
+Chen et al. [2024]
+X. Chen, M. Lin, N. Schärli, and D. Zhou.
+Teaching large language models to self-debug.
+In
+The Twelfth International Conference on Learning
+Representations
+, 2024.
+URL
+https://openreview.net/forum?id=KuPixIqPiq
+.
+Cobbe et al. [2021]
+K. Cobbe, V. Kosaraju, M. Bavarian, M. Chen, H. Jun, L. Kaiser, M. Plappert,
+J. Tworek, J. Hilton, R. Nakano, et al.
+Training verifiers to solve math word problems.
+arXiv preprint arXiv:2110.14168
+, 2021.
+Fernando et al. [2023]
+C. Fernando, D. Banarse, H. Michalewski, S. Osindero, and T. Rocktäschel.
+Promptbreeder: Self-referential self-improvement via prompt
+evolution.
+arXiv preprint arXiv:2309.16797
+, 2023.
+Garey and Johnson [1979]
+M. R. Garey and D. S. Johnson.
+Computers and Intractability: A Guide to the Theory of NP
+Completeness
+.
+W. H. Freeman & Co., 1979.
+Golberg [1989]
+D. E. Golberg.
+Genetic Algorithms in Search, Optimization, and Machine
+Learning
+.
+Addison Wesley, 1989.
+Goldberg [1990]
+D. E. Goldberg.
+A note on Boltzmann tournament selection for genetic algorithms and
+population-oriented simulated annealing.
+Complex Systems
+, 4:445–460, 1990.
+Guilford [1967]
+J. P. Guilford.
+The Nature of Human Intelligence
+.
+1967.
+Guo et al. [2023]
+Q. Guo, R. Wang, J. Guo, B. Li, K. Song, X. Tan, G. Liu, J. Bian, and Y. Yang.
+Connecting large language models with evolutionary algorithms yields
+powerful prompt optimizers.
+arXiv preprint arXiv:2309.08532
+, 2023.
+Hao et al. [2024]
+Y. Hao, Y. Chen, Y. Zhang, and C. Fan.
+Large language models can plan your travels rigorously with formal
+verification tools.
+arXiv preprint arXiv:2404.11891
+, 2024.
+Hemberg et al. [2024]
+E. Hemberg, S. Moskal, and U.-M. O’Reilly.
+Evolving code with a large language model.
+Genetic Programming and Evolvable Machines
+, 25(2):21, 2024.
+Holland [1975]
+J. H. Holland.
+Adaptation in Natural and Artificial Systems
+.
+University of Michigan Press, Ann Arbor, MI, 1975.
+second edition, 1992.
+Kim et al. [2023]
+G. Kim, P. Baldi, and S. McAleer.
+Language models can solve computer tasks.
+arXiv preprint arxiv:2303.17491
+, 2023.
+Kirchner et al. [2024]
+J. H. Kirchner, Y. Chen, H. Edwards, J. Leike, N. McAleese, and Y. Burda.
+Prover-verifier games improve legibility of LLM outputs.
+arXiv preprint arXiv:2407.13692
+, 2024.
+Kojima et al. [2022]
+T. Kojima, S. S. Gu, M. Reid, Y. Matsuo, and Y. Iwasawa.
+Large language models are zero-shot reasoners.
+Advances in Neural Information Processing Systems
+,
+35:22199–22213, 2022.
+Le et al. [2022]
+H. Le, Y. Wang, A. D. Gotmare, S. Savarese, and S. C. H. Hoi.
+CodeRL: Mastering code generation through pretrained models and
+deep reinforcement learning.
+Advances in Neural Information Processing Systems
+,
+35:21314–21328, 2022.
+Lehman et al. [2023]
+J. Lehman, J. Gordon, S. Jain, K. Ndousse, C. Yeh, and K. O. Stanley.
+Evolution through large models.
+In
+Handbook of Evolutionary Machine Learning
+, pages 331–366.
+Springer, 2023.
+Liang et al. [2024]
+Z. Liang, Y. Liu, T. Niu, X. Zhang, Y. Zhou, and S. Yavuz.
+Improving LLM reasoning through scaling inference computation with
+collaborative verification.
+arXiv preprint arXiv:2410.05318
+, 2024.
+Lightman et al. [2023]
+H. Lightman, V. Kosaraju, Y. Burda, H. Edwards, B. Baker, T. Lee, J. Leike,
+J. Schulman, I. Sutskever, and K. Cobbe.
+Let’s verify step by step.
+arXiv preprint arXiv:2305.20050
+, 2023.
+Liu et al. [2023a]
+F. Liu, X. Lin, Z. Wang, S. Yao, X. Tong, M. Yuan, and Q. Zhang.
+Large language model for multi-objective evolutionary optimization.
+arXiv preprint arXiv:2310.12541
+, 2023a.
+Liu et al. [2023b]
+J. Liu, Y. Zhu, K. Xiao, Q. FU, X. Han, Y. Wei, and D. Ye.
+RLTF: Reinforcement learning from unit test feedback.
+Transactions on Machine Learning Research
+, 2023b.
+ISSN 2835-8856.
+URL
+https://openreview.net/forum?id=hjYmsV6nXZ
+.
+Liu et al. [2024]
+S. Liu, C. Chen, X. Qu, K. Tang, and Y.-S. Ong.
+Large language models as evolutionary optimizers.
+In
+2024 IEEE Congress on Evolutionary Computation (CEC)
+, pages
+1–8. IEEE, 2024.
+Liventsev et al. [2023]
+V. Liventsev, A. Grishina, A. Härmä, and L. Moonen.
+Fully autonomous programming with large language models.
+In
+Proceedings of the Genetic and Evolutionary Computation
+Conference
+, pages 1146–1155, 2023.
+Madaan et al. [2024]
+A. Madaan, N. Tandon, P. Gupta, S. Hallinan, L. Gao, S. Wiegreffe, U. Alon,
+N. Dziri, S. Prabhumoye, Y. Yang, et al.
+Self-refine: Iterative refinement with self-feedback.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Mitchell [1998]
+M. Mitchell.
+An Introduction to Genetic Algorithms
+.
+MIT press, 1998.
+Park et al. [2023]
+J. S. Park, J. O’Brien, C. J. Cai, M. R. Morris, P. Liang, and M. S. Bernstein.
+Generative agents: Interactive simulacra of human behavior.
+In
+Proceedings of the 36th Annual ACM Symposium on User
+Interface Software and Technology
+, pages 1–22, 2023.
+Provos and Honeyman [2003]
+N. Provos and P. Honeyman.
+Hide and seek: An introduction to steganography.
+IEEE security & privacy
+, 1(3):32–44,
+2003.
+Romera-Paredes et al. [2024]
+B. Romera-Paredes, M. Barekatain, A. Novikov, M. Balog, M. P. Kumar, E. Dupont,
+F. J. Ruiz, J. S. Ellenberg, P. Wang, O. Fawzi, et al.
+Mathematical discoveries from program search with large language
+models.
+Nature
+, 625(7995):468–475, 2024.
+Setlur et al. [2024]
+A. Setlur, C. Nagpal, A. Fisch, X. Geng, J. Eisenstein, R. Agarwal, A. Agarwal,
+J. Berant, and A. Kumar.
+Rewarding progress: Scaling automated process verifiers for LLM
+reasoning.
+arXiv preprint arXiv:2410.08146
+, 2024.
+Shinn et al. [2024]
+N. Shinn, F. Cassano, A. Gopinath, K. Narasimhan, and S. Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+Advances in Neural Information Processing Systems
+, 36, 2024.
+Snell et al. [2024]
+C. Snell, J. Lee, K. Xu, and A. Kumar.
+Scaling LLM test-time compute optimally can be more effective than
+scaling model parameters.
+arXiv preprint arXiv:2408.03314
+, 2024.
+Tanese [1989]
+R. Tanese.
+Distributed genetic algorithms for function optimization
+.
+University of Michigan, 1989.
+Wang et al. [2023]
+X. Wang, J. Wei, D. Schuurmans, Q. V. Le, E. H. Chi, S. Narang, A. Chowdhery,
+and D. Zhou.
+Self-consistency improves chain of thought reasoning in language
+models.
+In
+The Eleventh International Conference on Learning
+Representations
+, 2023.
+URL
+https://openreview.net/forum?id=1PL1NIMMrw
+.
+Wang et al. [2024]
+Z. Wang, Y. Li, Y. Wu, L. Luo, L. Hou, H. Yu, and J. Shang.
+Multi-step problem solving through a verifier: An empirical analysis
+on model-induced process supervision.
+arXiv preprint arXiv:2402.02658
+, 2024.
+Wei et al. [2022]
+J. Wei, X. Wang, D. Schuurmans, M. Bosma, F. Xia, E. Chi, Q. V. Le, D. Zhou,
+et al.
+Chain-of-thought prompting elicits reasoning in large language
+models.
+Advances in Neural Information Processing Systems
+,
+35:24824–24837, 2022.
+Xie et al. [2024]
+J. Xie, K. Zhang, J. Chen, T. Zhu, R. Lou, Y. Tian, Y. Xiao, and Y. Su.
+Travelplanner: A benchmark for real-world planning with language
+agents.
+arXiv preprint arXiv:2402.01622
+, 2024.
+Yao et al. [2023]
+S. Yao, D. Yu, J. Zhao, I. Shafran, T. L. Griffiths, Y. Cao, and K. Narasimhan.
+Tree of thoughts: Deliberate problem solving with large language
+models.
+In
+Proceedings of the 37th International Conference on Neural
+Information Processing Systems
+, pages 11809–11822, 2023.
+Ye et al. [2024]
+H. Ye, J. Wang, Z. Cao, F. Berto, C. Hua, H. Kim, J. Park, and G. Song.
+ReEvo: Large language models as hyper-heuristics with reflective
+evolution.
+arXiv preprint arXiv:2402.01145
+, 2024.
+Yuan et al. [2024]
+S. Yuan, K. Song, J. Chen, X. Tan, D. Li, and D. Yang.
+EvoAgent: Towards automatic multi-agent generation via evolutionary
+algorithms.
+arXiv preprint arXiv:2406.14228
+, 2024.
+Zhang et al. [2023]
+K. Zhang, D. Wang, J. Xia, W. Y. Wang, and L. Li.
+ALGO: Synthesizing algorithmic programs with LLM-generated oracle
+verifiers.
+In
+Proceedings of the 37th International Conference on Neural
+Information Processing Systems
+, pages 54769–54784, 2023.
+Zheng et al. [2024]
+H. S. Zheng, S. Mishra, H. Zhang, X. Chen, M. Chen, A. Nova, L. Hou, H.-T.
+Cheng, Q. V. Le, E. H. Chi, et al.
+NATURAL PLAN: Benchmarking LLMs on natural language planning.
+arXiv preprint arXiv:2406.04520
+, 2024.
+Appendix A
+Implementation Details
+Here we describe the implementation details of Mind Evolution.
+The code will be made available.
+A.1
+Prompt Design
+We first use Meeting Planning as an example to illustrate the structure of the prompts used.
+The prompts, as well as the model responses when parent solutions are given, are shown in Figures
+12
+-
+16
+.
+The prompts begin with general instructions
+and a general problem definition,
+few-shot examples, then a task description.
+The few-shot examples help the LLM understand the problem and generate
+solutions closer to the desired formats.
+For TravelPlanner, we take two 3-day example plans from the training set and
+use them across all tasks (3-7 days).
+For Trip Planning, we take two example plans from the few-shot examples
+provided by the benchmark and use them across all tasks.
+For Meeting Planning, we use the 5-shot examples provided by the benchmark
+for each task.
+After the task description,
+we include parent solutions with corresponding evaluation feedback,
+followed by critical thinking instructions
+(in Figures
+14
+–
+15
+).
+These instructions lead the LLM to improve the parent solutions,
+following the Refinement through Critical Conversation (RCC) process described in
+Section
+˜
+3.2
+.
+The critical thinking instructions include problem-specific
+Strategy/Question prompts based on findings in each
+validation set (ablated in
+Section
+˜
+4.4
+).
+In the model responses, one can see that the LLM follows the critical
+thinking instructions in playing the critic role to analyze the parent
+solutions, and playing the author role to propose a new solution.
+We also give an example of the prompt and a model response
+for TravelPlanner,
+which has the same structure,
+in Figures
+17
+–
+22
+.
+A.2
+Evaluation Functions
+In this work, solutions are evaluated programmatically with a function.
+As described in
+Section
+˜
+3.2
+,
+an evaluation function has three main roles:
+(1) scoring solutions by measuring the optimization objective, if any;
+(2) verifying whether the solution satisfies given constraints;
+and (3) providing corresponding textual feedback.
+Specifically, we score natural language plans by penalizing the constraints
+that are not satisfied, the objectives that are not maximized,
+and for not following the required solution format.
+Thus the maximum score for all tasks is zero.
+We also provide textual feedback that describes how the constraints are not
+satisfied and how the objectives are not maximized.
+TravelPlanner
+Our evaluation function for TravelPlanner is modified from the TravelPlanner
+evaluation code
+[
+42
+]
+.
+The evaluation code expects travel plans in JSON format.
+We modify the original evaluation code to make it output a
+cumulative score that reflects all the constraints that are not satisfied,
+instead of simply answering whether or not a plan satisfies all the constraints.
+We also make it provide textual feedback for the violated constraints.
+In the TravelPlanner validation set, the constraints are provided in both user
+query text and a structured JSON format.
+However, in the test set, the constraints are only described in user query text.
+To make it easier for the evaluation function to consider the constraints,
+we extract them from user query into JSON using Gemini 1.5 Flash.
+For example, to extract the requested cuisines, we prompt Gemini with
+“Look at the following text and tell me if there are any cuisine requirements
+on the upcoming trip…”
+multiple times, and formulate the final answer via majority voting.
+To verify the reliability of this approach, we tested on the validation set
+and found complete agreement between the JSON extracted from user query and
+the provided JSON.
+In addition, we upload our test solutions to the TravelPlanner evaluation
+server, and found that the results agree with the official evaluation.
+Trip Planning
+Similar to TravelPlanner, the Trip Planning evaluation function expects plans
+in JSON format.
+Since Trip Planning user queries are programmatically generated,
+we can parse the constraints specified in user queries.
+These constraints include number of days to stay in a city,
+specific days to be in a city (e.g., for events), and whether there are
+flights between cities.
+Our evaluation function scores a plan by the constraints that are not satisfied
+and whether it conforms with the desired JSON format,
+while also providing corresponding textual feedback.
+Meeting Planning
+The Meeting Planning evaluation function also expects plans in JSON.
+Constraints are also provided in structured JSON format.
+Unlike TravelPlanner and Trip Planning, Meeting Planning has an optimization
+objective – the number of friends to meet with.
+We modify the original evaluation evaluation function to score a proposed plan
+by how many people that are not going to be met with;
+whether it conflicts with the schedules of other people;
+whether it includes meetings with the same person more than once;
+whether any part of the plan conflict with other parts;
+whether it follows the desired format as instructed.
+In Figures
+23
+–
+24
+we present the evaluation function that implements the simple logic described
+above as an example.
+Figure 12
+:
+Example Meeting Planning prompt and model response with parent solutions given (Part 1)
+Figure 13
+:
+Example Meeting Planning prompt and model response with parent solutions given (Part 2)
+Figure 14
+:
+Example Meeting Planning prompt and model response with parent solutions given (Part 3)
+Figure 15
+:
+Example Meeting Planning prompt and model response with parent solutions given (Part 4)
+Figure 16
+:
+Example Meeting Planning prompt and model response with parent solutions given (Part 5)
+Figure 17
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 1)
+Figure 18
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 2)
+Figure 19
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 3)
+Figure 20
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 4)
+Figure 21
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 5)
+Figure 22
+:
+Example TravelPlanner prompt and model response with parent solutions given (Part 6)
+⬇
+import
+datetime
+from
+typing
+import
+Any
+,
+Sequence
+def
+meeting_plan_eval
+(
+plan
+:
+list
+[
+str
+],
+start_location
+:
+str
+,
+initial_time
+:
+str
+,
+friend_schedules
+:
+dict
+[
+str
+,
+Any
+],
+distance_matrix
+:
+dict
+[
+str
+,
+Any
+]):
+"""
+Evaluate
+meeting
+plan
+.
+Args
+:
+plan
+:
+a
+list
+of
+planned
+steps
+,
+such
+as
+[’
+You
+start
+at
+Russian
+Hill
+at
+9:00
+AM
+.’,
+’
+You
+travel
+to
+Marina
+District
+in
+7
+minutes
+and
+arrive
+at
+9:07
+AM
+.’,
+’
+You
+wait
+until
+3:45
+PM
+.’,
+’
+You
+meet
+James
+for
+75
+minutes
+from
+3:45
+PM
+to
+5:00
+PM
+.’]
+start_location
+:
+Your
+initial
+location
+initial_time
+:
+the
+initial
+time
+,
+such
+as
+10:30
+AM
+friend_schedules
+:
+friend
+’
+s
+location
+,
+available
+time
+,
+the
+amount
+of
+time
+for
+the
+meeting
+,
+such
+as
+{’
+Stephanie
+’:
+{’
+location
+’:
+’
+Mission
+District
+’,
+’
+start_time
+’:
+’10:30
+AM
+’,
+’
+end_time
+’:
+’1:30
+PM
+’,
+’
+meeting_time
+’:
+120}}
+distance_matrix
+:
+Distances
+between
+locations
+,
+such
+as
+{’
+Marina
+District
+’:
+{’
+Mission
+District
+’:
+20},
+’
+Mission
+District
+’:
+{’
+Marina
+District
+’:
+19}}
+"""
+met_with
+=
+{}
+score
+=
+0.0
+feedback
+=
+[]
+cur_location
+=
+start_location
+cur_time
+=
+datetime
+.
+datetime
+.
+strptime
+(
+initial_time
+,
+"%
+assert
+isinstance
+(
+plan
+,
+list
+)
+for
+step
+in
+plan
+:
+try
+:
+if
+step
+.
+startswith
+("
+You
+start
+"):
+continue
+elif
+step
+.
+startswith
+("
+You
+travel
+"):
+destination
+=
+step
+.
+split
+("
+travel
+to
+")[1].
+split
+("
+in
+")[0].
+strip
+()
+cur_time
+=
+cur_time
++
+datetime
+.
+timedelta
+(
+minutes
+=
+distance_matrix
+[
+cur_location
+][
+destination
+]
+)
+cur_location
+=
+destination
+elif
+step
+.
+startswith
+("
+You
+wait
+"):
+raw_end_time
+=
+step
+.
+split
+("
+wait
+until
+")[1].
+split
+(".")[0].
+strip
+()
+end_time
+=
+None
+try
+:
+end_time
+=
+datetime
+.
+datetime
+.
+strptime
+(
+raw_end_time
+,
+"%
+except
+ValueError
+:
+score
+-=
+2
+feedback
+.
+append
+(
+f
+"\"{
+step
+}\"
+is
+invalid
+because
+the
+time
+format
+doesn
+’
+t
+follow
+the
+examples
+.")
+if
+end_time
+<=
+cur_time
+:
+end_time_str
+=
+end_time
+.
+strftime
+("%
+score
+-=
+2
+feedback
+.
+append
+(
+f
+"\"{
+step
+}\"
+is
+invalid
+because
+but
+the
+previous
+step
+already
+ends
+at
+{
+end_time_str
+}
+and
+you
+cannot
+go
+backwards
+in
+time
+.")
+cur_time
+=
+end_time
+elif
+step
+.
+startswith
+("
+You
+meet
+"):
+Figure 23
+:
+The Meeting Planning evaluation function (part 1).
+⬇
+person
+=
+step
+.
+split
+("
+meet
+")[1].
+split
+("
+for
+")[0].
+strip
+()
+if
+person
+in
+met_with
+:
+score
+-=
+2
+feedback
+.
+append
+(
+f
+"\"{
+step
+}\"
+is
+invalid
+because
+you
+would
+be
+meeting
+with
+{
+person
+}
+more
+than
+once
+.")
+met_with
+[
+person
+]
+=
+1
+new_time
+=
+cur_time
++
+datetime
+.
+timedelta
+(
+minutes
+=
+friend_schedules
+[
+person
+]["
+meeting_time
+"]
+)
+loc
+=
+friend_schedules
+[
+person
+]["
+location
+"]
+start_time
+=
+friend_schedules
+[
+person
+]["
+start_time
+"]
+end_time
+=
+friend_schedules
+[
+person
+]["
+end_time
+"]
+start_time_str
+=
+start_time
+.
+strftime
+("%
+end_time_str
+=
+end_time
+.
+strftime
+("%
+if
+cur_location
+==
+loc
+and
+cur_time
+>=
+start_time
+and
+new_time
+<=
+end_time
+:
+score
++=
+1
+cur_time
+=
+new_time
+else
+:
+score
+-=
+2
+feedback
+.
+append
+(
+f
+"\"{
+step
+}\"
+is
+invalid
+because
+it
+doesn
+’
+t
+match
+the
+schedule
+of
+{
+person
+},
+who
+will
+be
+at
+{
+loc
+}
+from
+{
+start_time_str
+}
+to
+{
+end_time_str
+}.")
+else
+:
+raise
+ValueError
+("
+Unknown
+plan
+format
+")
+except
+Exception
+:
+score
+-=
+10
+feedback
+.
+append
+(
+f
+"\"{
+step
+}\"
+is
+invalid
+because
+the
+format
+doesn
+’
+t
+follow
+the
+examples
+.")
+all_names
+=
+set
+(
+friend_schedules
+.
+keys
+())
+not_met_with
+=
+",
+".
+join
+(
+list
+(
+all_names
+-
+set
+(
+met_with
+.
+keys
+())))
+return
+score
+,
+feedback
+Figure 24
+:
+The Meeting Planning evaluation function (part 2).
+Appendix B
+Data Splits
+TravelPlanner
+TravelPlanner has 45 training tasks, 180 validation tasks,
+and 1,000 test tasks in the original benchmark.
+Natural Plan – Trip Planning
+The Trip Planning benchmakr has 1,600 example tasks.
+There are eight different difficulty levels, ranging from 3 to 10 cities.
+Each difficulty level has 200 examples.
+We split the dataset into validation and test sets by putting the first
+40 examples from each difficulty level into validation, and the last
+160 examples into test, giving 320 examples in validation
+(which we used for prompt development) and 1,280 for test.
+In
+Figure
+˜
+4
+,
+we show the performance at each difficulty level.
+Natural Plan – Meeting Planning
+The Meeting Planning benchmark has 1,000 example tasks.
+There are ten different difficulty levels, ranging from meeting one to ten
+different friends.
+Each difficulty level has 100 examples.
+We split the dataset into validation and test sets by putting the first
+50 examples from each difficulty level into validation,
+and the last 50 examples into test, giving 500 examples in validation
+(which we used for prompt development) and 500 for test.
+In
+Figure
+˜
+5
+,
+we show the performance at difficulty level.
+Appendix C
+GPT Results
+Table
+7
+presents the results of Mind Evolution using
+GPT-4o-mini with the same sets of prompts.
+Specifically, with 1-pass inference, GPT-4o-mini also struggles at planning
+tasks, achieving 0% on TravelPlanner, 9.1% success rate on Trip Planning,
+and 20.2% success rate on Meeting Planning.
+Again, Mind Evolution significantly improves the performance by over
+100
+%
+100\%
+relatively across different benchmarks.
+Success Rate
+TravelPlanner
+[
+42
+]
+79.4
+%
+79.4\%
+Natural Plan
+[
+47
+]
+Trip Planning
+48.1
+%
+48.1\%
+Natural Plan
+[
+47
+]
+Meeting Planning
+86.4
+%
+86.4\%
+Table 7
+:
+Mind Evolution with GPT-4o-Mini results on validation sets.
+Appendix D
+Model Pricing and API Cost Curves
+Table
+˜
+8
+shows the API pricing of different models used in our
+evaluation (Tables
+2
+), at the time of writing
+(October 2024).
+Model
+Input Token
+Output Token
+Gemini 1.5 Flash
+$
+0.075
+0.075
+/M
+$
+0.30
+0.30
+/M
+Gemini 1.5 Pro
+$
+1.25
+1.25
+/M
+$
+5.00
+5.00
+/M
+GPT-4o-Mini
+$
+0.15
+0.15
+$
+0.60
+0.60
+OpenAI o1-preview
+$
+15.00
+15.00
+/M
+$
+60.00
+60.00
+/M
+Table 8:
+Pricing at the time of writing (October 2024).
+These differences serve as a proxy for real computational cost differences
+among models.
+Figure
+˜
+25
+gives insight into the scaling properties
+of the various strategies in terms of their API cost, which is also a linear combination of the input token counts and the output token counts, weighted by base rate (
+Table
+˜
+8
+).
+(a)
+TravelPlanner
+(b)
+Trip Planning
+(c)
+Meeting Planning
+Figure 25
+:
+API cost per task (Oct 2024) vs. Success Rate
+on the validation sets with Gemini 1.5 Flash. API cost is also a linear combination of input token counts and output token counts. Note that Sequential-Revision+ curves are cut short. As the conversations can run up to 80 turns, it usually consumes a lot more tokens than the other methods.
+Appendix E
+Additional Examples
+In addition to
+Table
+˜
+3
+,
+we present qualitative examples of TravelPlanner and Meeting Planning in
+Table
+˜
+9
+and
+Table
+˜
+10
+,
+respectively.
+Q: Could you construct a 3-day journey for two people from Chicago to Albany
+that takes place from March 22nd to March 24th, 2022?
+Our budget is $2,300. We require accommodations that allow smoking and should
+ideally be entire rooms.
+We will not be self-driving during this trip. On the subject of cuisine,
+we’re open to any suggestions you might have.
+[Transportation Options] Taxi: N/A, Self-Driving: N/A, Flights: F3732604
+[…]
+[Accommodation Options in Albany] 2Br Union square Newly furnished
+(minimum 30 nights),
+[…]
+[…]
+Method
+Answer
+1-Pass
+[Day 1] city: ’from Chicago to Albany’, accommodation: ’Spacious room with huge bay window & natural light, Albany’, attraction: ’-’, breakfast: ’-’, dinner: ’Kaiser, Albany’, lunch: ’-’, transportation’: ’Flight Number: F3983351, from Chicago to Albany, Departure Time: 20:20, Arrival Time: 23:04’
+[Day 2] city: ’Albany’, accommodation: ’Spacious room with huge bay window & natural light, Albany’, attraction: ’Albany Pine Bush Preserve, Albany;New York State Museum, Albany;’, breakfast: ’Desi Villa, Albany’, lunch: ’Just Parkash, Albany’, dinner: ’Kaiser, Albany’, ’transportation’: ’-’
+[Day 3] city: ’from Albany to Chicago’, accommodation: ’-’, attraction: ’Hear about Here, Albany;’, breakfast: ’Cafe Coffee Day The Square, Albany’, lunch: ’PiccoLicko, Albany’, dinner: ’StarvinḾarvin, Albany’, transportation: ’Flight Number: F3813300, from Albany to Chicago, Departure Time: 16:40, Arrival Time: 17:52’
+The total cost exceeds the budget of $2300 by $114, […]
+Best-of-N
+[Day 1] city: ’from Chicago to Albany’, accommodation: ’Mid Town East Side Apartment, Albany’, attraction: ’New York State Museum, Albany’, breakfast: ’-’, dinner: ’Kaiser, Albany’, lunch: ’-’, transportation’: ’Taxi, from Chicago to Albany, duration: 12 hours 53 mins, distance: 723 km, cost: 532’
+[Day 2] city: ’Albany’, accommodation: ’Mid Town East Side Apartment, Albany’, attraction: ’Albany Institute of History & Art, Albany’, breakfast: ’Echoes Satyaniketan, Albany’, lunch: ’Cafe Coffee Day The Square, Albany’, dinner: ’Urban Punjab, Albany’, ’transportation’: ’-’
+[Day 3] city: ’from Albany to Chicago’, accommodation: ’-’, attraction: ’Historic Cherry Hill, Albany’, breakfast: ’Just Parkash, Albany’, lunch: ’BKś The Juice Bar, Albany’, dinner: ’-’, transportation: ’Flight Number: F4008387, from Albany to Chicago, Departure Time: 05:05, Arrival Time: 06:11’
+Taxi from Chicago to Albany on day 1 is not a valid option, […]
+Sequential Revisions+
+[Day 1] city: ’from Chicago to Albany’, accommodation: ’2Br Union square Newly furnished’, Albany’, attraction: ’-’, breakfast: ’-’, dinner: ’-’, lunch: ’-’, transportation’: ’Flight Number: F3983350, from Chicago to Albany, Departure Time: 14:29, Arrival Time: 17:03’
+[Day 2] city: ’Albany’, accommodation: ’2Br Union square Newly furnished’, attraction: ’New York State Museum, Albany;Albany Pine Bush Preserve, Albany’, breakfast: ’Cafe Coffee Day, Albany’, lunch: ’PiccoLicko, Albany’, dinner: ’Kaiser, Albany’, ’transportation’: ’-’
+[Day 3] city: ’from Albany to Chicago’, accommodation: ’-’, attraction: ’The McPherson Legacy to the City of Albany - Robert Burns Statue, Albany’, breakfast: ’-’, lunch: ’-’, dinner: ’-’, transportation: ’Flight Number: F3732604, from Albany to Chicago, Departure Time: 05:53, Arrival Time: 07:24’
+’2Br Union square Newly furnished’ requires staying at least 30 nights, […]
+Mind Evolution (ours)
+[Day 1] city: ’from Chicago to Albany’, accommodation: ’Mid Town East Side Apartment, Albany’, attraction: ’-’, breakfast: ’-’, dinner: ’-’, lunch: ’-’, transportation’: ’Flight Number: F3983350, from Chicago to Albany, Departure Time: 14:29, Arrival Time: 17:03’
+[Day 2] city: ’Albany’, accommodation: ’Mid Town East Side Apartment, Albany’, attraction: ’Albany Institute of History & Art, Albany’, breakfast: ’StarvinḾarvin, Albany’, lunch: ’Cafe Coffee Day The Square, Albany’, dinner: ’PiccoLicko, Albany’, ’transportation’: ’-’
+[Day 3] city: ’from Albany to Chicago’, accommodation: ’-’, attraction: ’The McPherson Legacy to the City of Albany - Robert Burns Statue, Albany’, breakfast: ’-’, lunch: ’-’, dinner: ’-’, transportation: ’Flight Number: F4008387, from Albany to Chicago, Departure Time: 05:05, Arrival Time: 06:11’
+Table 9:
+An example TravelPlanner task and the solutions proposed by Mind Evolution and the baselines method.
+Q: You are visiting San Francisco for the day and want to meet as many friends as possible. Solve the problem by considering various different schedules and picking the best one to optimize your goals.
+Travel distances (in minutes):
+The Castro to Sunset District: 17. The Castro to Presidio: 20. The Castro to Bayview: 19. The Castro to Chinatown: 20. The Castro to Mission District: 7. Sunset District to The Castro: 17. Sunset District to Presidio: 16. Sunset District to Bayview: 22. Sunset District to Chinatown: 30. Sunset District to Mission District: 24. Presidio to The Castro: 21. Presidio to Sunset District: 15. Presidio to Bayview: 31. Presidio to Chinatown: 21. Presidio to Mission District: 26. Bayview to The Castro: 20. Bayview to Sunset District: 23. Bayview to Presidio: 31. Bayview to Chinatown: 18. Bayview to Mission District: 13. Chinatown to The Castro: 22. Chinatown to Sunset District: 29. Chinatown to Presidio: 19. Chinatown to Bayview: 22. Chinatown to Mission District: 18. Mission District to The Castro: 7. Mission District to Sunset District: 24. Mission District to Presidio: 25. Mission District to Bayview: 15. Mission District to Chinatown: 16.
+CONSTRAINTS:
+You arrive at The Castro at 9:00AM. Michelle will be at Sunset District from 6:30PM to 8:30PM. You’d like to meet Michelle for a minimum of 120 minutes. Amanda will be at Presidio from 9:30PM to 10:00PM. You’d like to meet Amanda for a minimum of 30 minutes. Sandra will be at Bayview from 10:00AM to 2:30PM. You’d like to meet Sandra for a minimum of 90 minutes. Kevin will be at Chinatown from 6:15PM to 7:15PM. You’d like to meet Kevin for a minimum of 45 minutes. Mark will be at Mission District from 12:30PM to 1:45PM. You’d like to meet Mark for a minimum of 75 minutes.
+Method
+Answer
+1-Pass
+[’You start at The Castro at 9:00AM’, ’You travel to Bayview in 19 minutes and arrive at 9:19AM’, ’You meet Sandra for 90 minutes from 10:00AM to 11:30AM’, ’You travel to Mission District in 13 minutes and arrive at 11:43AM’, ’You meet Mark for 75 minutes from 12:30PM to 1:45PM’, ’You travel to Sunset District in 24 minutes and arrive at 2:09PM’, ’You wait until 6:30PM’, ’You meet Michelle for 120 minutes from 6:30PM to 8:30PM’, ’You travel to Chinatown in 30 minutes and arrive at 8:30PM’, ’You wait until 6:15PM’, ’You meet Kevin for 45 minutes from 6:15PM to 7:00PM’, ’You travel to Presidio in 19 minutes and arrive at 7:19PM’, ’You wait until 9:30PM’, ’You meet Amanda for 30 minutes from 9:30PM to 10:00PM’]
+’Meeting Mark for 75 minutes from 12:30PM to 1:45PM’ doesnt́ match the schedule of Mark, who will be at Mission District from 12:30PM to 01:45PM
+[…]
+Best-of-N
+[’You start at The Castro at 9:00AM’, ’You travel to Bayview in 19 minutes and arrive at 9:19AM’, ’You wait until 10:00AM’, ’You meet Sandra for 90 minutes from 10:00AM to 11:30AM’, ’You travel to Mission District in 13 minutes and arrive at 11:43AM’, ’You wait until 12:30PM’, ’You meet Mark for 75 minutes from 12:30PM to 1:45PM’, ’You travel to Sunset District in 24 minutes and arrive at 2:09PM’, ’You wait until 6:30PM’, ’You meet Michelle for 120 minutes from 6:30PM to 8:30PM’, ’You travel to Chinatown in 30 minutes and arrive at 8:30PM ’, ’You wait until 6:15PM’, ’You meet Kevin for 45 minutes from 6:15PM to 7:00PM’, ’You travel to Presidio in 19 minutes and arrive at 7:19PM’, ’You wait until 9:30PM’, ’You meet Amanda for 30 minutes from 9:30PM to 10:00PM’]
+’Waiting until 6:15PM’ is invalid because the previous step ends after 06:15PM.
+Sequential Revisions+
+[’You start at The Castro at 9:00AM’, ’You travel to Mission District in 7 minutes and arrive at 9:07AM’, ’You wait until 12:30PM’, ’You meet Mark for 30 minutes from 12:30PM to 1:00PM’, ’You travel to Sunset District in 17 minutes and arrive at 1:17PM’, ’You wait until 6:30PM’, ’You meet Michelle for 30 minutes from 6:30PM to 7:00PM’, ’You travel to Presidio in 16 minutes and arrive at 7:16PM’, ’You wait until 9:30PM’, ’You meet Amanda for 30 minutes from 9:30PM to 10:00PM’]
+Not meeting with Kevin and Sandra.
+Mind Evolution (ours)
+[’You start at The Castro at 9:00AM’, ’You travel to Bayview in 19 minutes and arrive at 9:19AM’, ’You wait until 10:00AM’, ’You meet Sandra for 90 minutes from 10:00AM to 11:30AM’, ’You travel to Mission District in 13 minutes and arrive at 11:43AM’, ’You wait until 12:30PM’, ’You meet Mark for 75 minutes from 12:30PM to 1:45PM’, ’You travel to Chinatown in 16 minutes and arrive at 2:01PM’, ’You wait until 6:15PM’, ’You meet Kevin for 45 minutes from 6:15PM to 7:00PM’, ’You travel to Presidio in 19 minutes and arrive at 7:19PM’, ’You wait until 9:30PM’, ’You meet Amanda for 30 minutes from 9:30PM to 10:00PM’]
+Not meeting with Michelle, but this is a best possible plan.
+Table 10:
+An example Meeting Planning task and the solutions proposed by Mind Evolution and the baselines method.
+Appendix F
+Additional Details for StegPoet
+The prompt design used for StegPoet is given in
+Figure
+˜
+26
+.
+StegPoet Evaluation
+Each proposed solution should contain a cipher and text component.
+The first step is to calculate what is encoded in the text by finding all
+the cipher strings;
+this is done via simple capitalization-agnostic character-matches.
+We refer to the actual encoded string as
+M
+′
+M^{\prime}
+.
+If
+M
+=
+M
+′
+M=M^{\prime}
+the problem is solved correctly.
+The numeric evaluation of a proposed solution is computed as follows:
+1.
+Invalid if the text or cipher component cannot be parsed or violates constraints.
+(a)
+Words in the cipher cannot be subsets of each other (e.g., origin and original).
+(b)
+Words in the cipher cannot be repeated.
+(c)
+Words in the cipher should be at least 4 characters long.
+(d)
+Words in the cipher should contain only alphabetic characters.
+2.
+What is the first position,
+i
+i
+, in which
+M
+i
+≠
+M
+i
+′
+M_{i}\neq M^{\prime}_{i}
+?
+This is the integer part of the score.
+3.
+Compute the Levenshtein distance between
+M
+M
+and
+M
+′
+M^{\prime}
+.
+Levenshtein distance is often used in information theory and linguistics to
+measure the difference between two sequences
+[
+2
+]
+.
+This is scaled between (0,1) and added to the integer component above.
+Additionally, textual feedback, without numeric penalties, is also provided in
+the revision request made to the LLM.
+1.
+A clearly marked list of what
+M
+′
+M^{\prime}
+was found.
+2.
+A list of number mappings missing from the cipher, or unnecessary numbers specified in the cipher.
+3.
+If a word appears an incorrect number of times (too few or too many) in the text, it is indicated, along with the error.
+4.
+An annotated copy of the text is returned.
+The annotations indicate where the cipher-keywords were found (they are shown asterisked), and the first error is indicated.
+5.
+If the text encodes the cipher correctly, but also encodes extra words, that is indicated.
+6.
+If everything in
+M
+′
+M^{\prime}
+is correct, but
+|
+M
+′
+|
+<
+|
+M
+|
+|M^{\prime}|<|M|
+, it is indicated as such.
+For this task, we experimented with many different genre forms
+(poetry, short-story fiction, essay, monologue, etc.),
+as well as inspirations from contemporary to classic writers.
+For this task, you are the world’s best poet, linguist and hidden code creator!
+You strive to write in the style of shel silverstein.
+I would like you to come up with a 1:1 mapping from numbers to words for the list of numbers
+demarcated by <HIDDEN-MESSAGE START> and <HIDDEN-MESSAGE END>.
+Then, cleverly hide the numbers in an poem about the fun of walking.
+
+SPECIFICS:
+* You can choose the words.
+* A number cannot have more than one word assigned to it.
+* A word cannot be used twice in the mapping.
+* Create a substitution-ENCODING-CIPHER for each number and then put
+those words into the poem in the order that they appear in the hidden message.
+* If you use a word in the ENCODING-CIPHER in the poem, then it can only be used
+to encode a number.  Nowhere else.  Otherwise it will be mistaken as part of the message.
+* The numbers must be encoded in the order that they appear in the hidden message.
+
+<HIDDEN-MESSAGE START>
+90 70 50 40 40 10 50 80 70 900 50 30
+<HIDDEN-MESSAGE END>
+
+Here are some guidelines to follow.
+* The poem must NOT contain any numbers.
+* Numbers should not be encoded with a word or words that directly represent the number.
+For example, DO NOT represent "2" : "Dual".  Similarly, "4" : "four" is not acceptable.
+* In general, numbers should not be represented by words that are numeric in nature.
+* Numbers should not be represented by hyphenated words or using any
+other non alphabetic character.  They should be encoded with single words.
+* The words used for ciphers should not be subsets of each other.
+* Your poem should make sense, be understandable, and be in the style of shel silverstein.
+* Your poem should not just be a list of words that look like jibberish.
+Try to have at least 4 words between your cipher words.
+
+Absolutely crucial:
+* Important — make sure that each number in the hidden message is represented in the poem.
+* DO NOT SKIP ANY numbers. The numbers must be encoded in the SAME order as they are given to you.
+* PRECISION IS OF THE UTMOST IMPORTANCE.
+* Note that we did not use common words such as ’and’ and ’the’ etc. to encode the numbers,
+as such words may be difficult to use only at the specific times
+required to encode the hidden-message.
+* You should  not use common words to encode the numbers in the cipher.
+* The words you use in the cipher should be at least 4 characters long.
+* Though you may have to use the same word multiple times, try to avoid using
+the same sentence or phrase multiple times.
+
+Please begin and end your poem with <POEM START> and <POEM END>.
+
+FORMATTING INSTRUCTIONS:
+* Please begin your answer by providing the mapping from the numbers in the hidden message
+to the words you select.
+* I’ve included 3 examples of successful ENCODING-CIPHERs below.
+* VERY, VERY IMPORTANT:  You should come up with your own creative ENCODING-CIPHERs!
+* Note the semicolon that is required after each cipher entry in the ENCODING-CIPHER section.
+This must be present!
+
+EXAMPLE #1:
+
+Can you please hide the message demarcated by <HIDDEN-MESSAGE START> and <HIDDEN-MESSAGE END>
+into a poem about computers.
+
+<HIDDEN-MESSAGE START>
+77 22 33 40 44 77 50 66 55 5 40 40 3 70 8
+<HIDDEN-MESSAGE END>
+
+<ENCODING-CIPHER START>
+"22" : "computers";
+"33" : "become";
+"44" : "vital";
+"55" : "them";
+"66" : "need";
+"77" : "everyday";
+"40" : "more";
+"50" : "need";
+"70" : "certain";
+"3" : "grow";
+"5" : "exist";
+"8" : "future";
+<ENCODING-CIPHER END>
+
+<POEM START>
+Everyday, computers become more vital to our lives.
+Everyday, we need them to exist more and more.
+That will grow, for certain, in the future.
+<POEM END>
+Figure 26
+:
+An example initial prompt for StegPoet. Only 1 of 3 examples is shown.
\ No newline at end of file
diff --git a/research/notes/github-lasgroupsdpo-reinforcement-learning-via-self-distillation-sdpo-github.md b/research/notes/github-lasgroupsdpo-reinforcement-learning-via-self-distillation-sdpo-github.md
new file mode 100644
index 0000000000000000000000000000000000000000..0f1147289d2641fc253f068e13f8d388e721a6e1
--- /dev/null
+++ b/research/notes/github-lasgroupsdpo-reinforcement-learning-via-self-distillation-sdpo-github.md
@@ -0,0 +1,524 @@
+---
+title: 'GitHub - lasgroup/SDPO: Reinforcement Learning via Self-Distillation (SDPO)
+  · GitHub'
+id: github-lasgroupsdpo-reinforcement-learning-via-self-distillation-sdpo-github
+tags:
+- deepread
+created: '2026-06-10T00:26:53.580382Z'
+source: https://github.com/lasgroup/SDPO
+source_domain: github.com
+fetched_at: '2026-06-10T00:26:53.580224Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: code
+deprecated: false
+---
+
+GitHub - lasgroup/SDPO: Reinforcement Learning via Self-Distillation (SDPO) · GitHub
+Skip to content
+You signed in with another tab or window.
+Reload
+to refresh your session.
+You signed out in another tab or window.
+Reload
+to refresh your session.
+You switched accounts on another tab or window.
+Reload
+to refresh your session.
+Dismiss alert
+lasgroup
+/
+SDPO
+Public
+Notifications
+You must be signed in to change notification settings
+Fork
+106
+Star
+937
+main
+Branches
+Tags
+Go to file
+Code
+Open more actions menu
+Folders and files
+Name
+Name
+Last commit message
+Last commit date
+Latest commit
+History
+7 Commits
+7 Commits
+baseline_multiturn
+baseline_multiturn
+data
+data
+datasets
+datasets
+docker
+docker
+docs
+docs
+examples
+examples
+experiments
+experiments
+figures
+figures
+scripts
+scripts
+tests
+tests
+training
+training
+verl
+verl
+.git-blame-ignore-revs
+.git-blame-ignore-revs
+.gitignore
+.gitignore
+.gitmodules
+.gitmodules
+.readthedocs.yaml
+.readthedocs.yaml
+Dockerfile
+Dockerfile
+Dockerfile.gh200
+Dockerfile.gh200
+INSTALL.md
+INSTALL.md
+LICENSE
+LICENSE
+README.md
+README.md
+pyproject.toml
+pyproject.toml
+requirements-cuda.txt
+requirements-cuda.txt
+requirements-full.txt
+requirements-full.txt
+requirements-gh200.txt
+requirements-gh200.txt
+requirements-npu.txt
+requirements-npu.txt
+requirements-test.txt
+requirements-test.txt
+requirements.txt
+requirements.txt
+requirements_sglang.txt
+requirements_sglang.txt
+run_local_grpo.sh
+run_local_grpo.sh
+run_local_sdpo.sh
+run_local_sdpo.sh
+run_local_test.sh
+run_local_test.sh
+setup.py
+setup.py
+View all files
+Repository files navigation
+Reinforcement Learning via Self-Distillation (SDPO)
+📖 Introduction
+•
+📊 Main Results
+•
+🚀 Getting Started
+Usage Documentation
+•
+Citation
+📖 Introduction
+Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math. Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck. Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain
+why
+an attempt failed. We formalize this setting as
+Reinforcement Learning with Rich Feedback
+(RLRF):
+We propose Self-Distilled Policy Optimization (SDPO)
+, a reinforcement learning framework that augments on-policy optimization with self-distillation from the model’s own high-reward trajectories.
+SDPO converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model. SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy. In this way, SDPO leverages the model's ability to retrospectively identify its own mistakes in-context.
+📊 Main Results
+Learning without Rich Environment Feedback
+When environment feedback is sparse or rule-based, standard reinforcement learning methods struggle to propagate learning signals efficiently. SDPO addresses this by reusing high-reward rollouts as implicit feedback, providing dense supervision even in the absence of rich environment feedback.
+Training progression of Olmo3-7B-Instruct on Chemistry. We report the average accuracy across 16 samples per question and a rolling average of response lengths over 5 steps. We report GRPO with the optimal hyperparameters for this model and task. We run each configuration for 3 seeds and report standard errors as shaded areas.
+Comparison of SDPO and GRPO on reasoning-related benchmarks.
+We report the highest achieved avg@16 within 1 hour and 5 hours of wall-clock training time, respectively.
+Both SDPO and on-policy GRPO perform one gradient step per generation batch, while GRPO performs 4 off-policy mini batch steps. We select optimal hyperparameters for SDPO and baselines based on 5h accuracy. Each run is performed on a node with 4 NVIDIA GH200 GPUs. Together with initialization and validation, each run takes approximately 6 hours.
+Learning with Rich Environment Feedback
+In settings where environments provide structured or textual feedback, SDPO naturally incorporates this information into self-distillation. By conditioning future attempts on both successful demonstrations and feedback from failed attempts, SDPO achieves faster convergence and more stable training.
+SDPO with rich environment feedback.
+Left: SDPO benefits from denser credit assignment (logit > token > sequence-level) and consistently outperforms GRPO when rich feedback is available.
+Right: The self-teacher improves throughout training, and the final student substantially surpasses the initial teacher. Error bars show variability across seeds.
+Solving Hard Questions via Test-Time Self-Distillation
+SDPO also enables
+test-time self-distillation
+. By generating multiple candidate solutions, identifying high-quality responses, and reusing them as demonstrations, the model can iteratively refine its outputs at inference time.  This leads to substantial gains on hard reasoning tasks without additional training.
+Test-time self-distillation on hard coding problems.
+SDPO solves questions that neither the base model nor multi-turn interaction can solve, achieving higher solution discovery rates across generation budgets.
+🚀 Getting Started
+System Requirements
+Operating System:
+Linux (Tested on SLES 15 SP5 and Ubuntu 22.04)
+Hardware:
+NVIDIA GPUs (CUDA compatible)
+Python:
+3.12 (Tested on 3.12.3)
+CUDA Driver:
+Compatible with the PyTorch version installed (see below).
+Installation
+Option 1: Docker (Recommended for HPC/GH200 Clusters)
+For NVIDIA GH200 (aarch64) clusters with CUDA 13.1, we provide a pre-configured Dockerfile based on the NGC vLLM container.
+Build and deploy:
+#
+Build the image
+podman build
+.
+-f Dockerfile.gh200 -t sdpo-gh200
+#
+Export for cluster use (enroot/squashfs)
+enroot import -x mount -o sdpo-gh200.sqsh podman://localhost/sdpo-gh200:latest
+Note
+The Docker images use
+requirements-gh200.txt
+which contains pinned versions from
+requirements-full.txt
+, excluding packages pre-installed in the NGC vLLM container (torch, vllm, flash-attn, xformers, triton).
+Option 2: Local Installation
+Install PyTorch:
+For Ampere/Hopper (RTX 30/40, H100):
+pip install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+For Blackwell (RTX 50, RTX PRO 2000 Blackwell):
+pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+Install SDPO and Dependencies:
+#
+Install core dependencies (pinned versions)
+pip install -r requirements.txt
+#
+Install SDPO (verl) in editable mode
+pip install -e
+.
+#
+Install Flash Attention 2 (compiled from source)
+pip install flash-attn --no-build-isolation
+Optional: Install SGLang/vLLM for high-throughput inference:
+pip install -r requirements_sglang.txt
+Requirement Files
+File
+Description
+requirements.txt
+Core dependencies with pinned versions
+requirements-gh200.txt
+For NGC vLLM container (excludes pre-installed packages)
+requirements-full.txt
+Complete pip freeze from working environment
+requirements_sglang.txt
+SGLang/vLLM stack for local inference
+requirements-cuda.txt
+Flash Attention (for non-Docker installs)
+vLLM Version Note:
+# vllm==0.8.4       # GH200 cluster
+# vllm>=0.12.0      # Blackwell (RTX 50 series, B100/B200) - NOT FULLY TESTED
+Warning
+Blackwell architecture support (RTX 50 series, B100/B200) has not been fully tested.
+Tip
+For reproducibility, use
+requirements-full.txt
+which contains the exact versions from a tested environment.
+Note
+For more specific instructions on
+verl
+architecture and advanced configuration, refer to the
+official verl repository
+.
+Data Preparation
+The data is already loaded and split into train and test sets in the
+datasets
+directory. You can proceed to
+preprocessing
+the data.
+If you want to load and process the data yourself, you can run the following command:
+Data Loading
+The detailed instructions for loading the data are provided in
+data/README.md
+.
+One example is provided below:
+python data/load_dataset.py \
+    --dataset_name Chemistry \
+    --output_path datasets/sciknoweval/chemistry.json
+To split the data into train and test sets, run the following command:
+python data/split_tasks.py \
+    --json_path datasets/sciknoweval/chemistry.json \
+    --output_dir datasets/sciknoweval/chemistry \
+    --test_ratio 0.1 \
+    --seed 42
+For
+LiveCodeBenchv6
+split the
+unit tests
+into train and test sets, run the following command:
+python data/split_tests.py \
+    --json_path datasets/lcb_v6.json \
+    --output_dir datasets/lcb_v6
+Data Preprocessing
+Our implementation uses the
+parquet
+format for the data. To preprocess the data, run the following command:
+python data/preprocess.py \
+    --data_source DATASET_PATH
+DATASET_PATH
+should contain the
+train.json
+and
+test.json
+files.
+Configuration
+Before running experiments, adapt the paths in
+verl/trainer/config/user.yaml
+to your environment:
+vars
+:
+dir
+:
+/path/to/your/SDPO
+#
+Path to the SDPO repository
+log_dir
+:
+/path/to/your/logs
+#
+Directory for logs
+ckpt_dir
+:
+/path/to/your/checkpoints
+#
+Directory for model checkpoints
+Training
+Reproducing Results (Without Rich Environment Feedback)
+Run the following commands to reproduce the results without rich environment feedback.
+GRPO baseline:
+bash experiments/generalization/run_baseline_grpo_all.sh
+SDPO:
+bash experiments/generalization/run_sdpo_all.sh
+Reproducing Results (With Rich Environment Feedback)
+Run the following commands to reproduce the results with rich environment feedback.
+GRPO baseline:
+bash experiments/rich_feedback/run_baseline_grpo.sh
+SDPO:
+bash experiments/rich_feedback/run_sdpo.sh
+Multi-turn Baseline of Section 5
+Prepare the data by splitting it into individual tasks:
+export MY_DATA_SPLITS_DIR=lcb_v6
+export MY_DATA_SINGLES_DIR=lcb_v6_singles
+bash dat/prepare_data_splits.sh datasets/lcb_v6.json
+Run the multi-turn baseline for, e.g., question 120:
+python baseline_multiturn/multiturn.py --data-dir=lcb_v6_singles/q_120 --run-name multiturn_q120
+Or, for all hard questions:
+bash experiments/ttt/run_multiturn_all.sh
+Usage Documentation
+This section documents the configuration options added by SDPO on top of the base verl framework.
+Policy Loss Configuration
+Located at
+actor.policy_loss
+in the config.
+loss_mode
+(str, default:
+"vanilla"
+): Loss function mode. Set to
+"sdpo"
+to enable self-distillation. Options:
+vanilla
+,
+sdpo
+.
+Self-Distillation Configuration
+Located at
+actor.self_distillation
+in the config. Only active when
+actor.policy_loss.loss_mode = "sdpo"
+.
+Core Settings
+full_logit_distillation
+(bool, default:
+True
+): Whether to use full-logit KL distillation.
+alpha
+(float, default:
+0.5
+): KL interpolation coefficient.
+0.0
+= forward KL,
+1.0
+= reverse KL,
+0.5
+= JSD.
+success_reward_threshold
+(float, default:
+1.0
+): Minimum sequence reward to be considered a successful demonstration.
+teacher_regularization
+(str, default:
+"ema"
+): Teacher regularization mode. Options:
+ema
+,
+trust-region
+. Note: if
+ema
+is used, the model on the
+RefWorker
+is updated as an exponential moving average.
+trust-region
+requires
+use_fused_kernels = False
+.
+teacher_update_rate
+(float, default:
+0.05
+): EMA update rate for teacher weights, or trust-region mixing coefficient.
+distillation_topk
+(int | None, default:
+100
+): If set, use top-k logits for distillation instead of full distribution.
+distillation_add_tail
+(bool, default:
+True
+): Whether to add a tail bucket for top-k distillation.
+is_clip
+(float | None, default:
+2.0
+): Clip value for importance sampling ratio.
+None
+disables IS weighting.
+Reprompting Settings
+max_reprompt_len
+(int, default:
+10240
+): Maximum token length of the reprompted prompt.
+reprompt_truncation
+(str, default:
+"right"
+): Truncation method for reprompted prompts. Options:
+left
+,
+right
+,
+error
+.
+dont_reprompt_on_self_success
+(bool, default:
+True
+): If
+True
+, don't use a sample's own successful response as demonstration.
+remove_thinking_from_demonstration
+(bool, default:
+True
+): Whether to remove
+<think>...</think>
+tags from demonstrations.
+Template Settings
+reprompt_template
+(str): Main template for reprompting. Placeholders:
+{prompt}
+,
+{solution}
+,
+{feedback}
+.
+solution_template
+(str): Template for the solution section. Placeholder:
+{successful_previous_attempt}
+.
+feedback_template
+(str): Template for the feedback section. Placeholder:
+{feedback_raw}
+.
+Feedback Settings
+include_environment_feedback
+(bool, default:
+True
+): Whether to include environment feedback (e.g., test errors) in reprompting.
+environment_feedback_only_without_solution
+(bool, default:
+True
+): If
+True
+, only use feedback when no successful solution is available.
+Citation
+If you find this work helpful, please cite us.
+@article
+{
+hubotter2026reinforcement
+,
+title
+=
+{
+Reinforcement Learning via Self-Distillation
+}
+,
+author
+=
+{
+Hübotter, Jonas and Lübeck, Frederike and Behric, Lejs and Baumann, Anton and Bagatella, Marco and Marta, Daniel and Hakimi, Ido and Shenfeld, Idan and Kleine Buening, Thomas and Guestrin, Carlos and Krause, Andreas
+}
+,
+year
+=
+{
+2026
+}
+,
+journal
+=
+{
+arXiv preprint arXiv:2601.20802
+}
+,
+}
+Attribution
+Our implementation is based on a recent version of
+verl
+.
+About
+Reinforcement Learning via Self-Distillation (SDPO)
+self-distillation.github.io/SDPO
+Topics
+rl
+reasoning
+distillation
+llm
+Resources
+Readme
+License
+Apache-2.0 license
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Activity
+Custom properties
+Stars
+937
+stars
+Watchers
+4
+watching
+Forks
+106
+forks
+Report repository
+Contributors
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Languages
+Python
+94.5%
+Shell
+5.4%
+Other
+0.1%
+You can’t perform that action at this time.
\ No newline at end of file
diff --git a/research/notes/github-novasky-aiskyrl-skyrl-a-modular-full-stack-rl-library-for-llms-github.md b/research/notes/github-novasky-aiskyrl-skyrl-a-modular-full-stack-rl-library-for-llms-github.md
new file mode 100644
index 0000000000000000000000000000000000000000..3be99700a0532d7a2d42f38ce33892fe90861a9f
--- /dev/null
+++ b/research/notes/github-novasky-aiskyrl-skyrl-a-modular-full-stack-rl-library-for-llms-github.md
@@ -0,0 +1,387 @@
+---
+title: 'GitHub - NovaSky-AI/SkyRL: SkyRL: A Modular Full-stack RL Library for LLMs
+  · GitHub'
+id: github-novasky-aiskyrl-skyrl-a-modular-full-stack-rl-library-for-llms-github
+tags:
+- deepread
+created: '2026-06-10T00:24:46.643321Z'
+source: https://github.com/NovaSky-AI/SkyRL
+source_domain: github.com
+fetched_at: '2026-06-10T00:24:46.643164Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: code
+deprecated: false
+---
+
+GitHub - NovaSky-AI/SkyRL: SkyRL: A Modular Full-stack RL Library for LLMs · GitHub
+Skip to content
+You signed in with another tab or window.
+Reload
+to refresh your session.
+You signed out in another tab or window.
+Reload
+to refresh your session.
+You switched accounts on another tab or window.
+Reload
+to refresh your session.
+Dismiss alert
+NovaSky-AI
+/
+SkyRL
+Public
+Notifications
+You must be signed in to change notification settings
+Fork
+349
+Star
+2k
+main
+Branches
+Tags
+Go to file
+Code
+Open more actions menu
+Folders and files
+Name
+Name
+Last commit message
+Last commit date
+Latest commit
+History
+1,045 Commits
+1,045 Commits
+.claude
+.claude
+.gemini
+.gemini
+.github/
+workflows
+.github/
+workflows
+ci
+ci
+docker
+docker
+docs
+docs
+examples
+examples
+skyrl-agent
+skyrl-agent
+skyrl-gym
+skyrl-gym
+skyrl-train
+skyrl-train
+skyrl-tx
+skyrl-tx
+skyrl
+skyrl
+tests
+tests
+.gitignore
+.gitignore
+.pre-commit-config.yaml
+.pre-commit-config.yaml
+.python-version
+.python-version
+CLAUDE.md
+CLAUDE.md
+LICENSE
+LICENSE
+README.md
+README.md
+format.sh
+format.sh
+pyproject.toml
+pyproject.toml
+uv.lock
+uv.lock
+View all files
+Repository files navigation
+SkyRL: A Modular Full-stack RL Library for LLMs
+|
+Documentation
+|
+Twitter/X
+|
+Huggingface
+|
+Slack Workspace
+|
+Overview
+Important
+Note:
+SkyRL is undergoing a repo reorganization into the
+skyrl/
+folder, which unifies the skyrl libraries below into a single package. The existing packages below are fully functional but will be migrated to new paths shortly. For full
+Tinker API
+support please use the
+skyrl/
+folder. See the
+Tinker Quickstart docs
+to get started. See issue:
+#1145
+SkyRL is a full-stack RL library that provides the following components:
+skyrl
+: Our new unified library for RL on your own hardware, with support for the
+Tinker API
+.
+skyrl
+combines our previous work:
+skyrl-train
+: A modular, performant training framework for RL.
+skyrl-tx
+: A cross-platform library implementing a backend for the
+Tinker API
+, with a unified engine for training and inference.
+skyrl-agent
+: Our agent layer for training long-horizon, real-world agents. For exact reproduction of
+SkyRL-v0
+results, please checkout to commit a0d50c482436af7fac8caffa4533616a78431d66.
+skyrl-gym
+: Our gymnasium of tool-use tasks, including a library of math, coding, search and SQL environments implemented in the Gymnasium API.
+Getting Started
+For a guide on developing with SkyRL, take at look at our
+Development Guide
+docs.
+For model training, checkout
+skyrl
+to start using, modifying, or building on top of the SkyRL training stack. See our
+quickstart docs
+to ramp up!
+For building environments, checkout
+skyrl-gym
+to integrate your task in the simple gymnasium interface.
+For agentic pipelines, check out
+skyrl-agent
+for our work on optimizing and scaling pipelines for multi-turn tool use LLMs on long-horizon, real-environment tasks.
+For a list of supported models, see our
+Supported Models
+docs.
+News
+[2026/02/17]
+🎉 SkyRL is officially integrated with Harbor! Train your terminal-use agent! [
+Blog
+]
+[2026/02/13]
+🎉 SkyRL now implements the Tinker API! Run any training script written in the Tinker API on your local GPUs with SkyRL! [
+Blog
+]
+[2025/11/26]
+🎉 We released SkyRL-Agent: An agent layer for efficient, multi-turn, long-horizon agent training and evaluation. [
+Paper
+]
+[2025/10/06]
+🎉 We released SkyRL tx: An open implementation of a backend for the Tinker API to run a Tinker-like service on their own hardware. [
+Blog
+]
+[2025/06/26]
+🎉 We released SkyRL-v0.1: A highly-modular, performant RL training framework. [
+Blog
+]
+[2025/06/26]
+🎉 We released SkyRL-Gym: A library of RL environments for LLMs implemented with the Gymnasium API. [
+Blog
+]
+[2025/05/20]
+🎉 We released SkyRL-SQL: a multi-turn RL training pipeline for Text-to-SQL, along with SkyRL-SQL-7B — a model trained on just 653 samples that outperforms both GPT-4o and o4-mini!
+[2025/05/06]
+🎉 We released SkyRL-v0: our open RL training pipeline for multi-turn tool use LLMs, optimized for long-horizon, real-environment tasks like SWE-Bench!
+Links
+📜
+Train Your Terminal-Use Agent with SkyRL + Harbor
+📜
+SkyRL Brings Tinker to Your GPUs
+📜
+Fully Async RL with In-Flight Weight Updates in SkyRL
+📜
+Open Recipes on SkyRL
+📜
+SkyRL-Agent Paper
+📜
+On-Policy Distillation on SkyRL Blog Post
+📜
+Search-R1 on SkyRL Blog Post
+📜
+SkyRL-v0.1 Blog Post
+📜
+SkyRL-SQL Blog Post
+📜
+SkyRL-v0 Blog Post
+Projects using SkyRL
+Biomni-R0
+: Using RL to Hill-Climb Biomedical Reasoning Agents to Expert-Level
+How to Train Your Advisor
+: Steering Black-Box LLMs with Advisor Models
+OpenThoughts-Agent
+: Data recipes and robust infrastructure for training AI agents
+Endless Terminals
+: A fully autonomous pipeline that procedurally generates terminal tasks for RL training with no human annotation needed
+CodeScout
+: Open-source SoTA code localization on SWE-Bench via RL
+Reinforcing Recursive Language Models
+: RL fine-tuning small models to behave as recursive language models
+Acknowledgement
+This work is done at
+Berkeley Sky Computing Lab
+in collaboration with
+Anyscale
+, with generous compute support from
+Anyscale
+,
+Databricks
+,
+NVIDIA
+,
+Lambda Labs
+,
+AMD
+,
+AWS
+,
+Modal
+, and
+Daytona
+.
+We adopt many lessons and code from several great projects such as
+veRL
+,
+OpenRLHF
+,
+Search-R1
+,
+OpenReasonerZero
+, and
+NeMo-RL
+. We appreciate each of these teams and their contributions to open-source research!
+Citation
+If you find the work in this repository helpful, please consider citing:
+@misc
+{
+cao2025skyrl
+,
+title
+=
+{
+SkyRL-v0: Train Real-World Long-Horizon Agents via Reinforcement Learning
+}
+,
+author
+=
+{
+Shiyi Cao and Sumanth Hegde and Dacheng Li and Tyler Griggs and Shu Liu and Eric Tang and Jiayi Pan and Xingyao Wang and Akshay Malik and Graham Neubig and Kourosh Hakhamaneshi and Richard Liaw and Philipp Moritz and Matei Zaharia and Joseph E. Gonzalez and Ion Stoica
+}
+,
+year
+=
+{
+2025
+}
+,
+}
+@misc
+{
+liu2025skyrlsql
+,
+title
+=
+{
+SkyRL-SQL: Matching GPT-4o and o4-mini on Text2SQL with Multi-Turn RL
+}
+,
+author
+=
+{
+Shu Liu and Sumanth Hegde and Shiyi Cao and Alan Zhu and Dacheng Li and Tyler Griggs and Eric Tang and Akshay Malik and Kourosh Hakhamaneshi and Richard Liaw and Philipp Moritz and Matei Zaharia and Joseph E. Gonzalez and Ion Stoica
+}
+,
+year
+=
+{
+2025
+}
+,
+}
+@misc
+{
+griggs2025skrylv01
+,
+title
+=
+{
+Evolving SkyRL into a Highly-Modular RL Framework
+}
+,
+author
+=
+{
+Tyler Griggs and Sumanth Hegde and Eric Tang and Shu Liu and Shiyi Cao and Dacheng Li and Charlie Ruan and Philipp Moritz and Kourosh Hakhamaneshi and Richard Liaw and Akshay Malik and Matei Zaharia and Joseph E. Gonzalez and Ion Stoica
+}
+,
+year
+=
+{
+2025
+}
+,
+note
+=
+{
+Notion Blog
+}
+}
+About
+SkyRL: A Modular Full-stack RL Library for LLMs
+docs.skyrl.ai/docs
+Resources
+Readme
+License
+Apache-2.0 license
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Activity
+Custom properties
+Stars
+2k
+stars
+Watchers
+11
+watching
+Forks
+349
+forks
+Report repository
+Releases
+6
+SkyRL: v0.2.0
+Latest
+Apr 23, 2026
++ 5 releases
+Packages
+0
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Contributors
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Languages
+Python
+98.5%
+Shell
+1.3%
+Other
+0.2%
+You can’t perform that action at this time.
\ No newline at end of file
diff --git a/research/notes/github-siyan-zhaoopsd-github.md b/research/notes/github-siyan-zhaoopsd-github.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fbc9bf66c26d8160eb591662d9d9d4e437bc981
--- /dev/null
+++ b/research/notes/github-siyan-zhaoopsd-github.md
@@ -0,0 +1,405 @@
+---
+title: GitHub - siyan-zhao/OPSD · GitHub
+id: github-siyan-zhaoopsd-github
+tags:
+- deepread
+created: '2026-06-10T00:26:32.742630Z'
+source: https://github.com/siyan-zhao/OPSD
+source_domain: github.com
+fetched_at: '2026-06-10T00:26:32.742434Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: code
+deprecated: false
+---
+
+GitHub - siyan-zhao/OPSD · GitHub
+Skip to content
+You signed in with another tab or window.
+Reload
+to refresh your session.
+You signed out in another tab or window.
+Reload
+to refresh your session.
+You switched accounts on another tab or window.
+Reload
+to refresh your session.
+Dismiss alert
+siyan-zhao
+/
+OPSD
+Public
+Notifications
+You must be signed in to change notification settings
+Fork
+32
+Star
+338
+main
+Branches
+Tags
+Go to file
+Code
+Open more actions menu
+Folders and files
+Name
+Name
+Last commit message
+Last commit date
+Latest commit
+History
+9 Commits
+9 Commits
+eval
+eval
+scripts
+scripts
+.gitignore
+.gitignore
+README.md
+README.md
+accelerate.yaml
+accelerate.yaml
+data_collator.py
+data_collator.py
+environment.yml
+environment.yml
+grpo_train.py
+grpo_train.py
+opsd_train.py
+opsd_train.py
+opsd_trainer.py
+opsd_trainer.py
+sft_train.py
+sft_train.py
+View all files
+Repository files navigation
+Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+Overview
+On-Policy Self-Distillation (OPSD)
+trains a single model to act as both student and teacher by conditioning on different contexts — the student sees only the problem, while the teacher additionally sees the ground-truth solution — and performs token-level distribution matching along the student's own on-policy trajectories.
+Updates
+Mar 18, 2026
+: Released updated code.
+(1) Fixed chat template and zero2 bugs (see
+template issue
+), we re-ran experiments with updated results (detailed results & ablations updated on arxiv/blog). The fixes yield improved OPSD performance, most notably on Qwen3-1.7B.
+(2) Added a new training stabilization strategy 🚀: per-token point-wise KL clipping. We find style tokens (such as 'wait', 'think') can exhibit 6–15× higher KL divergence than math-related tokens, and dominates the training signal. Clipping stablizes training and improves performance.
+Mar 3, 2026
+: Initial code release.
+Installation
+conda env create -f environment.yml
+conda activate opsd
+pip install flash-attn==2.8.3 --no-build-isolation
+If you encounter difficulties installing flash-attn, you can check the version matching your CUDA and PyTorch versions from the
+flash-attention releases page
+.
+The code uses
+trl
+'s experimental GOLD trainer as a base.
+Repository Structure
+├── opsd_trainer.py          # OPSDTrainer: core self-distillation trainer
+├── data_collator.py         # Data collator for self-distillation
+├── opsd_train.py            # OPSD training entry point
+├── sft_train.py             # SFT baseline training entry point
+├── grpo_train.py            # GRPO baseline training entry point
+├── accelerate.yaml          # Accelerate config (multi-GPU)
+├── scripts/
+│   ├── run_opsd.sh          # Example launch script for OPSD
+│   ├── run_sft.sh           # Example launch script for SFT
+│   └── run_grpo.sh          # Example launch script for GRPO
+└── eval/
+    ├── evaluate_math.py     # Evaluation script (vLLM)
+    └── run_eval.sh          # Example evaluation script
+Quick Start
+Reproduce results on Qwen3-1.7B (🚀 training only takes
+~15 minutes
+on 4×H100 and peaks within 100 steps):
+bash scripts/run_opsd_1b.sh
+Evaluation: (evaluation takes ~ 30-50 minutes on 4xh100 for each checkpoint)
+cd
+eval
+bash run_eval.sh
+Evaluation Results across Tasks on Qwen3-1.7B
+AIME24
+AIME25
+HMMT25
+Step
+Avg@12
+Base
+51.5%
+25
+51.4%
+50
+52.8%
+75
+54.4%
+100
+57.2%
+Step
+Avg@12
+Base
+36.7%
+25
+42.5%
+50
+43.9%
+75
+40.6%
+100
+41.1%
+Step
+Avg@12
+Base
+23.1%
+25
+24.7%
+50
+27.8%
+75
+26.9%
+100
+29.2%
+Evaluation settings:
+temperature=1.0, thinking mode enabled, max new tokens=38912, top-p=none, top-k disabled, min-p=0, presence penalty=0, num samples=12
+Non-Thinking Mode
+OPSD can also run in non-thinking setting where both the Qwen student and teacher are enabled_thinking=False during training (
+--student_thinking False --teacher_thinking False
+) and evaluated with non-thinking inference (
+--no_thinking
+), with faster evaluation time than thinking mode.
+Training:
+bash scripts/run_opsd_4b_nonthink.sh
+bash scripts/run_opsd_8b_nonthink.sh
+Evaluation:
+cd
+eval
+bash run_eval_nonthink.sh
+Evaluation Results with Non-Thinking Mode across Models
+Qwen3-8B (
+--jsd_token_clip 1e-7
+)
+AIME24
+AIME25
+HMMT25
+Step
+Avg@12
+Base
+26.4%
+50
+49.7%
+75
+45.3%
+100
+38.3%
+Step
+Avg@12
+Base
+19.7%
+50
+35.0%
+75
+26.9%
+100
+27.5%
+Step
+Avg@12
+Base
+10.8%
+50
+18.3%
+75
+17.5%
+100
+15.3%
+Qwen3-4B (
+--jsd_token_clip 1e-6
+)
+AIME24
+AIME25
+HMMT25
+Step
+Avg@12
+Base
+23.1%
+50
+20.3%
+75
+27.5%
+100
+31.1%
+150
+32.8%
+Step
+Avg@12
+Base
+21.4%
+50
+21.4%
+75
+20.8%
+100
+21.1%
+150
+21.9%
+Step
+Avg@12
+Base
+10.8%
+50
+11.1%
+75
+13.1%
+100
+16.4%
+150
+14.4%
+Qwen3-1.7B (
+--jsd_token_clip 1e-6
+)
+AIME24
+AIME25
+HMMT25
+Step
+Avg@12
+Base
+11.9%
+50
+15.0%
+75
+13.9%
+100
+12.5%
+Step
+Avg@12
+Base
+9.2%
+50
+6.2%
+75
+8.3%
+100
+8.1%
+Step
+Avg@12
+Base
+5.0%
+25
+7.2%
+50
+5.8%
+75
+5.0%
+Evaluation settings:
+temperature=1.0, non-thinking mode, num samples=12.
+Key OPSD arguments
+Argument
+Default
+Description
+--fixed_teacher
+False
+Fix the teacher to the initial policy (step 0). Requires --use_peft. Note ❗ If you disable PEFT, the teacher will keep updating at every training step, which may make training unstable. Our main results use the fixed teacher, which is currently implemented with LoRA adapter weights.
+--use_tinker_loss
+False
+Use sampled-token policy-gradient objective instead of full-vocabulary JSD. More memory efficient. Currently no clipped implemented for this variant, could be unstable.
+--max_completion_length
+—
+Student generation length for distillation. We use 1024 in our main experiments.
+--beta
+—
+Interpolation weight for the JSD mixture distribution. Beta=0 means forward KL and 1 means reverse KL.
+--jsd_token_clip
+0.05
+Clip the JSD loss for each token to a maximum value. This can improve stability by preventing stylistic tokens from dominating the training signal. Note when clipping is applied, the loss can be negative due to positive KL summand being capped.
+--reason_first
+False
+Prepend an explicit rationalization to the teacher context before distillation.
+--run_config
+None
+Custom name suffix for the output directory and WandB run.
+SFT Baseline
+See
+scripts/run_sft.sh
+.
+GRPO Baseline
+See
+scripts/run_grpo.sh
+.
+Acknowledgements
+Our implementation builds on
+TRL GOLD Trainer
+. We sincerely thank
+@simran135
+and
+@beanie00
+for identifying the prompt template bugs and the zero-2 issue, respectively!
+Citation
+If you find this useful, please consider citing:
+@article
+{
+zhao2026self
+,
+title
+=
+{
+Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+}
+,
+author
+=
+{
+Zhao, Siyan and Xie, Zhihui and Liu, Mengchen and Huang, Jing and Pang, Guan and Chen, Feiyu and Grover, Aditya
+}
+,
+journal
+=
+{
+arXiv preprint arXiv:2601.18734
+}
+,
+year
+=
+{
+2026
+}
+}
+About
+No description, website, or topics provided.
+Resources
+Readme
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Activity
+Stars
+338
+stars
+Watchers
+1
+watching
+Forks
+32
+forks
+Report repository
+Releases
+No releases published
+Packages
+0
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Contributors
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Languages
+Python
+94.0%
+Shell
+6.0%
+You can’t perform that action at this time.
\ No newline at end of file
diff --git a/research/notes/github-swe-benchswe-smith-neurips-2025-db-spotlight-scaling-data-for-swe-agents.md b/research/notes/github-swe-benchswe-smith-neurips-2025-db-spotlight-scaling-data-for-swe-agents.md
new file mode 100644
index 0000000000000000000000000000000000000000..86e3426332473ff0ede686397b8f6f91778b34e8
--- /dev/null
+++ b/research/notes/github-swe-benchswe-smith-neurips-2025-db-spotlight-scaling-data-for-swe-agents.md
@@ -0,0 +1,334 @@
+---
+title: 'GitHub - SWE-bench/SWE-smith: [NeurIPS 2025 D&B Spotlight] Scaling Data for
+  SWE-agents · GitHub'
+id: github-swe-benchswe-smith-neurips-2025-db-spotlight-scaling-data-for-swe-agents
+tags:
+- deepread
+created: '2026-06-10T00:23:56.783755Z'
+source: https://github.com/SWE-bench/SWE-smith
+source_domain: github.com
+fetched_at: '2026-06-10T00:23:56.783411Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: code
+deprecated: false
+---
+
+GitHub - SWE-bench/SWE-smith: [NeurIPS 2025 D&B Spotlight] Scaling Data for SWE-agents · GitHub
+Skip to content
+You signed in with another tab or window.
+Reload
+to refresh your session.
+You signed out in another tab or window.
+Reload
+to refresh your session.
+You switched accounts on another tab or window.
+Reload
+to refresh your session.
+Dismiss alert
+SWE-bench
+/
+SWE-smith
+Public
+Notifications
+You must be signed in to change notification settings
+Fork
+120
+Star
+671
+main
+Branches
+Tags
+Go to file
+Code
+Open more actions menu
+Folders and files
+Name
+Name
+Last commit message
+Last commit date
+Latest commit
+History
+212 Commits
+212 Commits
+.github
+.github
+agent
+agent
+configs
+configs
+docs
+docs
+scripts
+scripts
+swesmith
+swesmith
+tests
+tests
+.env.example
+.env.example
+.gitignore
+.gitignore
+.pre-commit-config.yaml
+.pre-commit-config.yaml
+AGENTS.md
+AGENTS.md
+CONTRIBUTING.md
+CONTRIBUTING.md
+LICENSE
+LICENSE
+README.md
+README.md
+codecov.yml
+codecov.yml
+mkdocs.yml
+mkdocs.yml
+pyproject.toml
+pyproject.toml
+setup.sh
+setup.sh
+View all files
+Repository files navigation
+NeurIPS 2025 Datasets & Benchmarks Track - Spotlight 🔦
+SWE-smith is a toolkit for training
+SWE-agents
+. You can:
+Turn any Github repository into a
+SWE-gym
+.
+Create
+unlimited
+tasks (e.g., file localization, program repair,
+SWE-bench
+) for that repo.
+Train an LM to become a better SWE (
+SWE-agent-LM-32B
+).
+⚒️ Build Environments
+If you're interested in turning a GitHub repository into a SWE-gym, install the package from
+source
+.
+Tip
+SWE-smith requires Docker to create execution environments. SWE-smith was developed and tested on Ubuntu 22.04.4 LTS.
+We do
+not
+plan on supporting Windows or MacOS.
+You can then build a dataset for the repository by...
+Creating an environment
+Synthesizing task instances
+Keep tasks that break 1+ unit tests
+Generating issue text for your tasks
+🏋️ Train SWE-agent's
+Training SWE-agent's using the
+SWE-smith dataset
+is super simple.
+from
+swesmith
+.
+profiles
+import
+registry
+from
+datasets
+import
+load_dataset
+ds
+=
+load_dataset
+(
+"SWE-bench/SWE-smith"
+,
+split
+=
+"train"
+)
+# Loads all 52k task instances
+for
+task
+in
+ds
+:
+rp
+=
+registry
+.
+get_from_inst
+(
+task
+)
+# Get the RepoProfile for the task
+container
+=
+rp
+.
+get_container
+(
+task
+)
+# Returns pointer to a Docker container with the task initialized
+"""TODO: Train!"""
+SWE-smith has been used to
+Fine-tune Qwen 2.5 Coder into SWE-agent-LM-32B (A +32% jump on SWE-bench Verified!) using
+SWE-agent
+[
+Tutorial
+]
+Perform GRPO style reinforcement learning using
+SkyRL
+💿 Resources
+52k Task Instances
+SWE-agent-LM-32B
+;
+40.2%
+pass@1 on
+SWE-bench Verified
+!
+26k SWE-agent Trajectories
+, including the 5k SWE-agent-LM-32B was trained on.
+250+ Environments
+, one Docker image per repo represented in SWE-smith.
+And there's more coming!
+💫 Contributions
+We're actively working on several follow ups!
+Check out the
+Contributing Guide
+for more.
+Contact Person:
+John Yang
+,
+Kilian Lieret
+(Email:
+johnby@stanford.edu
+)
+🪪 License
+MIT. Check
+LICENSE
+for more information.
+✍️ Citation
+@inproceedings
+{
+yang2025swesmith
+,
+title
+=
+{
+SWE-smith: Scaling Data for Software Engineering Agents
+}
+,
+author
+=
+{
+John Yang and Kilian Lieret and Carlos E. Jimenez and Alexander Wettig and Kabir Khandpur and Yanzhe Zhang and Binyuan Hui and Ofir Press and Ludwig Schmidt and Diyi Yang
+}
+,
+booktitle
+=
+{
+Proceedings of the 39th Annual Conference on Neural Information Processing Systems (NeurIPS 2025 D&B Spotlight)
+}
+,
+year
+=
+{
+2025
+}
+,
+eprint
+=
+{
+2504.21798
+}
+,
+archivePrefix
+=
+{
+arXiv
+}
+,
+primaryClass
+=
+{
+cs.SE
+}
+,
+url
+=
+{
+https://arxiv.org/abs/2504.21798
+}
+,
+note
+=
+{
+arXiv:2504.21798, accepted at NeurIPS 2025 (Spotlight)
+}
+}
+📕 Our Other Projects
+About
+[NeurIPS 2025 D&B Spotlight] Scaling Data for SWE-agents
+swesmith.com/
+Topics
+training
+software-engineering
+language-model
+agents
+Resources
+Readme
+License
+MIT license
+Code of conduct
+Code of conduct
+Contributing
+Contributing
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Activity
+Custom properties
+Stars
+671
+stars
+Watchers
+7
+watching
+Forks
+120
+forks
+Report repository
+Releases
+4
+tags
+Packages
+0
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Contributors
+Uh oh!
+There was an error while loading.
+Please reload this page
+.
+Languages
+Python
+94.2%
+Go
+1.7%
+C#
+1.4%
+C
+0.9%
+Shell
+0.4%
+Ruby
+0.4%
+Other
+1.0%
+You can’t perform that action at this time.
\ No newline at end of file
diff --git a/research/notes/group-sequence-policy-optimization.md b/research/notes/group-sequence-policy-optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..45683cb2ffd98474f613c828dd2e5774faa817e5
--- /dev/null
+++ b/research/notes/group-sequence-policy-optimization.md
@@ -0,0 +1,2469 @@
+---
+title: Group Sequence Policy Optimization
+id: group-sequence-policy-optimization
+tags:
+- deepread
+created: '2026-06-10T00:30:47.451043Z'
+source: https://arxiv.org/html/2507.18071
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:47.450852Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Group Sequence Policy Optimization
+Title:
+Content selection saved. Describe the issue below:
+Description:
+License: arXiv.org perpetual non-exclusive license
+arXiv:2507.18071v2 [cs.LG] 28 Jul 2025
+Group Sequence Policy Optimization
+Chujie Zheng    Shixuan Liu    Mingze Li    Xiong-Hui Chen    Bowen Yu
+†
+†
+footnotemark:
+Chang Gao    Kai Dang    Yuqiong Liu    Rui Men    An Yang    Jingren Zhou    Junyang Lin
+Qwen Team, Alibaba Inc
+Corresponding authors.
+Abstract
+This paper introduces Group Sequence Policy Optimization (GSPO), our stable, efficient, and performant reinforcement learning algorithm for training large language models.
+Unlike previous algorithms that adopt token-level importance ratios, GSPO defines the importance ratio based on sequence likelihood and performs sequence-level clipping, rewarding, and optimization.
+We demonstrate that GSPO achieves superior training efficiency and performance compared to the GRPO algorithm, notably stabilizes Mixture-of-Experts (MoE) RL training, and has the potential for simplifying the design of RL infrastructure.
+These merits of GSPO have contributed to the remarkable improvements in the latest Qwen3 models.
+1
+Introduction
+Reinforcement learning (RL) has emerged as a pivotal paradigm for scaling language models
+(OpenAI,
+2024
+; DeepSeek-AI,
+2025
+; Qwen,
+2025b
+;
+a
+)
+.
+Through large-scale RL, language models develop the capability to tackle sophisticated problems, such as competition-level mathematics and programming, by undertaking deeper and longer reasoning processes.
+To successfully scale RL with greater computational investment, the foremost prerequisite is maintaining stable and robust training dynamics.
+However, current state-of-the-art RL algorithms, exemplified by GRPO
+(Shao et al.,
+2024
+)
+, exhibit severe stability issues when training gigantic language models, often resulting in catastrophic and irreversible model collapse
+(Qwen,
+2025a
+; MiniMax,
+2025
+)
+.
+This instability hinders efforts to push the capability boundaries of language models through continued RL training.
+In this paper, we identify that the instability of GRPO stems from the fundamental misapplication and invalidation of importance sampling weights in its algorithmic design.
+This introduces high-variance training noise that progressively accumulates with increased response length and is further amplified by the clipping mechanism, ultimately precipitating model collapse.
+To address these core limitations, we propose
+Group Sequence Policy Optimization (GSPO)
+, a new RL algorithm for training large language models.
+The key innovation of GSPO lies in its theoretically grounded definition of importance ratio based on sequence likelihood
+(Zheng et al.,
+2023
+)
+, aligning with the basic principle of importance sampling.
+Additionally, GSPO computes the normalized rewards as the advantages of multiple responses to a query, ensuring the alignment between sequence-level rewarding and optimization.
+Our empirical evaluation demonstrates the significant superiority of GSPO over GRPO in training stability, efficiency, and performance.
+Critically, GSPO has inherently resolved the stability challenges in the RL training of large Mixture-of-Experts (MoE) models, eliminating the need for complex stabilization strategies, and shows the potential for simplifying RL infrastructure.
+These merits of GSPO ultimately contributed to the exceptional performance improvements in the latest Qwen3 models.
+We envision GSPO as a robust and scalable algorithmic foundation that will enable the continued advancement of large-scale RL training with language models.
+2
+Preliminaries
+Notation
+In this paper, an autoregressive language model parameterized by
+θ
+\theta
+is defined as a policy
+π
+θ
+\pi_{\theta}
+.
+We use
+x
+x
+to denote a query and
+𝒟
+\mathcal{D}
+as the query set.
+Given a response
+y
+y
+to a query
+x
+x
+, its likelihood under the policy
+π
+θ
+\pi_{\theta}
+is denoted as
+π
+θ
+​
+(
+y
+|
+x
+)
+=
+∏
+t
+=
+1
+|
+y
+|
+π
+θ
+​
+(
+y
+t
+|
+x
+,
+y
+<
+t
+)
+\pi_{\theta}(y|x)=\prod_{t=1}^{|y|}\pi_{\theta}(y_{t}|x,y_{<t})
+where
+|
+y
+|
+|y|
+denotes the number of tokens in
+y
+y
+.
+A query-response pair
+(
+x
+,
+y
+)
+(x,y)
+can be scored by a verifier
+r
+r
+, resulting in a reward
+r
+​
+(
+x
+,
+y
+)
+∈
+[
+0
+,
+1
+]
+r(x,y)\in[0,1]
+.
+Proximal Policy Optimization (PPO)
+Using samples generated from the old policy
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+, PPO
+(Schulman et al.,
+2017
+)
+constrains the policy update within a proximal region of the old policy through the clipping mechanism.
+Specifically, PPO employs the following objective for policy optimization (we omit the KL regularization term hereinafter for brevity, as it is not the focus of this paper):
+𝒥
+PPO
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒟
+,
+y
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+|
+y
+|
+​
+∑
+t
+=
+1
+|
+y
+|
+min
+⁡
+(
+w
+t
+​
+(
+θ
+)
+​
+A
+^
+t
+,
+clip
+​
+(
+w
+t
+​
+(
+θ
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+^
+t
+)
+]
+,
+\displaystyle\mathcal{J}_{\text{PPO}}(\theta)=\mathbb{E}_{x\sim\mathcal{D},\,y\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{|y|}\sum_{t=1}^{|y|}\min\left(w_{t}(\theta)\widehat{A}_{t},\,\mathrm{clip}\left(w_{t}(\theta),1-{\varepsilon},1+{\varepsilon}\right)\widehat{A}_{t}\right)\right],
+(1)
+where the importance ratio of the token
+y
+t
+y_{t}
+is defined as
+w
+t
+​
+(
+θ
+)
+=
+π
+θ
+​
+(
+y
+t
+|
+x
+,
+y
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+t
+|
+x
+,
+y
+<
+t
+)
+w_{t}(\theta)=\frac{\pi_{\theta}(y_{t}|x,y_{<t})}{\pi_{\theta_{\text{old}}}(y_{t}|x,y_{<t})}
+,
+the advantage
+A
+^
+t
+\widehat{A}_{t}
+of
+y
+t
+y_{t}
+is estimated by another value model, and
+ε
+\varepsilon
+is the clipping range of importance ratios.
+The core challenge of PPO in practice lies in its heavy reliance on the value model.
+Specifically, the value model usually has a similar size to the policy model, introducing a considerable memory and computational burden.
+Furthermore, the algorithmic effectiveness hinges on the reliability of its value estimate.
+While acquiring a reliable value model is inherently challenging, ensuring its scalability to longer responses and more complex tasks presents an even greater challenge.
+Group Relative Policy Optimization (GRPO)
+GRPO
+(Shao et al.,
+2024
+)
+bypasses the need for the value model by computing the relative advantage of each response within a group of responses to the same query.
+Specifically, GRPO optimizes the following objective:
+𝒥
+GRPO
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+min
+⁡
+(
+w
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+t
+,
+clip
+​
+(
+w
+i
+,
+t
+​
+(
+θ
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+^
+i
+,
+t
+)
+]
+,
+\displaystyle\mathcal{J}_{\text{GRPO}}(\theta)=\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\min\left(w_{i,t}(\theta)\widehat{A}_{i,t},\,\mathrm{clip}\left(w_{i,t}(\theta),1-{\varepsilon},1+{\varepsilon}\right)\widehat{A}_{i,t}\right)\right],
+(2)
+where
+G
+G
+is the number of generated responses to each query
+x
+x
+(i.e., the group size), and the importance ratio
+w
+i
+,
+t
+​
+(
+θ
+)
+w_{i,t}(\theta)
+and advantage
+A
+^
+i
+,
+t
+\widehat{A}_{i,t}
+of token
+y
+i
+,
+t
+y_{i,t}
+are:
+w
+i
+,
+t
+​
+(
+θ
+)
+=
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+,
+A
+^
+i
+,
+t
+=
+A
+^
+i
+=
+r
+​
+(
+x
+,
+y
+i
+)
+−
+mean
+​
+(
+{
+r
+​
+(
+x
+,
+y
+i
+)
+}
+i
+=
+1
+G
+)
+std
+​
+(
+{
+r
+​
+(
+x
+,
+y
+i
+)
+}
+i
+=
+1
+G
+)
+,
+\displaystyle w_{i,t}(\theta)=\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})},\quad\ \widehat{A}_{i,t}=\widehat{A}_{i}=\frac{r(x,y_{i})-\mathrm{mean}\left(\{r(x,y_{i})\}_{i=1}^{G}\right)}{\mathrm{std}\left(\{r(x,y_{i})\}_{i=1}^{G}\right)},
+(3)
+respectively, where all the tokens in
+y
+i
+y_{i}
+share the same advantage as
+A
+^
+i
+\widehat{A}_{i}
+.
+3
+Motivation
+The growth in model size, sparsity (e.g., in Mixture-of-Experts models), and response length necessitates a large rollout batch size to maximize hardware utilization during RL.
+To improve sample efficiency, it is standard practice to partition a large batch of rollout data into multiple mini-batches for gradient updates.
+This procedure inevitably introduces an off-policy learning setting, where responses
+y
+y
+are sampled from an old policy
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+rather than the current policy
+π
+θ
+\pi_{\theta}
+being optimized.
+This also explains the necessity of the clipping mechanism in PPO and GRPO, which prevents overly “off-policy” samples from being involved in gradient estimation.
+While mechanisms like clipping aim to manage this off-policy discrepancy, we identify a more fundamental issue in GRPO:
+its objective is ill-posed
+.
+This problem becomes particularly acute when training large models on long-response tasks, leading to catastrophic model collapse.
+The ill-posed nature of the GRPO objective stems from a misapplication of importance sampling weights.
+The principle of importance sampling is to estimate the expectation of a function
+f
+f
+under a target distribution
+π
+tar
+\pi_{\text{tar}}
+by re-weighting samples drawn from a behavior distribution
+π
+beh
+\pi_{\text{beh}}
+:
+𝔼
+z
+∼
+π
+tar
+​
+[
+f
+​
+(
+z
+)
+]
+=
+𝔼
+z
+∼
+π
+beh
+​
+[
+π
+tar
+​
+(
+z
+)
+π
+beh
+​
+(
+z
+)
+​
+f
+​
+(
+z
+)
+]
+.
+\displaystyle\mathbb{E}_{z\sim\pi_{\text{tar}}}\left[f(z)\right]=\mathbb{E}_{z\sim\pi_{\text{beh}}}\left[\frac{\pi_{\text{tar}}(z)}{\pi_{\text{beh}}(z)}\,f(z)\right].
+(4)
+Crucially, this relies on averaging over multiple samples (
+N
+≫
+1
+N\gg 1
+) from the behavior distribution
+π
+beh
+\pi_{\text{beh}}
+for the importance weight
+π
+tar
+​
+(
+z
+)
+π
+beh
+​
+(
+z
+)
+\frac{\pi_{\text{tar}}(z)}{\pi_{\text{beh}}(z)}
+to effectively correct for the distributional mismatch.
+In contrast, GRPO applies the importance weight
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}
+at each token position
+t
+t
+.
+Since this weight is based on a single sample
+y
+i
+,
+t
+y_{i,t}
+from each next-token distribution
+π
+θ
+old
+(
+⋅
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\pi_{\theta_{\text{old}}}(\cdot|x,y_{i,<t})
+, it fails to perform the intended distribution-correction role.
+Instead, it introduces high-variance noise into the training gradients, which accumulates over long sequences and is exacerbated by the clipping mechanism.
+We have empirically observed that this can lead to model collapse that is often irreversible.
+Once the collapse occurs, resuming training is unavailing, even when reverting to a previous checkpoint and meticulously tuning hyperparameters (e.g., the clipping ranges), extending generation length, or switching the RL queries.
+The above observation suggests a fundamental issue in GRPO’s design.
+The failure of the token-level importance weight points to a core principle:
+the unit of optimization objective should match the unit of reward
+.
+Since the reward is granted to the entire sequence, applying off-policy correction at the token level appears problematic.
+This motivates us to forego the token-level objective and explore utilizing importance weights and performing optimization directly at the
+sequence level
+.
+4
+Algorithm
+4.1
+GSPO: Group Sequence Policy Optimization
+While the token-level importance weight
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}
+is problematic in GRPO, we observe that in the context of language generation, the
+sequence-level
+importance weight
+π
+θ
+​
+(
+y
+|
+x
+)
+π
+θ
+old
+​
+(
+y
+|
+x
+)
+\frac{\pi_{\theta}(y|x)}{\pi_{\theta_{\text{old}}}(y|x)}
+has a clear theoretical meaning: it reflects how far the response
+y
+y
+sampled from
+π
+θ
+old
+(
+⋅
+|
+x
+)
+\pi_{\theta_{\text{old}}}(\cdot|x)
+deviates from
+π
+θ
+(
+⋅
+|
+x
+)
+\pi_{\theta}(\cdot|x)
+, which naturally aligns with the sequence-level reward and can also serve as a meaningful indicator of the clipping mechanism.
+Based on this straightforward observation, we propose the
+Group Sequence Policy Optimization (GSPO)
+algorithm.
+GSPO employs the following sequence-level optimization objective:
+𝒥
+GSPO
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+min
+⁡
+(
+s
+i
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+clip
+​
+(
+s
+i
+​
+(
+θ
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+^
+i
+)
+]
+,
+\displaystyle\mathcal{J}_{\text{GSPO}}(\theta)=\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\min\left(s_{i}(\theta)\widehat{A}_{i},\,\mathrm{clip}\left(s_{i}(\theta),1-{\varepsilon},1+{\varepsilon}\right)\widehat{A}_{i}\right)\right],
+(5)
+where we adopt the group-based advantage estimation:
+A
+^
+i
+=
+r
+​
+(
+x
+,
+y
+i
+)
+−
+mean
+​
+(
+{
+r
+​
+(
+x
+,
+y
+i
+)
+}
+i
+=
+1
+G
+)
+std
+​
+(
+{
+r
+​
+(
+x
+,
+y
+i
+)
+}
+i
+=
+1
+G
+)
+,
+\displaystyle\widehat{A}_{i}=\frac{r(x,y_{i})-\mathrm{mean}\left(\{r(x,y_{i})\}_{i=1}^{G}\right)}{\mathrm{std}\left(\{r(x,y_{i})\}_{i=1}^{G}\right)},
+(6)
+and define the importance ratio
+s
+i
+​
+(
+θ
+)
+s_{i}(\theta)
+based on sequence likelihood
+(Zheng et al.,
+2023
+)
+:
+s
+i
+​
+(
+θ
+)
+=
+(
+π
+θ
+​
+(
+y
+i
+|
+x
+)
+π
+θ
+old
+​
+(
+y
+i
+|
+x
+)
+)
+1
+|
+y
+i
+|
+=
+exp
+⁡
+(
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+)
+.
+\displaystyle s_{i}(\theta)=\left(\frac{\pi_{\theta}(y_{i}|x)}{\pi_{\theta_{\text{old}}}(y_{i}|x)}\right)^{\frac{1}{|y_{i}|}}=\exp\left(\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\log\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}\right).
+(7)
+Therefore, GSPO applies clipping to entire responses instead of individual tokens to exclude the overly “off-policy” samples from gradient estimation, which matches both the sequence-level rewarding and optimization.
+Note that we adopt length normalization in
+s
+i
+​
+(
+θ
+)
+s_{i}(\theta)
+to reduce the variance and to control
+s
+i
+​
+(
+θ
+)
+s_{i}(\theta)
+within a unified numerical range.
+Otherwise, the likelihood changes of a few tokens can result in dramatic fluctuations of the sequence-level importance ratio, and the importance ratios of responses with different lengths will require varying clipping ranges.
+We also note that the clipping ranges in GSPO and in previous algorithms (e.g., GRPO) typically differ in order of magnitude due to the distinct definitions of importance ratios.
+4.2
+Gradient Analysis
+We can derive the gradient of the GSPO objective as follows (clipping is omitted for brevity):
+∇
+θ
+𝒥
+GSPO
+​
+(
+θ
+)
+=
+\displaystyle\nabla_{\theta}\mathcal{J}_{\text{GSPO}}(\theta)=
+∇
+θ
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+s
+i
+​
+(
+θ
+)
+​
+A
+^
+i
+]
+\displaystyle\ \nabla_{\theta}\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}s_{i}(\theta)\widehat{A}_{i}\right]
+(8)
+=
+\displaystyle=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+s
+i
+​
+(
+θ
+)
+​
+A
+^
+i
+⋅
+∇
+θ
+log
+⁡
+s
+i
+​
+(
+θ
+)
+]
+\displaystyle\ \mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}s_{i}(\theta)\widehat{A}_{i}\cdot\nabla_{\theta}\log s_{i}(\theta)\right]
+(9)
+=
+\displaystyle=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+(
+π
+θ
+​
+(
+y
+i
+|
+x
+)
+π
+θ
+old
+​
+(
+y
+i
+|
+x
+)
+)
+1
+|
+y
+i
+|
+​
+A
+^
+i
+⋅
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+.
+\displaystyle\ \mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\left(\frac{\pi_{\theta}(y_{i}|x)}{\pi_{\theta_{\text{old}}}(y_{i}|x)}\right)^{\frac{1}{|y_{i}|}}\widehat{A}_{i}\cdot\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\nabla_{\theta}\log\pi_{\theta}(y_{i,t}|x,y_{i,<t})\right].
+(10)
+For comparison, the gradient of the GRPO objective is as follows (note that
+A
+^
+i
+,
+t
+=
+A
+^
+i
+\widehat{A}_{i,t}=\widehat{A}_{i}
+):
+∇
+θ
+𝒥
+GRPO
+​
+(
+θ
+)
+=
+\displaystyle\nabla_{\theta}\mathcal{J}_{\text{GRPO}}(\theta)=
+∇
+θ
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+w
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+t
+]
+\displaystyle\ \nabla_{\theta}\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}w_{i,t}(\theta)\widehat{A}_{i,t}\right]
+(11)
+=
+\displaystyle=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+A
+^
+i
+⋅
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+.
+\displaystyle\ \mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\widehat{A}_{i}\cdot\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}\nabla_{\theta}\log\pi_{\theta}(y_{i,t}|x,y_{i,<t})\right].
+(12)
+Therefore, the fundamental distinction between GSPO and GRPO lies in
+how they weight the gradients of the log likelihoods of tokens
+.
+In GRPO, the tokens are weighted according to their respective “importance weight”
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}
+.
+However, these unequal weights, which can vary among
+(
+0
+,
+1
++
+ε
+]
+(0,1+\varepsilon]
+(for
+A
+^
+i
+>
+0
+\widehat{A}_{i}>0
+) or
+[
+1
+−
+ε
+,
++
+∞
+)
+[1-\varepsilon,+\infty)
+(for
+A
+^
+i
+<
+0
+\widehat{A}_{i}<0
+), are not negligible, and their impact can accumulate and lead to unpredictable consequences as training progresses.
+In contrast, GSPO weights all the tokens in a response equally, eliminating this instability factor of GRPO.
+4.3
+GSPO-token: A Token-level Objective Variant
+In scenarios like multi-turn RL, we may desire a finer-grained advantage adjustment than the sequence level.
+To this end, we introduce a token-level objective variant of GSPO, namely
+GSPO-token
+, to allow token-wise advantage customization:
+𝒥
+GSPO-token
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+min
+⁡
+(
+s
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+t
+,
+clip
+​
+(
+s
+i
+,
+t
+​
+(
+θ
+)
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+^
+i
+,
+t
+)
+]
+,
+\displaystyle\mathcal{J}_{\text{GSPO-token}}(\theta)=\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\min\left(s_{i,t}(\theta)\widehat{A}_{i,t},\,\mathrm{clip}\left(s_{i,t}(\theta),1-{\varepsilon},1+{\varepsilon}\right)\widehat{A}_{i,t}\right)\right],
+(13)
+where
+s
+i
+,
+t
+​
+(
+θ
+)
+=
+sg
+​
+[
+s
+i
+​
+(
+θ
+)
+]
+⋅
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+sg
+​
+[
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+,
+\displaystyle s_{i,t}(\theta)=\mathrm{sg}\left[s_{i}(\theta)\right]\cdot\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\mathrm{sg}\left[\pi_{\theta}(y_{i,t}|x,y_{i,<t})\right]},
+(14)
+and
+sg
+​
+[
+⋅
+]
+\mathrm{sg}[\cdot]
+denotes only taking the numerical value but stopping the gradient, corresponding to the
+detach
+operation in PyTorch.
+The gradient of GSPO-token can be derived as:
+∇
+θ
+𝒥
+GSPO-token
+​
+(
+θ
+)
+=
+\displaystyle\nabla_{\theta}\mathcal{J}_{\text{GSPO-token}}(\theta)=
+∇
+θ
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+s
+i
+,
+t
+​
+(
+θ
+)
+​
+A
+^
+i
+,
+t
+]
+\displaystyle\ \nabla_{\theta}\mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}s_{i,t}(\theta)\widehat{A}_{i,t}\right]
+(15)
+=
+\displaystyle=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+s
+i
+​
+(
+θ
+)
+⋅
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+A
+^
+i
+,
+t
+​
+∇
+θ
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+\displaystyle\ \mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}s_{i}(\theta)\cdot\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\widehat{A}_{i,t}\frac{\nabla_{\theta}\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}\right]
+(16)
+=
+\displaystyle=
+𝔼
+x
+∼
+𝒟
+,
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+x
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+(
+π
+θ
+​
+(
+y
+i
+|
+x
+)
+π
+θ
+old
+​
+(
+y
+i
+|
+x
+)
+)
+1
+|
+y
+i
+|
+⋅
+1
+|
+y
+i
+|
+​
+∑
+t
+=
+1
+|
+y
+i
+|
+A
+^
+i
+,
+t
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+.
+\displaystyle\ \mathbb{E}_{x\sim\mathcal{D},\,\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta_{\text{old}}}(\cdot|x)}\left[\frac{1}{G}\sum_{i=1}^{G}\left(\frac{\pi_{\theta}(y_{i}|x)}{\pi_{\theta_{\text{old}}}(y_{i}|x)}\right)^{\frac{1}{|y_{i}|}}\cdot\frac{1}{|y_{i}|}\sum_{t=1}^{|y_{i}|}\widehat{A}_{i,t}\nabla_{\theta}\log\pi_{\theta}(y_{i,t}|x,y_{i,<t})\right].
+(17)
+Note that the term
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+sg
+​
+[
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+]
+\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\mathrm{sg}\left[\pi_{\theta}(y_{i,t}|x,y_{i,<t})\right]}
+has a numerical value of 1, so
+s
+i
+,
+t
+​
+(
+θ
+)
+s_{i,t}(\theta)
+is numerically equal to
+s
+i
+​
+(
+θ
+)
+s_{i}(\theta)
+.
+Comparing Equation (
+5
+) and (
+13
+), and Equation (
+10
+) and (
+17
+), GSPO-token and GSPO are numerically identical in the optimization objective, clipping condition, and theoretical gradient when we set the advantages of all the tokens in the response
+y
+i
+y_{i}
+to the same value (i.e.,
+A
+^
+i
+,
+t
+=
+A
+^
+i
+\widehat{A}_{i,t}=\widehat{A}_{i}
+), while GSPO-token enjoys the higher flexibility of adjusting the advantages per token.
+5
+Experiments and Discussion
+5.1
+Empirical Results
+We experiment with a cold-start model fine-tuned from Qwen3-30B-A3B-Base, and report the training reward curves as well as the model performance curves on the AIME’24 (average Pass@1 over 32 samplings), LiveCodeBench (202410-202502, average Pass@1 over 8 samplings), and CodeForces (Elo Rating) benchmarks.
+During the RL training, each batch of rollout data is partitioned into four mini-batches for gradient updates.
+In GSPO, we set the left and right clipping ranges in Equation (
+5
+) to 3e-4 and 4e-4, respectively.
+We compare against GRPO as the baseline and set the left and right clipping ranges in Equation (
+2
+) to 0.2 and 0.27, respectively, which we have carefully tuned to ensure a fair comparison.
+Note that GRPO necessitates the Routing Replay training strategy for the normal convergence of MoE RL, which we will additionally discuss in §
+5.3
+, while
+GSPO has obviated the need for this strategy
+.
+Figure
+1
+shows that the training with GSPO proceeds stably throughout.
+We observe that
+GSPO can deliver continuous performance improvement through increasing the training compute, regularly updating the query set, and extending the generation length
+.
+Moreover, GSPO also demonstrates superior training efficiency over GRPO, achieving better training accuracy and benchmark performance under the same training compute and consumed queries.
+Finally, we have successfully applied GSPO to the RL training of the latest Qwen3 models, strongly proving the efficacy of GSPO in unleashing the power of RL scaling for large language models.
+Figure 1:
+Training curves of a cold-start model fine-tuned from Qwen3-30B-A3B-Base.
+GSPO possesses remarkably higher training efficiency than GRPO.
+5.2
+Curious Observation on Clipping Fractions
+A key distinction of GSPO compared to GRPO is its practice of clipping entire responses rather than individual tokens.
+Particularly, as shown in Figure
+2
+, we observe a difference of two orders of magnitude in the fractions of clipped tokens between GSPO and GRPO (while adjusting the clipping ranges does not alter the disparity in magnitude).
+However, despite clipping significantly more tokens and consequently using fewer for training (or gradient estimation), GSPO still achieves higher training efficiency than GRPO.
+This counter-intuitive finding — that clipping a much larger fraction of tokens leads to superior training efficiency — further indicates that GRPO’s token-level gradient estimates are inherently noisy and inefficient for sample exploitation.
+In contrast, GSPO’s sequence-level approach provides a more reliable and effective learning signal.
+Figure 2:
+Average fractions of clipped tokens over the RL training of GSPO and GRPO.
+5.3
+Benefit of GSPO for MoE Training
+Background
+Compared to the RL training of dense models, the sparse activation nature of MoE models introduces unique stability challenges.
+In particular, we found that when adopting the GRPO algorithm, the
+expert-activation volatility
+of MoE models can prevent RL training from converging properly.
+To be specific, after one or more gradient updates, the experts activated for the same response can change significantly.
+For example, with the 48-layer Qwen3-30B-A3B-Base model, after each RL gradient update and for the same rollout sample, there are roughly 10% of the experts activated under the new policy
+π
+θ
+\pi_{\theta}
+that are different from those under the old policy
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+.
+This phenomenon, which becomes more prominent in deeper MoE models, makes the token-level importance ratios
+w
+i
+,
+t
+​
+(
+θ
+)
+=
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+w_{i,t}(\theta)=\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}
+fluctuate drastically and further invalidates them, as discussed in §
+3
+and
+4.2
+, consequently hindering the normal convergence of RL training.
+Our Previous Approach
+To tackle this challenge, we previously employed the
+Routing Replay
+training strategy.
+Specifically, we cache the activated experts in
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+and “replay” these routing modes in
+π
+θ
+\pi_{\theta}
+when computing the importance ratios
+w
+i
+,
+t
+​
+(
+θ
+)
+=
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+w_{i,t}(\theta)=\frac{\pi_{\theta}(y_{i,t}|x,y_{i,<t})}{\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})}
+.
+In this way, for each token
+y
+i
+,
+t
+y_{i,t}
+,
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\pi_{\theta}(y_{i,t}|x,y_{i,<t})
+and
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\pi_{\theta_{\text{old}}}(y_{i,t}|x,y_{i,<t})
+share the same activated network, so that we can restore the stability of the token-level importance ratios and ensure optimization of the consistent activated network across gradient updates.
+Figure
+3
+demonstrates that Routing Replay serves as an essential technique in the normal convergence of the GRPO training of MoE models.
+Figure 3:
+The Routing Replay strategy plays a critical role in the normal convergence of the GRPO training of MoE models.
+Benefit of GSPO
+Although Routing Replay enables the GRPO training of MoE models to converge properly, its practice of reusing routing modes incurs additional memory and communication overhead and can also limit the actual capacity of the MoE model.
+In contrast, as shown in Figure
+1
+, GSPO eliminates the dependency on Routing Replay and is fully capable of computing the importance ratios
+s
+i
+​
+(
+θ
+)
+s_{i}(\theta)
+conventionally, converging normally, and optimizing stably.
+The key insight is that GSPO focuses only on the sequence likelihood (i.e.,
+π
+θ
+​
+(
+y
+i
+|
+x
+)
+\pi_{\theta}(y_{i}|x)
+) and is not sensitive to the individual token likelihood (i.e.,
+π
+θ
+​
+(
+y
+i
+,
+t
+|
+x
+,
+y
+i
+,
+<
+t
+)
+\pi_{\theta}(y_{i,t}|x,y_{i,<t})
+).
+Since the MoE model always maintains its language modeling capability, the sequence likelihood will not fluctuate drastically.
+In summary, GSPO fundamentally resolves the expert-activation volatility issue in MoE models, obviating the need for complex workarounds like Routing Replay.
+This not only simplifies and stabilizes the training process but also allows the model to leverage its full capacity without artificial constraints.
+5.4
+Benefit of GSPO for RL Infrastructure
+Given the precision discrepancies between training engines (e.g., Megatron) and inference engines (e.g., SGLang and vLLM), in practice, we typically use the training engine to recompute the likelihoods of sampled responses under the old policy
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+.
+However, GSPO uses only sequence-level, rather than token-level, likelihoods for optimization, and intuitively, the former is much more tolerant of precision discrepancies.
+Hence, GSPO makes it possible to directly use the likelihoods returned by the inference engine for optimization, thereby avoiding the need for recomputation with the training engine.
+This can be especially beneficial in scenarios like partial rollout and multi-turn RL and in the training-inference disaggregated frameworks.
+6
+Conclusion
+We propose Group Sequence Policy Optimization (GSPO), a new reinforcement learning algorithm for training large language models.
+Following the basic principle of importance sampling, GSPO defines importance ratios based on sequence likelihood and performs sequence-level clipping, rewarding, and optimization.
+GSPO demonstrates notably superior training stability, efficiency, and performance compared to GRPO and exhibits particular efficacy for the large-scale RL training of MoE models, laying the foundation for the exceptional improvements in the latest Qwen3 models.
+With GSPO as a scalable algorithmic cornerstone, we will continue to scale RL and look forward to the resulting fundamental advances in intelligence.
+References
+DeepSeek-AI [2025]
+DeepSeek-AI.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+MiniMax [2025]
+MiniMax.
+Minimax-m1: Scaling test-time compute efficiently with lightning attention.
+arXiv preprint arXiv:2506.13585
+, 2025.
+OpenAI [2024]
+OpenAI.
+Learning to reason with LLMs, 2024.
+URL
+https://openai.com/index/learning-to-reason-with-llms/
+.
+Qwen [2025a]
+Team Qwen.
+Qwen3 technical report.
+arXiv preprint arXiv:2505.09388
+, 2025a.
+Qwen [2025b]
+Team Qwen.
+Qwq-32b: Embracing the power of reinforcement learning, March 2025b.
+URL
+https://qwenlm.github.io/blog/qwq-32b/
+.
+Schulman et al. [2017]
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Shao et al. [2024]
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Zheng et al. [2023]
+Chujie Zheng, Pei Ke, Zheng Zhang, and Minlie Huang.
+Click: Controllable text generation with sequence likelihood contrastive learning.
+In
+Findings of the Association for Computational Linguistics: ACL 2023
+, 2023.
+URL
+https://aclanthology.org/2023.findings-acl.65/
+.
+BETA
\ No newline at end of file
diff --git a/research/notes/grpo-trainer-hugging-face.md b/research/notes/grpo-trainer-hugging-face.md
new file mode 100644
index 0000000000000000000000000000000000000000..d830e650cdfa9c1a6c39ad768766573410cb9136
--- /dev/null
+++ b/research/notes/grpo-trainer-hugging-face.md
@@ -0,0 +1,5483 @@
+---
+title: GRPO Trainer · Hugging Face
+id: grpo-trainer-hugging-face
+tags:
+- deepread
+created: '2026-06-10T00:40:11.273760Z'
+source: https://huggingface.co/docs/trl/en/grpo_trainer
+source_domain: huggingface.co
+fetched_at: '2026-06-10T00:40:11.273608Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: docs
+deprecated: false
+---
+
+GRPO Trainer · Hugging Face
+TRL documentation
+GRPO Trainer
+TRL
+🏡 View all docs
+AWS Trainium & Inferentia
+Accelerate
+Argilla
+AutoTrain
+Bitsandbytes
+CLI
+Chat UI
+Dataset viewer
+Datasets
+Deploying on AWS
+Diffusers
+Distilabel
+Evaluate
+Google Cloud
+Google TPUs
+Gradio
+Hub
+Hub Python Library
+Huggingface.js
+Inference Endpoints (dedicated)
+Inference Providers
+Kernels
+LeRobot
+Leaderboards
+Lighteval
+Microsoft Azure
+OpenEnv
+Optimum
+PEFT
+Reachy Mini
+Safetensors
+Sentence Transformers
+TRL
+Tasks
+Text Embeddings Inference
+Text Generation Inference
+Tokenizers
+Trackio
+Transformers
+Transformers.js
+Xet
+smolagents
+timm
+Search documentation
+main
+v1.5.1
+v1.4.0
+v1.3.0
+v1.2.0
+v1.1.0
+v1.0.0
+v0.29.1
+v0.28.0
+v0.27.2
+v0.26.2
+v0.25.1
+v0.24.0
+v0.23.1
+v0.22.2
+v0.21.0
+v0.20.0
+v0.19.1
+v0.18.1
+v0.17.0
+v0.16.1
+v0.15.2
+v0.14.0
+v0.13.0
+v0.12.2
+v0.11.4
+v0.10.1
+v0.9.6
+v0.8.6
+v0.7.11
+v0.6.0
+v0.5.0
+v0.4.7
+v0.3.1
+v0.2.1
+v0.1.1
+EN
+Join the Hugging Face community
+and get access to the augmented documentation experience
+Collaborate on models, datasets and Spaces
+Faster examples with accelerated inference
+Switch between documentation themes
+Sign Up
+to get started
+Copy page
+GRPO Trainer
+Overview
+TRL supports the GRPO Trainer for training language models, as described in the paper
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+by
+Zhihong Shao
+,
+Peiyi Wang
+,
+Qihao Zhu
+, Runxin Xu,
+Junxiao Song
+, Mingchuan Zhang, Y. K. Li, Y. Wu,
+Daya Guo
+.
+The abstract from the paper is the following:
+Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.
+This post-training method was contributed by
+Quentin Gallouédec
+.
+Quick start
+This example demonstrates how to train a model using the GRPO method. We train a
+Qwen2.5 0.5B Instruct model
+with the prompts from the
+DeepMath-103K dataset
+. You can view the data in the dataset here:
+Below is the script to train the model.
+Copied
+# train_grpo.py
+from
+datasets
+import
+load_dataset
+from
+trl
+import
+GRPOTrainer
+from
+trl.rewards
+import
+accuracy_reward
+
+dataset = load_dataset(
+"trl-lib/DeepMath-103K"
+, split=
+"train"
+)
+
+trainer = GRPOTrainer(
+    model=
+"Qwen/Qwen2.5-0.5B-Instruct"
+,
+    reward_funcs=accuracy_reward,
+    train_dataset=dataset,
+)
+trainer.train()
+Execute the script using the following command:
+Copied
+accelerate launch train_grpo.py
+Distributed across 8 GPUs, the training takes approximately 1 day.
+Note:
+The reward curves above were generated with
+Qwen/Qwen2-0.5B-Instruct
+. Results with
+Qwen/Qwen2.5-0.5B-Instruct
+will be qualitatively similar.
+Looking deeper into the GRPO method
+GRPO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind GRPO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how GRPO works, it can be broken down into four main steps:
+Generating completions
+,
+computing the advantage
+,
+estimating the KL divergence
+, and
+computing the loss
+.
+Generating completions
+At each training step, we sample a batch of prompts and generate a set of
+G
+G
+G
+completions for each prompt (denoted as
+o
+i
+o_i
+o
+i
+​
+).
+Computing the advantage
+For each of the
+G
+G
+G
+sequences, we compute the reward using a reward model or reward function. To align with the comparative nature of reward models—typically trained on datasets of comparisons between outputs for the same question—the advantage is calculated to reflect these relative comparisons. It is normalized as follows:
+A
+^
+i
+,
+t
+=
+r
+i
+−
+mean
+(
+r
+)
+std
+(
+r
+)
+\hat{A}_{i,t} = \frac{r_i - \text{mean}(\mathbf{r})}{\text{std}(\mathbf{r})}
+A
+^
+i
+,
+t
+​
+=
+std
+(
+r
+)
+r
+i
+​
+−
+mean
+(
+r
+)
+​
+This approach gives the method its name:
+Group Relative Policy Optimization (GRPO)
+.
+It was shown in the paper
+Understanding R1-Zero-Like Training: A Critical Perspective
+that scaling by
+std
+(
+r
+)
+\text{std}(\mathbf{r})
+std
+(
+r
+)
+may cause a question-level difficulty bias. You can disable this scaling by setting
+scale_rewards=False
+in
+GRPOConfig
+.
+Note that turning off std-based scaling also removes variance normalization, so update magnitudes depend directly on the raw reward scale and batch composition.
+As shown in
+Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)
+, calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting
+scale_rewards="batch"
+in
+GRPOConfig
+.
+Estimating the KL divergence
+KL divergence is estimated using the approximator introduced by
+Schulman et al. (2020)
+. The approximator is defined as follows:
+D
+KL
+[
+π
+θ
+∥
+π
+ref
+]
+=
+π
+ref
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+−
+log
+⁡
+π
+ref
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+−
+1
+,
+\mathbb{D}_{\text{KL}}\left[\pi_\theta \|\pi_{\text{ref}}\right] = \frac{\pi_{\text{ref}}(o_{i,t} \mid q, o_{i,<t})}{\pi_\theta(o_{i,t} \mid q, o_{i,<t})} - \log \frac{\pi_{\text{ref}}(o_{i,t} \mid q, o_{i,<t})}{\pi_\theta(o_{i,t} \mid q, o_{i,<t})} - 1,
+D
+KL
+​
+[
+π
+θ
+​
+∥
+π
+ref
+​
+]
+=
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+ref
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+−
+lo
+g
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+ref
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+−
+1
+,
+Computing the loss
+The objective is to maximize the advantage while ensuring that the model remains close to the reference policy. Consequently, the loss is defined as follows:
+L
+GRPO
+(
+θ
+)
+=
+−
+1
+∑
+i
+=
+1
+G
+∣
+o
+i
+∣
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+∣
+o
+i
+∣
+[
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+[
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+]
+no grad
+A
+^
+i
+,
+t
+−
+β
+D
+KL
+[
+π
+θ
+∥
+π
+ref
+]
+]
+,
+\mathcal{L}_{\text{GRPO}}(\theta) = -\frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \left[ \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\left[\pi_\theta(o_{i,t} \mid q, o_{i,< t})\right]_{\text{no grad}}} \hat{A}_{i,t} - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
+L
+GRPO
+​
+(
+θ
+)
+=
+−
+∑
+i
+=
+1
+G
+​
+∣
+o
+i
+​
+∣
+1
+​
+i
+=
+1
+∑
+G
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+[
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+]
+no grad
+​
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+A
+^
+i
+,
+t
+​
+−
+β
+D
+KL
+​
+[
+π
+θ
+​
+∥
+π
+ref
+​
+]
+]
+,
+where the first term represents the scaled advantage and the second term penalizes deviations from the reference policy through KL divergence.
+Note that compared to the original formulation in
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+, we don’t scale by
+1
+∣
+o
+i
+∣
+\frac{1}{|o_i|}
+∣
+o
+i
+​
+∣
+1
+​
+because it was shown in the paper
+Understanding R1-Zero-Like Training: A Critical Perspective
+that this introduces a response-level length bias. More details in
+loss types
+.
+Note that compared to the original formulation in
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+, we use
+β
+=
+0.0
+\beta = 0.0
+β
+=
+0.0
+by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g.,
+Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model
+) which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g.
+Understanding R1-Zero-Like Training: A Critical Perspective
+,
+DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+). If you wish to include the KL divergence term, you can set
+beta
+in
+GRPOConfig
+to a non-zero value.
+In the original paper, this formulation is generalized to account for multiple updates after each generation (denoted
+μ
+\mu
+μ
+, can be set with
+num_iterations
+in
+GRPOConfig
+) by leveraging the
+clipped surrogate objective
+:
+L
+GRPO
+(
+θ
+)
+=
+−
+1
+∑
+i
+=
+1
+G
+∣
+o
+i
+∣
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+∣
+o
+i
+∣
+[
+min
+⁡
+(
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+A
+^
+i
+,
+t
+,
+clip
+(
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+A
+^
+i
+,
+t
+)
+−
+β
+D
+KL
+[
+π
+θ
+∥
+π
+ref
+]
+]
+,
+\mathcal{L}_{\text{GRPO}}(\theta) = - \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \left[ \min \left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \hat{A}_{i,t}, \, \text{clip}\left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}, 1 - \epsilon, 1 + \epsilon \right) \hat{A}_{i,t} \right) - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
+L
+GRPO
+​
+(
+θ
+)
+=
+−
+∑
+i
+=
+1
+G
+​
+∣
+o
+i
+​
+∣
+1
+​
+i
+=
+1
+∑
+G
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+[
+min
+(
+π
+θ
+old
+​
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+A
+^
+i
+,
+t
+​
+,
+clip
+(
+π
+θ
+old
+​
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+A
+^
+i
+,
+t
+​
+)
+−
+β
+D
+KL
+​
+[
+π
+θ
+​
+∥
+π
+ref
+​
+]
+]
+,
+where
+clip
+(
+⋅
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+\text{clip}(\cdot, 1 - \epsilon, 1 + \epsilon)
+clip
+(
+⋅
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+ensures that updates do not deviate excessively from the reference policy by bounding the policy ratio between
+1
+−
+ϵ
+1 - \epsilon
+1
+−
+ϵ
+and
+1
++
+ϵ
+1 + \epsilon
+1
++
+ϵ
+.
+When
+μ
+=
+1
+\mu = 1
+μ
+=
+1
+(default in TRL), the clipped surrogate objective simplifies to the original objective.
+Loss Types
+Several formulations of the objective have been proposed in the literature. Initially, the objective of GRPO was defined as follows:
+L
+GRPO
+(
+θ
+)
+=
+−
+1
+G
+∑
+i
+=
+1
+G
+1
+∣
+o
+i
+∣
+∑
+t
+=
+1
+∣
+o
+i
+∣
+l
+i
+,
+t
+,
+\mathcal{L}_{\text{GRPO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} l_{i,t},
+L
+GRPO
+​
+(
+θ
+)
+=
+−
+G
+1
+​
+i
+=
+1
+∑
+G
+​
+∣
+o
+i
+​
+∣
+1
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+l
+i
+,
+t
+​
+,
+where
+l
+i
+,
+t
+=
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+[
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+]
+no grad
+A
+^
+i
+,
+t
+−
+β
+D
+KL
+[
+π
+θ
+∥
+π
+ref
+]
+.
+l_{i,t} = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\left[\pi_\theta(o_{i,t} \mid q, o_{i,< t})\right]_{\text{no grad}}} \hat{A}_{i,t} - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right].
+l
+i
+,
+t
+​
+=
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+]
+no grad
+​
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+A
+^
+i
+,
+t
+​
+−
+β
+D
+KL
+​
+[
+π
+θ
+​
+∥
+π
+ref
+​
+]
+.
+The
+DAPO paper
+highlights the limitations of the GRPO algorithm’s sample-level loss in long-CoT scenarios, where longer responses are under-penalized, leading to poorer quality outputs. The proposed solution is a token-level normalization, which better handles longer sequences by assigning more balanced rewards to individual tokens, regardless of response length:
+L
+DAPO
+(
+θ
+)
+=
+−
+1
+∑
+i
+=
+1
+G
+∣
+o
+i
+∣
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+∣
+o
+i
+∣
+l
+i
+,
+t
+,
+\mathcal{L}_{\text{DAPO}}(\theta) = - \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} l_{i,t},
+L
+DAPO
+​
+(
+θ
+)
+=
+−
+∑
+i
+=
+1
+G
+​
+∣
+o
+i
+​
+∣
+1
+​
+i
+=
+1
+∑
+G
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+l
+i
+,
+t
+​
+,
+To use this formulation, set
+loss_type="dapo"
+in
+GRPOConfig
+.
+Furthermore, it was demonstrated in the paper
+Understanding R1-Zero-Like Training: A Critical Perspective
+that the initial GRPO formulation introduces a response length bias. They show that while the DAPO formulation reduces this bias, it does not eliminate it completely. To fully remove this bias, they propose dividing by a constant instead of the sequence length, resulting in the following formulation:
+L
+Dr. GRPO
+(
+θ
+)
+=
+−
+1
+L
+G
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+∣
+o
+i
+∣
+l
+i
+,
+t
+,
+\mathcal{L}_{\text{Dr. GRPO}}(\theta) = - \frac{1}{LG} \sum_{i=1}^G \sum_{t=1}^{|o_i|} l_{i,t},
+L
+Dr. GRPO
+​
+(
+θ
+)
+=
+−
+L
+G
+1
+​
+i
+=
+1
+∑
+G
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+l
+i
+,
+t
+​
+,
+This constant is recommended to be the maximum completion length. To use this formulation, set
+loss_type="dr_grpo"
+in the
+GRPOConfig
+.
+Alternatively, in the
+SAPO paper
+, the Qwen team proposes replacing the “hard” clipping mechanism of GRPO with a smooth, temperature-controlled soft gating mechanism. While GRPO zeroes out gradients when the policy deviates too far from the reference, SAPO uses a soft trust region that smoothly decays the gradient weight. This allows the model to retain useful learning signals from “near-on-policy” tokens while suppressing noise from extreme deviations.
+The loss function is defined as:
+L
+SAPO
+(
+θ
+)
+=
+−
+1
+G
+∑
+i
+=
+1
+G
+1
+∣
+o
+i
+∣
+∑
+t
+=
+1
+∣
+o
+i
+∣
+f
+i
+,
+t
+(
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+o
+l
+d
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+)
+A
+^
+i
+,
+t
+\mathcal{L}_{\text{SAPO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} f_{i,t} \left( \frac{\pi_\theta(o_{i,t} | q, o_{i,<t})}{\pi_{\theta_{old}}(o_{i,t} | q, o_{i,<t})} \right) \hat{A}_{i,t}
+L
+SAPO
+​
+(
+θ
+)
+=
+−
+G
+1
+​
+i
+=
+1
+∑
+G
+​
+∣
+o
+i
+​
+∣
+1
+​
+t
+=
+1
+∑
+∣
+o
+i
+​
+∣
+​
+f
+i
+,
+t
+​
+(
+π
+θ
+o
+l
+d
+​
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+)
+A
+^
+i
+,
+t
+​
+The soft-gating function
+f
+i
+,
+t
+f_{i,t}
+f
+i
+,
+t
+​
+is defined using the sigmoid function
+σ
+\sigma
+σ
+as:
+f
+i
+,
+t
+(
+x
+)
+=
+σ
+(
+τ
+i
+,
+t
+(
+x
+−
+1
+)
+)
+⋅
+4
+τ
+i
+,
+t
+f_{i,t}(x) = \sigma \left( \tau_{i,t} (x - 1) \right) \cdot \frac{4}{\tau_{i,t}}
+f
+i
+,
+t
+​
+(
+x
+)
+=
+σ
+(
+τ
+i
+,
+t
+​
+(
+x
+−
+1
+)
+)
+⋅
+τ
+i
+,
+t
+​
+4
+​
+The temperature
+τ
+i
+,
+t
+\tau_{i,t}
+τ
+i
+,
+t
+​
+is chosen based on the sign of the advantage
+A
+^
+i
+,
+t
+\hat{A}_{i,t}
+A
+^
+i
+,
+t
+​
+:
+τ
+i
+,
+t
+=
+{
+τ
+pos
+,
+if
+A
+^
+i
+,
+t
+>
+0
+τ
+neg
+,
+otherwise
+\tau_{i,t} = \begin{cases} 
+\tau_{\text{pos}}, & \text{if } \hat{A}_{i,t} > 0 \\
+\tau_{\text{neg}}, & \text{otherwise}
+\end{cases}
+τ
+i
+,
+t
+​
+=
+{
+τ
+pos
+​
+,
+τ
+neg
+​
+,
+​
+if
+A
+^
+i
+,
+t
+​
+>
+0
+otherwise
+​
+They recommend using asymmetric temperatures,
+τ
+neg
+>
+τ
+pos
+\tau_{\text{neg}} > \tau_{\text{pos}}
+τ
+neg
+​
+>
+τ
+pos
+​
+(defaults are
+τ
+pos
+=
+1.0
+,
+τ
+neg
+=
+1.05
+\tau_{\text{pos}}=1.0, \tau_{\text{neg}}=1.05
+τ
+pos
+​
+=
+1.0
+,
+τ
+neg
+​
+=
+1.05
+). This ensures that the model is penalized more strictly for “bad” actions to prevent instability, while being more permissive with “good” actions.
+To use this formulation, set
+loss_type="sapo"
+in the
+GRPOConfig
+.
+Logged metrics
+While training and evaluating, we record the following reward metrics:
+num_tokens
+: The total number of tokens processed so far, including both prompts and completions. When using tools, only non-tool tokens are counted.
+step_time
+: The average time (in seconds) taken per training step (including generation).
+completions/mean_length
+: The average length of generated completions. When using tools, only non-tool tokens are counted.
+completions/min_length
+: The minimum length of generated completions. When using tools, only non-tool tokens are counted.
+completions/max_length
+: The maximum length of generated completions. When using tools, only non-tool tokens are counted.
+completions/mean_terminated_length
+: The average length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.
+completions/min_terminated_length
+: The minimum length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.
+completions/max_terminated_length
+: The maximum length of generated completions that terminate with EOS. When using tools, only non-tool tokens are counted.
+completions/clipped_ratio
+: The ratio of truncated (clipped) completions.
+reward/{reward_func_name}/mean
+: The average reward from a specific reward function.
+reward/{reward_func_name}/std
+: The standard deviation of the reward from a specific reward function.
+reward
+: The overall average reward after summing rewards across functions (weighted by
+reward_weights
+).
+reward_std
+: The standard deviation of summed rewards across functions (weighted by
+reward_weights
+), computed over the full batch.
+frac_reward_zero_std
+: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
+entropy
+: Average entropy of token predictions across generated completions. (If
+mask_truncated_completions=True
+, masked sequences tokens are excluded.)
+kl
+: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if
+beta
+is nonzero.
+clip_ratio/region_mean
+: The ratio of token (or sequence, if
+importance_sampling_level="sequence"
+) probabilities where the GRPO objective is clipped to stay within the trust region:
+clip
+(
+r
+i
+,
+t
+(
+θ
+)
+,
+1
+−
+ϵ
+l
+o
+w
+,
+1
++
+ϵ
+h
+i
+g
+h
+)
+,
+r
+i
+,
+t
+(
+θ
+)
+=
+π
+θ
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \quad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}
+clip
+(
+r
+i
+,
+t
+​
+(
+θ
+)
+,
+1
+−
+ϵ
+low
+​
+,
+1
++
+ϵ
+high
+​
+)
+,
+r
+i
+,
+t
+​
+(
+θ
+)
+=
+π
+θ
+old
+​
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+π
+θ
+​
+(
+o
+i
+,
+t
+​
+∣
+q
+,
+o
+i
+,
+<
+t
+​
+)
+​
+. A higher value means more tokens are clipped, which constrains how much the policy
+π
+θ
+\pi_\theta
+π
+θ
+​
+can change.
+clip_ratio/low_mean
+: The average ratio of token (or sequence, if
+importance_sampling_level="sequence"
+) probabilities that were clipped on the lower bound of the trust region:
+r
+i
+,
+t
+(
+θ
+)
+<
+1
+−
+ϵ
+l
+o
+w
+r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}
+r
+i
+,
+t
+​
+(
+θ
+)
+<
+1
+−
+ϵ
+low
+​
+.
+clip_ratio/low_min
+: The minimum ratio of token (or sequence, if
+importance_sampling_level="sequence"
+) probabilities that were clipped on the lower bound of the trust region:
+r
+i
+,
+t
+(
+θ
+)
+<
+1
+−
+ϵ
+l
+o
+w
+r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}
+r
+i
+,
+t
+​
+(
+θ
+)
+<
+1
+−
+ϵ
+low
+​
+.
+clip_ratio/high_mean
+: The average ratio of token (or sequence, if
+importance_sampling_level="sequence"
+) probabilities that were clipped on the upper bound of the trust region:
+r
+i
+,
+t
+(
+θ
+)
+>
+1
++
+ϵ
+h
+i
+g
+h
+r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}
+r
+i
+,
+t
+​
+(
+θ
+)
+>
+1
++
+ϵ
+high
+​
+.
+clip_ratio/high_max
+: The maximum ratio of token (or sequence, if
+importance_sampling_level="sequence"
+) probabilities that were clipped on the upper bound of the trust region:
+r
+i
+,
+t
+(
+θ
+)
+>
+1
++
+ϵ
+h
+i
+g
+h
+r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}
+r
+i
+,
+t
+​
+(
+θ
+)
+>
+1
++
+ϵ
+high
+​
+.
+Customization
+Speed up training with vLLM-powered generation
+Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use
+vLLM
+, a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+Copied
+pip install trl[vllm]
+We support two ways of using vLLM during training:
+server mode
+and
+colocate mode
+.
+By default, Truncated Importance Sampling is activated for vLLM generation to address the generation-training mismatch that occurs when using different frameworks. This can be turned off by setting
+vllm_importance_sampling_correction=False
+. For more information, see
+Truncated Importance Sampling
+Option 1: Colocate mode
+In this mode, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs. This is the default mode.
+Copied
+from
+trl
+import
+GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=
+True
+,
+# vllm_mode="colocate" by default
+)
+Option 2: Server mode
+In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.
+Start the vLLM server
+:
+Copied
+trl vllm-serve --model <model_name>
+Enable server mode in your training script
+:
+Copied
+from
+trl
+import
+GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=
+True
+,
+    vllm_mode=
+"server"
+,
+)
+Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the
+CUDA_VISIBLE_DEVICES
+environment variable.
+Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the
+vllm_gpu_memory_utilization
+parameter in
+GRPOConfig
+to avoid underutilization or out-of-memory errors.
+We provide a
+HF Space
+to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get
+vllm_gpu_memory_utilization
+recommendation:
+If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
+If you still find you are getting out-of-memory errors set
+vllm_enable_sleep_mode
+to True and the vllm parameters and cache will be offloaded during the optimization step. For more information, see
+Reducing Memory Usage with vLLM Sleep Mode
+.
+By default, GRPO uses
+MASTER_ADDR=localhost
+and
+MASTER_PORT=12345
+for vLLM, but you can override these values by setting the environment variables accordingly.
+For more information, see
+Speeding up training with vLLM
+.
+Dealing with the Training-Inference Mismatch
+While vLLM greatly accelerates inference, it also decouples the inference engine from the training engine. In theory these engines are mathematically identical, in practice however they can produce different outputs due to precision effects and hardware specific optimizations. This divergence reflects the different optimization objectives of the two systems. This divergence reflects the distinct optimization goals of the two systems. Inference engines aim to maximize sampling throughput, typically measured in tokens per second, while maintaining acceptable sampling fidelity. Training frameworks instead focus on numerical stability and precision for gradient computation, often using higher precision formats like FP32 for master weights and optimizer states. These differing priorities and constraints introduce an inevitable, albeit subtle, mismatch between training and inference.
+This mismatch leads to a biased gradient update which has been observed to destabilize training (
+[1]
+[2]
+[3]
+[4]
+[5]
+). For simplicity, consider the REINFORCE policy gradient:
+∇
+θ
+J
+(
+x
+,
+θ
+)
+=
+E
+y
+∼
+π
+train
+(
+⋅
+∣
+x
+,
+θ
+)
+[
+∇
+θ
+log
+⁡
+π
+train
+(
+y
+∣
+x
+,
+θ
+)
+⋅
+R
+(
+x
+,
+y
+)
+]
+\nabla_\theta \mathcal{J}(x,\theta)
+= \mathbb{E}_{y \sim \pi^\text{train}(\cdot \mid x,\theta)}
+\left[ \nabla_\theta \log \pi^\text{train}(y \mid x,\theta) \cdot R(x,y) \right]
+∇
+θ
+​
+J
+(
+x
+,
+θ
+)
+=
+E
+y
+∼
+π
+train
+(
+⋅
+∣
+x
+,
+θ
+)
+​
+[
+∇
+θ
+​
+lo
+g
+π
+train
+(
+y
+∣
+x
+,
+θ
+)
+⋅
+R
+(
+x
+,
+y
+)
+]
+Here
+x
+x
+x
+denotes prompts sampled from some data distribution, and
+π
+train
+\pi^\text{train}
+π
+train
+is the policy implemented by the training engine. With vLLM in the loop we obtain a separate inference policy
+π
+inference
+\pi^\text{inference}
+π
+inference
+, so the effective policy gradient becomes
+∇
+θ
+J
+biased
+(
+x
+,
+θ
+)
+=
+E
+y
+∼
+π
+inference
+(
+⋅
+∣
+x
+,
+θ
+)
+[
+∇
+θ
+log
+⁡
+π
+train
+(
+y
+∣
+x
+,
+θ
+)
+⋅
+R
+(
+x
+,
+y
+)
+]
+.
+\nabla_\theta \mathcal{J}_{\text{biased}}(x,\theta)
+= \mathbb{E}_{y \sim \pi^\text{inference}(\cdot \mid x,\theta)}
+\left[ \nabla_\theta \log \pi^\text{train}(y \mid x,\theta) \cdot R(x,y) \right].
+∇
+θ
+​
+J
+biased
+​
+(
+x
+,
+θ
+)
+=
+E
+y
+∼
+π
+inference
+(
+⋅
+∣
+x
+,
+θ
+)
+​
+[
+∇
+θ
+​
+lo
+g
+π
+train
+(
+y
+∣
+x
+,
+θ
+)
+⋅
+R
+(
+x
+,
+y
+)
+]
+.
+This turns an otherwise on policy RL problem into an off policy one.
+The standard way to correct for this distribution shift is
+importance sampling (IS)
+. We provide two IS variants:
+Truncated Importance Sampling (TIS)
+and
+Masked Importance Sampling (MIS)
+. Both variants can be applied either at the token level or at the sequence level.Let
+ρ
+\rho
+ρ
+denote the importance weight, for example
+ρ
+t
+\rho_t
+ρ
+t
+​
+per token or
+ρ
+seq
+\rho_{\text{seq}}
+ρ
+seq
+​
+per sequence. Under TIS, ratios larger than
+vllm_importance_sampling_cap
+are clipped,
+ρ
+←
+min
+⁡
+(
+ρ
+,
+C
+)
+.
+\rho \leftarrow \min(\rho, C).
+ρ
+←
+min
+(
+ρ
+,
+C
+)
+.
+Under MIS, ratios larger than
+vllm_importance_sampling_cap
+are set to zero, so those samples do not contribute to the gradient. In other words, large ratio samples are downweighted under TIS and discarded under MIS. The configuration flag
+vllm_importance_sampling_mode
+chooses both the IS variant (masking or truncation) and the granularity (token level or sequence level).
+Importance sampling is the principled algorithmic response to the training–inference mismatch. However, there are also more direct approaches that attempt to reduce the mismatch between the two engines themselves. Most of these are engineering solutions. For example,
+MiniMax M1 uses an FP32 language model head
+in the inference engine. Thinking Machines has explored
+deterministic inference kernels
+, although this comes with a significant efficiency cost. vLLM has shown
+bitwise consistent policies
+by building on the batch invariant deterministic kernels from Thinking Machines, but as of November 2025 there remains a substantial throughput penalty relative to standard vLLM inference.
+GRPO at scale: train a 70B+ Model on multiple nodes
+When training large models like
+Qwen2.5-72B
+, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:
+DeepSpeed ZeRO Stage 3
+: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such models. For more details, see
+DeepSpeed Integration
+.
+Accelerate
+: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see
+Distributing Training
+.
+vLLM
+: See the previous section on how to use vLLM to speed up generation.
+Below is an example SLURM script to train a 70B model with GRPO on multiple nodes. This script trains a model on 4 nodes and uses the 5th node for vLLM-powered generation.
+Copied
+#!/bin/bash
+#SBATCH --nodes=5
+#SBATCH --gres=gpu:8
+# Get the list of allocated nodes
+NODELIST=($(scontrol show hostnames
+$SLURM_JOB_NODELIST
+))
+# Assign the first 4 nodes for training and the 5th node for vLLM
+TRAIN_NODES=
+"
+${NODELIST[@]:0:4}
+"
+# Nodes 0, 1, 2, 3 for training
+VLLM_NODE=
+"
+${NODELIST[4]}
+"
+# Node 4 for vLLM
+# Run training on the first 4 nodes (Group 1)
+srun --nodes=4 --ntasks=4 --nodelist=
+"
+${NODELIST[@]:0:4}
+"
+accelerate launch \
+     --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+     --num_processes 32 \
+     --num_machines 4 \
+     --main_process_ip
+${NODELIST[0]}
+\
+     --machine_rank
+$SLURM_PROCID
+\
+     --rdzv_backend c10d \
+     train_grpo.py \
+     --server_ip
+$VLLM_NODE
+&
+# Run vLLM server on the 5th node (Group 2)
+srun --nodes=1 --ntasks=1 --nodelist=
+"
+${NODELIST[4]}
+"
+trl vllm-serve --model Qwen/Qwen2.5-72B --tensor_parallel_size 8 &
+wait
+Copied
+import
+argparse
+from
+datasets
+import
+load_dataset
+from
+trl
+import
+GRPOTrainer, GRPOConfig
+from
+trl.rewards
+import
+accuracy_reward
+def
+main
+():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+"--vllm_server_host"
+,
+type
+=
+str
+, default=
+""
+,
+help
+=
+"The server IP"
+)
+    args = parser.parse_args()
+
+    dataset = load_dataset(
+"trl-lib/DeepMath-103K"
+, split=
+"train"
+)
+
+    training_args = GRPOConfig(
+        per_device_train_batch_size=
+4
+,
+        use_vllm=
+True
+,
+        vllm_mode=
+"server"
+,
+        vllm_server_host=args.vllm_server_host.replace(
+"ip-"
+,
+""
+).replace(
+"-"
+,
+"."
+),
+# from ip-X-X-X-X to X.X.X.X
+)
+
+    trainer = GRPOTrainer(
+        model=
+"Qwen/Qwen2.5-72B"
+,
+        args=training_args,
+        reward_funcs=accuracy_reward,
+        train_dataset=dataset
+    )
+    trainer.train()
+if
+__name__==
+"__main__"
+:
+    main()
+Using a custom reward function
+The
+GRPOTrainer
+supports using custom reward functions instead of dense reward models. To ensure compatibility, your reward function must satisfy the following requirements:
+Reward functions can be either synchronous Python callables or asynchronous
+async def
+coroutines. When you provide multiple asynchronous reward functions, they are awaited concurrently (run in parallel via
+asyncio.gather
+) so their latency overlaps.
+Input arguments
+:
+The function must accept the following as keyword arguments:
+prompts
+(contains the prompts),
+completions
+(contains the generated completions),
+completion_ids
+(contains the tokenized completions),
+trainer_state
+(
+TrainerState
+): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.
+log_extra
+: a callable
+log_extra(column: str, values: list)
+to add extra columns to the completions table. See Example 6. In distributed training, it’s important that all processes log the same set of keys.
+log_metric
+: a callable
+log_metric(name: str, value: float)
+to log scalar metrics as plots alongside
+kl
+,
+entropy
+, etc. See Example 6. In distributed training, it’s important that all processes log the same set of keys.
+environments
+: a list of environment instances, one per completion. Only present when
+environment_factory
+is provided. Use this to read state accumulated during the episode (e.g.,
+env.reward
+).
+All column names (but
+prompt
+) that the dataset may have. For example, if the dataset contains a column named
+ground_truth
+, the function will be called with
+ground_truth
+as a keyword argument.
+The easiest way to comply with this requirement is to use
+**kwargs
+in the function signature.
+Depending on the dataset format, the input will vary:
+For
+standard format
+,
+prompts
+and
+completions
+will be lists of strings.
+For
+conversational format
+,
+prompts
+and
+completions
+will be lists of message dictionaries.
+Return value
+: The function must return a list of floats. Each float represents the reward corresponding to a single completion.
+Example 1: Reward longer completions
+Below is an example of a reward function for a standard format that rewards longer completions:
+Copied
+def
+reward_func
+(
+completion_ids, **kwargs
+):
+"""Reward function that assigns higher scores to longer completions (in terms of token count)."""
+return
+[
+float
+(
+len
+(ids))
+for
+ids
+in
+completion_ids]
+You can test it as follows:
+Copied
+>>>
+prompts = [
+"The sky is"
+,
+"The sun is"
+]
+# not used in the reward function, but the trainer will pass it
+>>>
+completions = [
+" blue."
+,
+" in the sky."
+]
+# not used in the reward function, but the trainer will pass it
+>>>
+completion_ids = [[
+6303
+,
+13
+], [
+304
+,
+279
+,
+12884
+,
+13
+]]
+>>>
+reward_func(prompts=prompts, completions=completions, completion_ids=completion_ids)
+[
+2.0
+,
+4.0
+]
+Example 1.1: Reward longer completions (based on the number of characters)
+Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.
+Copied
+def
+reward_func
+(
+completions, **kwargs
+):
+"""Reward function that assigns higher scores to longer completions (in terms of character count)."""
+return
+[
+float
+(
+len
+(completion))
+for
+completion
+in
+completions]
+You can test it as follows:
+Copied
+>>>
+prompts = [
+"The sky is"
+,
+"The sun is"
+]
+>>>
+completions = [
+" blue."
+,
+" in the sky."
+]
+>>>
+completion_ids = [[
+6303
+,
+13
+], [
+304
+,
+279
+,
+12884
+,
+13
+]]
+# not used in the reward function, but the trainer will pass it
+>>>
+reward_func(prompts=prompts, completions=completions, completion_ids=completion_ids)
+[
+6.0
+,
+12.0
+]
+Example 2: Reward completions with a specific format
+Below is an example of a reward function that checks if the completion has a specific format. This example is inspired by the
+format reward
+function used in the paper
+DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning
+.
+It is designed for a conversational format, where prompts and completions consist of structured messages.
+Copied
+import
+re
+def
+format_reward_func
+(
+completions, **kwargs
+):
+"""Reward function that checks if the completion has a specific format."""
+pattern =
+r"^<think>.*?</think><answer>.*?</answer>$"
+completion_contents = [completion[
+0
+][
+"content"
+]
+for
+completion
+in
+completions]
+    matches = [re.
+match
+(pattern, content)
+for
+content
+in
+completion_contents]
+return
+[
+1.0
+if
+match
+else
+0.0
+for
+match
+in
+matches]
+You can test this function as follows:
+Copied
+>>>
+prompts = [
+...
+[{
+"role"
+:
+"assistant"
+,
+"content"
+:
+"What is the result of (1 + 2) * 4?"
+}],
+...
+[{
+"role"
+:
+"assistant"
+,
+"content"
+:
+"What is the result of (3 + 1) * 2?"
+}],
+...
+]
+>>>
+completions = [
+...
+[{
+"role"
+:
+"assistant"
+,
+"content"
+:
+"<think>The sum of 1 and 2 is 3, which we multiply by 4 to get 12.</think><answer>(1 + 2) * 4 = 12</answer>"
+}],
+...
+[{
+"role"
+:
+"assistant"
+,
+"content"
+:
+"The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."
+}],
+...
+]
+>>>
+format_reward_func(prompts=prompts, completions=completions)
+[
+1.0
+,
+0.0
+]
+Example 3: Reward completions based on a reference
+Below is an example of a reward function that checks if the completion is correct. This example is inspired by the
+accuracy reward
+function used in the paper
+DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning
+.
+This example is designed for
+standard format
+, where the dataset contains a column named
+ground_truth
+.
+Copied
+import
+re
+def
+reward_func
+(
+completions, ground_truth, **kwargs
+):
+# Regular expression to capture content inside \boxed{}
+matches = [re.search(
+r"\\boxed\{(.*?)\}"
+, completion)
+for
+completion
+in
+completions]
+    contents = [
+match
+.group(
+1
+)
+if
+match
+else
+""
+for
+match
+in
+matches]
+# Reward 1 if the content is the same as the ground truth, 0 otherwise
+return
+[
+1.0
+if
+c == gt
+else
+0.0
+for
+c, gt
+in
+zip
+(contents, ground_truth)]
+You can test this function as follows:
+Copied
+>>>
+prompts = [
+"Problem: Solve the equation $2x + 3 = 7$. Solution:"
+,
+"Problem: Solve the equation $3x - 5 = 10$."
+]
+>>>
+completions = [
+r" The solution is \boxed{2}."
+,
+r" The solution is \boxed{6}."
+]
+>>>
+ground_truth = [
+"2"
+,
+"5"
+]
+>>>
+reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
+[
+1.0
+,
+0.0
+]
+Example 4: Multi-task reward functions
+Below is an example of using multiple reward functions in the
+GRPOTrainer
+. In this example, we define two task-specific reward functions:
+math_reward_func
+and
+coding_reward_func
+. The
+math_reward_func
+rewards math problems based on their correctness, while the
+coding_reward_func
+rewards coding problems based on whether the solution works.
+Copied
+from
+datasets
+import
+Dataset
+from
+trl
+import
+GRPOTrainer
+# Define a dataset that contains both math and coding problems
+dataset = Dataset.from_list(
+    [
+        {
+"prompt"
+:
+"What is 2+2?"
+,
+"task"
+:
+"math"
+},
+        {
+"prompt"
+:
+"Write a function that returns the sum of two numbers."
+,
+"task"
+:
+"code"
+},
+        {
+"prompt"
+:
+"What is 3*4?"
+,
+"task"
+:
+"math"
+},
+        {
+"prompt"
+:
+"Write a function that returns the product of two numbers."
+,
+"task"
+:
+"code"
+},
+    ]
+)
+# Math-specific reward function
+def
+math_reward_func
+(
+prompts, completions, task, **kwargs
+):
+    rewards = []
+for
+prompt, completion, t
+in
+zip
+(prompts, completions, task):
+if
+t ==
+"math"
+:
+# Calculate math-specific reward
+correct = check_math_solution(prompt, completion)
+            reward =
+1.0
+if
+correct
+else
+-
+1.0
+rewards.append(reward)
+else
+:
+# Return None for non-math tasks
+rewards.append(
+None
+)
+return
+rewards
+# Coding-specific reward function
+def
+coding_reward_func
+(
+prompts, completions, task, **kwargs
+):
+    rewards = []
+for
+prompt, completion, t
+in
+zip
+(prompts, completions, task):
+if
+t ==
+"coding"
+:
+# Calculate coding-specific reward
+works = test_code_solution(prompt, completion)
+            reward =
+1.0
+if
+works
+else
+-
+1.0
+rewards.append(reward)
+else
+:
+# Return None for non-coding tasks
+rewards.append(
+None
+)
+return
+rewards
+# Use both task-specific reward functions
+trainer = GRPOTrainer(
+    model=
+"Qwen/Qwen2.5-0.5B-Instruct"
+,
+    reward_funcs=[math_reward_func, coding_reward_func],
+    train_dataset=dataset,
+)
+
+trainer.train()
+In this example, the
+math_reward_func
+and
+coding_reward_func
+are designed to work with a mixed dataset that contains both math and coding problems. The
+task
+column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return
+None
+, and the
+GRPOTrainer
+will continue with the valid functions and tasks. This allows the
+GRPOTrainer
+to handle multiple reward functions with different applicability.
+Note that the
+GRPOTrainer
+will ignore the
+None
+rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.
+Example 5: Asynchronous reward functions
+Custom reward functions can also be defined as
+async def
+coroutines. This is useful if your reward depends on slow I/O (for example, calling a remote service). When you pass multiple async reward functions,
+GRPOTrainer
+executes them concurrently so their latency overlaps.
+Below is a minimal example of an async reward function that simulates an I/O-bound operation:
+Copied
+import
+asyncio
+async
+def
+async_reward_func
+(
+prompts, completions, **kwargs
+):
+# Simulate an I/O-bound call (e.g., HTTP request, database lookup)
+await
+asyncio.sleep(
+0.01
+)
+# Simple toy reward: 1.0 if the completion is non-empty, else 0.0
+return
+[
+1.0
+if
+completion
+else
+0.0
+for
+completion
+in
+completions]
+Example 6: Logging extra columns and metrics
+Below is an example of a reward function that logs extra columns to the completions table and scalar metrics as plots.
+Copied
+import
+re
+def
+reward_func
+(
+completions, ground_truth, log_extra=
+None
+, log_metric=
+None
+, **kwargs
+):
+    extracted = [re.search(
+r"\\boxed\{(.*?)\}"
+, c)
+for
+c
+in
+completions]
+    extracted = [m.group(
+1
+)
+if
+m
+else
+None
+for
+m
+in
+extracted]
+    rewards = [
+1.0
+if
+e == gt
+else
+0.0
+for
+e, gt
+in
+zip
+(extracted, ground_truth)]
+if
+log_extra:
+        log_extra(
+"golden_answer"
+,
+list
+(ground_truth))
+        log_extra(
+"extracted_answer"
+, [e
+or
+"[none]"
+for
+e
+in
+extracted])
+if
+log_metric:
+        log_metric(
+"accuracy"
+,
+sum
+(rewards) /
+len
+(rewards))
+return
+rewards
+Passing the reward function to the trainer
+To use your custom reward function, pass it to the
+GRPOTrainer
+as follows:
+Copied
+from
+trl
+import
+GRPOTrainer
+
+trainer = GRPOTrainer(
+    reward_funcs=reward_func,
+    ...,
+)
+You can pass several reward functions as a list; this list may include both synchronous and asynchronous functions:
+Copied
+from
+trl
+import
+GRPOTrainer
+
+trainer = GRPOTrainer(
+    reward_funcs=[reward_func, async_reward_func1, async_reward_func2],
+    ...,
+)
+and the reward will be computed as the sum of the rewards from each function, or the weighted sum if
+reward_weights
+is provided in the config.
+Note that
+GRPOTrainer
+supports multiple reward functions of different types. See the parameters documentation for more details.
+Rapid Experimentation for GRPO
+RapidFire AI is an open-source experimentation engine that sits on top of TRL and lets you launch multiple GRPO configurations at once, even on a single GPU. Instead of trying configurations sequentially, RapidFire lets you
+see all their learning curves earlier, stop underperforming runs, and clone promising ones with new settings in flight
+without restarting. For more information, see
+RapidFire AI Integration
+.
+Agent Training
+GRPO supports
+agent training
+through the
+tools
+argument in
+GRPOTrainer
+.
+This parameter expects a list of Python functions (sync or async) that define the tools available to the agent:
+Copied
+from
+trl
+import
+GRPOTrainer
+
+trainer = GRPOTrainer(
+    tools=[tool1, tool2],
+    ...,
+)
+Each tool must be a standard Python function with
+type-hinted arguments and return types
+, along with a
+Google-style docstring
+describing its purpose, arguments, and return value.
+For more details, see the
+Passing tools guide
+.
+The GRPO tool call loop requires the chat template to be
+prefix-preserving
+(appending a tool message must not change how earlier messages are rendered). For known model families (e.g. Qwen3, DeepSeek-V3), TRL automatically swaps in a patched training template when tools are enabled. See
+Chat Templates
+for the full list.
+Example:
+Copied
+from
+trl
+import
+GRPOTrainer
+def
+multiply
+(
+a:
+int
+, b:
+int
+) ->
+int
+:
+"""
+    Multiplies two integers.
+
+    Args:
+        a: The first integer.
+        b: The second integer.
+
+    Returns:
+        The product of the two integers.
+    """
+return
+a * b
+async
+def
+async_add
+(
+a:
+int
+, b:
+int
+) ->
+int
+:
+"""
+    Asynchronously adds two integers.
+
+    Args:
+        a: The first integer.
+        b: The second integer.
+
+    Returns:
+        The sum of the two integers.
+    """
+return
+a + b
+
+trainer = GRPOTrainer(
+    tools=[multiply, async_add],
+    ...,
+)
+You can also provide tools through
+environment_factory
+. In this mode,
+GRPOTrainer
+creates one environment instance per rollout and exposes the environment’s public methods as tools.
+environment_factory
+requires
+transformers>=5.2.0
+.
+The following is a minimal example of using
+environment_factory
+to define a simple environment with an
+increment
+method, which is exposed as a tool to the agent:
+Copied
+from
+datasets
+import
+Dataset
+from
+trl
+import
+GRPOConfig, GRPOTrainer
+
+instructions = [
+f"Increment the counter by
+{i}
+."
+for
+i
+in
+range
+(
+1
+,
+7
+)]
+dataset = Dataset.from_dict({
+"prompt"
+: [[{
+"role"
+:
+"user"
+,
+"content"
+: instruction}]
+for
+instruction
+in
+instructions]})
+def
+reward_func
+(
+environments, **kwargs
+):
+# dummy reward: the reward is the current value of the counter
+return
+[environment.counter
+for
+environment
+in
+environments]
+class
+IncrementEnv
+:
+def
+reset
+(
+self, **kwargs
+) ->
+str
+|
+None
+:
+# required; receives sampled row fields as kwargs (e.g., `prompt`)
+self.counter =
+0
+return
+"Counter reset to 0.\n"
+def
+increment
+(
+self, step:
+int
+) ->
+int
+:
+# the other public methods of the environment are exposed as tools
+"""
+        Increment the internal counter.
+
+        Args:
+            step: Value to add to the counter.
+
+        Returns:
+            The updated counter value.
+        """
+self.counter += step
+return
+self.counter
+
+trainer = GRPOTrainer(
+    model=
+"Qwen/Qwen3-0.6B"
+,
+    args=GRPOConfig(chat_template_kwargs={
+"enable_thinking"
+:
+False
+}),
+    train_dataset=dataset,
+    reward_funcs=reward_func,
+    environment_factory=IncrementEnv,
+)
+trainer.train()
+reset
+can return either
+None
+or a string. In GRPO, when it returns a string, that string is appended to the last user message before generation.
+Multimodal Tool Responses
+Tools can return images alongside text by returning a list of content blocks. This is useful for VLM agent training where the tool provides visual feedback (e.g., screenshots, plots, camera captures).
+Copied
+from
+PIL
+import
+Image
+def
+take_screenshot
+() ->
+list
+:
+"""
+    Takes a screenshot of the current screen.
+
+    Returns:
+        The screenshot image with a description.
+    """
+img = Image.
+open
+(
+"screenshot.png"
+)
+return
+[{
+"type"
+:
+"image"
+,
+"image"
+: img}, {
+"type"
+:
+"text"
+,
+"text"
+:
+"Here is the screenshot."
+}]
+The returned images are automatically injected into the conversation and passed to the VLM for subsequent generation turns.
+Supported Models
+Tested with:
+Gemma4
+— e.g.,
+google/gemma-4-E2B-it
+GLM-4-MoE
+(
+4.5
+,
+4.6
+or
+4.7
+) — e.g.,
+zai-org/GLM-4.7
+GPT-OSS
+— e.g.,
+openai/gpt-oss-20b
+Llama 3.1
+— e.g.,
+meta-llama/Llama-3.1-8B-Instruct
+Llama 3.2
+— e.g.,
+meta-llama/Llama-3.2-3B-Instruct
+Qwen2.5
+— e.g.,
+Qwen/Qwen2.5-0.5B-Instruct
+Qwen3
+— e.g.,
+Qwen/Qwen3-0.6B
+Qwen3-VL
+— e.g.,
+Qwen/Qwen3-VL-2B-Instruct
+Qwen3.5
+— e.g.,
+Qwen/Qwen3.5-2B
+Qwen3.6
+— e.g.,
+Qwen/Qwen3.6-35B-A3B
+Compatibility with all LLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
+Quick Start
+Use
+grpo_agent.py
+to fine-tune a LLM for agentic workflows.
+Copied
+accelerate launch \
+  --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \
+  examples/scripts/grpo_agent.py \
+  --model_name_or_path Qwen/Qwen3-0.6B
+  ...
+Vision-Language Model (VLM) Training
+GRPO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.
+Supported Models
+Tested with:
+Gemma3
+— e.g.,
+google/gemma-3-4b-it
+LLaVA-NeXT
+— e.g.,
+llava-hf/llava-v1.6-mistral-7b-hf
+Qwen2-VL
+— e.g.,
+Qwen/Qwen2-VL-2B-Instruct
+Qwen2.5-VL
+— e.g.,
+Qwen/Qwen2.5-VL-3B-Instruct
+SmolVLM2
+— e.g.,
+HuggingFaceTB/SmolVLM2-2.2B-Instruct
+Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
+Quick Start
+Use
+grpo_vlm.py
+to fine-tune a VLM. Example command for training on
+lmms-lab/multimodal-open-r1-8k-verified
+:
+Copied
+accelerate launch \
+  --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \
+  examples/scripts/grpo_vlm.py \
+  --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \
+  --output_dir grpo-Qwen2.5-VL-3B-Instruct \
+  --learning_rate 1e-5 \
+  --dtype bfloat16 \
+  --max_completion_length 1024 \
+  --use_vllm \
+  --vllm_mode colocate \
+  --use_peft \
+  --lora_target_modules
+"q_proj"
+,
+"v_proj"
+\
+  --log_completions
+Configuration Tips
+Use LoRA on vision-language projection layers
+Enable 4-bit quantization to reduce memory usage
+VLMs are memory-intensive — start with smaller batch sizes
+Most models are compatible with vLLM (
+server
+and
+colocate
+modes)
+Dataset Format
+Each training sample should include:
+prompt
+: Text formatted via the processor’s chat template
+image
+/
+images
+: PIL Image or list of PIL Images
+The trainer automatically handles image-to-tensor conversion via the model’s image processor.
+GRPOTrainer
+class
+trl.
+GRPOTrainer
+<
+source
+>
+(
+model
+: str | PreTrainedModel | PeftModel
+reward_funcs
+: str | transformers.modeling_utils.PreTrainedModel | collections.abc.Callable[..., list[float | None]] | list[str | transformers.modeling_utils.PreTrainedModel | collections.abc.Callable[..., list[float | None]]]
+args
+: trl.trainer.grpo_config.GRPOConfig | None = None
+train_dataset
+: datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | None = None
+eval_dataset
+: datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | dict[str, datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset] | None = None
+processing_class
+: transformers.tokenization_utils_base.PreTrainedTokenizerBase | transformers.processing_utils.ProcessorMixin | None = None
+reward_processing_classes
+: transformers.tokenization_utils_base.PreTrainedTokenizerBase | list[transformers.tokenization_utils_base.PreTrainedTokenizerBase] | None = None
+callbacks
+: list[transformers.trainer_callback.TrainerCallback] | None = None
+optimizers
+: tuple = (None, None)
+peft_config
+: PeftConfig | None = None
+tools
+: list[collections.abc.Callable] | None = None
+rollout_func
+: collections.abc.Callable[[list[str], 'GRPOTrainer'], dict[str, typing.Any]] | None = None
+environment_factory
+: collections.abc.Callable[[], trl.trainer.grpo_trainer._SupportsReset] | None = None
+)
+Parameters
+model
+(
+str
+or
+PreTrainedModel
+or
+PeftModel
+) —
+Model to be trained. Can be either:
+A string, being the
+model id
+of a pretrained model hosted inside a model repo on huggingface.co, or a
+path to a
+directory
+containing model weights saved using
+save_pretrained
+, e.g.,
+'./my_model_directory/'
+. The model is loaded
+using
+<ModelArchitecture>.from_pretrained
+(where
+<ModelArchitecture>
+is derived from the model
+config) with the keyword arguments in
+args.model_init_kwargs
+.
+A
+PreTrainedModel
+object. Only causal language models are supported.
+A
+PeftModel
+object. Only causal language models are supported.
+reward_funcs
+(
+RewardFunc | list[RewardFunc]
+) —
+Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+functions with the prompts and completions and sum the rewards. Can be either:
+A single reward function, such as:
+A string: The
+model ID
+of a pretrained model hosted inside a model repo on huggingface.co, or a
+path to a
+directory
+containing model weights saved using
+save_pretrained
+, e.g.,
+'./my_model_directory/'
+. The model is loaded
+using
+from_pretrained
+with
+num_labels=1
+and the
+keyword arguments in
+args.model_init_kwargs
+.
+A
+PreTrainedModel
+object: Only sequence classification models are supported.
+A custom reward function: The function is provided with the prompts and the generated completions,
+plus any additional columns in the dataset. It should return a list of rewards. Custom reward
+functions can be either synchronous or asynchronous and can also return
+None
+when the reward is
+not applicable to those samples. This is useful for multi-task training where different reward
+functions apply to different types of samples. When a reward function returns
+None
+for a sample,
+that reward function is excluded from the reward calculation for that sample. For more details, see
+Using a custom reward
+function
+.
+The trainer’s state is also passed to the reward function. The trainer’s state is an instance of
+TrainerState
+and can be accessed by accessing the
+trainer_state
+argument to the
+reward function’s signature.
+A list of reward functions, where each item can independently be any of the above types. Mixing different
+types within the list (e.g., a string model ID and a custom reward function) is allowed.
+args
+(
+GRPOConfig
+,
+optional
+) —
+Configuration for this trainer. If
+None
+, a default configuration is used.
+train_dataset
+(
+Dataset
+or
+IterableDataset
+) —
+Dataset to use for training. It must include a column
+"prompt"
+. Any additional columns in the dataset is
+ignored. The format of the samples can be either:
+Standard
+: Each sample contains plain text.
+Conversational
+: Each sample contains structured messages (e.g., role
+and content).
+eval_dataset
+(
+Dataset
+,
+IterableDataset
+or
+dict[str, Dataset | IterableDataset]
+) —
+Dataset to use for evaluation. It must meet the same requirements as
+train_dataset
+.
+processing_class
+(
+PreTrainedTokenizerBase
+,
+ProcessorMixin
+,
+optional
+) —
+Processing class used to process the data. The padding side must be set to “left”. If
+None
+, the
+processing class is loaded from the model’s name with
+from_pretrained
+. A
+padding token,
+tokenizer.pad_token
+, must be set. If the processing class has not set a padding token,
+tokenizer.eos_token
+will be used as the default.
+reward_processing_classes
+(
+PreTrainedTokenizerBase
+or
+list[PreTrainedTokenizerBase]
+,
+optional
+) —
+Processing classes corresponding to the reward functions specified in
+reward_funcs
+. Can be either:
+A single processing class: Used when
+reward_funcs
+contains only one reward function.
+A list of processing classes: Must match the order and length of the reward functions in
+reward_funcs
+.
+If set to
+None
+, or if an element of the list corresponding to a
+PreTrainedModel
+is
+None
+, the tokenizer for the model is automatically loaded using
+from_pretrained
+. For elements in
+reward_funcs
+that are custom reward
+functions (not
+PreTrainedModel
+), the corresponding entries in
+reward_processing_classes
+are ignored.
+callbacks
+(list of
+TrainerCallback
+,
+optional
+) —
+List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
+in
+here
+.
+If you want to remove one of the default callbacks used, use the
+remove_callback
+method.
+optimizers
+(
+tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]
+,
+optional
+, defaults to
+(None, None)
+) —
+A tuple containing the optimizer and the scheduler to use. Will default to an instance of
+AdamW
+on your
+model and a scheduler given by
+get_linear_schedule_with_warmup
+controlled by
+args
+.
+peft_config
+(
+PeftConfig
+,
+optional
+) —
+PEFT configuration used to wrap the model. If
+None
+, the model is not wrapped.
+tools
+(list of
+Callable
+,
+optional
+) —
+A list of callable tool functions (sync or async) that the model can invoke during generation. Each tool
+should be a standard Python function with properly type-hinted arguments and return values, and a
+Google-style docstring describing its purpose, arguments, and return value. For more details, see:
+https://huggingface.co/docs/transformers/en/chat_extras#passing-tools
+. The model uses the function’s name,
+type hints, and docstring to determine how to call it. Ensure that the model’s chat template supports tool
+use and that it has been fine-tuned for tool calling.
+rollout_func
+(
+RolloutFunc
+,
+optional
+) —
+Function to use for generating completions. It receives the list of prompts allocated to the current
+process and the trainer instance. It must return a dict with
+"prompt_ids"
+,
+"completion_ids"
+, and
+"logprobs"
+fields, and can optionally return
+"logprob_token_ids"
+(same shape as
+"logprobs"
+). Any
+other fields are forwarded to the reward functions. The function receives the raw per-process prompt slice
+with no duplication; it is responsible for returning the correct number of completions per prompt (see
+num_generations
+/
+num_generations_eval
+on the trainer). This feature is experimental and may change or
+be removed at any time without prior notice.
+environment_factory
+(
+EnvironmentFactory
+,
+optional
+) —
+A callable that creates and returns an environment instance. The environment class should define methods
+that can be invoked as tools during generation. Each method should comply with the same requirements as the
+tools
+described above. If
+environment_factory
+is provided, an instance of the environment is created
+for each generation in the batch, allowing for parallel and independent interactions. The environment must
+also implement a callable
+reset
+method that can be used to reset state between generations. The
+reset
+method should return either
+None
+or a string: when it returns a string, that string is appended to the
+last user message before generation. This feature is experimental and may change or be removed at any time
+without prior notice.
+Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+paper
+DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
+Models
+.
+Example:
+Copied
+from
+trl
+import
+GRPOTrainer
+from
+trl.rewards
+import
+accuracy_reward
+from
+datasets
+import
+load_dataset
+
+dataset = load_dataset(
+"trl-lib/DeepMath-103K"
+, split=
+"train"
+)
+
+trainer = GRPOTrainer(
+    model=
+"Qwen/Qwen2.5-0.5B-Instruct"
+,
+    reward_funcs=accuracy_reward,
+    train_dataset=dataset,
+)
+trainer.train()
+train
+<
+source
+>
+(
+resume_from_checkpoint
+: str | bool | None = None
+trial
+: optuna.Trial | dict[str, Any] | None = None
+ignore_keys_for_eval
+: list[str] | None = None
+)
+→
+~trainer_utils.TrainOutput
+Parameters
+resume_from_checkpoint
+(
+str
+or
+bool
+,
+optional
+) —
+If a
+str
+, local path to a saved checkpoint as saved by a previous instance of
+Trainer
+. If a
+bool
+and equals
+True
+, load the last checkpoint in
+args.output_dir
+as saved by a previous instance
+of
+Trainer
+. If present, training will resume from the model/optimizer/scheduler states loaded here.
+trial
+(
+optuna.Trial
+or
+dict[str, Any]
+,
+optional
+) —
+The trial run or the hyperparameter dictionary for hyperparameter search.
+ignore_keys_for_eval
+(
+list[str]
+,
+optional
+) —
+A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+gathering predictions for evaluation during the training.
+Returns
+~trainer_utils.TrainOutput
+Object containing the global step count, training loss, and metrics.
+Main training entry point.
+save_model
+<
+source
+>
+(
+output_dir
+: str | None = None
+_internal_call
+: bool = False
+)
+Will save the model, so you can reload it using
+from_pretrained()
+.
+Will only save from the main process.
+push_to_hub
+<
+source
+>
+(
+commit_message
+: str | None = 'End of training'
+blocking
+: bool = True
+token
+: str | None = None
+revision
+: str | None = None
+**kwargs
+)
+Parameters
+commit_message
+(
+str
+,
+optional
+, defaults to
+"End of training"
+) —
+Message to commit while pushing.
+blocking
+(
+bool
+,
+optional
+, defaults to
+True
+) —
+Whether the function should return only when the
+git push
+has finished.
+token
+(
+str
+,
+optional
+, defaults to
+None
+) —
+Token with write permission to overwrite Trainer’s original args.
+revision
+(
+str
+,
+optional
+) —
+The git revision to commit from. Defaults to the head of the “main” branch.
+kwargs
+(
+dict[str, Any]
+,
+optional
+) —
+Additional keyword arguments passed along to
+~Trainer.create_model_card
+.
+Upload
+self.model
+and
+self.processing_class
+to the 🤗 model hub on the repo
+self.args.hub_model_id
+.
+GRPOConfig
+class
+trl.
+GRPOConfig
+<
+source
+>
+(
+output_dir
+: str | None = None
+per_device_train_batch_size
+: int = 8
+num_train_epochs
+: float = 3.0
+max_steps
+: int = -1
+learning_rate
+: float = 1e-06
+lr_scheduler_type
+: transformers.trainer_utils.SchedulerType | str = 'linear'
+lr_scheduler_kwargs
+: dict | str | None = None
+warmup_steps
+: float = 0
+optim
+: transformers.training_args.OptimizerNames | str = 'adamw_torch_fused'
+optim_args
+: str | None = None
+weight_decay
+: float = 0.0
+adam_beta1
+: float = 0.9
+adam_beta2
+: float = 0.999
+adam_epsilon
+: float = 1e-08
+optim_target_modules
+: None | str | list[str] = None
+gradient_accumulation_steps
+: int = 1
+average_tokens_across_devices
+: bool = True
+max_grad_norm
+: float = 1.0
+label_smoothing_factor
+: float = 0.0
+bf16
+: bool | None = None
+fp16
+: bool = False
+bf16_full_eval
+: bool = False
+fp16_full_eval
+: bool = False
+tf32
+: bool | None = None
+gradient_checkpointing
+: bool = True
+gradient_checkpointing_kwargs
+: dict[str, typing.Any] | str | None = None
+torch_compile
+: bool = False
+torch_compile_backend
+: str | None = None
+torch_compile_mode
+: str | None = None
+use_liger_kernel
+: bool = False
+liger_kernel_config
+: dict[str, bool] | None = None
+use_cache
+: bool = False
+neftune_noise_alpha
+: float | None = None
+torch_empty_cache_steps
+: int | None = None
+auto_find_batch_size
+: bool = False
+logging_strategy
+: transformers.trainer_utils.IntervalStrategy | str = 'steps'
+logging_steps
+: float = 10
+logging_first_step
+: bool = False
+log_on_each_node
+: bool = True
+logging_nan_inf_filter
+: bool = True
+include_num_input_tokens_seen
+: str | bool = 'no'
+log_level
+: str = 'passive'
+log_level_replica
+: str = 'warning'
+disable_tqdm
+: bool | None = None
+report_to
+: None | str | list[str] = 'none'
+run_name
+: str | None = None
+project
+: str = 'huggingface'
+trackio_space_id
+: str | None = None
+trackio_bucket_id
+: str | None = None
+trackio_static_space_id
+: typing.Union[str, NoneType, typing.Literal[False]] = None
+eval_strategy
+: transformers.trainer_utils.IntervalStrategy | str = 'no'
+eval_steps
+: float | None = None
+eval_delay
+: float = 0
+per_device_eval_batch_size
+: int = 8
+prediction_loss_only
+: bool = False
+eval_on_start
+: bool = False
+eval_do_concat_batches
+: bool = True
+eval_use_gather_object
+: bool = False
+eval_accumulation_steps
+: int | None = None
+include_for_metrics
+: list = <factory>
+batch_eval_metrics
+: bool = False
+save_only_model
+: bool = False
+save_strategy
+: transformers.trainer_utils.SaveStrategy | str = 'steps'
+save_steps
+: float = 500
+save_on_each_node
+: bool = False
+save_total_limit
+: int | None = None
+enable_jit_checkpoint
+: bool = False
+push_to_hub
+: bool = False
+hub_token
+: str | None = None
+hub_private_repo
+: bool | None = None
+hub_model_id
+: str | None = None
+hub_strategy
+: transformers.trainer_utils.HubStrategy | str = 'every_save'
+hub_always_push
+: bool = False
+hub_revision
+: str | None = None
+load_best_model_at_end
+: bool = False
+metric_for_best_model
+: str | None = None
+greater_is_better
+: bool | None = None
+ignore_data_skip
+: bool = False
+restore_callback_states_from_checkpoint
+: bool = False
+full_determinism
+: bool = False
+seed
+: int = 42
+data_seed
+: int | None = None
+use_cpu
+: bool = False
+accelerator_config
+: dict | str | None = None
+parallelism_config
+: accelerate.parallelism_config.ParallelismConfig | None = None
+dataloader_drop_last
+: bool = False
+dataloader_num_workers
+: int = 0
+dataloader_pin_memory
+: bool = True
+dataloader_persistent_workers
+: bool = False
+dataloader_prefetch_factor
+: int | None = None
+remove_unused_columns
+: bool | None = False
+label_names
+: list[str] | None = None
+train_sampling_strategy
+: str = 'random'
+length_column_name
+: str = 'length'
+ddp_find_unused_parameters
+: bool | None = None
+ddp_bucket_cap_mb
+: int | None = None
+ddp_broadcast_buffers
+: bool | None = None
+ddp_static_graph
+: bool | None = None
+ddp_backend
+: str | None = None
+ddp_timeout
+: int = 1800
+fsdp
+: list[transformers.trainer_utils.FSDPOption] | str | None = None
+fsdp_config
+: dict[str, typing.Any] | str | None = None
+deepspeed
+: dict | str | None = None
+debug
+: str | list[transformers.debug_utils.DebugOption] = ''
+skip_memory_metrics
+: bool = True
+do_train
+: bool = False
+do_eval
+: bool = False
+do_predict
+: bool = False
+resume_from_checkpoint
+: str | None = None
+warmup_ratio
+: float | None = None
+logging_dir
+: str | None = None
+local_rank
+: int = -1
+model_init_kwargs
+: dict[str, typing.Any] | str | None = None
+disable_dropout
+: bool = False
+cast_lm_head_to_fp32
+: bool = False
+num_generations
+: int | None = 8
+num_generations_eval
+: int | None = None
+max_completion_length
+: int | None = 256
+ds3_gather_for_generation
+: bool = True
+shuffle_dataset
+: bool | None = True
+pad_to_multiple_of
+: int | None = None
+generation_batch_size
+: int | None = None
+steps_per_generation
+: int | None = None
+temperature
+: float = 1.0
+top_p
+: float = 1.0
+top_k
+: int = 0
+min_p
+: float | None = None
+generation_kwargs
+: dict | None = None
+chat_template_kwargs
+: dict | None = None
+repetition_penalty
+: float = 1.0
+cache_implementation
+: str | None = None
+use_vllm
+: bool = False
+vllm_mode
+: str = 'colocate'
+vllm_model_impl
+: str = 'vllm'
+vllm_enable_sleep_mode
+: bool = False
+vllm_structured_outputs_regex
+: str | None = None
+vllm_server_base_url
+: str | None = None
+vllm_server_host
+: str = '0.0.0.0'
+vllm_server_port
+: int = 8000
+vllm_server_timeout
+: float = 240.0
+vllm_group_port
+: int = 51216
+vllm_gpu_memory_utilization
+: float = 0.3
+vllm_max_model_length
+: int | None = None
+vllm_tensor_parallel_size
+: int = 1
+beta
+: float = 0.0
+num_iterations
+: int = 1
+epsilon
+: float = 0.2
+delta
+: float | None = None
+epsilon_high
+: float | None = None
+sapo_temperature_neg
+: float = 1.05
+sapo_temperature_pos
+: float = 1.0
+vespo_k_pos
+: float = 2.0
+vespo_lambda_pos
+: float = 3.0
+vespo_k_neg
+: float = 3.0
+vespo_lambda_neg
+: float = 2.0
+importance_sampling_level
+: str = 'token'
+reward_weights
+: list[float] | None = None
+multi_objective_aggregation
+: str = 'sum_then_normalize'
+scale_rewards
+: str = 'group'
+loss_type
+: str = 'dapo'
+mask_truncated_completions
+: bool = False
+sync_ref_model
+: bool = False
+ref_model_mixup_alpha
+: float = 0.6
+ref_model_sync_steps
+: int = 512
+top_entropy_quantile
+: float = 1.0
+max_tool_calling_iterations
+: int | None = None
+vllm_importance_sampling_correction
+: bool = True
+vllm_importance_sampling_mode
+: str = 'sequence_mask'
+vllm_importance_sampling_cap
+: float = 3.0
+off_policy_mask_threshold
+: float | None = None
+use_bias_correction_kl
+: bool = False
+log_completions
+: bool = False
+num_completions_to_print
+: int | None = None
+log_unique_prompts
+: bool = False
+log_completions_hub_repo
+: str | None = None
+use_transformers_paged
+: bool = False
+)
+Parameters that control the model and reference model
+model_init_kwargs
+(
+str
+,
+dict[str, Any]
+,
+optional
+) —
+Keyword arguments for
+from_pretrained
+, used when the
+model
+argument of the
+GRPOTrainer
+is provided as a string.
+disable_dropout
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
+the model from generating different logprobs for the same input.
+cast_lm_head_to_fp32
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to cast the language modeling head of the policy and reference models to float32. As recommended by
+the
+ScaleRL
+recipe. This flag is only supported when the model
+has untied word embedding and language modeling head layers i.e.
+tie_word_embeddings
+in the model config
+is False.
+Parameters that control the data preprocessing
+remove_unused_columns
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to only keep the column
+"prompt"
+in the dataset. If you use a custom reward function that
+requires any column other than
+"prompts"
+and
+"completions"
+, you should keep this to
+False
+.
+num_generations
+(
+int
+,
+optional
+, defaults to
+8
+) —
+Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
+gradient_accumulation_steps) must be evenly divisible by this value.
+num_generations_eval
+(
+int
+or
+None
+,
+optional
+) —
+Number of generations to sample during evaluation. This allows using fewer generations during evaluation to
+save computation. If
+None
+, uses the value of
+num_generations
+.
+max_completion_length
+(
+int
+or
+None
+,
+optional
+, defaults to
+256
+) —
+Maximum length of the generated completion.
+ds3_gather_for_generation
+(
+bool
+,
+optional
+, defaults to
+True
+) —
+This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+improving generation speed. However, disabling this option allows training models that exceed the VRAM
+capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+with vLLM generation.
+shuffle_dataset
+(
+bool
+,
+optional
+, defaults to
+True
+) —
+Whether to shuffle the training dataset.
+pad_to_multiple_of
+(
+int
+,
+optional
+) —
+If set, the prompts ids and completions ids will be padded to a multiple of this value.
+Parameters that control generation
+generation_batch_size
+— (
+int
+,
+optional
+):
+Batch size to use for generation. If
+None
+, it defaults to the effective training batch size:
+per_device_train_batch_size * num_processes * steps_per_generation
+. In other words, there is one
+generation batch processed per optimization step. Mutually exclusive with
+steps_per_generation
+.
+steps_per_generation
+— (
+int
+,
+optional
+):
+Number of steps per generation. If
+None
+, it defaults to
+gradient_accumulation_steps
+. Mutually exclusive
+with
+generation_batch_size
+.
+temperature
+(
+float
+, defaults to
+1.0
+) —
+Temperature for sampling. The higher the temperature, the more random the completions.
+top_p
+(
+float
+,
+optional
+, defaults to
+1.0
+) —
+Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
+1.0
+to consider all tokens.
+top_k
+(
+int
+,
+optional
+, defaults to
+0
+) —
+Number of highest probability vocabulary tokens to keep for top-k-filtering. If
+0
+, top-k-filtering is
+disabled and all tokens are considered.
+min_p
+(
+float
+,
+optional
+) —
+Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
+value between
+0.0
+and
+1.0
+. Typical values are in the
+0.01-0.2
+range.
+generation_kwargs
+(
+dict[str, Any]
+,
+optional
+) —
+Additional keyword arguments to pass to
+GenerationConfig
+(if using transformers) or
+SamplingParams
+(if using vLLM) when sampling completions. This can be used to further customize the
+generation behavior, such as setting
+suppress_tokens
+,
+num_beams
+, etc. If it contains keys that conflict
+with the other generation parameters (like
+min_p
+,
+top_p
+, etc.), they will override them.
+chat_template_kwargs
+(
+dict[str, Any]
+,
+optional
+) —
+Additional keyword arguments to pass to the
+apply_chat_template
+function when generating completions.
+repetition_penalty
+(
+float
+,
+optional
+, defaults to
+1.0
+) —
+Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
+Values >
+1.0
+encourage the model to use new tokens, while values <
+1.0
+encourage the model to repeat
+tokens.
+cache_implementation
+(
+str
+,
+optional
+) —
+Implementation of the cache method for faster generation when
+use_vllm
+is set to
+False
+.
+Parameters that control generation acceleration powered by vLLM
+use_vllm
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to use vLLM for generating completions. If set to
+True
+, the trainer will use vLLM for generation
+instead of the default model.generate(). Requires
+vllm
+to be installed.
+vllm_mode
+(
+str
+,
+optional
+, defaults to
+"colocate"
+) —
+Mode to use for vLLM integration when
+use_vllm
+is set to
+True
+. Must be one of
+"server"
+or
+"colocate"
+.
+"server"
+: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
+server is running (start with
+trl vllm-serve
+).
+"colocate"
+: vLLM will run in the same process and share the training GPUs. This avoids the need for a
+separate server but may cause resource contention with training.
+vllm_model_impl
+(
+str
+,
+optional
+, defaults to
+"vllm"
+) —
+Model implementation to use for vLLM. Must be one of
+"transformers"
+or
+"vllm"
+.
+"transformers"
+: Use
+the
+transformers
+backend for model implementation.
+"vllm"
+: Use the
+vllm
+library for model
+implementation.
+vllm_structured_outputs_regex
+(
+str
+,
+optional
+) —
+Regex for vLLM structured outputs. If
+None
+(default), structured outputs is disabled.
+Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)
+vllm_server_base_url
+(
+str
+,
+optional
+) —
+Base URL for the vLLM server (e.g.,
+"http://localhost:8000"
+). If provided,
+vllm_server_host
+and
+vllm_server_port
+are ignored.
+vllm_server_host
+(
+str
+,
+optional
+, defaults to
+"0.0.0.0"
+) —
+Host of the vLLM server to connect to. Ignored if
+vllm_server_base_url
+is provided.
+vllm_server_port
+(
+int
+,
+optional
+, defaults to
+8000
+) —
+Port of the vLLM server to connect to. Ignored if
+vllm_server_base_url
+is provided.
+vllm_server_timeout
+(
+float
+,
+optional
+, defaults to
+240.0
+) —
+Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
+timeout, a
+ConnectionError
+is raised.
+vllm_group_port
+(
+int
+,
+optional
+, defaults to
+51216
+) —
+Port number for the weight update group. This is used to communicate with the vLLM server. Unless the port
+is occupied, there is no need to change it.
+Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)
+vllm_gpu_memory_utilization
+(
+float
+,
+optional
+, defaults to
+0.3
+) —
+Control the GPU memory utilization for vLLM. This setting only applies when
+vllm_mode
+is set to
+"colocate"
+. If you are using
+vllm_mode="server"
+, this parameter must be passed separately when
+launching the vLLM server via the
+--vllm_gpu_memory_utilization
+flag.
+vllm_max_model_length
+(
+int
+,
+optional
+) —
+Context window for vLLM. Set it to at least the maximum prompt length in the dataset plus
+max_completion_length
+; if omitted, it is inferred from the model config.
+vllm_tensor_parallel_size
+(
+int
+,
+optional
+, defaults to
+1
+) —
+Control the tensor parallel size for vLLM. This setting only applies when
+vllm_mode
+is set to
+"colocate"
+. If you are using
+vllm_mode="server"
+, this parameter must be passed separately when
+launching the vLLM server via the
+--vllm_tensor_parallel_size
+flag.
+vllm_enable_sleep_mode
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Enable vLLM sleep mode to offload weights/cache during the optimizer step. Keeps GPU memory usage low, but
+waking the engine adds host–device transfer latency.
+Parameters that control the training
+beta
+(
+float
+,
+optional
+, defaults to
+0.0
+) —
+KL coefficient. If
+0.0
+(default), the reference model is not loaded, reducing memory usage and improving
+training speed.
+DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement
+learning
+use a value of
+0.001
+.
+num_iterations
+(
+int
+,
+optional
+, defaults to
+1
+) —
+Number of iterations per batch (denoted as μ in the algorithm).
+epsilon
+(
+float
+,
+optional
+, defaults to
+0.2
+) —
+Epsilon value for clipping.
+delta
+(
+float
+,
+optional
+) —
+Enables the upper clipping bound in two-sided GRPO loss when set to a float. If
+None
+(default), standard
+GRPO clipping is used. Recommended to be greater than
+1 + ε
+when enabled. This method is introduced in
+the
+INTELLECT-2 tech report
+.
+epsilon_high
+(
+float
+,
+optional
+) —
+Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
+specified in argument
+epsilon
+. Paper
+DAPO
+recommends
+0.28
+.
+When used with
+loss_type='cispo'
+, this corresponds to the ε_max param specified in the
+ScaleRL
+paper
+and the recommended value is
+5.0
+.
+sapo_temperature_neg
+(
+float
+,
+optional
+, defaults to
+1.05
+) —
+Temperature for tokens with non-positive advantage scores used in the
+sapo
+loss function. This parameter
+is introduced in the
+Soft Adaptive Policy Optimization paper
+.
+sapo_temperature_pos
+(
+float
+,
+optional
+, defaults to
+1.0
+) —
+Temperature for tokens with positive advantage scores used in the
+sapo
+loss function. This parameter is
+introduced in the
+Soft Adaptive Policy Optimization paper
+.
+vespo_k_pos
+(
+float
+,
+optional
+, defaults to
+2.0
+) —
+k parameter for positive advantages, it is the power exponent in the VESPO loss. Controls how aggressively
+we down-weight samples with low importance weights (when the importance sampling ratio < 1).
+vespo_lambda_pos
+(
+float
+,
+optional
+, defaults to
+3.0
+) —
+lambda parameter for positive advantages, it is the decay factor in the VESPO loss. Controls how
+aggressively we down-weight samples with high importance weights (when the importance sampling ratio > 1).
+vespo_k_neg
+(
+float
+,
+optional
+, defaults to
+3.0
+) —
+k parameter for negative advantages, it is the power exponent in the VESPO loss. Controls how aggressively
+we down-weight samples with low importance weights (when the importance sampling ratio < 1).
+vespo_lambda_neg
+(
+float
+,
+optional
+, defaults to
+2.0
+) —
+lambda parameter for negative advantages, it is the exponential decay factor in the VESPO loss. Controls
+how aggressively we down-weight samples with high importance weights (when the importance sampling ratio >
+1).
+importance_sampling_level
+(
+str
+,
+optional
+, defaults to
+"token"
+) —
+Controls whether importance sampling ratios are computed at the
+"token"
+or
+"sequence"
+level.
+"token"
+keeps the raw per-token log-probability ratios (one weight per token).
+"sequence"
+averages the
+log-probability ratios across valid tokens to produce a single ratio per sequence. The
+GSPO
+paper
+shows that sequence-level sampling often yields more
+stable training and better alignment with sequence-level rewards.
+reward_weights
+(
+list[float]
+,
+optional
+) —
+Weights for each reward function. Must match the number of reward functions. If
+None
+, all rewards are
+weighted equally with weight
+1.0
+.
+multi_objective_aggregation
+(
+str
+,
+optional
+, defaults to
+"sum_then_normalize"
+) —
+Method to aggregate multiple reward functions. Supported values are:
+"sum_then_normalize"
+(default): First sums the weighted rewards from each reward function, then applies
+reward scaling/normalization as specified by
+scale_rewards
+(see
+scale_rewards
+for details).
+"normalize_then_sum"
+: First normalizes/scales each reward function across generations (within each
+group), then sums the normalized rewards using the specified weights. The aggregated reward is then
+normalized at the batch level when forming advantages. This is the suggested approach from the paper
+GDPO: Group reward-Decoupled Normalization Policy Optimization for Multi-reward RL
+Optimization
+.
+scale_rewards
+(
+str
+or
+bool
+,
+optional
+, defaults to
+"group"
+) —
+Specifies the scaling strategy for rewards. Supported values are:
+True
+or
+"group"
+(default): rewards are scaled by the standard deviation within each group, ensuring
+unit variance within a group.
+"batch"
+: rewards are scaled by the standard deviation across the entire batch, as recommended in the
+PPO Lite paper
+.
+False
+or
+"none"
+: no scaling is applied. The
+Dr. GRPO
+paper
+recommends not scaling rewards, as scaling by the
+standard deviation introduces a question-level difficulty bias.
+loss_type
+(
+str
+,
+optional
+, defaults to
+"dapo"
+) —
+Specifies the loss formulation to use. Supported values are:
+"grpo"
+: Aggregates token-level losses by normalizing over sequence length. Not recommended due to
+length bias—this approach tends to prefer shorter completions with positive advantages and longer ones
+with negative advantages.
+"dr_grpo"
+: Aggregates token-level losses by normalizing with a global constant. This method was
+introduced in the
+Dr. GRPO paper
+to eliminate length bias.
+The value of the constant corresponds to
+max_completion_length
+.
+"dapo"
+(default): Aggregates token-level losses by normalizing with the number of active token in the
+global accumulated batch. This method was introduced in the
+DAPO
+paper
+to eliminate length bias.
+"bnpo"
+: Aggregates token-level losses by normalizing with the number of active token in the local
+batch. Note that normalization is performed over the local batch only, so results may slightly vary
+depending on the local batch size, despite a constant effective batch size. When using
+per_device_train_batch_size==1
+, the loss is equivalent to the GRPO loss.
+"cispo"
+: Clips the importance sampling weights instead of the advantage scaled importance weights. The
+clipped weights are then multiplied with the advantages and policy model’s log probs. Individual token
+losses are aggregated by normalizing with the number of active tokens in the global accumulated batch.
+This method was introduced in the
+MiniMax-M1 paper
+.
+"sapo"
+: Soft Adaptive Policy Optimization loss, as introduced in the
+Soft Adaptive Policy Optimization
+paper
+. Replaces hard clipping with a smooth,
+temperature-controlled gate that adaptively attenuates off-policy updates while preserving useful
+learning signals.
+"luspo"
+: Length-Unbiased Sequence Policy Optimization loss. A sequence-level loss that scales each
+sequence’s loss by its length. This is a modification of GSPO and requires
+importance_sampling_level="sequence"
+. Introduced in the
+LUSPO
+paper
+.
+"vespo"
+: Variational Sequence-Level Soft Policy Optimization. Replaces hard clipping with a smooth,
+asymmetric Gamma weighting function applied directly to sequence-level importance weights. Introduced in
+the
+VESPO paper
+.
+mask_truncated_completions
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+When enabled, truncated completions are excluded from the loss calculation, preventing them from being
+incorrectly penalized and introducing noise during training. According to the
+DAPO
+paper, this is a good practice for training stability.
+sync_ref_model
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to synchronize the reference model with the active model every
+ref_model_sync_steps
+steps, using
+the
+ref_model_mixup_alpha
+parameter. This synchronization originates from the
+TR-DPO
+paper.
+ref_model_mixup_alpha
+(
+float
+,
+optional
+, defaults to
+0.6
+) —
+α parameter from the
+TR-DPO
+paper, which controls the mix
+between the current policy and the previous reference policy during updates. The reference policy is
+updated according to the equation:
+π_ref = α * π_θ + (1 - α) * π_ref_prev
+. To use this parameter, you
+must set
+sync_ref_model=True
+.
+ref_model_sync_steps
+(
+int
+,
+optional
+, defaults to
+512
+) —
+τ parameter from the
+TR-DPO
+paper, which determines how
+frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+set
+sync_ref_model=True
+.
+top_entropy_quantile
+(
+float
+,
+optional
+, defaults to
+1.0
+) —
+ρ parameter from
+Beyond the 80/20 Rule
+. Keeps in the policy
+loss term only the top-ρ quantile of tokens by entropy of the probability distribution at each sequence
+position, improving results. Range:
+[0.0-1.0]
+. A value of
+0.0
+masks all but the highest entropy token;
+1.0
+keeps all tokens. The paper recommends a value of
+0.2
+. If used with
+mask_truncated_completions=True
+, only tokens from non-truncated completions are considered.
+max_tool_calling_iterations
+(
+int
+,
+optional
+) —
+Maximum number of tool-calling turns when training an agent. If
+None
+, there is no limit and generation
+stops when the model generates a response turn with no tool calls or when the total response length reaches
+max_model_length
+.
+vllm_importance_sampling_correction
+(
+bool
+,
+optional
+, defaults to
+True
+) —
+Whether to apply Importance Sampling (IS) to correct for the mismatch between vLLM completion logprobs and
+recomputed training logprobs. If set to
+False
+, no IS is applied regardless of
+vllm_importance_sampling_mode
+. When
+True
+, the selected mode determines how the IS ratios are computed
+and constrained.
+vllm_importance_sampling_mode
+(
+str
+,
+optional
+, defaults to
+"sequence_mask"
+) —
+Specifies how Importance Sampling is performed when
+vllm_importance_sampling_correction=True
+. Possible
+values are:
+"token_truncate"
+: Token-level truncated IS (default). Per-token ratios are clipped from above at C.
+"token_mask"
+: Token-level masked IS. Per-token ratios above C are set to zero.
+"sequence_truncate"
+: Sequence-level truncated IS. A single sequence ratio is clipped from above at
+C and applied to all tokens in the sequence.
+"sequence_mask"
+: Sequence-level masked IS. Sequences with ratios above C are masked out.
+vllm_importance_sampling_cap
+(
+float
+,
+optional
+, defaults to
+3.0
+) —
+Importance sampling cap C used by
+vllm_importance_sampling_mode
+. For
+*_truncate
+modes, importance
+ratios are clipped from above at C. For
+*_mask
+modes, ratios larger than C are set to zero.
+off_policy_mask_threshold
+(
+float
+,
+optional
+) —
+Threshold for off-policy sequence masking. If
+None
+, off-policy sequence masking is disabled. When set,
+sequences with negative advantages and high KL divergence are masked out to stabilize training. This
+parameter corresponds to the
+delta
+threshold in Equation 9 of the
+DeepSeek-V3.2
+paper
+. It expects a positive value (e.g., 0.5).
+use_bias_correction_kl
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to use the unbiased KL divergence estimator with importance sampling correction. This corrects the
+KL divergence estimate by multiplying it with the importance sampling ratio. This is described in the
+DeepSeek-V3.2 paper
+.
+Parameters that control the logging
+log_completions
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to log a sample of (prompt, completion) pairs every
+logging_steps
+steps. If
+rich
+is installed,
+it prints the sample. If
+wandb
+and/or
+trackio
+logging is enabled, it logs it to
+wandb
+and/or
+trackio
+.
+num_completions_to_print
+(
+int
+,
+optional
+) —
+Number of completions to print with
+rich
+. If
+None
+, all completions are logged.
+log_unique_prompts
+(
+bool
+,
+optional
+, defaults to
+False
+) —
+Whether to log unique prompts. If
+True
+, only unique prompts are logged. If
+False
+, all prompts are
+logged.
+log_completions_hub_repo
+(
+str
+,
+optional
+) —
+Hugging Face Hub repository to save the completions. Should be a complete repository name like
+'username/reponame'
+or
+'orgname/reponame'
+, or just
+'reponame'
+in which case the repository will be
+created in the currently-logged-in Hugging Face user’s namespace. Note that this repository will be public
+unless you set
+hub_private_repo=True
+or your organization’s default is to create private repositories.”
+Deprecated parameters
+use_transformers_paged
+—
+Parameter
+use_transformers_paged
+is deprecated and will be removed in version v2.0.0. It will be
+replaced by
+transformers
+continuous batching support in an upcoming release.
+Configuration class for the
+GRPOTrainer
+.
+This class includes only the parameters that are specific to GRPO training. For a full list of training arguments,
+please refer to the
+TrainingArguments
+documentation. Note that default values in this class may
+differ from those in
+TrainingArguments
+.
+Using
+HfArgumentParser
+we can turn this class into
+argparse
+arguments that can be specified on the
+command line.
+These parameters have default values different from
+TrainingArguments
+:
+logging_steps
+: Defaults to
+10
+instead of
+500
+.
+gradient_checkpointing
+: Defaults to
+True
+instead of
+False
+.
+bf16
+: Defaults to
+True
+if
+fp16
+is not set, instead of
+False
+.
+learning_rate
+: Defaults to
+1e-6
+instead of
+5e-5
+.
+Update
+on GitHub
+←
+DPO
+Reward
+→
\ No newline at end of file
diff --git a/research/notes/local_sgdpy.md b/research/notes/local_sgdpy.md
new file mode 100644
index 0000000000000000000000000000000000000000..11be7f72a27a72dddfa3ba7b858b6e50ab0813b7
--- /dev/null
+++ b/research/notes/local_sgdpy.md
@@ -0,0 +1,802 @@
+---
+title: local_sgd.py
+id: local_sgdpy
+tags:
+- deepread
+created: '2026-06-10T00:34:03.299828Z'
+source: https://raw.githubusercontent.com/pytorch/torchft/main/torchft/local_sgd.py
+source_domain: raw.githubusercontent.com
+fetched_at: '2026-06-10T00:34:03.299674Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: unknown
+content_type: unknown
+deprecated: false
+---
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+LocalSGD
+=========
+This module implements a fault tolerant version of LocalSGD and related methods.
+"""
+
+import logging
+import math
+import os
+from types import TracebackType
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+from torch import nn, optim
+from torch.distributed.distributed_c10d import Work
+from torch.distributed.tensor import DTensor
+from torch.utils.hooks import RemovableHandle
+from torchft.manager import Manager
+from torchft.utils import get_stream_context
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+USE_BUCKETIZATION_ENV: str = "TORCHFT_USE_BUCKETIZATION"
+
+def extract_local_tensor(t: torch.Tensor) -> torch.Tensor:
+    """
+    Returns a cloned version of the input tensor. If the input tensor is a DTensor,
+    it extracts and clones its local representation.
+    """
+    new_tensor = None
+    if isinstance(t, DTensor):
+        new_tensor = t.to_local().clone()
+    else:
+        new_tensor = t.clone()
+    new_tensor.grad = None
+    return new_tensor
+
+class LocalSGD:
+    """
+    LocalSGD is a context manager that
+    implements the algorithm described in https://arxiv.org/pdf/1805.09767
+
+    This will synchronize the model parameters periodically in a fault tolerant
+    way using a torchft Manager. The allreduce on the parameters will happen
+    every sync_every steps after the optimizer.step call.
+
+    The torchft quorum is computed at the beginning of ``sync_every`` steps. If
+    any error occurs, or a worker fails between syncs, ``sync_every`` steps will be
+    discarded and a new quorum will be computed on the next step.
+
+    If running in async mode, on a joining worker the first ``sync_every`` steps
+    will discarded as the model will be recovering during that period. When
+    using sync mode, the checkpoint will be restored prior to the first step.
+    """
+
+    def __init__(
+        self,
+        manager: Manager,
+        model: nn.Module,
+        optimizer: optim.Optimizer,
+        sync_every: int,
+    ) -> None:
+        """
+        Args:
+            manager: The manager to use.
+            model: The model to wrap.
+            optimizer: The optimizer used by the model.
+            sync_every: How often to sync the model weights.
+        """
+        super().__init__()
+        self._manager = manager
+        self._model = model
+        self._local_optimizer = optimizer
+        self._local_step = 0
+        self._sync_every = sync_every
+        assert sync_every >= 1, "sync_every must be greater than or equal to 1"
+
+        self._hooks: List[RemovableHandle] = []
+
+    def __enter__(self) -> "LocalSGD":
+        self._hooks.append(
+            self._local_optimizer.register_step_pre_hook(self._step_pre_hook)
+        )
+        # Add optimizer hook which increments the local step counter and syncs if necessary
+        self._hooks.append(
+            self._local_optimizer.register_step_post_hook(self._step_post_hook)
+        )
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool:
+        # Handle any cleanup or error handling here
+        # Clean up hooks
+        for hook in self._hooks:
+            hook.remove()
+        self._hooks.clear()
+
+        return False  # Propagate exceptions
+
+    def _step_pre_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        # The checkpoint may transfer model parameters, so we need to make access to it thread safe
+        self._manager.disallow_state_dict_read()
+
+    def _step_post_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        """
+        This hook is registered on the optimizer and is called after the optimizer step.
+        """
+        self._manager.allow_state_dict_read()
+
+        self._local_step += 1
+        if self._local_step >= self._sync_every:
+            self.sync()
+
+    def sync(self) -> None:
+        """
+        Synchronizes and averages the model weights across the manager.
+        """
+        self._manager.start_quorum()
+        self._perform_sync()
+        self._local_step = 0
+
+    def _perform_sync(self) -> None:
+        """
+        Performs the synchronization of the model weights across the manager.
+        """
+        averaged_parameters = self._average()
+        if self._manager.should_commit():
+            # Update the model parameters with the averaged values
+            for param, avg_param in zip(self._model.parameters(), averaged_parameters):
+                if isinstance(param, DTensor):
+                    # we averaged the local version of the tensor so need to copy it back as a DTensor
+                    param.data.copy_(
+                        DTensor.from_local(
+                            avg_param,
+                            param.device_mesh,
+                            param.placements,
+                            shape=param.shape,
+                            stride=param.stride(),
+                        )
+                    )
+                else:
+                    param.data.copy_(avg_param)
+
+    def _average(self) -> list[torch.Tensor]:
+        """
+        Averages the model parameters across the manager and returns the averaged parameters.
+        """
+        works = []
+        averaged_parameters = []
+        for p in self._model.parameters():
+            # Create a new tensor to store the averaged parameter
+            avg_param = extract_local_tensor(p)
+            works.append(self._manager.allreduce(avg_param))
+            averaged_parameters.append(avg_param)
+        for work in works:
+            work.wait()
+        return averaged_parameters
+
+class _StreamingDiLoCoFragment:
+    bucket_cap_mb: int = 1 * 1024 * 1024 * 1024
+    use_bucketization: bool = False
+
+    def __init__(
+        self,
+        manager: Manager,
+        model_fragment: nn.Module,
+        fragment_id: int,
+        fragment_sync_offset: int,
+        inner_optimizer: optim.Optimizer,
+        outer_optimizer: optim.Optimizer,
+        sync_every: int,
+        backup_device: Optional[torch.device] = None,
+        pin_memory: bool = True,
+        use_bucketization: bool = False,
+        bucket_cap_mb: Optional[int] = None,
+        should_quantize: bool = False,
+        fragment_sync_delay: int = 0,
+        fragment_update_alpha: float = 0.0,
+    ) -> None:
+        if fragment_sync_offset > sync_every:
+            raise ValueError("Fragment must be synced once before `sync_every` steps")
+
+        self._fragment_id = fragment_id
+        self._manager = manager
+        self._model_fragment = model_fragment
+        self._fragment_sync_offset = fragment_sync_offset
+        self._local_optimizer = inner_optimizer
+        self._sync_every = sync_every
+        assert sync_every >= 1, "sync_every must be greater than or equal to 1"
+        self._backup_device = backup_device
+        self._pin_memory = pin_memory
+        self._fragment_sync_delay = fragment_sync_delay
+        self._fragment_update_alpha = fragment_update_alpha
+
+        self._outer_optimizer = outer_optimizer
+
+        # Stores pending all reduce
+        self._allreduce_work: list[Work] = []
+        self._stream: Optional[torch.Stream] = (
+            torch.Stream(torch.accelerator.current_accelerator())
+            if torch.accelerator.is_available()
+            else None
+        )
+
+        # Recorded on `_stream` to wait for allreduce to finish
+        self._stop_event: Optional[torch.Event] = None
+
+        if bucket_cap_mb is not None:
+            self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
+
+        if os.getenv(USE_BUCKETIZATION_ENV, "False") == "True":
+            self.use_bucketization = True
+        else:
+            self.use_bucketization = use_bucketization
+
+        self.should_quantize = should_quantize
+
+        self._grads: Dict[str, torch.Tensor] = {}
+
+        # Used to save global parameters so that they can be restored in case
+        # commit fails
+        self.original_parameters: Dict[str, torch.Tensor] = {}
+
+        # Used to mix the local and global parameters
+        self._local_parameters: Dict[str, torch.Tensor] = {}
+
+        for name, p in self._model_fragment.named_parameters():
+            if isinstance(p, DTensor):
+                p = extract_local_tensor(p.data)
+
+            backup_device = self._backup_device or torch.device("cpu")
+            t = torch.empty(*tuple(p.shape), dtype=p.dtype, device=backup_device)
+            if (
+                self._pin_memory
+                and t.device == torch.device("cpu")
+                and torch.accelerator.is_available()
+            ):
+                t = t.pin_memory()
+            self.original_parameters[name] = t
+
+    def register_state_dict_fn(self) -> None:
+        """
+        Register state dict functions for this fragment with the manager.
+        This allows for saving and loading the original_parameters during checkpointing and recovery.
+
+        Args:
+            manager: The manager to register with
+            fragment_id: Optional identifier for this fragment, used in the key
+        """
+        # Generate a unique key for this fragment based on the model fragment's name or provided ID
+        fragment_key = f"StreamingDiLoCoFragment_{self._fragment_id}"
+
+        # Define load function for this fragment
+        def load_fn(state_dict: Dict[str, Dict[str, torch.Tensor]]) -> None:
+            for name, param in state_dict["original_parameters"].items():
+                if name in self.original_parameters:
+                    self.original_parameters[name].copy_(param)
+
+            self._outer_optimizer.load_state_dict(state_dict["outer_optimizer"])
+
+        # Define save function for this fragment
+        def save_fn() -> Dict[str, Dict[str, torch.Tensor]]:
+            return {
+                "outer_optimizer": self._outer_optimizer.state_dict(),
+                "original_parameters": {
+                    name: extract_local_tensor(param)
+                    for name, param in self.original_parameters.items()
+                },
+            }
+
+        # Register the functions with the manager
+        self._manager.register_state_dict_fn(fragment_key, load_fn, save_fn)
+
+    @torch.profiler.record_function("torchft::local_sgd::save_parameters")
+    def save_parameters(self) -> None:
+        with torch.no_grad():
+            # TODO: consider running copy on a separate stream
+            for name, p in self._model_fragment.named_parameters():
+                param_to_local = extract_local_tensor(p.data)
+                self.original_parameters[name].copy_(param_to_local, non_blocking=True)
+
+    def _save_local_parameters(self) -> None:
+        """
+        Saves a copy of the model's parameters.
+        """
+        with torch.no_grad():
+            for name, p in self._model_fragment.named_parameters():
+                self._local_parameters[name] = extract_local_tensor(p.data)
+
+    @torch.profiler.record_function("torchft::local_sgd::restore_parameters")
+    def restore_parameters(self) -> None:
+        with torch.no_grad():
+            # TODO: consider running copy on a separate stream
+            for name, p in self._model_fragment.named_parameters():
+                if isinstance(p, DTensor):
+                    # we averaged the local version of the tensor so need to copy it back as a DTensor
+                    p.data.copy_(
+                        DTensor.from_local(
+                            self.original_parameters[name],
+                            p.device_mesh,
+                            p.placements,
+                            shape=p.shape,
+                            stride=p.stride(),
+                        ),
+                        non_blocking=False,
+                    )
+                else:
+                    p.data.copy_(self.original_parameters[name], non_blocking=False)
+
+    def _save_grads(self) -> None:
+        """
+        Saves pseudo-gradients of the parameters
+        """
+        with torch.no_grad():
+            for name, p in self._model_fragment.named_parameters():
+                if isinstance(p, DTensor):
+                    local_param = p.to_local()
+                else:
+                    local_param = p
+                pseudogradient = (
+                    self.original_parameters[name].to(p.device) - local_param
+                )
+                self._grads[name] = pseudogradient
+
+    def _set_grads(self) -> None:
+        """
+        Sets the gradients of the model fragment from the allreduce result
+        """
+        with torch.no_grad():
+            for name, p in self._model_fragment.named_parameters():
+                # avoid copying the gradient, it should be on the same device
+                if isinstance(p, DTensor):
+                    p.grad = DTensor.from_local(
+                        self._grads[name],
+                        p.device_mesh,
+                        p.placements,
+                        shape=p.shape,
+                        stride=p.stride(),
+                    )
+                else:
+                    p.grad = self._grads[name]
+
+                # No longer needed
+                del self._grads[name]
+
+    def _clear_local_parameters(self) -> None:
+        """
+        Clears the saved copy of the model's parameters
+        """
+        self._local_parameters = {}
+
+    def _merge_parameters(self) -> None:
+        """
+        Merges the local and global parameters.
+        """
+        for name, p in self._model_fragment.named_parameters():
+            # we averaged the local version of the tensor so need to copy it back as a DTensor
+            if isinstance(p, DTensor):
+                p.data.lerp_(
+                    DTensor.from_local(
+                        self._local_parameters[name],
+                        p.device_mesh,
+                        p.placements,
+                        shape=p.shape,
+                        stride=p.stride(),
+                    ),
+                    self._fragment_update_alpha,
+                )
+            else:
+                p.data.lerp_(self._local_parameters[name], self._fragment_update_alpha)
+
+    @torch.profiler.record_function("torchft::local_sgd::wait")
+    def wait(self) -> None:
+        """
+        Waits for the previously scheduled allreduce to finish
+        """
+        if len(self._allreduce_work) == 0:
+            return
+
+        if self._stream is not None:
+            assert self._stop_event is not None
+            self._stop_event.synchronize()
+            self._stop_event = None
+
+        self._allreduce_work = []
+
+    @torch.profiler.record_function("torchft::local_sgd::prepare_sync")
+    def prepare_sync(self) -> None:
+        """
+        Calculate the pseugradient, average them across the manager group and starts
+        allreduce on the pseudo-gradients but doesn't wait for it to finish.
+        """
+        self._save_grads()
+
+        assert len(self._allreduce_work) == 0
+
+        # Make sure tensors are available to `_stream`
+        if self._stream is not None:
+            self._stream.wait_stream(torch.accelerator.current_stream())
+
+        with get_stream_context(self._stream):
+            self._average_grads()
+
+    @torch.profiler.record_function("torchft::local_sgd::perform_sync")
+    def perform_sync(self) -> bool:
+        """
+        Overrides the sync method to wait for the scheduled allreduce to finish and
+        steps using the outer optimizer.
+        """
+        # Waiting for an allreduce before it has been sent is currently not supported.
+        assert len(self._allreduce_work) > 0
+
+        with get_stream_context(self._stream):
+            for work in self._allreduce_work:
+                work.wait()
+
+            if self._stream is not None:
+                self._stop_event = torch.Event()
+                self._stop_event.record()
+
+        self.wait()
+
+        # save the parameters so they can be used for merging
+        self._save_local_parameters()
+        # Restore the parameters back to the previous state
+        self.restore_parameters()
+
+        # For large values of `fragment_sync_delay`, this call can be
+        # a problem.
+        #
+        # This can return success even if the allreduce failed. Because
+        # the process group could have been reconfigured while the
+        # allreduce was inflight. The inflight allreduce may or may
+        # not have been aborted.
+        #
+        # We can track errors per allreduce to
+        # let the commit fail here. But this has the downside of
+        # reconfiguring the pg too many times resulting in
+        # more aborts and more commit failures.
+        should_commit = self._manager.should_commit()
+
+        if should_commit:
+            # Use the outer optimizer to update the model parameters
+            self._set_grads()
+            self._outer_optimizer.step()
+            self.save_parameters()
+            self._merge_parameters()
+        self._outer_optimizer.zero_grad()
+
+        # free up memory
+        self._clear_local_parameters()
+
+        return should_commit
+
+    def _average_grads(self) -> None:
+        """
+        Efficiently averages gradients across the group using either:
+        - Per-parameter allreduce (old behavior)
+        - Bucketized allreduce (new behavior)
+        """
+        if self.use_bucketization:
+            self._allreduce_bucketized()
+        else:
+            self._allreduce_per_param()
+
+    def _allreduce_per_param(self) -> None:
+        """Performs allreduce on each gradient tensor separately (original method)."""
+        for name, p in self._model_fragment.named_parameters():
+            # Perform allreduce on the pseudogradients
+            work = self._manager.allreduce(
+                self._grads[name], should_quantize=self.should_quantize
+            )
+
+            self._allreduce_work.append(work)
+
+    def _bucketize_and_allreduce(
+        self,
+        tensors: List[torch.Tensor],
+        bucket_size_bytes: int,
+    ) -> None:
+        """
+        Applies allreduce on a list of tensors using bucketization.
+
+        Args:
+            tensors: List of torch tensors (e.g., gradients).
+            bucket_size_bytes: Max size of each bucket in bytes.
+        """
+        if not tensors:
+            return
+
+        total_size = sum(t.numel() for t in tensors)
+        dtype, device = tensors[0].dtype, tensors[0].device
+
+        offset = 0
+        flat_index = 0
+        while offset < total_size:
+            chunk_size = min(
+                bucket_size_bytes // tensors[0].element_size(), total_size - offset
+            )
+            flat_buffer: torch.Tensor = torch.zeros(
+                chunk_size, dtype=dtype, device=device
+            )
+
+            pack_offset: int = 0
+            bucket_tensors: list[Tuple[torch.Tensor, int, int]] = []
+            for t in tensors[flat_index:]:
+                numel = t.numel()
+                if pack_offset + numel > chunk_size:
+                    break
+                flat_buffer[pack_offset : pack_offset + numel].copy_(t.view(-1))
+                bucket_tensors.append((t, pack_offset, numel))
+                pack_offset += numel
+                flat_index += 1
+
+            work = self._manager.allreduce(
+                flat_buffer, should_quantize=self.should_quantize
+            )
+
+            def callback(
+                fut: torch.futures.Future[list[torch.Tensor]],
+            ) -> list[torch.Tensor]:
+                nonlocal bucket_tensors, flat_buffer
+                for t, pack_offset, numel in bucket_tensors:
+                    t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
+
+                return []
+
+            fut = work.get_future()
+            fut = fut.then(callback)
+
+            self._allreduce_work.append(work)
+
+            offset += chunk_size
+
+    def _allreduce_bucketized(self) -> None:
+        """
+        Averages gradients using bucketized allreduce with a fixed buffer.
+        """
+        grads = list(self._grads.values())
+        assert len(grads) > 0, "No gradients to allreduce"
+        self._bucketize_and_allreduce(
+            grads,
+            bucket_size_bytes=self.bucket_cap_mb,
+        )
+
+class DiLoCo:
+    """
+    DiLoCo implements distributed optimization by averaging and synchronizing
+    pseudogradients (delta of the previous global weight and current local weights).
+
+    The class implements a more general version of DiLoco, Streaming DiLoCo,
+    which synchronizes fragments of pseudogradients at different steps.
+
+    This algorithm requires a backup copy of the
+    weights. By default these are stored in CPU memory. If any error occurs
+    during the DiLoCo step, the step will be discarded and the model
+    parameters will reset back to the last time DiLoCo synchronized.
+
+    DiLoCo paper: https://arxiv.org/pdf/2311.08105
+    Streaming DiLoCo paper: https://arxiv.org/pdf/2501.18512
+    """
+
+    def __init__(
+        self,
+        manager: Manager,
+        model_fragments: List[nn.Module],
+        inner_optimizer: optim.Optimizer,
+        # TODO: this is for backward compatibility
+        outer_optimizer: optim.Optimizer | list[optim.Optimizer],
+        sync_every: int,
+        backup_device: Optional[torch.device] = None,
+        pin_memory: bool = True,
+        use_bucketization: bool = False,
+        bucket_cap_mb: Optional[int] = None,
+        should_quantize: bool = False,
+        fragment_sync_delay: int = 0,
+        fragment_update_alpha: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            manager: The manager to use.
+            model_fragments: The fragments of the model to wrap.
+            inner_optimizer: The optimizer used for the local parameters every step.
+            outer_optimizer: The optimizer used for the global parameters updated every "sync_every" steps.
+            sync_every: How often to update the model weights.
+            backup_device: The device to store the backup weights on. If None, the backup weights will be on CPU.
+            pin_memory: Whether to pin the memory for the backup weights (only for CPU device).
+            should_quantize: Whether to quantize the gradients before allreduce.
+            fragment_sync_delay: Controls the number of inner steps to wait before blocking on a fragment's
+                                 synchronization. This is the "tao" parameter in the Streaming DiLoCo paper.
+            fragment_update_alpha: Determines how to mix the local and global optimized parameters
+        """
+
+        if isinstance(outer_optimizer, list):
+            assert len(outer_optimizer) == len(model_fragments), (
+                "The number of outer optimizers must match the number of model fragments"
+            )
+
+        if manager._use_async_quorum:
+            raise ValueError(
+                "Using DiLoCo require synchronous quorum to be enabled. "
+                "Ensure that the manager is initialized with use_async_quorum=False"
+            )
+
+        if sync_every < len(model_fragments):
+            raise ValueError("Only 1 fragment can be syncrhonized at a time")
+
+        if sync_every % len(model_fragments) != 0:
+            raise ValueError("sync_every must divide the number of fragments")
+
+        self._sync_every: int = sync_every // len(model_fragments)
+        if fragment_sync_delay >= self._sync_every:
+            raise ValueError(
+                "Fragment must be synced before it is reduced another time"
+            )
+
+        if fragment_update_alpha < 0 or fragment_update_alpha > 1:
+            raise ValueError("fragment_update_alpha must be between 0 and 1")
+
+        super().__init__()
+        self._manager = manager
+
+        # The number of training iterations performed.
+        # Used to synchronize which fragment to send across all
+        # replicas
+        self._local_step = 0
+
+        self._fragment_sync_delay = fragment_sync_delay
+
+        self._hooks: List[RemovableHandle] = []
+
+        self._local_optimizer = inner_optimizer
+
+        self._fragments: List[_StreamingDiLoCoFragment] = [
+            _StreamingDiLoCoFragment(
+                manager,
+                model_fragment,
+                i,
+                math.floor((sync_every / len(model_fragments)) * (i + 1)),
+                inner_optimizer,
+                (
+                    outer_optimizer[i]
+                    if isinstance(outer_optimizer, list)
+                    else outer_optimizer
+                ),
+                sync_every,
+                backup_device,
+                pin_memory,
+                use_bucketization,
+                bucket_cap_mb,
+                should_quantize,
+                fragment_sync_delay,
+                fragment_update_alpha,
+            )
+            for i, model_fragment in enumerate(model_fragments)
+        ]
+
+        # This is to make sure we adhere to the assumptions made by the
+        # `_StreamingDiLoCoFragment` about the fragment sync schedule.
+        assert fragment_sync_delay < sync_every // len(model_fragments)
+
+        # Need to copy the parameters to the host to be safe if we are on the first step.
+        self._save_parameters()
+        self._register_state_dict_fn()
+
+    def _register_state_dict_fn(self) -> None:
+        for fragment in self._fragments:
+            fragment.register_state_dict_fn()
+
+    def _save_parameters(self) -> None:
+        for fragment in self._fragments:
+            fragment.save_parameters()
+
+    def _restore_parameters(self) -> None:
+        for fragment in self._fragments:
+            fragment.restore_parameters()
+
+    def __enter__(self) -> "DiLoCo":
+        self._hooks.append(
+            self._local_optimizer.register_step_pre_hook(self._step_pre_hook)
+        )
+        # Add optimizer hook which increments the local step counter and syncs if necessary
+        self._hooks.append(
+            self._local_optimizer.register_step_post_hook(self._step_post_hook)
+        )
+        return self
+
+    def _step_pre_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        # The checkpoint may transfer model parameters, so we need to make access to it thread safe
+        self._manager.disallow_state_dict_read()
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool:
+        # Handle any cleanup or error handling here
+        # Clean up hooks
+        for hook in self._hooks:
+            hook.remove()
+        self._hooks.clear()
+
+        return False  # Propagate exceptions
+
+    def _wait(self) -> None:
+        """
+        Waits for allreduce to finish on all fragments
+        """
+        for fragment in self._fragments:
+            fragment.wait()
+
+    def _current_fragment(self) -> int:
+        """
+        Determines which fragment to prepare/sync based on the current step.
+        """
+        step = self._manager.current_step()
+        return step % len(self._fragments)
+
+    def _step_post_hook(
+        self, _optim: optim.Optimizer, _args: Tuple[Any, ...], _kwargs: Dict[str, Any]
+    ) -> None:
+        """
+        This hook is registered on the optimizer and is called after the optimizer step.
+        """
+        self._manager.allow_state_dict_read()
+
+        # We need to make sure all nodes send the same fragments in order.
+        # This is to avoid deadlocking e.g.
+        #
+        # 1. Step 1 - Node A sends fragment 1
+        # 2. Step 1 - Node B sends fragment 2
+        # 3. Step 2 - Node A waits for fragment 1
+        # 4. Step 2 - Node B waits for fragment 2
+        #
+        # Both of them will fail because Node A didn't send fragment 2
+        # and Node B didn't send fragment 1.
+        self._local_step += 1
+
+        if self._local_step == self._sync_every - self._fragment_sync_delay:
+            # Time to prepare a fragment
+            #
+            # Some replicas will get the same copy of the model, implying batches
+            # can be overrepresented.
+            self._manager.start_quorum()
+            fragment = self._current_fragment()
+            logger.info(f"Preparing fragment={fragment} step={self._local_step}")
+            self._fragments[fragment].prepare_sync()
+
+        if self._local_step < self._sync_every:
+            return
+
+        if self._local_step == self._sync_every:
+            # Time to sync a fragment
+            fragment = self._current_fragment()
+            logger.info(
+                f"Syncing fragment={fragment} step={self._local_step} manager_step={self._manager.current_step()}"
+            )
+            self._fragments[fragment].perform_sync()
+
+            # If the allreduce truly failed, we'll keep retrying this fragment.
+            # We reset the parameters upon failure. We'll skip over some data
+            # but we won't over train before syncing.
+
+            self._local_step = 0
+            return
+
+        assert False, (
+            f"{self._local_step=} should never be greater than {self._sync_every=}"
+        )
\ No newline at end of file
diff --git a/research/notes/mastering-atari-go-chess-and-shogi-by-planning-with-a-learned-model-nature.md b/research/notes/mastering-atari-go-chess-and-shogi-by-planning-with-a-learned-model-nature.md
new file mode 100644
index 0000000000000000000000000000000000000000..007cea0910dbb81c456a56a9344fde53aaa6fed5
--- /dev/null
+++ b/research/notes/mastering-atari-go-chess-and-shogi-by-planning-with-a-learned-model-nature.md
@@ -0,0 +1,513 @@
+---
+title: Mastering Atari, Go, chess and shogi by planning with a learned model | Nature
+id: mastering-atari-go-chess-and-shogi-by-planning-with-a-learned-model-nature
+tags:
+- deepread
+created: '2026-06-10T00:33:52.703702Z'
+source: https://www.nature.com/articles/s41586-020-03051-4
+source_domain: www.nature.com
+fetched_at: '2026-06-10T00:33:52.703487Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: unknown
+content_type: unknown
+deprecated: false
+---
+
+Mastering Atari, Go, chess and shogi by planning with a learned model | Nature
+Skip to main content
+Thank you for visiting nature.com. You are using a browser version with limited support for CSS. To obtain
+            the best experience, we recommend you use a more up to date browser (or turn off compatibility mode in
+            Internet Explorer). In the meantime, to ensure continued support, we are displaying the site without styles
+            and JavaScript.
+Subjects
+Computational science
+Computer science
+Abstract
+Constructing agents with planning capabilities has long been one of the main challenges in the pursuit of artificial intelligence. Tree-based planning methods have enjoyed huge success in challenging domains, such as chess
+1
+and Go
+2
+, where a perfect simulator is available. However, in real-world problems, the dynamics governing the environment are often complex and unknown. Here we present the MuZero algorithm, which, by combining a tree-based search with a learned model, achieves superhuman performance in a range of challenging and visually complex domains, without any knowledge of their underlying dynamics. The MuZero algorithm learns an iterable model that produces predictions relevant to planning: the action-selection policy, the value function and the reward. When evaluated on 57 different Atari games
+3
+—the canonical video game environment for testing artificial intelligence techniques, in which model-based planning approaches have historically struggled
+4
+—the MuZero algorithm achieved state-of-the-art performance. When evaluated on Go, chess and shogi—canonical environments for high-performance planning—the MuZero algorithm matched, without any knowledge of the game dynamics, the superhuman performance of the AlphaZero algorithm
+5
+that was supplied with the rules of the game.
+Access through your institution
+Buy or subscribe
+This is a preview of subscription content,
+access via your institution
+Access options
+Access through your institution
+Access Nature and 54 other Nature Portfolio journals
+Get Nature+, our best-value online-access subscription
+$32.99
+/ 30 days
+cancel any time
+Learn more
+Subscribe to this journal
+Receive 52 print issues and online access
+$199.00 per year
+only $3.83 per issue
+Learn more
+Buy this article
+Purchase on SpringerLink
+Instant access to the full article PDF.
+USD 39.95
+Prices may be subject to local taxes which are calculated during checkout
+Fig. 1: Planning, acting and training with a learned model.
+The alternative text for this image may have been generated using AI.
+Fig. 2: Evaluation of MuZero throughout training in chess, shogi, Go and Atari.
+The alternative text for this image may have been generated using AI.
+Fig. 3: Evaluations of MuZero on Go, all 57 Atari games and Ms. Pac-Man.
+The alternative text for this image may have been generated using AI.
+Similar content being viewed by others
+Information based explanation methods for deep learning agents—with applications on large open-source chess models
+Article
+Open access
+30 August 2024
+Expertise increases planning depth in human gameplay
+Article
+31 May 2023
+Beyond human agency: exploring the social and legal subjectivity of artificial intelligence
+Article
+Open access
+16 May 2026
+Data availability
+MuZero is trained only on data generated by MuZero itself; no external data were used to produce the results presented in the article. Data for all figures and tables presented are available in JSON format in the
+Supplementary Information
+.
+Code availability
+The Arcade Learning Environment
+3
+is available open source at
+https://github.com/mgbellemare/Arcade-Learning-Environment
+. The Go and chess environments are available open source in OpenSpiel
+52
+at
+https://github.com/deepmind/open_spiel
+. The pseudocode for the MuZero algorithm can be found in the file pseudocode.py in the
+Supplementary Information
+. All the neural architecture details and hyperparameters are described in Methods.
+References
+Campbell, M., Hoane, A. J. Jr & Hsu, F.-h. Deep Blue.
+Artif. Intell
+.
+134
+, 57–83 (2002).
+Article
+Google Scholar
+Silver, D. et al. Mastering the game of Go with deep neural networks and tree search.
+Nature
+529
+, 484–489 (2016).
+Article
+ADS
+CAS
+Google Scholar
+Bellemare, M. G., Naddaf, Y., Veness, J. & Bowling, M. The arcade learning environment: an evaluation platform for general agents.
+J. Artif. Intell. Res
+.
+47
+, 253–279 (2013).
+Article
+Google Scholar
+Machado, M. et al. Revisiting the arcade learning environment: evaluation protocols and open problems for general agents.
+J. Artif. Intell. Res
+.
+61
+, 523–562 (2018).
+Article
+MathSciNet
+Google Scholar
+Silver, D. et al. A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play.
+Science
+362
+, 1140–1144 (2018).
+Article
+ADS
+MathSciNet
+CAS
+Google Scholar
+Schaeffer, J. et al. A world championship caliber checkers program.
+Artif. Intell
+.
+53
+, 273–289 (1992).
+Article
+Google Scholar
+Brown, N. & Sandholm, T. Superhuman AI for heads-up no-limit poker: Libratus beats top professionals.
+Science
+359
+, 418–424 (2018).
+Article
+ADS
+MathSciNet
+CAS
+Google Scholar
+Moravčík, M. et al. Deepstack: expert-level artificial intelligence in heads-up no-limit poker.
+Science
+356
+, 508–513 (2017).
+Article
+ADS
+MathSciNet
+Google Scholar
+Vlahavas, I. & Refanidis, I.
+Planning and Scheduling
+Technical Report (EETN, 2013).
+Segler, M. H., Preuss, M. & Waller, M. P. Planning chemical syntheses with deep neural networks and symbolic AI.
+Nature
+555
+, 604–610 (2018).
+Article
+ADS
+CAS
+Google Scholar
+Sutton, R. S. & Barto, A. G.
+Reinforcement Learning: An Introduction
+2nd edn (MIT Press, 2018).
+Deisenroth, M. & Rasmussen, C. PILCO: a model-based and data-efficient approach to policy search. In
+Proc. 28th International Conference on Machine Learning, ICML 2011
+465–472 (Omnipress, 2011).
+Heess, N. et al. Learning continuous control policies by stochastic value gradients. In
+NIPS’15: Proc. 28th International Conference on Neural Information Processing Systems
+Vol. 2 (eds Cortes, C. et al.) 2944–2952 (MIT Press, 2015).
+Levine, S. & Abbeel, P. Learning neural network policies with guided policy search under unknown dynamics.
+Adv. Neural Inf. Process. Syst
+.
+27
+, 1071–1079 (2014).
+Google Scholar
+Hafner, D. et al. Learning latent dynamics for planning from pixels. Preprint at
+https://arxiv.org/abs/1811.04551
+(2018).
+Kaiser, L. et al. Model-based reinforcement learning for atari. Preprint at
+https://arxiv.org/abs/1903.00374
+(2019).
+Buesing, L. et al. Learning and querying fast generative models for reinforcement learning. Preprint at
+https://arxiv.org/abs/1802.03006
+(2018).
+Espeholt, L. et al. IMPALA: scalable distributed deep-RL with importance weighted actor-learner architectures. In
+Proc. International Conference on Machine Learning, ICML
+Vol. 80 (eds Dy, J. & Krause, A.) 1407–1416 (2018).
+Kapturowski, S., Ostrovski, G., Dabney, W., Quan, J. & Munos, R. Recurrent experience replay in distributed reinforcement learning. In
+International Conference on Learning Representations
+(2019).
+Horgan, D. et al. Distributed prioritized experience replay. In
+International Conference on Learning Representations
+(2018).
+Puterman, M. L.
+Markov Decision Processes: Discrete Stochastic Dynamic Programming
+1st edn (John Wiley & Sons, 1994).
+Coulom, R. Efficient selectivity and backup operators in Monte-Carlo tree search. In
+International Conference on Computers and Games
+72–83 (Springer, 2006).
+Wahlström, N., Schön, T. B. & Deisenroth, M. P. From pixels to torques: policy learning with deep dynamical models. Preprint at
+http://arxiv.org/abs/1502.02251
+(2015).
+Watter, M., Springenberg, J. T., Boedecker, J. & Riedmiller, M. Embed to control: a locally linear latent dynamics model for control from raw images. In
+NIPS’15: Proc. 28th International Conference on Neural Information Processing Systems
+Vol. 2 (eds Cortes, C. et al.) 2746–2754 (MIT Press, 2015).
+Ha, D. & Schmidhuber, J. Recurrent world models facilitate policy evolution. In
+NIPS’18: Proc. 32nd International Conference on Neural Information Processing Systems
+(eds Bengio, S. et al.) 2455–2467 (Curran Associates, 2018).
+Gelada, C., Kumar, S., Buckman, J., Nachum, O. & Bellemare, M. G. DeepMDP: learning continuous latent space models for representation learning.
+Proc. 36th International Conference on Machine Learning: Volume 97 of Proc. Machine Learning Research
+(eds Chaudhuri, K. & Salakhutdinov, R.) 2170–2179 (PMLR, 2019).
+van Hasselt, H., Hessel, M. & Aslanides, J. When to use parametric models in reinforcement learning? Preprint at
+https://arxiv.org/abs/1906.05243
+(2019).
+Tamar, A., Wu, Y., Thomas, G., Levine, S. & Abbeel, P. Value iteration networks.
+Adv. Neural Inf. Process. Syst
+.
+29
+, 2154–2162 (2016).
+Google Scholar
+Silver, D. et al. The predictron: end-to-end learning and planning. In
+Proc. 34th International Conference on Machine Learning
+Vol. 70 (eds Precup, D. & Teh, Y. W.) 3191–3199 (JMLR, 2017).
+Farahmand, A. M., Barreto, A. & Nikovski, D. Value-aware loss function for model-based reinforcement learning. In
+Proc. 20th International Conference on Artificial Intelligence and Statistics: Volume 54 of Proc. Machine Learning Research
+(eds Singh, A. & Zhu, J) 1486–1494 (PMLR, 2017).
+Farahmand, A. Iterative value-aware model learning.
+Adv. Neural Inf. Process. Syst
+.
+31
+, 9090–9101 (2018).
+Google Scholar
+Farquhar, G., Rocktaeschel, T., Igl, M. & Whiteson, S. TreeQN and ATreeC: differentiable tree planning for deep reinforcement learning. In
+International Conference on Learning Representations
+(2018).
+Oh, J., Singh, S. & Lee, H. Value prediction network.
+Adv. Neural Inf. Process. Syst
+.
+30
+, 6118–6128 (2017).
+Google Scholar
+Krizhevsky, A., Sutskever, I. & Hinton, G. E. Imagenet classification with deep convolutional neural networks.
+Adv. Neural Inf. Process. Syst
+.
+25
+, 1097–1105 (2012).
+Google Scholar
+He, K., Zhang, X., Ren, S. & Sun, J. Identity mappings in deep residual networks. In
+14th European Conference on Computer Vision
+630–645 (2016).
+Hessel, M. et al. Rainbow: combining improvements in deep reinforcement learning. In
+Thirty-Second AAAI Conference on Artificial Intelligence
+(2018).
+Schmitt, S., Hessel, M. & Simonyan, K. Off-policy actor-critic with shared experience replay. Preprint at
+https://arxiv.org/abs/1909.11583
+(2019).
+Azizzadenesheli, K. et al. Surprising negative results for generative adversarial tree search. Preprint at
+http://arxiv.org/abs/1806.05780
+(2018).
+Mnih, V. et al. Human-level control through deep reinforcement learning.
+Nature
+518
+, 529–533 (2015).
+Article
+ADS
+CAS
+Google Scholar
+Open, A. I. OpenAI five.
+OpenAI
+https://blog.openai.com/openai-five/
+(2018).
+Vinyals, O. et al. Grandmaster level in StarCraft II using multi-agent reinforcement learning.
+Nature
+575
+, 350–354 (2019).
+Article
+ADS
+CAS
+Google Scholar
+Jaderberg, M. et al. Reinforcement learning with unsupervised auxiliary tasks. Preprint at
+https://arxiv.org/abs/1611.05397
+(2016).
+Silver, D. et al. Mastering the game of Go without human knowledge.
+Nature
+550
+, 354–359 (2017).
+Article
+ADS
+CAS
+Google Scholar
+Kocsis, L. & Szepesvári, C. Bandit based Monte-Carlo planning. In
+European Conference on Machine Learning
+282–293 (Springer, 2006).
+Rosin, C. D. Multi-armed bandits with episode context.
+Ann. Math. Artif. Intell
+.
+61
+, 203–230 (2011).
+Article
+MathSciNet
+Google Scholar
+Schadd, M. P., Winands, M. H., Van Den Herik, H. J., Chaslot, G. M.-B. & Uiterwijk, J. W. Single-player Monte-Carlo tree search. In
+International Conference on Computers and Games
+1–12 (Springer, 2008).
+Pohlen, T. et al. Observe and look further: achieving consistent performance on Atari. Preprint at
+https://arxiv.org/abs/1805.11593
+(2018).
+Schaul, T., Quan, J., Antonoglou, I. & Silver, D. Prioritized experience replay. In
+International Conference on Learning Representations
+(2016).
+Cloud TPU.
+Google Cloud
+https://cloud.google.com/tpu/
+(2019).
+Coulom, R. Whole-history rating: a Bayesian rating system for players of time-varying strength. In
+International Conference on Computers and Games
+113–124 (2008).
+Nair, A. et al. Massively parallel methods for deep reinforcement learning. Preprint at
+https://arxiv.org/abs/1507.04296
+(2015).
+Lanctot, M. et al. OpenSpiel: a framework for reinforcement learning in games. Preprint at
+http://arxiv.org/abs/1908.09453
+(2019).
+Download references
+Acknowledgements
+We thank L. Bennett, O. Smith and C. Apps for organizational assistance; K. Kavukcuoglu for reviewing the paper; T. Anthony, M. Lai, N. Tomasev, U. Paquet, S. Ghaisas for many discussions; and the rest of the DeepMind team for their support.
+Author information
+Author notes
+These authors contributed equally: Julian Schrittwieser, Ioannis Antonoglou, Thomas Hubert, David Silver
+Authors and Affiliations
+DeepMind, London, UK
+Julian Schrittwieser, Ioannis Antonoglou, Thomas Hubert, Karen Simonyan, Laurent Sifre, Simon Schmitt, Arthur Guez, Edward Lockhart, Demis Hassabis, Thore Graepel, Timothy Lillicrap & David Silver
+University College London, London, UK
+Ioannis Antonoglou, Thore Graepel & David Silver
+Authors
+Julian Schrittwieser
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Ioannis Antonoglou
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Thomas Hubert
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Karen Simonyan
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Laurent Sifre
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Simon Schmitt
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Arthur Guez
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Edward Lockhart
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Demis Hassabis
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Thore Graepel
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Timothy Lillicrap
+View author publications
+Search author on:
+PubMed
+Google Scholar
+David Silver
+View author publications
+Search author on:
+PubMed
+Google Scholar
+Contributions
+J.S., I.A., T.H. and D.S. designed the MuZero algorithm with advice from A.G., K.S., L.S., E.L., T.L. and T.G.; J.S., I.A., T.H. and S.S. implemented the MuZero program, ran experiments and analysed data. D.S., J.S., I.A. and T.H. wrote the paper with contributions from A.G., K.S., L.S., E.L., T.L., T.G. and D.H.
+Corresponding author
+Correspondence to
+David Silver
+.
+Ethics declarations
+Competing interests
+DeepMind filed Greek patent GR20200100037 on 28 January 2020, covering the MuZero algorithm described in this paper, listing the authors J.S., I.A. and T.H. as inventors. The other authors declare no competing interests.
+Additional information
+Peer review information
+Nature
+thanks Jaap van den Herik and the other, anonymous, reviewer(s) for their contribution to the peer review of this work.
+Publisher’s note
+Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.
+Supplementary information
+Supplementary Information (download PDF
+)
+This file contains Supplementary Figures S1-S5 and Supplementary Tables S1-S2.
+Supplementary Data (download ZIP
+)
+The ZIP file contains Supplementary Data.
+Rights and permissions
+Reprints and permissions
+About this article
+Cite this article
+Schrittwieser, J., Antonoglou, I., Hubert, T.
+et al.
+Mastering Atari, Go, chess and shogi by planning with a learned model.
+Nature
+588
+, 604–609 (2020). https://doi.org/10.1038/s41586-020-03051-4
+Download citation
+Received
+:
+03 April 2020
+Accepted
+:
+07 October 2020
+Published
+:
+23 December 2020
+Version of record
+:
+23 December 2020
+Issue date
+:
+24 December 2020
+DOI
+:
+https://doi.org/10.1038/s41586-020-03051-4
+Share this article
+Anyone you share the following link with will be able to read this content:
+Get shareable link
+Sorry, a shareable link is not currently available for this article.
+Copy shareable link to clipboard
+Provided by the Springer Nature SharedIt content-sharing initiative
+This article is cited by
+Integrating reinforcement learning with visual generative models: foundations and advances
+Yuanzhi Liang
+Yijie Fang
+Chi Zhang
+Vicinagearth
+(2026)
+Reward shaping of deep reinforcement learning algorithm for autonomous navigation in a structured environment
+Anu Priya
+Ritu Tiwari
+Sushant Kumar
+Intelligent Service Robotics
+(2026)
+Comments
+Commenting on this article is now closed.
+Ari Margo
+24 December 2020, 07:58
+"Tree-based planning methods have enjoyed huge success in challenging domains, such as chess1 and Go2, where a perfect simulator is available." Chess has been solved, but Go has not, so this claim is slightly inaccurate. The Go simulator is not perfect, despite its impressively high win rate. It's definitely the best Go playing entity on Earth, but that is not the same as saying it is perfect. And the best part of Go, which is played on a 19x19 board, is that if it ever is mathematically solved, we'll just increase the board size to 21x21 and let the computers work on that for a few decades.
+Jeremy Hummel
+Replied to
+Ari Margo
+24 December 2020, 11:50
+They are referring to a perfect simulator of the environment, where given the state, the simulator can return the available actions and the transition function (state, action) => (next state, reward).
+The environments for chess and Go can be simulated perfectly, all that's  required is codifying the rules. They are not saying that these games are solved.
+Search
+Search articles by subject, keyword or author
+Show results from
+All journals
+This journal
+Search
+Advanced search
+Quick links
+Explore articles by subject
+Find a job
+Guide to authors
+Editorial policies
+Close banner
+Close
+Sign up for the
+Nature Briefing: AI and Robotics
+newsletter — what matters in AI and robotics research, free to your inbox weekly.
+Email address
+Sign up
+I agree my information will be processed in accordance with the
+Nature
+and Springer Nature Limited
+Privacy Policy
+.
+Close banner
+Close
+Get the most important science stories of the day, free in your inbox.
+Sign up for Nature Briefing: AI and Robotics
\ No newline at end of file
diff --git a/research/notes/mastering-diverse-domains-through-world-models.md b/research/notes/mastering-diverse-domains-through-world-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..6c14936dde7e73f31fcde8601ac960014e777230
--- /dev/null
+++ b/research/notes/mastering-diverse-domains-through-world-models.md
@@ -0,0 +1,2649 @@
+---
+title: Mastering Diverse Domains through World Models
+id: mastering-diverse-domains-through-world-models
+tags:
+- deepread
+created: '2026-06-10T00:30:45.749285Z'
+source: https://arxiv.org/html/2301.04104
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:45.749153Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Mastering Diverse Domains through World Models
+HTML conversions
+sometimes display errors
+due to content that did not convert correctly from the source. This paper uses the following packages that are not yet supported by the HTML conversion tool. Feedback on these issues are not necessary; they are known and are being worked on.
+failed: stackengine
+failed: xpatch
+failed: cuted
+failed: xstring
+failed: eqparbox
+failed: datetime2
+failed: environ
+failed: titletoc
+failed: trimclip
+Authors: achieve the best HTML results from your LaTeX submissions by following these
+best practices
+.
+License: CC BY 4.0
+arXiv:2301.04104v2 [cs.AI] 17 Apr 2024
+\xapptocmd
+\NewColumnType
+L[1]Q[l,#1]
+\NewColumnType
+C[1]Q[c,#1]
+\NewColumnType
+R[1]Q[r,#1]
+\NewEnviron
+mytabular[1]
+\BODY
+Mastering Diverse Domains through World Models
+Danijar Hafner
+,
+12
+Jurgis Pasukonis
+,
+1
+Jimmy Ba
+,
+2
+Timothy Lillicrap
+1
+†
+†
+1
+Google DeepMind.
+2
+University of Toronto. Correspondence: mail@danijar.com
+Abstract
+Developing a general algorithm that learns to solve tasks across a wide range of applications has been a fundamental challenge in artificial intelligence.
+Although current reinforcement learning algorithms can be readily applied to tasks similar to what they have been developed for, configuring them for new application domains requires significant human expertise and experimentation.
+We present DreamerV3, a general algorithm that outperforms specialized methods across over 150 diverse tasks, with a single configuration.
+Dreamer learns a model of the environment and improves its behavior by imagining future scenarios.
+Robustness techniques based on normalization, balancing, and transformations enable stable learning across domains.
+Applied out of the box, Dreamer is the first algorithm to collect diamonds in Minecraft from scratch without human data or curricula.
+This achievement has been posed as a significant challenge in artificial intelligence that requires exploring farsighted strategies from pixels and sparse rewards in an open world.
+Our work allows solving challenging control problems without extensive experimentation, making reinforcement learning broadly applicable.
+Figure 1
+:
+Benchmark summary.
+a
+, Using fixed hyperparameters across all domains, Dreamer outperforms tuned expert algorithms across a wide range of benchmarks and data budgets. Dreamer also substantially outperforms a high-quality implementation of the widely applicable PPO algorithm.
+b
+, Applied out of the box, Dreamer learns to obtain diamonds in the popular video game Minecraft from scratch given sparse rewards, a long-standing challenge in artificial intelligence for which previous approaches required human data or domain-specific heuristics.
+(a)
+Control Suite
+(b)
+Atari
+(c)
+ProcGen
+(d)
+DMLab
+(e)
+Minecraft
+Figure 2
+:
+Diverse visual domains used in the experiments. Dreamer succeeds across these domains, ranging from robot locomotion and manipulation tasks over Atari games, procedurally generated ProcGen levels, and DMLab tasks, that require spatial and temporal reasoning, to the complex and infinite world of Minecraft. We also evaluate Dreamer on non-visual domains.
+Introduction
+Reinforcement learning has enabled computers to solve tasks through interaction, such as surpassing humans in the games of Go and Dota
+1
+,
+2
+.
+It is also a key component for improving large language models beyond what is demonstrated in their pretraining data
+3
+,
+4
+.
+While PPO
+5
+has become a standard algorithm in the field of reinforcement learning, more specialized algorithms are often employed to achieve higher performance.
+These specialized algorithms target the unique challenges posed by different application domains, such as continuous control
+6
+, discrete actions
+7
+,
+8
+, sparse rewards
+9
+, image inputs
+10
+, spatial environments
+11
+, and board games
+12
+.
+However, applying reinforcement learning algorithms to sufficiently new tasks—such as moving from video games to robotics tasks—requires substantial effort, expertise, and computational resources for tweaking the hyperparameters of the algorithm
+13
+.
+This brittleness poses a bottleneck in applying reinforcement learning to new problems and also limits the applicability of reinforcement learning to computationally expensive models or tasks where tuning is prohibitive.
+Creating a general algorithm that learns to master new domains without having to be reconfigured has been a central challenge in artificial intelligence and would open up reinforcement learning to a wide range of practical applications.
+We present Dreamer, a general algorithm that outperforms specialized expert algorithms across a wide range of domains while using fixed hyperparameters, making reinforcement learning readily applicable to new problems.
+The algorithm is based on the idea of learning a world model that equips the agent with rich perception and the ability to imagine the future
+14
+,
+15
+,
+16
+.
+The world model predicts the outcomes of potential actions, a critic neural network judges the value of each outcome, and an actor neural network chooses actions to reach the best outcomes.
+Although intuitively appealing, robustly learning and leveraging world models to achieve strong task performance has been an open problem
+17
+.
+Dreamer overcomes this challenge through a range of robustness techniques based on normalization, balancing, and transformations.
+We observe robust learning not only across over 150 tasks from the domains summarized in
+Figure
+2
+, but also across model sizes and training budgets, offering a predictable way to increase performance.
+Notably, larger model sizes not only achieve higher scores but also require less interaction to solve a task.
+To push the boundaries of reinforcement learning, we consider the popular video game Minecraft that has become a focal point of research in recent years
+18
+,
+19
+,
+20
+, with international competitions held for developing algorithms that autonomously learn to collect diamonds in Minecraft
+1
+1
+1
+The MineRL Diamond Competitions were held in 2019, 2020, and 2021 and provided a dataset of human expert trajectories:
+https://minerl.io/diamond
+. Competitions in the following years focused on a wide range of tasks.
+.
+Solving this problem without human data has been widely recognized as a substantial challenge for artificial intelligence because of the sparse rewards, exploration difficulty, long time horizons, and the procedural diversity of this open world game
+18
+.
+Due to these obstacles, previous approaches resorted to using human expert data and domain-specific curricula
+19
+,
+20
+.
+Applied out of the box, Dreamer is the first algorithm to collect diamonds in Minecraft from scratch.
+(a)
+World Model Learning
+(b)
+Actor Critic Learning
+Figure 3
+:
+Training process of Dreamer. The world model encodes sensory inputs into discrete representations
+z
+t
+subscript
+𝑧
+𝑡
+z_{t}
+italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+that are predicted by a sequence model with recurrent state
+h
+t
+subscript
+ℎ
+𝑡
+h_{t}
+italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+given actions
+a
+t
+subscript
+𝑎
+𝑡
+a_{t}
+italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+. The inputs are reconstructed to shape the representations. The actor and critic predict actions
+a
+t
+subscript
+𝑎
+𝑡
+a_{t}
+italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+and values
+v
+t
+subscript
+𝑣
+𝑡
+v_{t}
+italic_v start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+and learn from trajectories of abstract representations predicted by the world model.
+Learning algorithm
+We present the third generation of the Dreamer algorithm
+21
+,
+22
+.
+The algorithm consists of three neural networks: the world model predicts the outcomes of potential actions, the critic judges the value of each outcome, and the actor chooses actions to reach the most valuable outcomes.
+The components are trained concurrently from replayed experience while the agent interacts with the environment.
+To succeed across domains, all three components need to accommodate different signal magnitudes and robustly balance terms in their objectives.
+This is challenging as we are not only targeting similar tasks within the same domain but aim to learn across diverse domains with fixed hyperparameters.
+This section introduces the world model, critic, and actor along with their robust loss functions, as well as tools for robustly predicting quantities of unknown orders of magnitude.
+World model learning
+The world model learns compact representations of sensory inputs through autoencoding
+23
+and enables planning by predicting future representations and rewards for potential actions.
+We implement the world model as a Recurrent State-Space Model (RSSM)
+24
+, shown in
+Figure
+3
+.
+First, an encoder maps sensory inputs
+x
+t
+subscript
+𝑥
+𝑡
+x_{t}
+italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+to stochastic representations
+z
+t
+subscript
+𝑧
+𝑡
+z_{t}
+italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+.
+Then, a sequence model with recurrent state
+h
+t
+subscript
+ℎ
+𝑡
+h_{t}
+italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+predicts the sequence of these representations given past actions
+a
+t
+−
+1
+subscript
+𝑎
+𝑡
+1
+a_{t-1}
+italic_a start_POSTSUBSCRIPT italic_t - 1 end_POSTSUBSCRIPT
+.
+The concatenation of
+h
+t
+subscript
+ℎ
+𝑡
+h_{t}
+italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+and
+z
+t
+subscript
+𝑧
+𝑡
+z_{t}
+italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+forms the model state from which we predict rewards
+r
+t
+subscript
+𝑟
+𝑡
+r_{t}
+italic_r start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+and episode continuation flags
+c
+t
+∈
+{
+0
+,
+1
+}
+subscript
+𝑐
+𝑡
+0
+1
+c_{t}\in\{0,1\}
+italic_c start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∈ { 0 , 1 }
+and reconstruct the inputs to ensure informative representations:
+Sequence model:
+h
+t
+=
+f
+ϕ
+⁢
+(
+h
+t
+−
+1
+,
+z
+t
+−
+1
+,
+a
+t
+−
+1
+)
+Encoder:
+z
+t
+∼
+q
+ϕ
+⁢
+(
+z
+t
+|
+h
+t
+,
+x
+t
+)
+Dynamics predictor:
+z
+^
+t
+∼
+p
+ϕ
+⁢
+(
+z
+^
+t
+|
+h
+t
+)
+Reward predictor:
+r
+^
+t
+∼
+p
+ϕ
+⁢
+(
+r
+^
+t
+|
+h
+t
+,
+z
+t
+)
+Continue predictor:
+c
+^
+t
+∼
+p
+ϕ
+⁢
+(
+c
+^
+t
+|
+h
+t
+,
+z
+t
+)
+Decoder:
+x
+^
+t
+∼
+p
+ϕ
+⁢
+(
+x
+^
+t
+|
+h
+t
+,
+z
+t
+)
+missing-subexpression
+Sequence model:
+missing-subexpression
+subscript
+ℎ
+𝑡
+subscript
+𝑓
+italic-ϕ
+subscript
+ℎ
+𝑡
+1
+subscript
+𝑧
+𝑡
+1
+subscript
+𝑎
+𝑡
+1
+missing-subexpression
+Encoder:
+missing-subexpression
+subscript
+𝑧
+𝑡
+similar-to
+subscript
+𝑞
+italic-ϕ
+conditional
+subscript
+𝑧
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑥
+𝑡
+missing-subexpression
+Dynamics predictor:
+missing-subexpression
+subscript
+^
+𝑧
+𝑡
+similar-to
+subscript
+𝑝
+italic-ϕ
+conditional
+subscript
+^
+𝑧
+𝑡
+subscript
+ℎ
+𝑡
+missing-subexpression
+Reward predictor:
+missing-subexpression
+subscript
+^
+𝑟
+𝑡
+similar-to
+subscript
+𝑝
+italic-ϕ
+conditional
+subscript
+^
+𝑟
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑧
+𝑡
+missing-subexpression
+Continue predictor:
+missing-subexpression
+subscript
+^
+𝑐
+𝑡
+similar-to
+subscript
+𝑝
+italic-ϕ
+conditional
+subscript
+^
+𝑐
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑧
+𝑡
+missing-subexpression
+Decoder:
+missing-subexpression
+subscript
+^
+𝑥
+𝑡
+similar-to
+subscript
+𝑝
+italic-ϕ
+conditional
+subscript
+^
+𝑥
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑧
+𝑡
+\displaystyle\begin{aligned} \begin{aligned} \raisebox{8.39578pt}{\hbox to 0.0%
+pt{\hss\vbox to 0.0pt{\hbox{$\text{RSSM}\hskip 4.30554pt\begin{cases}\hphantom%
+{A}\\
+\hphantom{A}\\
+\hphantom{A}\end{cases}$}\vss}}}&\text{Sequence model:}\hskip 35.00005pt&&h_{t%
+}&\ =&\ f_{\phi}(h_{t-1},z_{t-1},a_{t-1})\\
+&\text{Encoder:}\hskip 35.00005pt&&z_{t}&\ \sim&\ q_{\phi}(z_{t}\;|\;h_{t},x_{%
+t})\\
+&\text{Dynamics predictor:}\hskip 35.00005pt&&\hat{z}_{t}&\ \sim&\ p_{\phi}(%
+\hat{z}_{t}\;|\;h_{t})\\
+&\text{Reward predictor:}\hskip 35.00005pt&&\hat{r}_{t}&\ \sim&\ p_{\phi}(\hat%
+{r}_{t}\;|\;h_{t},z_{t})\\
+&\text{Continue predictor:}\hskip 35.00005pt&&\hat{c}_{t}&\ \sim&\ p_{\phi}(%
+\hat{c}_{t}\;|\;h_{t},z_{t})\\
+&\text{Decoder:}\hskip 35.00005pt&&\hat{x}_{t}&\ \sim&\ p_{\phi}(\hat{x}_{t}\;%
+|\;h_{t},z_{t})\end{aligned}\end{aligned}
+start_ROW start_CELL start_ROW start_CELL end_CELL start_CELL Sequence model: end_CELL start_CELL end_CELL start_CELL italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL = end_CELL start_CELL italic_f start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_h start_POSTSUBSCRIPT italic_t - 1 end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT italic_t - 1 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT italic_t - 1 end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL end_CELL start_CELL Encoder: end_CELL start_CELL end_CELL start_CELL italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL ∼ end_CELL start_CELL italic_q start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL end_CELL start_CELL Dynamics predictor: end_CELL start_CELL end_CELL start_CELL over^ start_ARG italic_z end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL ∼ end_CELL start_CELL italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( over^ start_ARG italic_z end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL end_CELL start_CELL Reward predictor: end_CELL start_CELL end_CELL start_CELL over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL ∼ end_CELL start_CELL italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL end_CELL start_CELL Continue predictor: end_CELL start_CELL end_CELL start_CELL over^ start_ARG italic_c end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL ∼ end_CELL start_CELL italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( over^ start_ARG italic_c end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL end_CELL start_CELL Decoder: end_CELL start_CELL end_CELL start_CELL over^ start_ARG italic_x end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT end_CELL start_CELL ∼ end_CELL start_CELL italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( over^ start_ARG italic_x end_ARG start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW end_CELL end_ROW
+(1)
+Figure
+4
+visualizes long-term video predictions of the world world.
+The encoder and decoder use convolutional neural networks (CNN) for image inputs and multi-layer perceptrons (MLPs) for vector inputs.
+The dynamics, reward, and continue predictors are also MLPs.
+The representations are sampled from a vector of softmax distributions and we take straight-through gradients through the sampling step
+25
+,
+22
+.
+Given a sequence batch of inputs
+x
+1
+:
+T
+subscript
+𝑥
+:
+1
+𝑇
+x_{1:T}
+italic_x start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, actions
+a
+1
+:
+T
+subscript
+𝑎
+:
+1
+𝑇
+a_{1:T}
+italic_a start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, rewards
+r
+1
+:
+T
+subscript
+𝑟
+:
+1
+𝑇
+r_{1:T}
+italic_r start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, and continuation flags
+c
+1
+:
+T
+subscript
+𝑐
+:
+1
+𝑇
+c_{1:T}
+italic_c start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, the world model parameters
+ϕ
+italic-ϕ
+\phi
+italic_ϕ
+are optimized end-to-end to minimize the prediction loss
+ℒ
+pred
+subscript
+ℒ
+pred
+\mathcal{L}_{\mathrm{pred}}
+caligraphic_L start_POSTSUBSCRIPT roman_pred end_POSTSUBSCRIPT
+, the dynamics loss
+ℒ
+dyn
+subscript
+ℒ
+dyn
+\mathcal{L}_{\mathrm{dyn}}
+caligraphic_L start_POSTSUBSCRIPT roman_dyn end_POSTSUBSCRIPT
+, and the representation loss
+ℒ
+rep
+subscript
+ℒ
+rep
+\mathcal{L}_{\mathrm{rep}}
+caligraphic_L start_POSTSUBSCRIPT roman_rep end_POSTSUBSCRIPT
+with corresponding loss weights
+β
+pred
+=
+1
+subscript
+𝛽
+pred
+1
+\beta_{\mathrm{pred}}=1
+italic_β start_POSTSUBSCRIPT roman_pred end_POSTSUBSCRIPT = 1
+,
+β
+dyn
+=
+1
+subscript
+𝛽
+dyn
+1
+\beta_{\mathrm{dyn}}=1
+italic_β start_POSTSUBSCRIPT roman_dyn end_POSTSUBSCRIPT = 1
+, and
+β
+rep
+=
+0.1
+subscript
+𝛽
+rep
+0.1
+\beta_{\mathrm{rep}}=0.1
+italic_β start_POSTSUBSCRIPT roman_rep end_POSTSUBSCRIPT = 0.1
+:
+ℒ
+⁢
+(
+ϕ
+)
+≐
+E
+q
+ϕ
+⁡
+[
+∑
+t
+=
+1
+T
+(
+β
+pred
+⁢
+ℒ
+pred
+⁢
+(
+ϕ
+)
++
+β
+dyn
+⁢
+ℒ
+dyn
+⁢
+(
+ϕ
+)
++
+β
+rep
+⁢
+ℒ
+rep
+⁢
+(
+ϕ
+)
+)
+]
+.
+approaches-limit
+ℒ
+italic-ϕ
+subscript
+E
+subscript
+𝑞
+italic-ϕ
+superscript
+subscript
+𝑡
+1
+𝑇
+subscript
+𝛽
+pred
+subscript
+ℒ
+pred
+italic-ϕ
+subscript
+𝛽
+dyn
+subscript
+ℒ
+dyn
+italic-ϕ
+subscript
+𝛽
+rep
+subscript
+ℒ
+rep
+italic-ϕ
+\displaystyle\begin{aligned} \mathcal{L}(\phi)\doteq\operatorname{E}_{q_{\phi}%
+}\Big{[}\textstyle\sum_{t=1}^{T}(\beta_{\mathrm{pred}}\mathcal{L}_{\mathrm{%
+pred}}(\phi)+\beta_{\mathrm{dyn}}\mathcal{L}_{\mathrm{dyn}}(\phi)+\beta_{%
+\mathrm{rep}}\mathcal{L}_{\mathrm{rep}}(\phi))\Big{]}.\end{aligned}
+start_ROW start_CELL caligraphic_L ( italic_ϕ ) ≐ roman_E start_POSTSUBSCRIPT italic_q start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT end_POSTSUBSCRIPT [ ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_T end_POSTSUPERSCRIPT ( italic_β start_POSTSUBSCRIPT roman_pred end_POSTSUBSCRIPT caligraphic_L start_POSTSUBSCRIPT roman_pred end_POSTSUBSCRIPT ( italic_ϕ ) + italic_β start_POSTSUBSCRIPT roman_dyn end_POSTSUBSCRIPT caligraphic_L start_POSTSUBSCRIPT roman_dyn end_POSTSUBSCRIPT ( italic_ϕ ) + italic_β start_POSTSUBSCRIPT roman_rep end_POSTSUBSCRIPT caligraphic_L start_POSTSUBSCRIPT roman_rep end_POSTSUBSCRIPT ( italic_ϕ ) ) ] . end_CELL end_ROW
+(2)
+Figure 4
+:
+Multi-step video predictions of a DMLab maze (top) and a quadrupedal robot (bottom). Given 5 context images and the full action sequence, the model predicts 45 frames into the future without access to intermediate images. The world model learns an understanding of the underlying structure of each environment.
+The prediction loss trains the decoder and reward predictor via the symlog squared loss described later, and the continue predictor via logistic regression.
+The dynamics loss trains the sequence model to predict the next representation by minimizing the KL divergence between the predictor
+p
+ϕ
+⁢
+(
+z
+t
+|
+h
+t
+)
+subscript
+𝑝
+italic-ϕ
+conditional
+subscript
+𝑧
+𝑡
+subscript
+ℎ
+𝑡
+p_{\phi}(z_{t}\;|\;h_{t})
+italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT )
+and the next stochastic representation
+q
+ϕ
+⁢
+(
+z
+t
+|
+h
+t
+,
+x
+t
+)
+subscript
+𝑞
+italic-ϕ
+conditional
+subscript
+𝑧
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑥
+𝑡
+q_{\phi}(z_{t}\;|\;h_{t},x_{t})
+italic_q start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT )
+.
+The representation loss, in turn, trains the representations to become more predictable allowing us to use a factorized dynamics predictor for fast sampling during imagination training.
+The two losses differ in the stop-gradient operator
+sg
+⁡
+(
+⋅
+)
+sg
+⋅
+\operatorname{sg}(\cdot)
+roman_sg ( ⋅ )
+and their loss scale.
+To avoid a degenerate solution where the dynamics are trivial to predict but fail to contain enough information about the inputs, we employ free bits
+26
+by clipping the dynamics and representation losses below the value of 1 nat
+≈
+\approx
+≈
+1.44 bits.
+This disables them while they are already minimized well to focus learning on the prediction loss:
+ℒ
+pred
+⁢
+(
+ϕ
+)
+≐
+−
+ln
+⁡
+p
+ϕ
+⁢
+(
+x
+t
+|
+z
+t
+,
+h
+t
+)
+−
+ln
+⁡
+p
+ϕ
+⁢
+(
+r
+t
+|
+z
+t
+,
+h
+t
+)
+−
+ln
+⁡
+p
+ϕ
+⁢
+(
+c
+t
+|
+z
+t
+,
+h
+t
+)
+ℒ
+dyn
+⁢
+(
+ϕ
+)
+≐
+max
+(
+1
+,
+KL
+[
+sg
+(
+q
+ϕ
+(
+z
+t
+|
+h
+t
+,
+x
+t
+)
+)
+∥
+p
+ϕ
+(
+z
+t
+|
+h
+t
+)
+]
+)
+ℒ
+rep
+⁢
+(
+ϕ
+)
+≐
+max
+(
+1
+,
+KL
+[
+q
+ϕ
+(
+z
+t
+|
+h
+t
+,
+x
+t
+)
+∥
+sg
+(
+p
+ϕ
+(
+z
+t
+|
+h
+t
+)
+)
+]
+)
+\displaystyle\begin{aligned} \mathcal{L_{\mathrm{pred}}}(\phi)&\doteq-\ln p_{%
+\phi}(x_{t}\;|\;z_{t},h_{t})-\ln p_{\phi}(r_{t}\;|\;z_{t},h_{t})-\ln p_{\phi}(%
+c_{t}\;|\;z_{t},h_{t})\\
+\mathcal{L_{\mathrm{dyn}}}(\phi)&\doteq\max\bigl{(}1,\operatorname{KL}\!\big{[%
+}\operatorname{sg}(q_{\phi}(z_{t}\;|\;h_{t},x_{t}))\;\big{\|}\;\hskip 13.77771%
+ptp_{\phi}(z_{t}\;|\;h_{t})\hphantom{)}\big{]}\bigr{)}\\
+\mathcal{L_{\mathrm{rep}}}(\phi)&\doteq\max\bigl{(}1,\operatorname{KL}\!\big{[%
+}\hskip 13.77771ptq_{\phi}(z_{t}\;|\;h_{t},x_{t})\hphantom{)}\;\big{\|}\;%
+\operatorname{sg}(p_{\phi}(z_{t}\;|\;h_{t}))\big{]}\bigr{)}\end{aligned}
+start_ROW start_CELL caligraphic_L start_POSTSUBSCRIPT roman_pred end_POSTSUBSCRIPT ( italic_ϕ ) end_CELL start_CELL ≐ - roman_ln italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) - roman_ln italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_r start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) - roman_ln italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_c start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW start_ROW start_CELL caligraphic_L start_POSTSUBSCRIPT roman_dyn end_POSTSUBSCRIPT ( italic_ϕ ) end_CELL start_CELL ≐ roman_max ( 1 , roman_KL [ roman_sg ( italic_q start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ) ∥ italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ] ) end_CELL end_ROW start_ROW start_CELL caligraphic_L start_POSTSUBSCRIPT roman_rep end_POSTSUBSCRIPT ( italic_ϕ ) end_CELL start_CELL ≐ roman_max ( 1 , roman_KL [ italic_q start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_x start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ∥ roman_sg ( italic_p start_POSTSUBSCRIPT italic_ϕ end_POSTSUBSCRIPT ( italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ) ] ) end_CELL end_ROW
+(3)
+Previous world models require scaling the representation loss differently based on the visual complexity of the environment
+21
+.
+Complex 3D environments contain details unnecessary for control and thus prompt a stronger regularizer to simplify the representations and make them more predictable.
+In games with static backgrounds and where individual pixels may matter for the task, a weak regularizer is required to extract fine details.
+We find that combining free bits with a small representation loss resolves this dilemma, allowing for fixed hyperparameters across domains.
+Moreover, we transform vector observations using the symlog function described later, to prevent large inputs and large reconstruction gradients, further stabilizing the trade-off with the representation loss.
+We occasionally observed spikes the in KL losses in earlier experiments, consistent with reports for deep variational autoencoders
+27
+.
+To prevent this, we parameterize the categorical distributions of the encoder and dynamics predictor as mixtures of 1% uniform and 99% neural network output, making it impossible for them to become deterministic and thus ensuring well-behaved KL losses.
+Further model details and hyperparameters are included in the supplementary material.
+Critic learning
+The actor and critic neural networks learn behaviors purely from abstract trajectories of representations predicted by the world model
+14
+.
+For environment interaction, we select actions by sampling from the actor network without lookahead planning.
+The actor and critic operate on model states
+s
+t
+≐
+{
+h
+t
+,
+z
+t
+}
+approaches-limit
+subscript
+𝑠
+𝑡
+subscript
+ℎ
+𝑡
+subscript
+𝑧
+𝑡
+s_{t}\doteq\{h_{t},z_{t}\}
+italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ≐ { italic_h start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , italic_z start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT }
+and thus benefit from the Markovian representations learned by the recurrent world model.
+The actor aims to maximize the return
+R
+t
+≐
+∑
+τ
+=
+0
+∞
+γ
+τ
+⁢
+r
+t
++
+τ
+approaches-limit
+subscript
+𝑅
+𝑡
+superscript
+subscript
+𝜏
+0
+superscript
+𝛾
+𝜏
+subscript
+𝑟
+𝑡
+𝜏
+R_{t}\doteq\textstyle\sum_{\tau=0}^{\infty}\gamma^{\tau}r_{t+\tau}
+italic_R start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ≐ ∑ start_POSTSUBSCRIPT italic_τ = 0 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ∞ end_POSTSUPERSCRIPT italic_γ start_POSTSUPERSCRIPT italic_τ end_POSTSUPERSCRIPT italic_r start_POSTSUBSCRIPT italic_t + italic_τ end_POSTSUBSCRIPT
+with a discount factor
+γ
+=
+0.997
+𝛾
+0.997
+\gamma=0.997
+italic_γ = 0.997
+for each model state.
+To consider rewards beyond the prediction horizon
+T
+=
+16
+𝑇
+16
+T=16
+italic_T = 16
+, the critic learns to approximate the distribution of returns
+28
+for each state under the current actor behavior:
+Actor:
+a
+t
+∼
+π
+θ
+⁢
+(
+a
+t
+|
+s
+t
+)
+Critic:
+v
+ψ
+⁢
+(
+R
+t
+|
+s
+t
+)
+missing-subexpression
+Actor:
+missing-subexpression
+similar-to
+subscript
+𝑎
+𝑡
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑎
+𝑡
+subscript
+𝑠
+𝑡
+Critic:
+missing-subexpression
+subscript
+𝑣
+𝜓
+conditional
+subscript
+𝑅
+𝑡
+subscript
+𝑠
+𝑡
+\displaystyle\begin{aligned} &\text{Actor:}\quad&&a_{t}\sim\pi_{\theta}(a_{t}%
+\;|\;s_{t})\qquad&\text{Critic:}\quad&&v_{\psi}(R_{t}\;|\;s_{t})\end{aligned}
+start_ROW start_CELL end_CELL start_CELL Actor: end_CELL start_CELL end_CELL start_CELL italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL start_CELL Critic: end_CELL start_CELL end_CELL start_CELL italic_v start_POSTSUBSCRIPT italic_ψ end_POSTSUBSCRIPT ( italic_R start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) end_CELL end_ROW
+(4)
+Starting from representations of replayed inputs, the world model and actor generate a trajectory of
+imagined model states
+s
+1
+:
+T
+subscript
+𝑠
+:
+1
+𝑇
+s_{1:T}
+italic_s start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, actions
+a
+1
+:
+T
+subscript
+𝑎
+:
+1
+𝑇
+a_{1:T}
+italic_a start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, rewards
+r
+1
+:
+T
+subscript
+𝑟
+:
+1
+𝑇
+r_{1:T}
+italic_r start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+, and continuation flags
+c
+1
+:
+T
+subscript
+𝑐
+:
+1
+𝑇
+c_{1:T}
+italic_c start_POSTSUBSCRIPT 1 : italic_T end_POSTSUBSCRIPT
+.
+Because the critic predicts a distribution, we read out its predicted values
+v
+t
+≐
+E
+[
+v
+ψ
+(
+⋅
+|
+s
+t
+)
+]
+v_{t}\doteq\operatorname{E}[v_{\psi}(\,\cdot\;|\;s_{t})]
+italic_v start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ≐ roman_E [ italic_v start_POSTSUBSCRIPT italic_ψ end_POSTSUBSCRIPT ( ⋅ | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ]
+as the expectation of the distribution.
+To estimate returns that consider rewards beyond the prediction horizon, we compute bootstrapped
+λ
+𝜆
+\lambda
+italic_λ
+-returns
+29
+that integrate the predicted rewards and the values.
+The critic learns to predict the distribution of the return estimates
+R
+t
+λ
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+R^{\lambda}_{t}
+italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+using the maximum likelihood loss:
+ℒ
+⁢
+(
+ψ
+)
+≐
+−
+∑
+t
+=
+1
+T
+ln
+⁡
+p
+ψ
+⁢
+(
+R
+t
+λ
+|
+s
+t
+)
+R
+t
+λ
+≐
+r
+t
++
+γ
+⁢
+c
+t
+⁢
+(
+(
+1
+−
+λ
+)
+⁢
+v
+t
++
+λ
+⁢
+R
+t
++
+1
+λ
+)
+R
+T
+λ
+≐
+v
+T
+formulae-sequence
+approaches-limit
+ℒ
+𝜓
+superscript
+subscript
+𝑡
+1
+𝑇
+subscript
+𝑝
+𝜓
+conditional
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+subscript
+𝑠
+𝑡
+formulae-sequence
+approaches-limit
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+subscript
+𝑟
+𝑡
+𝛾
+subscript
+𝑐
+𝑡
+1
+𝜆
+subscript
+𝑣
+𝑡
+𝜆
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+1
+approaches-limit
+subscript
+superscript
+𝑅
+𝜆
+𝑇
+subscript
+𝑣
+𝑇
+\displaystyle\begin{aligned} \mathcal{L}(\psi)\doteq-\textstyle\sum_{t=1}^{T}%
+\ln p_{\psi}(R^{\lambda}_{t}\;|\;s_{t})\qquad R^{\lambda}_{t}\doteq r_{t}+%
+\gamma c_{t}\Big{(}(1-\lambda)v_{t}+\lambda R^{\lambda}_{t+1}\Big{)}\qquad R^{%
+\lambda}_{T}\doteq v_{T}\end{aligned}
+start_ROW start_CELL caligraphic_L ( italic_ψ ) ≐ - ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_T end_POSTSUPERSCRIPT roman_ln italic_p start_POSTSUBSCRIPT italic_ψ end_POSTSUBSCRIPT ( italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ≐ italic_r start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT + italic_γ italic_c start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ( ( 1 - italic_λ ) italic_v start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT + italic_λ italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t + 1 end_POSTSUBSCRIPT ) italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_T end_POSTSUBSCRIPT ≐ italic_v start_POSTSUBSCRIPT italic_T end_POSTSUBSCRIPT end_CELL end_ROW
+(5)
+While a simple choice would be to parameterize the critic as a Normal distribution, the return distribution can have multiple modes and vary by orders of magnitude across environments. To stabilize and accelerate learning under these conditions, we parameterize the critic as categorical distribution with exponentially spaced bins, decoupling the scale of gradients from the prediction targets as described later.
+To improve value prediction in environments where rewards are challenging to predict, we apply the critic loss both to imagined trajectories with loss scale
+β
+val
+=
+1
+subscript
+𝛽
+val
+1
+\beta_{\mathrm{val}}=1
+italic_β start_POSTSUBSCRIPT roman_val end_POSTSUBSCRIPT = 1
+and to trajectories sampled from the replay buffer with loss scale
+β
+repval
+=
+0.3
+subscript
+𝛽
+repval
+0.3
+\beta_{\mathrm{repval}}=0.3
+italic_β start_POSTSUBSCRIPT roman_repval end_POSTSUBSCRIPT = 0.3
+. The critic replay loss uses the imagination returns
+R
+t
+λ
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+R^{\lambda}_{t}
+italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT
+at the start states of the imagination rollouts as on-policy value annotations for the replay trajectory to then compute
+λ
+𝜆
+\lambda
+italic_λ
+-returns over the replay rewards.
+Because the critic regresses targets that depend on its own predictions, we stabilize learning by regularizing the critic towards predicting the outputs of an exponentially moving average of its own parameters.
+This is similar to target networks used previously in reinforcement learning
+7
+but allows us to compute returns using the current critic network.
+We further noticed that the randomly initialized reward predictor and critic networks at the start of training can result in large predicted rewards that can delay the onset of learning.
+We thus initialize the output weight matrix of the reward predictor and critic to zeros, which alleviates the problem and accelerates early learning.
+Actor learning
+The actor learns to choose actions that maximize return while exploring through an entropy regularizer
+30
+.
+However, the correct scale for this regularizer depends both on the scale and frequency of rewards in the environment.
+Ideally, we would like the agent to explore more if rewards are sparse and exploit more if rewards are dense or nearby. At the same time, the exploration amount should not be influenced by arbitrary scaling of rewards in the environment.
+This requires normalizing the return scale while preserving information about reward frequency.
+To use a fixed entropy scale of
+η
+=
+3
+×
+10
+−
+4
+𝜂
+3
+superscript
+10
+4
+\eta=3\times 10^{-4}
+italic_η = 3 × 10 start_POSTSUPERSCRIPT - 4 end_POSTSUPERSCRIPT
+across domains, we normalize returns to be approximately contained in the interval
+[
+0
+,
+1
+]
+0
+1
+[0,1]
+[ 0 , 1 ]
+. In practice, substracting an offset from the returns does not change the actor gradient and thus dividing by the range
+S
+𝑆
+S
+italic_S
+is sufficient. Moreover, to avoid amplifying noise from function approximation under sparse rewards, we only scale down large return magnitudes but leave small returns below the threshold of
+L
+=
+1
+𝐿
+1
+L=1
+italic_L = 1
+untouched. We use the Reinforce estimator
+31
+for both discrete and continuous actions, resulting in the surrogate loss function:
+ℒ
+⁢
+(
+θ
+)
+≐
+−
+∑
+t
+=
+1
+T
+sg
+⁡
+(
+(
+R
+t
+λ
+−
+v
+ψ
+⁢
+(
+s
+t
+)
+)
+/
+max
+⁡
+(
+1
+,
+S
+)
+)
+⁢
+log
+⁡
+π
+θ
+⁢
+(
+a
+t
+|
+s
+t
+)
++
+η
+⁢
+H
+⁡
+[
+π
+θ
+⁢
+(
+a
+t
+|
+s
+t
+)
+]
+approaches-limit
+ℒ
+𝜃
+superscript
+subscript
+𝑡
+1
+𝑇
+sg
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+subscript
+𝑣
+𝜓
+subscript
+𝑠
+𝑡
+1
+𝑆
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑎
+𝑡
+subscript
+𝑠
+𝑡
+𝜂
+H
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑎
+𝑡
+subscript
+𝑠
+𝑡
+\displaystyle\begin{aligned} \mathcal{L}(\theta)\doteq-\textstyle\sum_{t=1}^{T%
+}\operatorname{sg}\!\Big{(}\big{(}R^{\lambda}_{t}-v_{\psi}(s_{t})\big{)}/\max(%
+1,\,S)\Big{)}\log\pi_{\theta}(a_{t}\;|\;s_{t})+\,\eta\operatorname{H}\big{[}%
+\pi_{\theta}(a_{t}\;\big{|}\;s_{t})\big{]}\end{aligned}
+start_ROW start_CELL caligraphic_L ( italic_θ ) ≐ - ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_T end_POSTSUPERSCRIPT roman_sg ( ( italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT - italic_v start_POSTSUBSCRIPT italic_ψ end_POSTSUBSCRIPT ( italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ) / roman_max ( 1 , italic_S ) ) roman_log italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) + italic_η roman_H [ italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_a start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT | italic_s start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT ) ] end_CELL end_ROW
+(6)
+The return distribution can be multi-modal and include outliers, especially for randomized environments where some episodes have higher achievable returns than others. Normalizing by the smallest and largest observed returns would then scale returns down too much and may cause suboptimal convergence. To be robust to these outliers, we compute the range from the 5
+th
+to the 95
+th
+return percentile over the return batch and smooth out the estimate using an exponential moving average:
+S
+≐
+EMA
+⁡
+(
+Per
+⁡
+(
+R
+t
+λ
+,
+95
+)
+−
+Per
+⁡
+(
+R
+t
+λ
+,
+5
+)
+,
+0.99
+)
+approaches-limit
+𝑆
+EMA
+Per
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+95
+Per
+subscript
+superscript
+𝑅
+𝜆
+𝑡
+5
+0.99
+\displaystyle\begin{aligned} S\doteq\operatorname{EMA}\!\big{(}\operatorname{%
+Per}(R^{\lambda}_{t},95)-\operatorname{Per}(R^{\lambda}_{t},5),0.99\big{)}\end%
+{aligned}
+start_ROW start_CELL italic_S ≐ roman_EMA ( roman_Per ( italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , 95 ) - roman_Per ( italic_R start_POSTSUPERSCRIPT italic_λ end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_t end_POSTSUBSCRIPT , 5 ) , 0.99 ) end_CELL end_ROW
+(7)
+Previous work typically normalizes advantages
+5
+rather than returns, which puts a fixed amount of emphasis on maximizing returns over entropy regardless of whether rewards are within reach.
+Scaling up advantages when rewards are sparse can amplify noise that outweighs the entropy regularizer and stagnates exploration.
+Normalizing rewards or returns by standard deviation can fail under sparse rewards where their standard deviation is near zero, drastically amplifying rewards regardless of their size.
+Constrained optimization targets a fixed entropy on average across states
+32
+,
+33
+regardless of achievable returns, which is robust but explores slowly under sparse rewards and converges lower under dense rewards.
+We did not find stable hyperparameters across domains for these approaches.
+Return normalization with a denominator limit overcomes these challenges, exploring rapidly under sparse rewards and converging to high performance across diverse domains.
+Robust predictions
+Reconstructing inputs and predicting rewards and returns can be challenging because the scale of these quantities can vary across domains.
+Predicting large targets using a squared loss can lead to divergence whereas absolute and Huber losses
+7
+stagnate learning.
+On the other hand, normalizing targets based on running statistics
+5
+introduces non-stationarity into the optimization.
+We suggest the symlog squared error as a simple solution to this dilemma.
+For this, a neural network
+f
+⁢
+(
+x
+,
+θ
+)
+𝑓
+𝑥
+𝜃
+f(x,\theta)
+italic_f ( italic_x , italic_θ )
+with inputs
+x
+𝑥
+x
+italic_x
+and parameters
+θ
+𝜃
+\theta
+italic_θ
+learns to predict a transformed version of its targets
+y
+𝑦
+y
+italic_y
+.
+To read out predictions
+y
+^
+^
+𝑦
+\hat{y}
+over^ start_ARG italic_y end_ARG
+of the network, we apply the inverse transformation:
+ℒ
+⁢
+(
+θ
+)
+≐
+1
+2
+⁢
+(
+f
+⁢
+(
+x
+,
+θ
+)
+−
+symlog
+⁡
+(
+y
+)
+)
+2
+y
+^
+≐
+symexp
+⁡
+(
+f
+⁢
+(
+x
+,
+θ
+)
+)
+formulae-sequence
+approaches-limit
+ℒ
+𝜃
+1
+2
+superscript
+𝑓
+𝑥
+𝜃
+symlog
+𝑦
+2
+approaches-limit
+^
+𝑦
+symexp
+𝑓
+𝑥
+𝜃
+\displaystyle\begin{aligned} \mathcal{L(\theta)}\doteq\textstyle\frac{1}{2}%
+\big{(}f(x,\theta)-\operatorname{symlog}(y)\big{)}^{2}\qquad\hat{y}\doteq%
+\operatorname{symexp}\!\big{(}f(x,\theta)\big{)}\end{aligned}
+start_ROW start_CELL caligraphic_L ( italic_θ ) ≐ divide start_ARG 1 end_ARG start_ARG 2 end_ARG ( italic_f ( italic_x , italic_θ ) - roman_symlog ( italic_y ) ) start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT over^ start_ARG italic_y end_ARG ≐ roman_symexp ( italic_f ( italic_x , italic_θ ) ) end_CELL end_ROW
+(8)
+Using the logarithm as transformation would not allow us to predict targets that take on negative values.
+Therefore, we choose a function from the bi-symmetric logarithmic family
+34
+that we name symlog as the transformation with the symexp function as its inverse:
+symlog
+⁡
+(
+x
+)
+≐
+sign
+⁡
+(
+x
+)
+⁢
+ln
+⁡
+(
+|
+x
+|
++
+1
+)
+symexp
+⁡
+(
+x
+)
+≐
+sign
+⁡
+(
+x
+)
+⁢
+(
+exp
+⁡
+(
+|
+x
+|
+)
+−
+1
+)
+formulae-sequence
+approaches-limit
+symlog
+𝑥
+sign
+𝑥
+𝑥
+1
+approaches-limit
+symexp
+𝑥
+sign
+𝑥
+𝑥
+1
+\displaystyle\begin{aligned} \operatorname{symlog}(x)\doteq\operatorname{sign}%
+(x)\ln\!\big{(}|x|+1\big{)}\qquad\operatorname{symexp}(x)\doteq\operatorname{%
+sign}(x)\big{(}\!\exp(|x|)-1\big{)}\end{aligned}
+start_ROW start_CELL roman_symlog ( italic_x ) ≐ roman_sign ( italic_x ) roman_ln ( | italic_x | + 1 ) roman_symexp ( italic_x ) ≐ roman_sign ( italic_x ) ( roman_exp ( | italic_x | ) - 1 ) end_CELL end_ROW
+(9)
+The symlog function compresses the magnitudes of both large positive and negative values.
+Unlike the logarithm, it is symmetric around the origin while preserving the input sign.
+This allows the optimization process to quickly move the network predictions to large values when needed.
+The symlog function approximates the identity around the origin so that it does not affect learning of targets that are already small enough.
+For potentially stochastic targets, such as rewards or returns, we introduce the symexp twohot loss. Here, the network outputs the logits for a softmax distribution over exponentially spaced bins
+b
+i
+∈
+B
+subscript
+𝑏
+𝑖
+𝐵
+b_{i}\in B
+italic_b start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∈ italic_B
+.
+Predictions are read out as the weighted average of the bin positions weighted by their predicted probabilities. Importantly, the network can output any continuous value in the interval because the weighted average can fall between the buckets:
+y
+^
+≐
+softmax
+(
+f
+(
+x
+)
+)
+T
+B
+B
+≐
+symexp
+(
+[
+−
+20
+…
++
+20
+]
+)
+\displaystyle\begin{aligned} \hat{y}\doteq\operatorname{softmax}(f(x))^{T}B%
+\qquad B\doteq\operatorname{symexp}(\begin{bmatrix}-20&...&+20\end{bmatrix})%
+\end{aligned}
+start_ROW start_CELL over^ start_ARG italic_y end_ARG ≐ roman_softmax ( italic_f ( italic_x ) ) start_POSTSUPERSCRIPT italic_T end_POSTSUPERSCRIPT italic_B italic_B ≐ roman_symexp ( [ start_ARG start_ROW start_CELL - 20 end_CELL start_CELL … end_CELL start_CELL + 20 end_CELL end_ROW end_ARG ] ) end_CELL end_ROW
+(10)
+The network is trained on twohot encoded targets
+8
+,
+28
+, a generalization of onehot encoding to continuous values. The twohot encoding of a scalar is a vector with
+|
+B
+|
+𝐵
+|B|
+| italic_B |
+entries that are all
+0
+0
+except at the indices
+k
+𝑘
+k
+italic_k
+and
+k
++
+1
+𝑘
+1
+k+1
+italic_k + 1
+of the two bins closest to the encoded scalar. The two entries sum up to
+1
+1
+1
+1
+, with linearly higher weight given to the bin that is closer to the encoded continuous number.
+The network is then trained to minimize the categorical cross entropy loss for classification with soft targets. Note that the loss only depends on the probabilities assigned to the bins but not on the continuous values associated with the bin locations, decoupling the size of the gradients from the size of the targets:
+ℒ
+(
+θ
+)
+≐
+−
+twohot
+(
+y
+)
+T
+log
+⁢
+softmax
+(
+f
+(
+x
+,
+θ
+)
+)
+\displaystyle\begin{aligned} \mathcal{L}(\theta)\doteq-\operatorname{twohot}(y%
+)^{T}\operatorname{log\,softmax}(f(x,\theta))\end{aligned}
+start_ROW start_CELL caligraphic_L ( italic_θ ) ≐ - roman_twohot ( italic_y ) start_POSTSUPERSCRIPT italic_T end_POSTSUPERSCRIPT start_OPFUNCTION roman_log roman_softmax end_OPFUNCTION ( italic_f ( italic_x , italic_θ ) ) end_CELL end_ROW
+(11)
+Applying these principles, Dreamer transforms vector observations using the symlog functions, both for the encoder inputs and the decoder targets and employs the synexp twohot loss for the reward predictor and critic.
+We find that these techniques enable robust and fast learning across many diverse domains.
+For critic learning, an alternative asymmetric transformation has previously been proposed
+35
+, which we found less effective on average across domains.
+Unlike alternatives, symlog transformations avoid truncating large targets
+7
+, introducing non-stationary from normalization
+5
+, or adjusting network weights when new extreme values are detected
+36
+.
+Figure 5
+:
+Fraction of trained agents that discover each of the three latest items in the Minecraft Diamond task. Although previous algorithms progress up to the iron pickaxe, Dreamer is the only compared algorithm that manages to discover a diamond, and does so reliably.
+Figure 6
+:
+Ablations and robust scaling of Dreamer.
+a
+, All individual robustness techniques contribute to the performance of Dreamer on average, although each individual technique may only affect some tasks. Training curves of individual tasks are included in the supplementary material.
+b
+, The performance of Dreamer predominantly rests on the unsupervised reconstruction loss of its world model, unlike most prior algorithms that rely predominantly on reward and value prediction gradients
+7
+,
+5
+,
+8
+.
+c
+, The performance of Dreamer increases monotonically with larger model sizes, ranging from 12M to 400M parameters. Notably, larger models not only increase task performance but also require less environment interaction.
+d
+, Higher replay ratios predictably increase the performance of Dreamer. Together with model size, this allows practitioners to improve task performance and data-efficiency by employing more computational resources.
+Results
+We evaluate the generality of Dreamer across 8 domains—with over 150 tasks—under fixed hyperparameters.
+We designed the experiments to compare Dreamer to the best methods in the literature, which are often specifically designed and tuned for the benchmark at hand.
+We further compare to a high-quality implementation of PPO
+5
+, a standard reinforcement learning algorithm that is known for its robustness. We run PPO with fixed hyperparameters chosen to maximize performance across domains and that reproduce strong published results of PPO on ProcGen
+37
+.
+To push the boundaries of reinforcement learning, we apply Dreamer to the challenging video game Minecraft, comparing it to strong previous algorithms.
+Finally, we analyze the importance of individual components of Dreamer and its robustness to different model sizes and computational budgets.
+All Dreamer agents are trained on a single Nvidia A100 GPU each, making it reproducible for many research labs.
+A public implementation of Dreamer that reproduces all results is available on the project website.
+Benchmarks
+We perform an extensive empirical study across 8 domains that include continuous and discrete actions, visual and low-dimensional inputs, dense and sparse rewards, different reward scales, 2D and 3D worlds, and procedural generation.
+Figure
+1
+summarizes the benchmark results, showing that Dreamer outperforms a wide range of previous expert algorithms across diverse domains. Crucially, Dreamer substantially outperforms PPO across all domains.
+•
+Atari
+This established benchmark contains 57 Atari 2600 games with a budget of 200M frames, posing a diverse range of challenges
+38
+.
+We use the sticky action simulator setting
+39
+.
+Dreamer outperforms the powerful MuZero algorithm
+8
+while using only a fraction of the computational resources.
+Dreamer also outperforms the widely-used expert algorithms Rainbow
+40
+and IQN
+41
+.
+•
+ProcGen
+This benchmark of 16 games features randomized levels and visual distractions to test the robustness and generalization of agents
+42
+. Within the budget of 50M frames, Dreamer matches the tuned expert algorithm PPG
+37
+and outperforms Rainbow
+42
+,
+40
+. Our PPO agent with fixed hyperparameters matches the published score of the highly tuned official PPO implementation
+37
+.
+•
+DMLab
+This suite of 30 tasks features 3D environments that test spatial and temporal reasoning
+43
+. In 100M frames, Dreamer exceeds the performance of the scalable IMPALA and R2D2+ agents
+35
+at 1B environment steps, amounting to a data-efficiency gain of over 1000%.
+We note that these baselines were not designed for data-efficiency but serve as a valuable comparison point for the performance previously achievable at scale.
+•
+Atari100k
+This data-efficiency benchmark comntains 26 Atari games and a budget of only 400K frames, amounting to 2 hours of game time
+17
+. EfficientZero
+44
+holds the state-of-the-art by combining online tree search, prioritized replay, and hyperparameter scheduling, but also resets levels early to increase data diversity, making a comparison difficult.
+Without this complexity, Dreamer outperforms the best remaining methods, including the transformer-based IRIS and TWM agents, the model-free SPR, and SimPLe
+45
+.
+•
+Proprio Control
+This benchmark contains 18 control tasks with continuous actions, proprioceptive vector inputs, and a budget of 500K environment steps
+46
+. The tasks range from classical control over locomotion to robot manipulation tasks, featuring dense and sparse rewards. Dreamer sets a new state-of-the-art on this benchmark,
+outperforming D4PG, DMPO, and MPO
+33
+.
+•
+Visual Control
+This benchmark consists of 20 continuous control tasks where the agent receives only high-dimensional images as input and has a budget of 1M environment steps
+46
+.
+Dreamer establishes a new state-of-the-art on this benchmark, outperforming DrQ-v2 and CURL
+47
+, which are specialized to visual environments and leverage data augmentation.
+•
+BSuite
+This benchmark includes 23 environments with a total of 468 configurations that are specifically designed to test credit assignment, robustness to reward scale and stochasticity, memory, generalization, and exploration
+48
+. Dreamer establishes a new state-of-the-art on this benchmark, outperforming Boot DQN and other methods
+49
+.
+Dreamer improves over previous algorithms especially in the scale robustness category.
+Minecraft
+Collecting diamonds in the popular game Minecraft has been a long-standing challenge in artificial intelligence
+18
+,
+19
+,
+20
+.
+Every episode in this game is set in a unique randomly generated and infinite 3D world. Episodes last until the player dies or up to 36000 steps equaling 30 minutes, during which the player needs to discover a sequence of 12 items from sparse rewards by foraging for resources and crafting tools. It takes about 20 minutes for experienced human players to obtain diamonds
+20
+.
+We follow the block breaking setting of prior work
+19
+because the provided action space would make it challenging for stochastic policies to keep a key pressed for a prolonged time.
+Because of the training time in this complex domain, extensive tuning would be difficult for Minecraft.
+Instead, we apply Dreamer out of the box with its default hyperparameters.
+As shown in
+Figures
+5
+and
+1
+, Dreamer is the first algorithm to collect diamonds in Minecraft from scratch without using human data as was required by VPT
+20
+or adaptive curricula
+19
+.
+All the Dreamer agents we trained on Minecraft discover diamonds in 100M environment steps.
+While several strong baselines progress to advanced items such as the iron pickaxe, none of them discovers a diamond.
+Ablations
+In
+Figure
+6
+, we ablate the robustness techniques and learning signals on a diverse set of 14 tasks to understand their importance.
+The training curves of individual tasks are included in the supplementary material.
+We observe that all robustness techniques contribute to performance, most notably the KL objective of the world model, followed by return normalization and symexp twohot regression for reward and value prediction. In general, we find that each individual technique is critical on a subset of tasks but may not affect performance on other tasks.
+To investigate the effect of the world model, we ablate the learning signals of Dreamer by stopping either the task-specific reward and value prediction gradients or the task-agnostic reconstruction gradients from shaping its representations. Unlike previous reinforcement learning algorithms that often rely only on task-specific learning signals
+7
+,
+8
+, Dreamer rests predominantly on the unsupervised objective of its world model. This finding could allow for future algorithm variants that leverage pretraining on unsupervised data.
+Scaling properties
+To investigate whether Dreamer can scale robustly, we train 6 model sizes ranging from 12M to 400M parameters, as well as different replay ratios on Crafter
+50
+and a DMLab task
+43
+. The replay ratio affects the number of gradient updates performed by the agent.
+Figure
+6
+shows robust learning with fixed hyperparameters across the compared model sizes and replay ratios.
+Moreover, increasing the model size directly translates to both higher task performance and a lower data requirement.
+Increasing the number of gradient steps further reduces the interactions needed to learn successful behaviors.
+The results show that Dreamer learns robustly across model sizes and replay ratios and that its performance and provides a predictable way for increasing performance given computational resources.
+Previous work
+Developing general-purpose algorithms has long been a goal of reinforcement learning research.
+PPO
+5
+is one of the most widely used algorithms and is relatively robust but requires large amounts of experience and often yields lower performance than specialized alternatives.
+SAC
+32
+is a popular choice for continuous control and leverages experience replay for data-efficiency, but in practice requires tuning, especially for its entropy scale, and struggles under high-dimensional inputs
+51
+.
+MuZero
+8
+plans using a value prediction model and has been applied to board games and Atari, but the authors did not release an implementation and the algorithm contains several complex components, making it challenging to reproduce.
+Gato
+52
+fits one large model to expert demonstrations of multiple tasks, but is only applicable when expert data is available.
+In comparison, Dreamer masters a diverse range of environments with fixed hyperparameters, does not require expert data, and its implementation is open source.
+Minecraft has been a focus of recent research. With MALMO
+53
+, Microsoft released a free version of the successful game for research purposes.
+MineRL
+18
+offers several competition environments, which we rely on as the basis for our experiments.
+The MineRL competition supports agents in exploring and learning meaningful skills through a diverse human dataset
+18
+.
+Voyager obtains items at a similar depth in the technology tree as Dreamer using API calls to a language model but operates on top of the MineFlayer bot scripting layer that was specifically engineered to the game and exposes high-level actions
+54
+.
+VPT
+20
+trained an agent to play Minecraft through behavioral cloning based on expert data of keyboard and mouse actions collected by contractors and finetuning using reinforcement learning to obtain diamonds using 720 GPUs for 9 days.
+In comparison, Dreamer uses the MineRL competition action space to autonomously learn to collect diamonds from sparse rewards using 1 GPU for 9 days, without human data.
+Conclusion
+We present the third generation of the Dreamer algorithm, a general reinforcement learning algorithm that masters a wide range of domains with fixed hyperparameters.
+Dreamer excels not only across over 150 tasks but also learns robustly across varying data and compute budgets, moving reinforcement learning toward a wide range of practical applications.
+Applied out of the box, Dreamer is the first algorithm to collect diamonds in Minecraft from scratch, achieving a significant milestone in the field of artificial intelligence.
+As a high-performing algorithm that is based on a learned world model, Dreamer paves the way for future research directions, including teaching agents world knowledge from internet videos and learning a single world model across domains to allow artificial agents to build up increasingly general knowledge and competency.
+Acknowledgements
+We thank Mohammad Norouzi, Jessy Lin, Abbas Abdolmaleki, John Schulman, Adam Kosiorek, and Oleh Rybkin for insightful discussions.
+We thank Bobak Shahriari, Denis Yarats, Karl Cobbe,
+and Hubert Soyer for sharing training curves of baseline algorithms. We thank Daniel Furrer, Andrew Chen, and Dakshesh Garambha for help with Google Cloud infrastructure.
+References
+Silver et al. 2016
+David Silver, Aja Huang, Chris J Maddison, Arthur Guez, Laurent Sifre, George
+Van Den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda
+Panneershelvam, Marc Lanctot, et al.
+Mastering the game of go with deep neural networks and tree search.
+nature
+, 529(7587):484, 2016.
+OpenAI 2018
+OpenAI.
+OpenAI Five.
+https://blog.openai.com/openai-five/
+, 2018.
+Ouyang et al. 2022
+Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela
+Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al.
+Training language models to follow instructions with human feedback.
+Advances in neural information processing systems
+,
+35:27730–27744, 2022.
+Le et al. 2022
+Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven
+Chu Hong Hoi.
+Coderl: Mastering code generation through pretrained models and deep
+reinforcement learning.
+Advances in Neural Information Processing Systems
+,
+35:21314–21328, 2022.
+Schulman et al. 2017
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Lillicrap et al. 2015
+Timothy P Lillicrap, Jonathan J Hunt, Alexander Pritzel, Nicolas Heess, Tom
+Erez, Yuval Tassa, David Silver, and Daan Wierstra.
+Continuous control with deep reinforcement learning.
+arXiv preprint arXiv:1509.02971
+, 2015.
+Mnih et al. 2015
+Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Andrei A Rusu, Joel Veness,
+Marc G Bellemare, Alex Graves, Martin Riedmiller, Andreas K Fidjeland, Georg
+Ostrovski, et al.
+Human-level control through deep reinforcement learning.
+Nature
+, 518(7540):529, 2015.
+Schrittwieser et al. 2019
+Julian Schrittwieser, Ioannis Antonoglou, Thomas Hubert, Karen Simonyan,
+Laurent Sifre, Simon Schmitt, Arthur Guez, Edward Lockhart, Demis Hassabis,
+Thore Graepel, et al.
+Mastering atari, go, chess and shogi by planning with a learned
+model.
+arXiv preprint arXiv:1911.08265
+, 2019.
+Jaderberg et al. 2016
+Max Jaderberg, Volodymyr Mnih, Wojciech Marian Czarnecki, Tom Schaul, Joel Z
+Leibo, David Silver, and Koray Kavukcuoglu.
+Reinforcement learning with unsupervised auxiliary tasks.
+arXiv preprint arXiv:1611.05397
+, 2016.
+Anand et al. 2019
+Ankesh Anand, Evan Racah, Sherjil Ozair, Yoshua Bengio, Marc-Alexandre
+Côté, and R Devon Hjelm.
+Unsupervised state representation learning in atari.
+Advances in neural information processing systems
+, 32, 2019.
+Driess et al. 2022
+Danny Driess, Ingmar Schubert, Pete Florence, Yunzhu Li, and Marc Toussaint.
+Reinforcement learning with neural radiance fields.
+arXiv preprint arXiv:2206.01634
+, 2022.
+Silver et al. 2017
+David Silver, Julian Schrittwieser, Karen Simonyan, Ioannis Antonoglou, Aja
+Huang, Arthur Guez, Thomas Hubert, Lucas Baker, Matthew Lai, Adrian Bolton,
+et al.
+Mastering the game of go without human knowledge.
+Nature
+, 550(7676):354, 2017.
+Andrychowicz et al. 2020
+Marcin Andrychowicz, Anton Raichuk, Piotr Stańczyk, Manu Orsini, Sertan
+Girgin, Raphael Marinier, Léonard Hussenot, Matthieu Geist, Olivier
+Pietquin, Marcin Michalski, et al.
+What matters in on-policy reinforcement learning? a large-scale
+empirical study.
+arXiv preprint arXiv:2006.05990
+, 2020.
+Sutton 1991
+Richard S Sutton.
+Dyna, an integrated architecture for learning, planning, and
+reacting.
+ACM SIGART Bulletin
+, 2(4):160–163, 1991.
+Finn and Levine 2017
+Chelsea Finn and Sergey Levine.
+Deep visual foresight for planning robot motion.
+In
+2017 IEEE International Conference on Robotics and
+Automation (ICRA)
+, pages 2786–2793. IEEE, 2017.
+Ha and Schmidhuber 2018
+David Ha and Jürgen Schmidhuber.
+World models.
+arXiv preprint arXiv:1803.10122
+, 2018.
+Kaiser et al. 2019
+Lukasz Kaiser, Mohammad Babaeizadeh, Piotr Milos, Blazej Osinski, Roy H
+Campbell, Konrad Czechowski, Dumitru Erhan, Chelsea Finn, Piotr Kozakowski,
+Sergey Levine, et al.
+Model-based reinforcement learning for atari.
+arXiv preprint arXiv:1903.00374
+, 2019.
+Guss et al. 2019
+William H Guss, Cayden Codel, Katja Hofmann, Brandon Houghton, Noboru Kuno,
+Stephanie Milani, Sharada Mohanty, Diego Perez Liebana, Ruslan Salakhutdinov,
+Nicholay Topin, et al.
+The minerl competition on sample efficient reinforcement learning
+using human priors.
+arXiv e-prints
+, pages arXiv–1904, 2019.
+Kanitscheider et al. 2021
+Ingmar Kanitscheider, Joost Huizinga, David Farhi, William Hebgen Guss, Brandon
+Houghton, Raul Sampedro, Peter Zhokhov, Bowen Baker, Adrien Ecoffet, Jie
+Tang, et al.
+Multi-task curriculum learning in a complex, visual, hard-exploration
+domain: Minecraft.
+arXiv preprint arXiv:2106.14876
+, 2021.
+Baker et al. 2022
+Bowen Baker, Ilge Akkaya, Peter Zhokhov, Joost Huizinga, Jie Tang, Adrien
+Ecoffet, Brandon Houghton, Raul Sampedro, and Jeff Clune.
+Video pretraining (vpt): Learning to act by watching unlabeled online
+videos.
+arXiv preprint arXiv:2206.11795
+, 2022.
+Hafner et al. 2019
+Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi.
+Dream to control: Learning behaviors by latent imagination.
+arXiv preprint arXiv:1912.01603
+, 2019.
+Hafner et al. 2020
+Danijar Hafner, Timothy Lillicrap, Mohammad Norouzi, and Jimmy Ba.
+Mastering atari with discrete world models.
+arXiv preprint arXiv:2010.02193
+, 2020.
+Kingma and Welling 2013
+Diederik P Kingma and Max Welling.
+Auto-encoding variational bayes.
+arXiv preprint arXiv:1312.6114
+, 2013.
+Hafner et al. 2018
+Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha,
+Honglak Lee, and James Davidson.
+Learning latent dynamics for planning from pixels.
+arXiv preprint arXiv:1811.04551
+, 2018.
+Bengio et al. 2013
+Yoshua Bengio, Nicholas Léonard, and Aaron Courville.
+Estimating or propagating gradients through stochastic neurons for
+conditional computation.
+arXiv preprint arXiv:1308.3432
+, 2013.
+Kingma et al. 2016
+Durk P Kingma, Tim Salimans, Rafal Jozefowicz, Xi Chen, Ilya Sutskever, and Max
+Welling.
+Improved variational inference with inverse autoregressive flow.
+Advances in neural information processing systems
+, 29, 2016.
+Child 2020
+Rewon Child.
+Very deep vaes generalize autoregressive models and can outperform
+them on images.
+arXiv preprint arXiv:2011.10650
+, 2020.
+Bellemare et al. 2017
+Marc G Bellemare, Will Dabney, and Rémi Munos.
+A distributional perspective on reinforcement learning.
+In
+International Conference on Machine Learning
+, pages
+449–458. PMLR, 2017.
+Sutton and Barto 2018
+Richard S Sutton and Andrew G Barto.
+Reinforcement learning: An introduction
+.
+MIT press, 2018.
+Williams and Peng 1991
+Ronald J Williams and Jing Peng.
+Function optimization using connectionist reinforcement learning
+algorithms.
+Connection Science
+, 3(3):241–268, 1991.
+Williams 1992
+Ronald J Williams.
+Simple statistical gradient-following algorithms for connectionist
+reinforcement learning.
+Machine learning
+, 8(3-4):229–256, 1992.
+Haarnoja et al. 2018
+Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine.
+Soft actor-critic: Off-policy maximum entropy deep reinforcement
+learning with a stochastic actor.
+arXiv preprint arXiv:1801.01290
+, 2018.
+Abdolmaleki et al. 2018
+Abbas Abdolmaleki, Jost Tobias Springenberg, Yuval Tassa, Remi Munos, Nicolas
+Heess, and Martin Riedmiller.
+Maximum a posteriori policy optimisation.
+arXiv preprint arXiv:1806.06920
+, 2018.
+Webber 2012
+J Beau W Webber.
+A bi-symmetric log transformation for wide-range data.
+Measurement Science and Technology
+, 24(2):027001, 2012.
+Kapturowski et al. 2018
+Steven Kapturowski, Georg Ostrovski, John Quan, Remi Munos, and Will Dabney.
+Recurrent experience replay in distributed reinforcement learning.
+In
+International conference on learning representations
+, 2018.
+Hessel et al. 2019
+Matteo Hessel, Hubert Soyer, Lasse Espeholt, Wojciech Czarnecki, Simon Schmitt,
+and Hado van Hasselt.
+Multi-task deep reinforcement learning with popart.
+In
+Proceedings of the AAAI Conference on Artificial
+Intelligence
+, volume 33, pages 3796–3803, 2019.
+Cobbe et al. 2021
+Karl W Cobbe, Jacob Hilton, Oleg Klimov, and John Schulman.
+Phasic policy gradient.
+In
+International Conference on Machine Learning
+, pages
+2020–2027. PMLR, 2021.
+Bellemare et al. 2013
+Marc G Bellemare, Yavar Naddaf, Joel Veness, and Michael Bowling.
+The arcade learning environment: An evaluation platform for general
+agents.
+Journal of Artificial Intelligence Research
+, 47:253–279, 2013.
+Machado et al. 2018a
+Marlos C Machado, Marc G Bellemare, Erik Talvitie, Joel Veness, Matthew
+Hausknecht, and Michael Bowling.
+Revisiting the arcade learning environment: Evaluation protocols and
+open problems for general agents.
+Journal of Artificial Intelligence Research
+, 61:523–562, 2018a.
+Hessel et al. 2018
+Matteo Hessel, Joseph Modayil, Hado Van Hasselt, Tom Schaul, Georg Ostrovski,
+Will Dabney, Dan Horgan, Bilal Piot, Mohammad Azar, and David Silver.
+Rainbow: Combining improvements in deep reinforcement learning.
+In
+Thirty-Second AAAI Conference on Artificial Intelligence
+,
+2018.
+Dabney et al. 2018
+Will Dabney, Georg Ostrovski, David Silver, and Rémi Munos.
+Implicit quantile networks for distributional reinforcement learning.
+In
+International conference on machine learning
+, pages
+1096–1105. PMLR, 2018.
+Cobbe et al. 2020
+Karl Cobbe, Chris Hesse, Jacob Hilton, and John Schulman.
+Leveraging procedural generation to benchmark reinforcement learning.
+In
+International conference on machine learning
+, pages
+2048–2056. PMLR, 2020.
+Beattie et al. 2016
+Charles Beattie, Joel Z Leibo, Denis Teplyashin, Tom Ward, Marcus Wainwright,
+Heinrich Küttler, Andrew Lefrancq, Simon Green, Víctor Valdés,
+Amir Sadik, et al.
+Deepmind lab.
+arXiv preprint arXiv:1612.03801
+, 2016.
+Ye et al. 2021
+Weirui Ye, Shaohuai Liu, Thanard Kurutach, Pieter Abbeel, and Yang Gao.
+Mastering atari games with limited data.
+Advances in Neural Information Processing Systems
+,
+34:25476–25488, 2021.
+Micheli et al. 2022
+Vincent Micheli, Eloi Alonso, and François Fleuret.
+Transformers are sample efficient world models.
+arXiv preprint arXiv:2209.00588
+, 2022.
+Tassa et al. 2018
+Yuval Tassa, Yotam Doron, Alistair Muldal, Tom Erez, Yazhe Li, Diego de Las
+Casas, David Budden, Abbas Abdolmaleki, Josh Merel, Andrew Lefrancq, et al.
+Deepmind control suite.
+arXiv preprint arXiv:1801.00690
+, 2018.
+Yarats et al. 2021
+Denis Yarats, Rob Fergus, Alessandro Lazaric, and Lerrel Pinto.
+Mastering visual continuous control: Improved data-augmented
+reinforcement learning.
+arXiv preprint arXiv:2107.09645
+, 2021.
+Osband et al. 2019
+Ian Osband, Yotam Doron, Matteo Hessel, John Aslanides, Eren Sezener, Andre
+Saraiva, Katrina McKinney, Tor Lattimore, Csaba Szepesvari, Satinder Singh,
+et al.
+Behaviour suite for reinforcement learning.
+arXiv preprint arXiv:1908.03568
+, 2019.
+Dizon-Paradis et al. 2023
+Olivia Dizon-Paradis, Stephen Wormald, Daniel Capecci, Avanti Bhandarkar, and
+Damon Woodard.
+Investigating the practicality of existing reinforcement learning
+algorithms: A performance comparison.
+Authorea Preprints
+, 2023.
+Hafner 2021
+Danijar Hafner.
+Benchmarking the spectrum of agent capabilities.
+arXiv preprint arXiv:2109.06780
+, 2021.
+Yarats et al. 2019
+Denis Yarats, Amy Zhang, Ilya Kostrikov, Brandon Amos, Joelle Pineau, and Rob
+Fergus.
+Improving sample efficiency in model-free reinforcement learning from
+images.
+arXiv preprint arXiv:1910.01741
+, 2019.
+Reed et al. 2022
+Scott Reed, Konrad Zolna, Emilio Parisotto, Sergio Gomez Colmenarejo, Alexander
+Novikov, Gabriel Barth-Maron, Mai Gimenez, Yury Sulsky, Jackie Kay,
+Jost Tobias Springenberg, et al.
+A generalist agent.
+arXiv preprint arXiv:2205.06175
+, 2022.
+Johnson et al. 2016
+Matthew Johnson, Katja Hofmann, Tim Hutton, and David Bignell.
+The malmo platform for artificial intelligence experimentation.
+In
+IJCAI
+, pages 4246–4247. Citeseer, 2016.
+Wang et al. 2023
+Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu,
+Linxi Fan, and Anima Anandkumar.
+Voyager: An open-ended embodied agent with large language models.
+arXiv preprint arXiv:2305.16291
+, 2023.
+Huang et al. 2022
+Shengyi Huang, Rousslan Fernand Julien Dossa, Antonin Raffin, Anssi Kanervisto,
+and Weixun Wang.
+The 37 implementation details of proximal policy optimization.
+The ICLR Blog Track 2023
+, 2022.
+Hoffman et al. 2020
+Matt Hoffman, Bobak Shahriari, John Aslanides, Gabriel Barth-Maron, Feryal
+Behbahani, Tamara Norman, Abbas Abdolmaleki, Albin Cassirer, Fan Yang, Kate
+Baumli, et al.
+Acme: A research framework for distributed reinforcement learning.
+arXiv preprint arXiv:2006.00979
+, 2020.
+Schmitt et al. 2020
+Simon Schmitt, Matteo Hessel, and Karen Simonyan.
+Off-policy actor-critic with shared experience replay.
+In
+International Conference on Machine Learning
+, pages
+8545–8554. PMLR, 2020.
+Schaul et al. 2015
+Tom Schaul, John Quan, Ioannis Antonoglou, and David Silver.
+Prioritized experience replay.
+arXiv preprint arXiv:1511.05952
+, 2015.
+Brock et al. 2021
+Andy Brock, Soham De, Samuel L Smith, and Karen Simonyan.
+High-performance large-scale image recognition without normalization.
+In
+International Conference on Machine Learning
+, pages
+1059–1071. PMLR, 2021.
+Ziyin et al. 2020
+Liu Ziyin, Zhikang T Wang, and Masahito Ueda.
+Laprop: Separating momentum and adaptivity in adam.
+arXiv preprint arXiv:2002.04839
+, 2020.
+Kingma and Ba 2014
+Diederik P Kingma and Jimmy Ba.
+Adam: A method for stochastic optimization.
+arXiv preprint arXiv:1412.6980
+, 2014.
+Gruslys et al. 2017
+Audrunas Gruslys, Will Dabney, Mohammad Gheshlaghi Azar, Bilal Piot, Marc
+Bellemare, and Remi Munos.
+The reactor: A fast and sample-efficient actor-critic agent for
+reinforcement learning.
+arXiv preprint arXiv:1704.04651
+, 2017.
+Cho et al. 2014
+Kyunghyun Cho, Bart Van Merriënboer, Caglar Gulcehre, Dzmitry Bahdanau,
+Fethi Bougares, Holger Schwenk, and Yoshua Bengio.
+Learning phrase representations using rnn encoder-decoder for
+statistical machine translation.
+arXiv preprint arXiv:1406.1078
+, 2014.
+Van Keirsbilck et al. 2019
+Matthijs Van Keirsbilck, Alexander Keller, and Xiaodong Yang.
+Rethinking full connectivity in recurrent neural networks.
+arXiv preprint arXiv:1905.12340
+, 2019.
+Machado et al. 2018b
+Marlos C Machado, Marc G Bellemare, Erik Talvitie, Joel Veness, Matthew
+Hausknecht, and Michael Bowling.
+Revisiting the arcade learning environment: Evaluation protocols and
+open problems for general agents.
+Journal of Artificial Intelligence Research
+, 61:523–562, 2018b.
+Espeholt et al. 2018
+Lasse Espeholt, Hubert Soyer, Remi Munos, Karen Simonyan, Volodymir Mnih, Tom
+Ward, Yotam Doron, Vlad Firoiu, Tim Harley, Iain Dunning, et al.
+Impala: Scalable distributed deep-rl with importance weighted
+actor-learner architectures.
+arXiv preprint arXiv:1802.01561
+, 2018.
+Methods
+Baselines
+We employ the Proximal Policy Optimization (PPO) algorithm
+5
+, which has become a standard choice in the field, to compare Dreamer under fixed hyperparameters across all benchmarks. There are a large number of PPO implementations available publicly and they are known to substantially vary in task performance
+55
+. To ensure a comparison that is representative of the highest performance PPO can achieve under fixed hyperparameters across domains, we choose the high-quality PPO implementation available in the Acme framework
+56
+and select its hyperparameters in
+Baselines
+following recommendations
+55
+,
+13
+and additionally tune its epoch batch size to be large enough for complex environments
+42
+, its learning rate, and its entropy scale. We match the discount factor to Dreamer because it works well across domains and is a common choice in the literature
+35
+,
+8
+. We choose the IMPALA network architecture that we have found performed better than alternatives
+42
+and set the minibatch size to the largest possible for one A100 GPU. We verify the performance of our PPO implementation and hyperparameters on the ProcGen benchmark, where a highly tuned PPO implementation has been reported by the PPO authors
+37
+. We find that our implementation matches or slightly outperforms this performance reference.
+Table 1
+:
+PPO hyperparameters used across all benchmarks.
+colspec = | L12em | C5em |,
+row1 = font=,
+Parameter
+Value
+Observation normalization  Yes
+Reward normalization  Yes
+Reward clipping (stddev.)  10
+Epoch batch
+64
+×
+256
+64
+256
+64\times 256
+64 × 256
+Number of epochs  3
+Minibatch size  8
+Minibatch length  256
+Policy trust region  0.2
+Value trust region  No
+Advantage normalization  Yes
+Entropy penalty scale  0.01
+Discount factor  0.997
+GAE lambda  0.95
+Learning rate
+3
+×
+10
+−
+4
+3
+superscript
+10
+4
+3\times 10^{-4}
+3 × 10 start_POSTSUPERSCRIPT - 4 end_POSTSUPERSCRIPT
+Gradient clipping
+0.5
+0.5
+0.5
+0.5
+Adam epsilon
+10
+−
+5
+superscript
+10
+5
+10^{-5}
+10 start_POSTSUPERSCRIPT - 5 end_POSTSUPERSCRIPT
+For Minecraft, we additionally tune and run the IMPALA and Rainbow algorithms because not successful end-to-end learning from scratch has been reported in the literature
+18
+. We use the Acme implementations
+56
+of these algorithms, use the same IMPALA network we used for PPO, and tuned the learning rate and entropy regularizers.
+For all other benchmarks, we compare to tuned expert algorithms reported in the literature as referenced in the results section.
+Table 1
+:
+PPO hyperparameters used across all benchmarks.
\ No newline at end of file
diff --git a/research/notes/minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attention.md b/research/notes/minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attention.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a1bc42c2df445fb98d98468a010ecbf23bbdc9f
--- /dev/null
+++ b/research/notes/minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attention.md
@@ -0,0 +1,2651 @@
+---
+title: 'MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention'
+id: minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attention
+tags:
+- deepread
+created: '2026-06-10T00:30:49.495067Z'
+source: https://arxiv.org/html/2506.13585
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:49.494926Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention
+\reportnumber
+MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention
+MiniMax
+1
+1
+1
+Please send correspondence to model@minimax.io.
+Abstract
+We introduce MiniMax-M1, the world’s first open-weight, large-scale hybrid-attention reasoning model. MiniMax-M1 is powered by a hybrid Mixture-of-Experts (MoE) architecture combined with a lightning attention mechanism. The model is developed based on our previous MiniMax-Text-01 model
+(MiniMax et al.,
+2025
+)
+, which contains a total of 456 billion parameters with 45.9 billion parameters activated per token. The M1 model natively supports a context length of 1 million tokens, 8x the context size of DeepSeek R1. Furthermore, the lightning attention mechanism in MiniMax-M1 enables efficient scaling of test-time compute – For example, compared to DeepSeek R1, M1 consumes 25% of the FLOPs at a generation length of 100K tokens. These properties make M1 particularly suitable for complex tasks that require processing long inputs and thinking extensively.
+MiniMax-M1 is trained using large-scale reinforcement learning (RL) on diverse problems ranging from traditional mathematical reasoning to sandbox-based, real-world software engineering environments.
+In addition to the inherent efficiency advantage of lightning attention for RL training, we propose CISPO, a novel RL algorithm to further enhance RL efficiency. CISPO clips importance sampling weights rather than token updates, outperforming other competitive RL variants.
+Combining hybrid-attention and CISPO enables MiniMax-M1’s full RL training on 512 H800 GPUs to complete in only three weeks, with a rental cost of just $534,700.
+We release two versions of MiniMax-M1 models with 40K and 80K thinking budgets respectively, where the 40K model represents an intermediate phase of the 80K training.
+Experiments on standard benchmarks show that our models are comparable or superior to strong open-weight models such as the original DeepSeek-R1 and Qwen3-235B, with particular strengths in complex software engineering, tool utilization, and long-context tasks.
+Through efficient scaling of test-time compute, MiniMax-M1 serves as a strong foundation for next-generation language model agents to reason and tackle real-world challenges. We publicly release MiniMax-M1 at
+https://github.com/MiniMax-AI/MiniMax-M1
+.
+Figure 1
+:
+Left
+: Benchmark performance comparison of leading commercial and open-weight models across competition-level mathematics, coding, software engineering, agentic tool use, and long-context understanding tasks. We use the MiniMax-M1-80k model here for MiniMax-M1.
+Right
+: Theoretical inference FLOPs scaling with generation length (# tokens).
+1
+Introduction
+Large reasoning models (LRMs), such as OpenAI o1
+(OpenAI,
+2024a
+)
+and DeepSeek-R1
+(DeepSeek-AI et al.,
+2025
+)
+, have demonstrated remarkable success by extending the length of reasoning through large-scale reinforcement learning (RL). In recent months, both the open-source community and commercial organizations have followed this trend, achieving significant advances on complex tasks such as Olympiad mathematics competitions and competitive programming
+(Kimi Team,
+2025
+; Anthropic,
+2025
+; Google DeepMind,
+2025
+; Seed et al.,
+2025
+; Zeng et al.,
+2025
+; Yu et al.,
+2025
+; Hu et al.,
+2025
+)
+.
+The success of LRMs has been primarily attributed to a new scaling dimension of test-time compute—As more FLOPs are dedicated to extended reasoning processes during generation, model performance shows consistent improvement, particularly for complex real-world applications
+(OpenAI,
+2025
+; Jimenez et al.,
+2024
+)
+.
+However, continuously extending the reasoning process is challenging within the traditional transformer architecture
+(Vaswani et al.,
+2017
+)
+, due to the inherent quadratic computational complexity of the softmax attention mechanism. While previous works have proposed various techniques to mitigate this issue—such as sparse attention
+(Beltagy et al.,
+2020
+; Zaheer et al.,
+2020
+; Lu et al.,
+2025
+; Yuan et al.,
+2025
+)
+, linear attention
+(Katharopoulos et al.,
+2020
+; Qin et al.,
+2021
+; Choromanski et al.,
+2021
+; Peng et al.,
+2021
+; Sun et al.,
+2023
+; Qin et al.,
+2022a
+,
+b
+,
+2024a
+,
+2024c
+; Peng et al.,
+2024b
+; Sun et al.,
+2025
+; Shen et al.,
+2024
+; Arora et al.,
+2024
+; Zhang et al.,
+2024
+; Du et al.,
+2025
+; He et al.,
+2024
+)
+, linear attention with delta decay
+(Yang et al.,
+2024b
+,
+a
+; Peng et al.,
+2025
+)
+, state space models
+(Gu et al.,
+2020
+,
+2023
+,
+2022
+; Gu and Dao,
+2024
+; Dao and Gu,
+2024
+; Glorioso et al.,
+2024
+; Ren et al.,
+2024
+; Jamba Team,
+2024
+; Gupta et al.,
+2022
+)
+, and linear RNNs
+(Hochreiter and Schmidhuber,
+1997
+; Martin and Cundy,
+2018
+; Chung and Ç,
+2014
+; Qin et al.,
+2023
+; Peng et al.,
+2023
+,
+2024a
+; Qin et al.,
+2024d
+; Chou et al.,
+2024
+; Siems et al.,
+2025
+; Sun et al.,
+2024
+; von Oswald et al.,
+2025
+; Behrouz et al.,
+2024
+)
+—these approaches have not been fully validated in large-scale reasoning models, and nearly all competitive LRMs to date still rely on traditional attention designs. An exception is the Hunyuan-T1 model
+(Tencent AI Lab,
+2025
+)
+that employs the Mamba architecture
+(Gu and Dao,
+2024
+; Dao and Gu,
+2024
+)
+. However, this model is not open-sourced and few details are disclosed.
+In this work, we aim to build and open-source a large reasoning model that can efficiently scale up test-time compute and compete with the state-of-the-art reasoning models.
+We introduce MiniMax-M1, a reasoning model with a hybrid Mixture-of-Experts (MoE) architecture and Lightning Attention
+(Qin et al.,
+2024b
+)
+, an I/O-aware implementation of a linear attention variant
+(Qin et al.,
+2022a
+)
+. MiniMax-M1 is developed based on our previous MiniMax-Text-01
+(MiniMax et al.,
+2025
+)
+model, and comprises 456 billion parameters in total, with 45.9 billion activations and 32 experts. In our attention design, a transformer block with softmax attention follows every seven transnormer blocks
+(Qin et al.,
+2022a
+)
+with lightning attention. This design theoretically enables efficient scaling of reasoning lengths to hundreds of thousands of tokens, as illustrated in Figure
+1
+(Right). For example, compared to DeepSeek R1, M1 consumes less than 50% of the FLOPs at a generation length of 64K tokens, and approximately 25% of the FLOPs at a length of 100K tokens. This substantial reduction in computational cost makes M1 significantly more efficient during both inference and large-scale RL training. Furthermore, owing to its lightning attention mechanism and in line with MiniMax-Text-01, our M1 model natively supports a context length of up to 1 million tokens – eight times the context size of DeepSeek R1 and an order of magnitude greater than all open-weight LRMs available to date. These features make M1 particularly well-suited for addressing complex, real-world tasks that require processing long inputs and generating extended thinking. A comparison of the maximum input and output lengths of M1 and other leading models is demonstrated in Table
+1
+.
+Table 1
+:
+The maximum supported input length and output length (# tokens) of different reasoning models. For Claude-4 we refer to the Claude-4-Opus model. “DS-R1” represents the latest
+DeepSeek-R1-0528
+model.
+o3
+Gemini 2.5 Pro
+Claude 4
+DS-R1
+Qwen3-235B
+MiniMax-M1-80k
+Max Input
+200K
+1M
+200K
+128K
+128K
+1M
+Max Output
+100K
+64K
+32K
+64K
+32K
+80K
+To develop our M1 model, we first continue pretraining MiniMax-Text-01 on 7.5T tokens from a carefully curated, reasoning-intensive corpus. Subsequently, we perform supervised fine-tuning (SFT) to inject certain chain-of-thought (CoT)
+(Wei et al.,
+2022
+)
+patterns, establishing a strong foundation for reinforcement learning, the core stage of M1 development.
+Notably, our RL scaling with M1 is made efficient through innovations from two key perspectives: (1) We propose a novel RL algorithm, CISPO, which abandons the trust region constraint and instead clips the importance sampling weights to stabilize training. This approach always leverages all tokens for gradient computations, achieving enhanced efficiency compared to GRPO
+(Shao et al.,
+2024
+)
+and DAPO
+(Yu et al.,
+2025
+)
+empirically – For example, on a controlled study based on Qwen2.5-32B models
+(Qwen et al.,
+2025
+)
+, CISPO achieves a 2x speedup compared to DAPO; (2) Although the hybrid-attention design in M1 naturally allows for efficient RL scaling, unique challenges arise when scaling RL with this architecture. For instance, we find a precision mismatch between the training and inference kernels of our architecture, which prevents reward growth during RL training. We develop targeted solutions to address these challenges and successfully scale up RL with this hybrid architecture.
+In the end, our efficient RL framework enables us to complete a full RL run of MiniMax-M1 within 3 weeks using 512 H800 GPUs—equivalent to a rental cost of approximately $0.53M USD.
+In addition to methodological innovations, we curate a diverse set of problems and environments for RL training. Our data encompasses both verifiable and non-verifiable problems. For verifiable problems that are typically considered critical for reasoning learning, we not only include mathematical reasoning and competitive programming problems as commonly used in related works, but also leverage our previous data synthesis framework SynLogic
+(Liu et al.,
+2025a
+)
+to generate diverse logical reasoning problems spanning 41 distinct tasks. Furthermore, we construct sandboxes for complex software engineering (SE) environments derived from SWE-bench
+(Jimenez et al.,
+2024
+)
+, and conduct RL on real-world SE problems with execution-based rewards to improve M1’s performance in challenging SE scenarios. Our unverifiable problems span a broad range of domains such as question answering and creative writing, where we use generative reward models to provide the feedback.
+We train two versions of MiniMax-M1 models with 40K and 80K tokens of maximum generation length respectively, which leads to two models MiniMax-M1-40k and MiniMax-M1-80k.
+MiniMax-M1-80k outperforms MiniMax-M1-40k on complex mathematical and coding tasks, further demonstrating the benefits of scaling test-time compute. As shown in Figure
+1
+(Left), MiniMax-M1 surpasses previous leading open-weight models such as the original DeepSeek-R1 and Qwen-235B overall, with particular advantages in complex software engineering, tool-using, and long-context tasks.
+Compared to the latest DeepSeek-R1-0528 model, MiniMax-M1 lags in mathematical and coding competitions but achieves comparable or superior performance in more realistic tool-using and long-context scenarios.
+Notably, MiniMax-M1 outperforms Gemini 2.5 Pro on the agentic tool use benchmark TAU-Bench
+(Yao et al.,
+2025
+)
+, and surpasses OpenAI o3 and Claude 4 Opus on long-context understanding benchmarks.
+With efficient test-time scaling, we contend that MiniMax-M1 establishes a strong foundation for next-generation language model agents to address real-world challenges.
+To facilitate collaboration and advancement in the field, we have made our models publicly available at GitHub and Hugging Face. They are now supported by both the
+vLLM
+and
+Transformers
+frameworks, with detailed deployment guides available at
+vLLM
+and
+Transformers
+respectively. This enables easy integration of MiniMax-M1 into modern inference pipelines. We also provide commercial standard API at
+minimax.io
+.
+2
+Preparation for Scalable RL: Continual Pretraining and SFT
+In this work, we focus on scaling up reinforcement learning to enhance reasoning capabilities of Minimax-Text-01. To facilitate scalable RL training, we first carry out continual pretraining of our base model to strengthen its intrinsic reasoning abilities. Subsequently, we perform a cold-start supervised fine-tuning (SFT) stage to inject specific reasoning patterns to the model, thereby providing a stronger foundation for the subsequent RL phase.
+2.1
+Continual Pre-Training: Foundation for RL Scaling
+To enhance the reasoning and long context capabilities of the foundation model while ensuring diversity, we continue training the MiniMax-Text-01 model with additional 7.5T tokens with optimized data quality and mixture.
+Training Data.
+We refine our pretraining Web and PDF parsing mechanisms and enhance our heuristic cleaning rules to ensure a high recall rate for mathematical and code-related data. We prioritize the extraction of natural Question-Answer (QA) pairs from a diverse range of sources, including webpages, forums, and textbooks, while strictly avoiding the use of synthetic data. Additionally, we conduct semantic deduplication on the QA data to maintain its diversity and uniqueness. Furthermore, we increase the proportion of STEM (Science, Technology, Engineering, and Mathematics), code, book, and reasoning-related data to 70%. This significantly enhances the foundation model’s ability to handle complex tasks without compromising its other general capabilities.
+Training Recipe.
+We decrease the coefficient of the MoE auxiliary loss and adjust the parallel training strategy to support a larger training micro batch size, which mitigates the detrimental effects of the auxiliary loss on overall model performance. Based on MiniMax-Text-01, we continue training with a constant learning rate of 8e-5 for 2.5T tokens, followed by a decay schedule over 5T tokens down to 8e-6.
+Long Context Extension.
+For a hybrid-lightning architecture model with higher convergence complexity, we have observed that excessively aggressive extensions of the training length can lead to a sudden gradient explosion that may occur during the training process. This makes the optimization process extremely challenging. We attribute this to the parameter optimization of the earlier layers not keeping up with the changes in the later layers – For lightning attention, the earlier and later layers have different decay rates, which makes the earlier layers focus more on local information. We alleviate this issue by adapting a smoother extension of context length across four stages, starting from a 32K context window length and ultimately extending the training context to 1M tokens.
+2.2
+Supervised Fine-Tuning: Focused Alignment for Efficient RL
+After continual pretraining, we conduct Supervised Fine-Tuning (SFT) to instill desired behaviors like reflection-based Chain-of-Thought (CoT) reasoning using high-quality examples, creating a strong starting point for more efficient and stable RL in the next stage. Specifically, we curate data samples with long CoT responses. These data samples cover diverse domains such as math, coding, STEM, writing, QA, and multi-turn chat. Math and coding samples account for around 60% of all the data.
+3
+Efficient RL Scaling: Algorithms and Lightning Attention
+As shown in Figure
+1
+(Right), the M1 architecture demonstrates a clear efficiency advantage during inference. This naturally facilitates efficient RL scaling where increasingly longer responses are generated. However, as pioneers in scaling up RL with this hybrid architecture, we encounter unique challenges during the process, and the RL procedure can become unstable or even fail due to various issues.
+To address these difficulties, we develop targeted solutions that enable us to successfully scale up RL training for M1. In addition, we propose a new RL algorithm that achieves greater RL efficiency compared to existing methods.
+These dual contributions yield an efficient and scalable RL framework for training M1, where the complete training cycle requires 3 weeks on 512 H800 GPUs—equivalent to a rental cost of approximately $0.53M USD.
+In this section, we first provide general context on RL and present our novel RL algorithm, and then describe the specific challenges we face with the hybrid architecture, along with the solutions we devise to overcome them.
+3.1
+Efficient RL Scaling with CISPO
+Background.
+For questions
+q
+𝑞
+q
+italic_q
+from a dataset
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+, we denote
+π
+𝜋
+\pi
+italic_π
+as the policy model parameterized by
+θ
+𝜃
+\theta
+italic_θ
+, and
+o
+𝑜
+o
+italic_o
+as the response generated by the policy.
+PPO
+(Schulman et al.,
+2017
+)
+adopts the following objective to optimize the policy to maximize the expected return, and a clipping operation is applied to stabilize training:
+𝒥
+PPO
+⁢
+(
+θ
+)
+subscript
+𝒥
+PPO
+𝜃
+\displaystyle\mathcal{J}_{\text{PPO}}(\theta)
+caligraphic_J start_POSTSUBSCRIPT PPO end_POSTSUBSCRIPT ( italic_θ )
+=
+𝔼
+q
+∼
+𝒟
+,
+o
+i
+∼
+π
+θ
+old
+(
+⋅
+|
+q
+)
+\displaystyle=\mathbb{E}_{q\sim\mathcal{D},o_{i}\sim\pi_{\theta_{\text{old}}}(%
+\cdot|q)}
+= blackboard_E start_POSTSUBSCRIPT italic_q ∼ caligraphic_D , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ | italic_q ) end_POSTSUBSCRIPT
+(1)
+[
+1
+|
+o
+i
+|
+∑
+t
+=
+1
+|
+o
+i
+|
+min
+(
+r
+i
+,
+t
+(
+θ
+)
+A
+^
+i
+,
+t
+,
+clip
+(
+r
+i
+,
+t
+(
+θ
+)
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+A
+^
+i
+,
+t
+)
+−
+β
+D
+K
+⁢
+L
+(
+π
+θ
+|
+|
+π
+ref
+)
+]
+,
+\displaystyle\left[\frac{1}{|o_{i}|}\sum_{t=1}^{|o_{i}|}\min\left(r_{i,t}(%
+\theta)\hat{A}_{i,t},\text{clip}\big{(}r_{i,t}(\theta),1-\epsilon,1+\epsilon%
+\big{)}\hat{A}_{i,t}\right)-\beta D_{KL}(\pi_{\theta}||\pi_{\text{ref}})\right],
+[ divide start_ARG 1 end_ARG start_ARG | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT roman_min ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT , clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ϵ , 1 + italic_ϵ ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ) - italic_β italic_D start_POSTSUBSCRIPT italic_K italic_L end_POSTSUBSCRIPT ( italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT | | italic_π start_POSTSUBSCRIPT ref end_POSTSUBSCRIPT ) ] ,
+where
+r
+i
+,
+t
+⁢
+(
+θ
+)
+=
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+π
+θ
+old
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+subscript
+𝜋
+subscript
+𝜃
+old
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+r_{i,t}(\theta)=\frac{\pi_{\theta}(o_{i,t}\mid q,o_{i,<t})}{\pi_{\theta_{\text%
+{old}}}(o_{i,t}\mid q,o_{i,<t})}
+italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) = divide start_ARG italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG start_ARG italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) end_ARG
+is the importance sampling (IS) weight, which is used to correct the distribution during off-policy updates, because we use
+π
+θ
+old
+subscript
+𝜋
+subscript
+𝜃
+old
+\pi_{\theta_{\text{old}}}
+italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT
+to collect trajectories to update the policy via multiple steps in a minibatch manner. While PPO requires a separate value model to compute the advantage
+A
+^
+i
+,
+t
+subscript
+^
+𝐴
+𝑖
+𝑡
+\hat{A}_{i,t}
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT
+, GRPO
+(Shao et al.,
+2024
+)
+eliminates the value model and defines the advantage as the output reward relative to other responses in the group:
+A
+^
+i
+,
+t
+=
+R
+i
+−
+mean
+⁢
+(
+{
+R
+j
+}
+j
+=
+1
+G
+)
+std
+⁢
+(
+{
+R
+j
+}
+j
+=
+1
+G
+)
+,
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝑅
+𝑖
+mean
+superscript
+subscript
+subscript
+𝑅
+𝑗
+𝑗
+1
+𝐺
+std
+superscript
+subscript
+subscript
+𝑅
+𝑗
+𝑗
+1
+𝐺
+\hat{A}_{i,t}=\frac{R_{i}-\text{mean}(\{R_{j}\}_{j=1}^{G})}{\text{std}(\{R_{j}%
+\}_{j=1}^{G})},
+over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT = divide start_ARG italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT - mean ( { italic_R start_POSTSUBSCRIPT italic_j end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_j = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG start_ARG std ( { italic_R start_POSTSUBSCRIPT italic_j end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_j = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ) end_ARG ,
+(2)
+where
+R
+i
+subscript
+𝑅
+𝑖
+R_{i}
+italic_R start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+is the reward of the response, and
+G
+𝐺
+G
+italic_G
+responses
+{
+o
+i
+}
+i
+=
+1
+G
+subscript
+superscript
+subscript
+𝑜
+𝑖
+𝐺
+𝑖
+1
+\{o_{i}\}^{G}_{i=1}
+{ italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT
+are sampled for each question. The reward is either from rule-based verifiers such as in mathematical problem solving, or from a reward model.
+Issues of Token Clipping.
+In our initial experiments with the hybrid architecture under the zero-RL setting, we observed that the GRPO algorithm adversely affected training performance and failed to effectively promote the emergence of long CoT reasoning behaviors. Through a series of controlled ablation studies, we ultimately identified the undesirable clipping operation in the original PPO/GRPO loss as the primary factor contributing to degraded learning performance.
+Specifically, we found that tokens associated with reflective behaviors (e.g.,
+However
+,
+Recheck
+,
+Wait
+,
+Aha
+), which often serve as “forks” in reasoning paths, were typically rare and assigned low probabilities by our base model. During policy updates, these tokens were likely to exhibit high
+r
+i
+,
+t
+subscript
+𝑟
+𝑖
+𝑡
+r_{i,t}
+italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT
+values. As a result, these tokens were clipped out after the first on-policy update, preventing them from contributing to subsequent off-policy gradient updates. This issue was particularly pronounced in our hybrid-architecture model and further hindered the scalability of reinforcement learning.
+These low-probability tokens, however, are often crucial for stabilizing entropy
+(Cui et al.,
+2025
+)
+and facilitating scalable RL
+(Wang et al.,
+2025
+)
+. Although DAPO attempts to mitigate this issue by increasing the upper clipping bound
+(Yu et al.,
+2025
+)
+, we found this approach to be less effective in our setup, which involved 16 rounds of off-policy updates per generation batch.
+Figure 2
+:
+Comparison of GRPO, DAPO, and our proposed CISPO on AIME 2024, based on Qwen2.5-32B-base. CISPO outperforms both GRPO and DAPO in terms of performance at the same number of training steps, and achieves comparable performance to DAPO using 50% of the training steps.
+The CISPO Algorithm.
+In response, we propose a new algorithm that explicitly avoids dropping tokens, even those associated with large updates, while inherently maintaining entropy within a reasonable range to ensure stable exploration. First, recall that the vanilla REINFORCE objective with corrected distribution for offline updates is:
+𝒥
+REINFORCE
+⁢
+(
+θ
+)
+subscript
+𝒥
+REINFORCE
+𝜃
+\displaystyle\mathcal{J}_{\text{REINFORCE}}(\theta)
+caligraphic_J start_POSTSUBSCRIPT REINFORCE end_POSTSUBSCRIPT ( italic_θ )
+=
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+o
+i
+∼
+π
+θ
+old
+(
+⋅
+|
+q
+)
+\displaystyle=\mathbb{E}_{(q,a)\sim\mathcal{D},o_{i}\sim\pi_{\theta_{\text{old%
+}}}(\cdot|q)}
+= blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ | italic_q ) end_POSTSUBSCRIPT
+(3)
+[
+1
+|
+o
+i
+|
+⁢
+∑
+t
+=
+1
+|
+o
+i
+|
+sg
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+)
+⁢
+A
+^
+i
+,
+t
+⁢
+log
+⁡
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+]
+,
+delimited-[]
+1
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+sg
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+\displaystyle\left[\frac{1}{|o_{i}|}\sum_{t=1}^{|o_{i}|}\texttt{sg}(r_{i,t}(%
+\theta))\hat{A}_{i,t}\log\pi_{\theta}(o_{i,t}\mid q,o_{i,<t})\right],
+[ divide start_ARG 1 end_ARG start_ARG | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT sg ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT roman_log italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) ] ,
+where
+sg
+⁢
+(
+⋅
+)
+sg
+⋅
+\texttt{sg}(\cdot)
+sg ( ⋅ )
+denotes the stop-gradient operation.
+Rather than clipping the token updates as in PPO/GRPO, we instead clip the importance sampling weight in Eq.
+3
+to stabilize training.
+We term our approach CISPO (
+C
+lipped
+IS
+-weight
+P
+olicy
+O
+ptimization). Adopting the group relative advantage from GRPO and the token-level loss
+(Yu et al.,
+2025
+; Liu et al.,
+2025b
+)
+, CISPO optimizes the following objective:
+𝒥
+CISPO
+⁢
+(
+θ
+)
+subscript
+𝒥
+CISPO
+𝜃
+\displaystyle\mathcal{J}_{\text{CISPO{}}}(\theta)
+caligraphic_J start_POSTSUBSCRIPT CISPO end_POSTSUBSCRIPT ( italic_θ )
+=
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+q
+)
+\displaystyle=\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot|q)}
+= blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ | italic_q ) end_POSTSUBSCRIPT
+(4)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+sg
+⁢
+(
+r
+^
+i
+,
+t
+⁢
+(
+θ
+)
+)
+⁢
+A
+^
+i
+,
+t
+⁢
+log
+⁡
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+]
+,
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+sg
+subscript
+^
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+\displaystyle\left[\frac{1}{\sum_{i=1}^{G}|o_{i}|}\sum_{i=1}^{G}\sum_{t=1}^{|o%
+_{i}|}\texttt{sg}(\hat{r}_{i,t}(\theta))\hat{A}_{i,t}\log\pi_{\theta}(o_{i,t}%
+\mid q,o_{i,<t})\right],
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT sg ( over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT roman_log italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) ] ,
+where
+r
+^
+i
+,
+t
+⁢
+(
+θ
+)
+subscript
+^
+𝑟
+𝑖
+𝑡
+𝜃
+\hat{r}_{i,t}(\theta)
+over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ )
+is the clipped IS weight:
+r
+^
+i
+,
+t
+⁢
+(
+θ
+)
+=
+clip
+⁢
+(
+r
+i
+,
+t
+⁢
+(
+θ
+)
+,
+1
+−
+ϵ
+l
+⁢
+o
+⁢
+w
+I
+⁢
+S
+,
+1
++
+ϵ
+h
+⁢
+i
+⁢
+g
+⁢
+h
+I
+⁢
+S
+)
+.
+subscript
+^
+𝑟
+𝑖
+𝑡
+𝜃
+clip
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+superscript
+italic-ϵ
+𝐼
+𝑆
+𝑙
+𝑜
+𝑤
+1
+subscript
+superscript
+italic-ϵ
+𝐼
+𝑆
+ℎ
+𝑖
+𝑔
+ℎ
+\hat{r}_{i,t}(\theta)=\text{clip}\left(r_{i,t}(\theta),1-\epsilon^{IS}_{low},1%
++\epsilon^{IS}_{high}\right).
+over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) = clip ( italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) , 1 - italic_ϵ start_POSTSUPERSCRIPT italic_I italic_S end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_l italic_o italic_w end_POSTSUBSCRIPT , 1 + italic_ϵ start_POSTSUPERSCRIPT italic_I italic_S end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_h italic_i italic_g italic_h end_POSTSUBSCRIPT ) .
+(5)
+We note that without weight clipping,
+𝒥
+CISPO
+subscript
+𝒥
+CISPO
+\mathcal{J}_{\text{CISPO{}}}
+caligraphic_J start_POSTSUBSCRIPT CISPO end_POSTSUBSCRIPT
+reduces to the standard policy gradient objective. In our experiments, we did not impose a lower bound on the IS weight by setting
+ϵ
+l
+⁢
+o
+⁢
+w
+I
+⁢
+S
+subscript
+superscript
+italic-ϵ
+𝐼
+𝑆
+𝑙
+𝑜
+𝑤
+\epsilon^{IS}_{low}
+italic_ϵ start_POSTSUPERSCRIPT italic_I italic_S end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_l italic_o italic_w end_POSTSUBSCRIPT
+to a large value; instead, we only tuned
+ϵ
+h
+⁢
+i
+⁢
+g
+⁢
+h
+I
+⁢
+S
+subscript
+superscript
+italic-ϵ
+𝐼
+𝑆
+ℎ
+𝑖
+𝑔
+ℎ
+\epsilon^{IS}_{high}
+italic_ϵ start_POSTSUPERSCRIPT italic_I italic_S end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_h italic_i italic_g italic_h end_POSTSUBSCRIPT
+.
+Although the gradient of Eq.
+4
+is slightly biased due to weight clipping, this approach preserves gradient contributions from all tokens, especially in long responses.
+CISPO proves effective in our experiments, helping reduce variance and stabilizing RL training.
+In addition, we utilize the dynamic sampling and length penalty techniques from
+Yu et al. (
+2025
+)
+. There is no KL penalty term in CISPO similar to other recent works
+(Yu et al.,
+2025
+; Hu et al.,
+2025
+)
+.
+A General Formulation.
+While we adopt CISPO in our experiments, here we further present a unified formulation by introducing a token-wise mask into the CISPO objective. This allows for hyperparameter tuning to control whether, and under what conditions, gradients from specific tokens should be dropped:
+𝒥
+unify
+⁢
+(
+θ
+)
+subscript
+𝒥
+unify
+𝜃
+\displaystyle\mathcal{J}_{\text{unify}}(\theta)
+caligraphic_J start_POSTSUBSCRIPT unify end_POSTSUBSCRIPT ( italic_θ )
+=
+𝔼
+(
+q
+,
+a
+)
+∼
+𝒟
+,
+{
+o
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+old
+(
+⋅
+|
+q
+)
+\displaystyle=\mathbb{E}_{(q,a)\sim\mathcal{D},\{o_{i}\}_{i=1}^{G}\sim\pi_{%
+\theta_{\text{old}}}(\cdot|q)}
+= blackboard_E start_POSTSUBSCRIPT ( italic_q , italic_a ) ∼ caligraphic_D , { italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∼ italic_π start_POSTSUBSCRIPT italic_θ start_POSTSUBSCRIPT old end_POSTSUBSCRIPT end_POSTSUBSCRIPT ( ⋅ | italic_q ) end_POSTSUBSCRIPT
+(6)
+[
+1
+∑
+i
+=
+1
+G
+|
+o
+i
+|
+⁢
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+o
+i
+|
+sg
+⁢
+(
+r
+^
+i
+,
+t
+⁢
+(
+θ
+)
+)
+⁢
+A
+^
+i
+,
+t
+⁢
+log
+⁡
+π
+θ
+⁢
+(
+o
+i
+,
+t
+∣
+q
+,
+o
+i
+,
+<
+t
+)
+⁢
+M
+i
+,
+t
+]
+.
+delimited-[]
+1
+superscript
+subscript
+𝑖
+1
+𝐺
+subscript
+𝑜
+𝑖
+superscript
+subscript
+𝑖
+1
+𝐺
+superscript
+subscript
+𝑡
+1
+subscript
+𝑜
+𝑖
+sg
+subscript
+^
+𝑟
+𝑖
+𝑡
+𝜃
+subscript
+^
+𝐴
+𝑖
+𝑡
+subscript
+𝜋
+𝜃
+conditional
+subscript
+𝑜
+𝑖
+𝑡
+𝑞
+subscript
+𝑜
+𝑖
+absent
+𝑡
+subscript
+𝑀
+𝑖
+𝑡
+\displaystyle\left[\frac{1}{\sum_{i=1}^{G}|o_{i}|}\sum_{i=1}^{G}\sum_{t=1}^{|o%
+_{i}|}\texttt{sg}(\hat{r}_{i,t}(\theta))\hat{A}_{i,t}\log\pi_{\theta}(o_{i,t}%
+\mid q,o_{i,<t})M_{i,t}\right].
+[ divide start_ARG 1 end_ARG start_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_ARG ∑ start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_G end_POSTSUPERSCRIPT ∑ start_POSTSUBSCRIPT italic_t = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT | italic_o start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT | end_POSTSUPERSCRIPT sg ( over^ start_ARG italic_r end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) ) over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT roman_log italic_π start_POSTSUBSCRIPT italic_θ end_POSTSUBSCRIPT ( italic_o start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ∣ italic_q , italic_o start_POSTSUBSCRIPT italic_i , < italic_t end_POSTSUBSCRIPT ) italic_M start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ] .
+The mask
+M
+i
+,
+t
+subscript
+𝑀
+𝑖
+𝑡
+M_{i,t}
+italic_M start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT
+is equivalent to the mask implicitly defined in the PPO trust region:
+M
+i
+,
+t
+=
+{
+0
+if
+⁢
+A
+^
+i
+,
+t
+>
+0
+⁢
+and
+⁢
+r
+i
+,
+t
+⁢
+(
+θ
+)
+>
+1
++
+ϵ
+high
+,
+0
+if
+⁢
+A
+^
+i
+,
+t
+<
+0
+⁢
+and
+⁢
+r
+i
+,
+t
+⁢
+(
+θ
+)
+<
+1
+−
+ϵ
+low
+,
+1
+otherwise
+.
+subscript
+𝑀
+𝑖
+𝑡
+cases
+0
+if
+subscript
+^
+𝐴
+𝑖
+𝑡
+0
+and
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+italic-ϵ
+high
+0
+if
+subscript
+^
+𝐴
+𝑖
+𝑡
+0
+and
+subscript
+𝑟
+𝑖
+𝑡
+𝜃
+1
+subscript
+italic-ϵ
+low
+1
+otherwise
+M_{i,t}=\begin{cases}0&\text{if }\hat{A}_{i,t}>0\text{ and }r_{i,t}(\theta)>1+%
+\epsilon_{\text{high}},\\
+0&\text{if }\hat{A}_{i,t}<0\text{ and }r_{i,t}(\theta)<1-\epsilon_{\text{low}}%
+,\\
+1&\text{otherwise}.\end{cases}
+italic_M start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT = { start_ROW start_CELL 0 end_CELL start_CELL if over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT > 0 and italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) > 1 + italic_ϵ start_POSTSUBSCRIPT high end_POSTSUBSCRIPT , end_CELL end_ROW start_ROW start_CELL 0 end_CELL start_CELL if over^ start_ARG italic_A end_ARG start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT < 0 and italic_r start_POSTSUBSCRIPT italic_i , italic_t end_POSTSUBSCRIPT ( italic_θ ) < 1 - italic_ϵ start_POSTSUBSCRIPT low end_POSTSUBSCRIPT , end_CELL end_ROW start_ROW start_CELL 1 end_CELL start_CELL otherwise . end_CELL end_ROW
+(7)
+This unified loss formulation can flexibly represent different clipping strategies under a common framework.
+Empirical Validation of CISPO.
+To validate the effectiveness of CISPO, we empirically compare it with DAPO and GRPO in a zero-RL training setting. Specifically, we apply different RL algorithms to train the Qwen2.5-32B-base model on the mathematical reasoning dataset from
+Yu et al. (
+2025
+)
+, and report performance on the AIME 2024 benchmark. As shown in Figure
+2
+, CISPO significantly outperforms both DAPO and GRPO with the same number of training steps. Notably, CISPO demonstrates superior training efficiency compared to other approaches; for example, it matches DAPO’s performance with only 50% of the training steps.
+3.2
+Efficient RL Scaling with Lightning Attention – Challenges and Recipes
+Figure 3
+:
+Probability of tokens in training-mode code vs. probability of tokens in inference-mode code. Each point in the figures represents an individual token. The Pearson correlation coefficient is indicated in the figures. Theoretically, the two probabilities should be identical, and all the tokens should be exactly on the diagonal line.
+Left:
+Correlation of the M1 model before our fix;
+Right:
+Correlation of the M1 model after applying our fix of using FP32 precision for the LM output head.
+As shown in Figure
+1
+(Right), we emphasize that our hybrid attention inherently enables more efficient RL scaling compared to traditional attention designs, since rollout computation and latency are often the primary bottlenecks in RL training. However, as pioneers in conducting large-scale RL experiments with this novel architecture, we encountered unique challenges and developed targeted solutions, as we describe below.
+Computational Precision Mismatch in Generation and Training.
+RL training is highly sensitive to computational precision.
+During our RL training, we observed a significant discrepancy in the probabilities of rolled-out tokens between training-mode and inference-mode, as shown in Figure
+3
+(Left). This discrepancy arose from a precision mismatch between the training and inference kernels. The issue was detrimental and prevented reward growth in our experiments.
+Interestingly, this issue did not appear in smaller, dense models with softmax attention.
+Through layer-by-layer analysis, we identified high-magnitude activations in the LM head at the output layer as the primary source of error. To address this, we increased the precision of the LM output head to FP32, thereby realigning the two theoretically identical probabilities, as demonstrated in Figure
+3
+(Right). This adjustment improved the correlation between training and inference probabilities from approximately 0.9x to 0.99x. Notably, this correlation metric remained stable throughout training, enabling successful reward increase.
+Optimizer Hyperparameter Sensitivity.
+We employ the AdamW
+(Loshchilov and Hutter,
+2019
+)
+optimizer, and inappropriate configurations of
+β
+1
+subscript
+𝛽
+1
+\beta_{1}
+italic_β start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+,
+β
+2
+subscript
+𝛽
+2
+\beta_{2}
+italic_β start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+, and
+ϵ
+italic-ϵ
+\epsilon
+italic_ϵ
+can lead to non-convergence during training.
+(Molybog et al.,
+2023
+)
+. For instance, using the default configuration from VeRL
+(Sheng et al.,
+2024
+)
+, where betas = (0.9, 0.999) and eps = 1e-8, can result in such issues.
+We have observed that the gradient magnitudes in MiniMax-M1 training span a wide range, from 1e-18 to 1e-5, with the majority of the gradients being smaller than 1e-14. Furthermore, the correlation between the gradients of adjacent iterations is weak. Based on this, we set
+β
+1
+=
+0.9
+subscript
+𝛽
+1
+0.9
+\beta_{1}=0.9
+italic_β start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT = 0.9
+,
+β
+2
+=
+0.95
+subscript
+𝛽
+2
+0.95
+\beta_{2}=0.95
+italic_β start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT = 0.95
+, and eps=1e-15.
+Early Truncation via Repetition Detection.
+During RL training, we found that complex prompts could induce pathologically long and repetitive responses, whose large gradients threatened model stability. Our goal was to preemptively terminate these generation loops rather than penalize the already repetitive text. As simple string-matching is ineffective against varied repetition patterns, we developed a heuristic based on token probabilities. We observed that once a model enters a repetitive cycle, the probability for each token soars. Consequently, we implemented an early truncation rule: generation is halted if 3,000 consecutive tokens each have a probability above 0.99. This method successfully prevents model instability and improves generation throughput by eliminating these pathological, long-tail cases.
+4
+Scaling Reinforcement Learning with Diverse Data
+In this section, we describe the data and reward we adopted for our RL stage. We incorporate a diverse set of environments in our RL training pipeline, including tasks that can be verified by rules and general tasks that need to be verified through reward models.
+All these environments are integrated into the RL stage using a carefully designed curriculum.
+4.1
+Reasoning-Intensive Tasks with Rule-based Verification
+Below, we introduce our data that can be verified by deterministic rules. For all the following tasks, we employ rule-based final correctness as the correctness reward, complemented by a format reward.
+Mathematical Reasoning.
+Our initial mathematical dataset comprises hundreds of thousands of high-quality, competition-level problems, meticulously curated and organized from public sources and official mathematics competitions. These problems span a wide range of difficulty levels, each paired with a standard reference solution.
+Our data cleaning pipeline begins with the removal of incomplete samples and those exhibiting formatting or typographical errors. We subsequently apply embedding-based deduplication across the RL data sources and enforce a strict separation from the SFT dataset to avoid any overlap, as leakage from the SFT phase into the RL stage hinders exploration and undermines training effectiveness. Additionally, we employ both n-gram and embedding-based methods to eliminate potential contamination from commonly used mathematical benchmark test sets, thereby ensuring the integrity and fairness of our evaluations.
+We filter out samples containing multiple sub-problems, proof-based questions, and binary questions (e.g., true/false) that are susceptible to random guessing. Multiple-choice questions are reformulated into open-ended formats to better align with our reinforcement learning framework.
+Next, we employ our internal model to extract the final answers from the reference solution, retaining only those samples whose extracted answers can be correctly parsed by our rule-based answer checker. Finally, we use a strong reasoning model to compute the pass@10 for each question and retain only those samples with a pass rate strictly between 0 and 0.9, resulting in a curated dataset of nearly 50K high-quality mathematical samples for our RL training.
+Logical Reasoning.
+For logical reasoning data, we carefully select 41 logical reasoning tasks requiring non-trivial reasoning ability such as cipher and Sudoku, then we implement a data synthesis framework to synthesize all the data. Concretely, we utilize our SynLogic framework
+(Liu et al.,
+2025a
+)
+to implement the data synthesis pipeline featuring task-specific data generators and rule-based task-specific verifiers, enabling automatic logical data generation. We meticulously configure the difficulty parameters during generation, ensuring the appropriate learning challenge of the generated data. Specifically, to prevent inclusion of overly difficult instances, we establish an upper difficulty bound based on the solvability limits of current strong reasoning models, requiring their pass@10 rates greater than zero. Similarly, we set a lower difficulty bound using the lowest difficulty parameters for which the MiniMax-Text-01 model achieves pass rates between 0 and 0.5. This approach ensures the data maintains a balance between difficulty and learnability. In addition, as the model capabilities improve during training, we increase the difficulty of the data in the later stages. Using this framework, we synthesize approximately 53K logical reasoning samples for RL training.
+Competitive Programming.
+For the competitive programming problems, we collect publicly available problems from online judge platforms and popular coding websites. For problems lacking test cases, we develop an LLM-based workflow and use the MiniMax-Text-01 model to generate comprehensive test suites. Similar to our approach with mathematical reasoning datasets, we filter problems based on quality and difficulty using pass rates from model sampling, retaining moderately challenging and high-quality algorithmic problems. Through this process, we generate 30K competitive programming data samples for RL training.
+Software Engineering.
+For the software engineering domain, inspired by SWE-bench
+(Jimenez et al.,
+2024
+)
+, we construct verifiable reinforcement learning environments by leveraging real-world data from public GitHub repositories. Our dataset primarily comprises issues and pull requests (PRs) that encapsulate common software development challenges, including bug localization, code repair, and test case synthesis.
+To facilitate effective reinforcement learning, we develop a sophisticated containerized sandbox environment that simulates a realistic software development workflow. This environment enables the actual execution of code, providing direct and verifiable feedback on the correctness and efficacy of an agent’s proposed interventions. The pass/fail status of pre-defined or newly generated test cases serves as the primary reward signal for our RL framework. A successful execution that passes all relevant test cases yields a positive reward, while compilation errors, runtime failures, or test case regressions result in a zero or negative reward, thus providing a clear signal for policy optimization.
+Through this process, we curate several thousand high-quality data samples. Each sample includes a problem description (e.g., bug report from an issue), the initial faulty code, and a set of associated test cases. This setup allows our RL agent to learn to accurately pinpoint bugs, propose correct code fixes, and even synthesize new, effective test cases, with performance directly verifiable through the execution within our sandboxed environment.
+4.2
+General Domain Tasks with Model-based Feedbacks
+In this section, we further extend the RL scope to a wider array of general domain tasks. As these tasks cannot be easily verified by rules, we utilize reward models to provide the feedback.
+4.2.1
+Data and Reward Models
+Our general RL dataset consists of a total of 25K complex samples. These can be broadly categorized into two types: samples with ground-truth answers that are verifiable but difficult to validate using rules, and samples without ground-truth answers.
+Tasks with Ground Truth.
+This category primarily includes STEM and other factual problems where answers are objective but may have multiple valid expressions. Such diversity often renders rule-based answer checkers inaccurate. Our data cleaning process is similar to that used in mathematical reasoning, while we use our Generative Reward Model (GenRM) as a verifier, instead of relying on rule-based checkers.
+To evaluate consistency between ground-truth answers and model responses, we adopt a five-grade reward scale to evaluate the two components. First, we construct a human-annotated reward model benchmark, which covers a range of objective tasks across diverse knowledge and task domains, especially the pairs of model response–ground truth that rule-based checkers fail to judge accurately. Second, we evaluate the GenRM’s effectiveness by comparing the Best-of-N (BoN) responses selected by GenRM against the pass@N metrics across several benchmarks. GenRM performance is assessed using its accuracy on the human-annotated benchmark and the performance gap between BoN and pass@N. These metrics guide experiments to optimize both the data distribution and the prompt design used during the GenRM training.
+Tasks without Ground Truth.
+This category encompasses a wider range of tasks, including instruction-following, creative writing, etc.
+Prompts are sampled from a large pool based on our internal tagging system, ensuring a balanced training distribution across fine-grained domains.
+Even though these queries are typically open-ended and do not have a ground-truth answer, we seek to pair a reference answer for each query, which serves as a reference for reward model judgment. To this end, we first generate responses by various internal and external models, and then these reference answers will undergo our internal quality evaluation.
+During RL training, we adopt a pairwise comparison framework to evaluate model responses. Each comparison yields a score of -1, 0, or 1, indicating whether the model’s output is worse than, similar to, or better than a reference answer. For instruction-following tasks with constraints particularly, we utilize both the rule-based reward to assess whether the response satisfies the constraint, and model-based reward to evaluate response’s quality. As with the ground-truth setting, we first build a human-annotated benchmark, incorporating multiple blind preference judgments from reliable annotators. We then refine our scoring criteria and preference prompt to optimize accuracy as well as potential biases, which would be mentioned in §
+4.2.2
+below. To minimize the potential biases, training data are also optimized by several methods, such as multiple-blind consistent judgment, position-switched consistent judgment, etc. Once an optimal GenRM is trained, a Swiss Round scoring system is performed across the training dataset to determine the most suitable reference answer for RL training.
+4.2.2
+Addressing Bias of Generative Reward Models for Long CoT
+Effective general RL for complex CoT reasoning tasks is critically dependent on accurate and unbiased reward models. Assessing such CoT responses turns out to be challenging, and we found that GenRMs preferred longer outputs over potentially superior concise alternatives, irrespective of actual reasoning quality. This
+length bias
+is a significant issue as it may substantially misguide RL policy optimization, incentivizing verbosity without substance and inducing reward hacking.
+Our initial efforts to improve GenRM fidelity include standard offline strategies: (1) Diversifying training data with a wide range of response lengths, sources, and quality tiers; (2) Incorporating adversarial examples to expose vulnerabilities; and (3) Refining model architectures. However, empirical analysis revealed that purely offline evaluation and preemptive mitigation of length bias in GenRMs frequently failed to prevent length bias during RL training.
+Consequently, our core strategy incorporates continuous online monitoring of length bias during RL training. Specific metrics are established to detect whether the RL policy disproportionately extends output lengths to maximize GenRMs rewards without gains in task success or reasoning depth. Upon detecting such detrimental length-seeking behavior, indicative of exploiting GenRMs length bias, immediate GenRMs recalibration is triggered. This iterative adjustment is vital to preempt reward hacking related to output length, ensuring the policy prioritized substantive capability enhancement over superficial text inflation.
+Complementing this adaptive approach, RL-side techniques including reward shaping, value clipping, and normalization are systematically employed.
+These mechanisms desensitize reward signals to extreme values from superficial characteristics (e.g., length), thereby directing policy optimization toward substantive quality and correctness of its long CoT reasoning.
+4.3
+Curriculum of Incorporating Diverse Data
+Given that our RL data spans a wide spectrum of categories, a core challenge is training a single policy capable of excelling on both reasoning-intensive tasks and general domain tasks.
+To address this, our approach entails a carefully managed curriculum and dynamic weighting strategy for reasoning and general-domain tasks during the RL training process with CISPO: we start with only the reasoning-intensive tasks with rule-based reward, and then gradually mix in the general domain tasks. This ensures that the model continues to refine its verifiable skills (e.g., in math and code) while progressively enhancing its performance on a diverse spectrum of general tasks, from complex instruction following to open-ended CoT reasoning.
+This mixed RL training encourages the model to learn context-dependent application of its reasoning abilities—applying rigorous, step-by-step deduction for verifiable problems and more flexible, adaptive generation for general queries—all within a unified policy framework. It prevents catastrophic forgetting of specialized skills while fostering broader generalization.
+5
+Extending RL Scaling to Longer Thinking
+Our first RL training is performed with an output length limit of 40K tokens. Given that the hybrid architecture of M1 natively supports near-linear scaling for longer sequences, as demonstrated in Figure
+1
+(Right), we further extend the generation length during RL training to 80K tokens. This results in a new model, which we refer to as MiniMax-M1-80k.
+Data.
+To efficiently train our RL model for an 80K output length, we utilize our previously trained 40K model to guide the data filtering process. First, we evaluate the pass rates on the curated dataset described in §
+4
+and remove samples that are easily solved. We then adjust the data distribution to favor more challenging examples, such as difficult mathematical and coding problems. Additionally, we downsample synthetic reasoning data after observing that it destabilizes long-context RL training. Specifically, outputs generated from this data type often become repetitive and homogenous, and continued exposure to these patterns proves detrimental to the model’s overall performance.
+Length Scaling Strategy.
+To gradually increase the output length, we employ a staged window expansion RL strategy. We begin with an output length of 40K and incrementally expand it to 48K, 56K, 64K, 72K, and ultimately 80K. This staged approach ensures training stability at each step. The transition to a subsequent length is determined by a set of empirical indicators. These include the convergence of perplexity on the generated sequences and whether the 99th percentile of the output lengths is approaching the current context window limit. These signals offer valuable insights into the model’s readiness for scaling, which allows us to maintain robust training throughout the process.
+Addressing Training Instability During Scaling.
+During the scaling process, we encountered a critical issue in the later stages of training at each length window. Specifically, the model exhibited susceptibility to pattern collapse, where the latter portions of generated sequences degraded into incoherent or garbled text. This phenomenon consistently coincided with increased perplexity, indicating compromised generation quality and stability. We identify the root cause: during output length extension, negative samples increase in length substantially faster than positive samples, frequently reaching the context window limit earlier. Consequently, disproportionately large negative gradients accumulate in the latter segments of generation sequences. This imbalance originates from the inherently unequal nature of GRPO’s advantage normalization and the token-level loss we adopt.
+To address this, we implement three key solutions: (1) Detecting repetitive patterns (consecutive high-probability tokens) with early stopping to prevent excessive context window consumption by repetitive responses; (2) Adopting combined sample-level loss and token-level normalization to alleviate negative-positive sample imbalance and mitigate adverse effects; (3) Decreasing both the gradient clipping threshold and
+ϵ
+h
+⁢
+i
+⁢
+g
+⁢
+h
+I
+⁢
+S
+subscript
+superscript
+italic-ϵ
+𝐼
+𝑆
+ℎ
+𝑖
+𝑔
+ℎ
+\epsilon^{IS}_{high}
+italic_ϵ start_POSTSUPERSCRIPT italic_I italic_S end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_h italic_i italic_g italic_h end_POSTSUBSCRIPT
+to further stabilize generation.
+6
+Evaluations
+Table 2
+:
+Performance of MiniMax-M1 on core benchmarks.
+Tasks
+Leading Close-Weights Models
+Open-Weights Models
+Our Models
+OpenAI-o3
+Gemini 2.5
+Pro (06-05)
+Claude
+4 Opus
+Seed-
+Thinking-
+v1.5
+DeepSeek-
+R1
+DeepSeek-
+R1-0528
+Qwen3-
+235B-A22B
+MiniMax-
+M1-40k
+MiniMax-
+M1-80k
+Extended
+Thinking
+100K
+64K
+64K
+32K
+32K
+64K
+32K
+40K
+80K
+Mathematics
+AIME 2024
+91.6
+92.0
+76.0
+86.7
+79.8
+91.4
+85.7
+83.3
+86.0
+AIME 2025
+88.9
+88.0
+75.5
+74.0
+70.0
+87.5
+81.5
+74.6
+76.9
+MATH-500
+98.1
+98.8
+98.2
+96.7
+97.3
+98.0
+96.2
+96.0
+96.8
+General Coding
+LiveCodeBench
+(24/8
+∼
+similar-to
+\sim
+∼
+25/5)
+75.8
+77.1
+56.6
+67.5
+55.9
+73.1
+65.9
+62.3
+65.0
+FullStackBench
+69.3
+–
+70.3
+69.9
+70.1
+69.4
+62.9
+67.6
+68.3
+Reasoning & Knowledge
+GPQA Diamond
+83.3
+86.4
+79.6
+77.3
+71.5
+81.0
+71.1
+69.2
+70.0
+HLE
+(no tools)
+20.3
+21.6
+10.7
+8.2
+8.6
+∗
+17.7
+∗
+7.6
+∗
+7.2
+∗
+8.4
+∗
+ZebraLogic
+95.8
+91.6
+95.1
+84.4
+78.7
+95.1
+80.3
+80.1
+86.8
+MMLU-Pro
+85.0
+86.0
+85.0
+87.0
+84.0
+85.0
+83.0
+80.6
+81.1
+Software Engineering
+SWE-bench Verified
+69.1
+67.2
+72.5
+47.0
+49.2
+57.6
+34.4
+55.6
+56.0
+Long Context
+OpenAI-MRCR
+(128k)
+56.5
+76.8
+48.9
+54.3
+35.8
+51.5
+27.7
+76.1
+73.4
+OpenAI-MRCR
+(1M)
+–
+58.8
+–
+–
+–
+–
+–
+58.6
+56.2
+LongBench-v2
+58.8
+65.0
+55.6
+52.5
+58.3
+52.1
+50.1
+61.0
+61.5
+Agentic Tool Use
+TAU-bench
+(airline)
+52.0
+50.0
+59.6
+44.0
+–
+53.5
+34.7
+60.0
+62.0
+TAU-bench
+(retail)
+73.9
+67.0
+81.4
+55.7
+–
+63.9
+58.6
+67.8
+63.5
+Factuality
+SimpleQA
+49.4
+54.0
+–
+12.9
+30.1
+27.8
+11.0
+17.9
+18.5
+General Assistant
+MultiChallenge
+56.5
+51.8
+45.8
+43.0
+40.7
+45.0
+40.0
+44.7
+44.7
+* conducted on the text-only HLE subset.
+6.1
+Core Benchmarks
+We conduct a comprehensive evaluation of MiniMax-M1 across several key domains: mathematics, general coding, software engineering, reasoning & knowledge, long context, agentic tool use, factuality, and general assistant ability. We evaluate all tasks using temperature 1.0 and top-p 0.95 sampling.
+•
+Mathematics:
+To evaluate mathematical reasoning capabilities, we utilize several competition level math benchmarks, including MATH-500
+(Hendrycks et al.,
+2021
+)
+, AIME 2024, AIME 2025. For AIME evaluation, we sample 32 times and compute the average passrate as the final score.
+•
+General Coding:
+We assess general programming proficiency using LiveCodeBench
+(Jain et al.,
+2025
+)
+and FullStackBench
+(Liu et al.,
+2024
+)
+, which evaluate code generation across diverse programming tasks. For both benchmarks, we report scores as the average passrate of 16 samples.
+•
+Reasoning & Knowledge:
+We assess domain knowledge and reasoning capabilities through GPQA-Diamond
+(Rein et al.,
+2024
+)
+, MMLU-Pro
+(Wang et al.,
+2024
+)
+, and the challenging HLE benchmark
+(Phan et al.,
+2025
+)
+. For GPQA-Diamond, we sample 32 times and report the average passrate.
+For HLE evaluation, we assess the model without external tools. Additionally, we measure logical reasoning ability using ZebraLogic
+(Lin et al.,
+2025
+)
+.
+•
+Software Engineering:
+We evaluate software engineering capabilities using SWE-bench Verified
+(Jimenez et al.,
+2024
+)
+, which measures the ability to resolve real-world GitHub issues. We report results derived from the Agentless scaffold
+(Xia et al.,
+2024
+)
+. Departing from the original pipeline, our methodology employs a two-stage localization process (without any embedding-based retrieval mechanisms): initial coarse-grained file localization followed by fine-grained localization to specific files and code elements.
+•
+Long Context:
+We evaluate long context understanding using OpenAI-MRCR
+(OpenAI,
+2024b
+)
+, which tests retrieval and disambiguation of multiple similar items within extended contexts, and LongBench-v2
+(Bai et al.,
+2024
+)
+, a challenging benchmark with 503 multiple-choice questions across contexts ranging from 8k to 2M words.
+•
+Agentic Tool Use:
+We assess tool use capabilities through TAU-bench
+(Yao et al.,
+2025
+)
+, which emulates dynamic conversations where agents must utilize API tools while adhering to domain-specific policy guidelines. We evaluate TAU-bench with GPT-4.1 as user model, a general system prompt
+2
+2
+2
+”In each round, you need to carefully examine the tools provided to you to determine if any can be used. You must adhere to all of the policies. Pay attention to the details in the terms. Solutions for most situations can be found within these policies.”
+and without any custom tools.
+The maximum number of interaction steps is 40.
+•
+Factuality:
+To measure factuality of LLMs, we utilize SimpleQA
+(Wei et al.,
+2024
+)
+, an adversarially-collected benchmark of fact-seeking questions with single, indisputable answers.
+•
+General Assistant:
+We evaluate general assistant capabilities using MultiChallenge
+(Sirdeshmukh et al.,
+2025
+)
+, which assesses LLMs on conducting realistic multi-turn conversations with human users. We report our scores judged by GPT-4o.
+Results on Math, Coding, and other General Tasks.
+Table
+2
+presents our model’s performance compared to state-of-the-art large reasoning models. In mathematical reasoning, the MiniMax-M1 models demonstrate strong performance across multiple benchmarks, achieving results comparable to the close-weight model Seed-Thinking-v1.5
+(Seed et al.,
+2025
+)
+. Notably, MiniMax-M1-80k achieves 86.0% on AIME 2024, placing it second among open-weight models and trailing only the latest DeepSeek-R1-0528 model. For general coding, MiniMax-M1-80k matches Qwen3-235B on LiveCodeBench while outperforming it on FullStackBench, demonstrating robust capabilities among leading open-weight models.
+On reasoning & knowledge benchmarks, MiniMax-M1-80k similarly trails DeepSeek-R1-0528 but achieves competitive performance against other top open-weight models.
+On the factuality benchmark SimpleQA, Minimax-M1 models underperform DeepSeek-R1 while outperforming all other open-weight models and Seed-Thinking-v1.5.
+On MultiChallenge, both MiniMax models perform comparably to DeepSeek-R1-0528 and Claude 4 Optus, with inferior results only to o3 and Gemini-2.5-Pro.
+Highlights in Complex Scenarios: Software Engineering, Long Context, and Tool use.
+Benefiting from our execution-based, software engineering environments during RL, MiniMax-M1-40k and MiniMax-M1-80k achieve strong scores of 55.6% and 56.0% on SWE-bench verified respectively. These results are slightly inferior to DeepSeek-R1-0528’s 57.6% and significantly surpass other open-weights models.
+Leveraging its 1M context window, the M1 models significantly outperform all other open-weight models in long-context understanding. They even surpass OpenAI o3 and Claude 4 Opus, ranking second globally and trailing only Gemini 2.5 Pro by a small margin.
+In agentic tool-use scenarios (TAU-bench), MiniMax-M1-40k surpasses all open-weight models and even Gemini-2.5 Pro.
+Moreover, MiniMax-M1-80k consistently outperforms MiniMax-M1-40k across most benchmarks, confirming the benefits of scaling test-time compute.
+Figure 4
+:
+Accuracy and generation length versus RL training steps for MiniMax-M1.
+6.2
+Effect of RL Scaling
+To investigate the effect of RL scaling, we track performance and response length throughout training. Figure
+4
+presents three representative examples from AIME 2024, AIME 2025, and LiveCodeBench v5, respectively.
+We observe consistent improvements in both model performance and response length during training. Notably, average response lengths on AIME and LiveCodeBench exceed 20,000 tokens, with AIME 2024 accuracy showing substantial gains from 68% to 80%. Crucially, the strong correlation between accuracy gains and increased response length in these visualizations underscores the importance of extending RL scaling to facilitate more extensive reasoning processes.
+7
+Conclusion and Future work
+In this work, we introduce and release MiniMax-M1, the world’s first open-weight, large-scale reasoning model featuring a lightning attention mechanism. This efficient attention design enables MiniMax-M1 to natively support inputs of up to 1M tokens and generation lengths of 80K tokens—both significantly exceeding capabilities of other open-weight models. These capabilities render MiniMax-M1 uniquely suited for complex, realistic scenarios requiring long context and extended reasoning, properties empirically validated by its strong performance on software engineering, agentic tool use, and long-context understanding benchmarks.
+Beyond the inherent efficiency advantages of lightning attention for RL training, this work contributes a novel RL algorithm, CISPO, to accelerate training. Combining architectural advantages with CISPO, we efficiently trained MiniMax-M1, with complete RL training completed in three weeks using 512 H800 GPUs. Across comprehensive evaluations, MiniMax-M1 ranks among the world’s best open-weight models alongside DeepSeek-R1 and Qwen3-235B.
+Looking forward, as test-time compute continuously scales to power increasingly complex scenarios, we foresee significant potential for such efficient architectures in addressing real-world challenges. These include automating company workflows
+(Xu et al.,
+2025
+)
+and conducting scientific research
+(Si et al.,
+2024
+; OpenAI,
+2025
+)
+. Real-world applications particularly demand LRMs that function as agents interacting with environments, tools, computers, or other agents—requiring reasoning across dozens to hundreds of turns while integrating long-context information from diverse sources. We envision MiniMax-M1 serving as a strong foundation for such applications with unique advantages, and we are fully dedicated to further evolving MiniMax-M1 toward this goal.
+References
+Anthropic (2025)
+Anthropic.
+Claude 3.7 sonnet and claude code.
+https://www.anthropic.com/news/claude-3-7-sonnet
+, 2025.
+Blog post, February 24, 2025.
+Arora et al. (2024)
+Simran Arora, Sabri Eyuboglu, Michael Zhang, Aman Timalsina, Silas Alberti, Dylan Zinsley, James Zou, Atri Rudra, and Ré.
+Simple linear attention language models balance the recall-throughput tradeoff.
+arXiv preprint arXiv:2402.18668
+, 2024.
+Bai et al. (2024)
+Yushi Bai, Shangqing Tu, Jiajie Zhang, Hao Peng, Xiaozhi Wang, Xin Lv, Shulin Cao, Jiazheng Xu, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li.
+LongBench.
+arXiv preprint arXiv:2412.15204
+, 2024.
+Behrouz et al. (2024)
+Ali Behrouz, Peilin Zhong, and Vahab Mirrokni.
+Titans: Learning to memorize at test time.
+arXiv preprint arXiv:2501.00663
+, 2024.
+Beltagy et al. (2020)
+Iz Beltagy, Matthew E Peters, and Arman Cohan.
+Longformer: The long-document transformer.
+arXiv preprint arXiv:2004.05150
+, 2020.
+Choromanski et al. (2021)
+Krzysztof Marcin Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Quincy Davis, Afroz Mohiuddin, Lukasz Kaiser, David Benjamin Belanger, Lucy J Colwell, and Adrian Weller.
+Rethinking attention with Performers.
+In
+International Conference on Learning Representations
+, 2021.
+URL
+https://openreview.net/forum?id=Ua6zuk0WRH
+.
+Chou et al. (2024)
+Yuhong Chou, Man Yao, Kexin Wang, Yuqi Pan, Rui-Jie Zhu, Jibin Wu, Yiran Zhong, Yu Qiao, Bo Xu, and Guoqi Li.
+Metala: Unified optimal linear approximation to softmax attention map.
+Advances in Neural Information Processing Systems
+, 37:71034–71067, 2024.
+Chung and Ç (2014)
+Junyoung Chung and Ç.
+Empirical evaluation of gated recurrent neural networks on sequence modeling.
+arXiv preprint arXiv:1412.3555
+, 2014.
+Cui et al. (2025)
+Ganqu Cui, Yuchen Zhang, Jiacheng Chen, Lifan Yuan, Zhi Wang, Yuxin Zuo, Haozhan Li, Yuchen Fan, Huayu Chen, Weize Chen, Zhiyuan Liu, Hao Peng, Lei Bai, Wanli Ouyang, Yu Cheng, Bowen Zhou, and Ning Ding.
+The entropy mechanism of reinforcement learning for reasoning language models.
+arXiv preprint arXiv:2505.22617
+, 2025.
+Dao and Gu (2024)
+Tri Dao and Albert Gu.
+Transformers are ssms: Generalized models and efficient algorithms through structured state space duality.
+arXiv preprint arXiv:2405.21060
+, 2024.
+DeepSeek-AI et al. (2025)
+DeepSeek-AI, Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+Du et al. (2025)
+Jusen Du, Weigao Sun, Disen Lan, Jiaxi Hu, and Yu Cheng.
+Mom: Linear sequence modeling with mixture-of-memories.
+arXiv preprint arXiv:2502.13685
+, 2025.
+Glorioso et al. (2024)
+Paolo Glorioso, Quentin Anthony, Yury Tokpanov, James Whittington, Jonathan Pilault, Adam Ibrahim, and Beren Millidge.
+Zamba: A compact 7b SSM.
+arXiv preprint arXiv:2405.16712
+, 2024.
+Google DeepMind (2025)
+Google DeepMind.
+Gemini pro.
+https://deepmind.google/models/gemini/pro/
+, 2025.
+Web page, accessed 2025.
+Gu and Dao (2024)
+Albert Gu and Tri Dao.
+Mamba: Linear-time sequence modeling with selective state spaces.
+In
+First Conference on Language Modeling
+, 2024.
+URL
+https://openreview.net/forum?id=tEYskw1VY2
+.
+Gu et al. (2020)
+Albert Gu, Tri Dao, Stefano Ermon, Atri Rudra, and Christopher Ré.
+Hippo: Recurrent memory with optimal polynomial projections.
+Advances in neural information processing systems
+, 33:1474–1487, 2020.
+Gu et al. (2022)
+Albert Gu, Karan Goel, and Christopher Ré.
+Efficiently modeling long sequences with structured state spaces.
+In
+The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022
+. OpenReview.net, 2022.
+URL
+https://openreview.net/forum?id=uYLFoz1vlAC
+.
+Gu et al. (2023)
+Albert Gu, Isys Johnson, Aman Timalsina, Atri Rudra, and Christopher Re.
+How to train your HIPPO: State space models with generalized orthogonal basis projections.
+In
+International Conference on Learning Representations
+, 2023.
+URL
+https://openreview.net/forum?id=klK17OQ3KB
+.
+Gupta et al. (2022)
+Ankit Gupta, Albert Gu, and Jonathan Berant.
+Diagonal state spaces are as effective as structured state spaces.
+In
+NeurIPS
+, 2022.
+URL
+http://papers.nips.cc/paper_files/paper/2022/hash/9156b0f6dfa9bbd18c79cc459ef5d61c-Abstract-Conference.html
+.
+He et al. (2024)
+Zhihao He, Hang Yu, Zi Gong, Shizhan Liu, Jianguo Li, and Weiyao Lin.
+Rodimus*: Breaking the accuracy-efficiency trade-off with efficient attentions.
+arXiv preprint arXiv:2410.06577
+, 2024.
+Hendrycks et al. (2021)
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt.
+Measuring mathematical problem solving with the math dataset.
+arXiv preprint arXiv:2103.03874
+, 2021.
+Hochreiter and Schmidhuber (1997)
+Sepp Hochreiter and Jürgen Schmidhuber.
+Long short-term memory.
+Neural computation
+, 9(8):1735–1780, 1997.
+Hu et al. (2025)
+Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, Xiangyu Zhang, and Heung-Yeung Shum.
+Open-reasoner-zero: An open source approach to scaling up reinforcement learning on the base model.
+arXiv preprint arXiv:2503.24290
+, 2025.
+Jain et al. (2025)
+Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica.
+Livecodebench: Holistic and contamination free evaluation of large language models for code.
+In
+The Thirteenth International Conference on Learning Representations
+, 2025.
+Jamba Team (2024)
+Jamba Team.
+Jamba-1.5: Hybrid T.
+arXiv preprint arXiv:2408.12570
+, 2024.
+Jimenez et al. (2024)
+Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan.
+SWE-bench: Can language models resolve real-world github issues?
+In
+International Conference on Learning Representations
+, 2024.
+URL
+https://openreview.net/forum?id=VTF8yNQM66
+.
+Katharopoulos et al. (2020)
+Angelos Katharopoulos, Apoorv Vyas, Nikolaos Pappas, and François Fleuret.
+Transformers are RNNs: Fast autoregressive transformers with linear attention.
+In
+International Conference on Machine Learning
+, pages 5156–5165. PMLR, 2020.
+Kimi Team (2025)
+Kimi Team.
+Kimi k1. 5: Scaling reinforcement learning with llms.
+arXiv preprint arXiv:2501.12599
+, 2025.
+Lin et al. (2025)
+Bill Yuchen Lin, Ronan Le Bras, Kyle Richardson, Ashish Sabharwal, Radha Poovendran, Peter Clark, and Yejin Choi.
+Zebralogic: On the scaling limits of llms for logical reasoning.
+arXiv preprint arXiv:2502.01100
+, 2025.
+Liu et al. (2025a)
+Junteng Liu, Yuanxiang Fan, Zhuo Jiang, Han Ding, Yongyi Hu, Chi Zhang, Yiqi Shi, Shitong Weng, Aili Chen, Shiqi Chen, Yunan Huang, Mozhi Zhang, Pengyu Zhao, Junjie Yan, and Junxian He.
+Synlogic: Synthesizing verifiable reasoning data at scale for learning logical reasoning and beyond.
+arXiv preprint arXiv:2505.19641
+, 2025a.
+Liu et al. (2024)
+Siyao Liu, He Zhu, Jerry Liu, Shulin Xin, Aoyan Li, Rui Long, Li Chen, Jack Yang, Jinxiang Xia, Z. Y. Peng, Shukai Liu, Zhaoxiang Zhang, Ge Zhang, Wenhao Huang, Kai Shen, and Liang Xiang.
+Fullstack bench: Evaluating llms as full stack coders.
+arXiv preprint arXiv:2412.00535
+, 2024.
+Liu et al. (2025b)
+Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin.
+Understanding r1-zero-like training: A critical perspective.
+arXiv preprint arXiv:2503.20783
+, 2025b.
+Loshchilov and Hutter (2019)
+Ilya Loshchilov and Frank Hutter.
+Decoupled weight decay regularization.
+In
+International Conference on Learning Representations
+, 2019.
+Lu et al. (2025)
+Enzhe Lu, Zhejun Jiang, Jingyuan Liu, Yulun Du, Tao Jiang, Chao Hong, Shaowei Liu, Weiran He, Enming Yuan, Yuzhi Wang, et al.
+Moba: Mixture of block attention for long-context llms.
+arXiv preprint arXiv:2502.13189
+, 2025.
+Martin and Cundy (2018)
+Eric Martin and Chris Cundy.
+Parallelizing linear recurrent neural nets over sequence length.
+In
+6th International Conference on Learning Representations, ICLR 2018, Vancouver, BC, Canada, April 30 - May 3, 2018, Conference Track Proceedings
+. OpenReview.net, 2018.
+URL
+https://openreview.net/forum?id=HyUNwulC-
+.
+MiniMax et al. (2025)
+MiniMax, Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, et al.
+Minimax-01: Scaling foundation models with lightning attention.
+arXiv preprint arXiv:2501.08313
+, 2025.
+Molybog et al. (2023)
+Igor Molybog, Peter Albert, Moya Chen, Zachary DeVito, David Esiobu, Naman Goyal, Punit Singh Koura, Sharan Narang, Andrew Poulton, Ruan Silva, Binh Tang, Diana Liskovich, Puxin Xu, Yuchen Zhang, Melanie Kambadur, Stephen Roller, and Susan Zhang.
+A theory on adam instability in large-scale machine learning.
+arXiv preprint arXiv:2304.09871
+, 2023.
+OpenAI (2024a)
+OpenAI.
+Introducing openai o1.
+https://openai.com/o1/
+, 2024a.
+Web page, accessed 2024.
+OpenAI (2024b)
+OpenAI.
+Openai mrcr dataset.
+https://huggingface.co/datasets/openai/mrcr
+, 2024b.
+Accessed: 2025-06-15.
+OpenAI (2025)
+OpenAI.
+Introducing deep research, 2025.
+URL
+https://openai.com/index/introducing-deep-research/
+.
+Peng et al. (2023)
+Bo Peng, Eric Alcaide, Quentin Gregory Anthony, Alon Albalak, Samuel Arcadinho, Stella Biderman, Huanqi Cao, Xin Cheng, Michael Nguyen Chung, Leon Derczynski, et al.
+Rwkv: Reinventing rnns for the transformer era.
+In
+Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)
+, 2023.
+Peng et al. (2024a)
+Bo Peng, Daniel Goldstein, Quentin Anthony, Alon Albalak, Eric Alcaide, Stella Biderman, Eugene Cheah, Teddy Ferdinan, Haowen Hou, and Przemysł Kazienko.
+Eagle and finch: Rwkv with matrix-valued states and dynamic recurrence.
+arXiv preprint arXiv:2404.05892
+, 2024a.
+Peng et al. (2024b)
+Bo Peng, Daniel Goldstein, Quentin Anthony, Alon Albalak, Eric Alcaide, Stella Biderman, Eugene Cheah, Teddy Ferdinan, Haowen Hou, and Przemysł Kazienko.
+Eagle and finch: Rwkv with matrix-valued states and dynamic recurrence.
+arXiv preprint arXiv:2404.05892
+, 2024b.
+Peng et al. (2025)
+Bo Peng, Ruichong Zhang, Daniel Goldstein, Eric Alcaide, Xingjian Du, Haowen Hou, Jiaju Lin, Jiaxing Liu, Janna Lu, William Merrill, et al.
+Rwkv-7.
+arXiv preprint arXiv:2503.14456
+, 2025.
+Peng et al. (2021)
+Hao Peng, Nikolaos Pappas, Dani Yogatama, Roy Schwartz, Noah Smith, and Lingpeng Kong.
+Random feature attention.
+In
+International Conference on Learning Representations
+, 2021.
+URL
+https://openreview.net/forum?id=QtTKTdVrFBB
+.
+Phan et al. (2025)
+Long Phan, Alice Gatti, Ziwen Han, Nathaniel Li, Josephina Hu, Hugh Zhang, Chen Bo Calvin Zhang, Mohamed Shaaban, John Ling, Sean Shi, et al.
+Humanity’s last exam.
+arXiv preprint arXiv:2501.14249
+, 2025.
+Qin et al. (2021)
+Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong.
+cosformer: Rethinking softmax in attention.
+In
+Proceedings of the International Conference on Learning Representations (ICLR)
+, 2021.
+Qin et al. (2022a)
+Zhen Qin, Xiaodong Han, Weixuan Sun, Dongxu Li, Lingpeng Kong, Nick Barnes, and Yiran Zhong.
+The devil in linear transformer.
+In
+Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
+, pages 7025–7041, 2022a.
+Qin et al. (2022b)
+Zhen Qin, Weixuan Sun, Hui Deng, Dongxu Li, Yunshen Wei, Baohong Lv, Junjie Yan, Lingpeng Kong, and Yiran Zhong.
+cosFormer: Rethinking softmax in attention.
+In
+International Conference on Learning Representations
+, 2022b.
+URL
+https://openreview.net/forum?id=Bl8CQrx2Up4
+.
+Qin et al. (2023)
+Zhen Qin, Songlin Yang, and Yiran Zhong.
+Hierarchically gated recurrent neural network for sequence modeling.
+In
+Proceedings of the 37th International Conference on Neural Information Processing Systems
+, pages 33202–33221, 2023.
+Qin et al. (2024a)
+Zhen Qin, Yuxin Mao, Xuyang Shen, Dong Li, Jing Zhang, Yuchao Dai, and Yiran Zhong.
+You only scan once: Efficient multi-dimension sequential modeling with lightnet.
+arXiv preprint arXiv:2405.21022
+, 2024a.
+Qin et al. (2024b)
+Zhen Qin, Weigao Sun, Dong Li, Xuyang Shen, Weixuan Sun, and Yiran Zhong.
+Lightning attention-2: A free lunch for handling unlimited sequence lengths in large language models.
+arXiv preprint arXiv:2401.04658
+, 2024b.
+Qin et al. (2024c)
+Zhen Qin, Weigao Sun, Dong Li, Xuyang Shen, Weixuan Sun, and Yiran Zhong.
+Various lengths, constant speed: Efficient language modeling with lightning attention.
+In
+International conference on machine learning
+, pages 41517–41535. PMLR, 2024c.
+Qin et al. (2024d)
+Zhen Qin, Songlin Yang, Weixuan Sun, Xuyang Shen, Dong Li, Weigao Sun, and Yiran Zhong.
+HGRN2.
+arXiv preprint arXiv:2404.07904
+, 2024d.
+Qwen et al. (2025)
+Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tianyi Tang, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu.
+Qwen2.5 technical report.
+arXiv preprint arXiv:2412.15115
+, 2025.
+Rein et al. (2024)
+David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman.
+Gpqa: A graduate-level google-proof q&a benchmark.
+In
+First Conference on Language Modeling
+, 2024.
+Ren et al. (2024)
+Liliang Ren, Yang Liu, Yadong Lu, Yelong Shen, Chen Liang, and Weizhu Chen.
+Samba: Simple hybrid state space models for efficient unlimited context language modeling.
+arXiv preprint arXiv:2406.07522
+, 2024.
+Schulman et al. (2017)
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Seed et al. (2025)
+ByteDance Seed, Jiaze Chen, Tiantian Fan, Xin Liu, Lingjun Liu, Zhiqi Lin, Mingxuan Wang, Chengyi Wang, Xiangpeng Wei, Wenyuan Xu, et al.
+Seed1. 5-thinking: Advancing superb reasoning models with reinforcement learning.
+arXiv preprint arXiv:2504.13914
+, 2025.
+Shao et al. (2024)
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Y Wu, et al.
+DeepSeekMath.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Shen et al. (2024)
+Xuyang Shen, Dong Li, Ruitao Leng, Zhen Qin, Weigao Sun, and Yiran Zhong.
+Scaling laws for linear complexity language models.
+In
+Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
+, pages 16377–16426, 2024.
+Sheng et al. (2024)
+Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu.
+Hybridflow: A flexible and efficient rlhf framework.
+arXiv preprint arXiv:2409.19256
+, 2024.
+Si et al. (2024)
+Chenglei Si, Diyi Yang, and Tatsunori Hashimoto.
+Can llms generate novel research ideas? a large-scale human study with 100+ nlp researchers.
+arXiv preprint arXiv:2409.04109
+, 2024.
+Siems et al. (2025)
+Julien Siems, Timur Carstensen, Arber Zela, Frank Hutter, Massimiliano Pontil, and Riccardo Grazzi.
+Deltaproduct: Improving state-tracking in linear rnns via householder products.
+arXiv preprint arXiv:2502.10297
+, 2025.
+Sirdeshmukh et al. (2025)
+Ved Sirdeshmukh, Kaustubh Deshpande, Johannes Mols, Lifeng Jin, Ed-Yeremai Cardona, Dean Lee, Jeremy Kritz, Willow Primack, Summer Yue, and Chen Xing.
+Multichallenge: A realistic multi-turn conversation evaluation benchmark challenging to frontier llms.
+arXiv preprint arXiv:2501.17399
+, 2025.
+Sun et al. (2025)
+Weigao Sun, Disen Lan, Tong Zhu, Xiaoye Qu, and Yu Cheng.
+Linear-moe: Linear sequence modeling meets mixture-of-experts.
+arXiv preprint arXiv:2503.05447
+, 2025.
+Sun et al. (2024)
+Yu Sun, Xinhao Li, Karan Dalal, Jiarui Xu, Arjun Vikram, Genghan Zhang, Yann Dubois, Xinlei Chen, Xiaolong Wang, Sanmi Koyejo, et al.
+Learning to (learn at test time): Rnns with expressive hidden states.
+arXiv preprint arXiv:2407.04620
+, 2024.
+Sun et al. (2023)
+Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei.
+Retentive network: A successor to transformer for large language models.
+arXiv preprint arXiv:2307.08621
+, 2023.
+Tencent AI Lab (2025)
+Tencent AI Lab.
+Hunyuan-t1: Reasoning efficiency redefined.
+https://llm.hunyuan.tencent.com/#/Blog/hy-t1/
+, 2025.
+Accessed: 2025-06-15.
+Vaswani et al. (2017)
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin.
+Attention is all you need.
+Advances in neural information processing systems
+, 30, 2017.
+von Oswald et al. (2025)
+Johannes von Oswald, Nino Scherrer, Seijin Kobayashi, Luca Versari, Songlin Yang, Maximilian Schlegel, Kaitlin Maile, Yanick Schimpf, Oliver Sieberling, Alexander Meulemans, et al.
+Mesanet: Sequence modeling by locally optimal test-time training.
+arXiv preprint arXiv:2506.05233
+, 2025.
+Wang et al. (2025)
+Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, Yuqiong Liu, An Yang, Andrew Zhao, Yang Yue, Shiji Song, Bowen Yu, Gao Huang, and Junyang Lin.
+Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning.
+arXiv preprint arXiv:2506.01939
+, 2025.
+Wang et al. (2024)
+Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, et al.
+Mmlu-pro: A more robust and challenging multi-task language understanding benchmark.
+In
+The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track
+, 2024.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al.
+Chain-of-thought prompting elicits reasoning in large language models.
+Advances in neural information processing systems
+, 35:24824–24837, 2022.
+Wei et al. (2024)
+Jason Wei, Nguyen Karina, Hyung Won Chung, Yunxin Joy Jiao, Spencer Papay, Amelia Glaese, John Schulman, and William Fedus.
+Measuring short-form factuality in large language models.
+arXiv preprint arXiv:2411.04368
+, 2024.
+Xia et al. (2024)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying llm-based software engineering agents.
+arXiv preprint arXiv:2407.01489
+, 2024.
+Xu et al. (2025)
+Frank F. Xu, Yufan Song, Boxuan Li, Yuxuan Tang, Kritanjali Jain, Mengxue Bao, Zora Z. Wang, Xuhui Zhou, Zhitong Guo, Murong Cao, Mingyang Yang, Hao Yang Lu, Amaad Martin, Zhe Su, Leander Maben, Raj Mehta, Wayne Chi, Lawrence Jang, Yiqing Xie, Shuyan Zhou, and Graham Neubig.
+Theagentcompany: Benchmarking llm agents on consequential real world tasks.
+arXiv preprint arXiv:2412.14161
+, 2025.
+Yang et al. (2024a)
+Songlin Yang, Bailin Wang, Yikang Shen, Rameswar Panda, and Yoon Kim.
+Gated linear attention transformers with hardware-efficient training.
+arXiv preprint arXiv:2312.06635
+, 2024a.
+Yang et al. (2024b)
+Songlin Yang, Bailin Wang, Yu Zhang, Yikang Shen, and Yoon Kim.
+Parallelizing linear transformers with the delta rule over sequence length.
+arXiv preprint arXiv:2406.06484
+, 2024b.
+Yao et al. (2025)
+Shunyu Yao, Noah Shinn, Pedram Razavi, and Karthik R Narasimhan.
+τ
+𝜏
+\tau
+italic_τ
+-bench: A benchmark for tool-agent-user interaction in real-world domains.
+In
+The Thirteenth International Conference on Learning Representations
+, 2025.
+Yu et al. (2025)
+Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, Xin Liu, Haibin Lin, Zhiqi Lin, Bole Ma, Guangming Sheng, Yuxuan Tong, Chi Zhang, Mofan Zhang, Wang Zhang, Hang Zhu, Jinhua Zhu, Jiaze Chen, Jiangjie Chen, Chengyi Wang, Hongli Yu, Yuxuan Song, Xiangpeng Wei, Hao Zhou, Jingjing Liu, Wei-Ying Ma, Ya-Qin Zhang, Lin Yan, Mu Qiao, Yonghui Wu, and Mingxuan Wang.
+Dapo: An open-source llm reinforcement learning system at scale.
+arXiv preprint arXiv:2503.14476
+, 2025.
+Yuan et al. (2025)
+Jingyang Yuan, Huazuo Gao, Damai Dai, Junyu Luo, Liang Zhao, Zhengyan Zhang, Zhenda Xie, YX Wei, Lean Wang, Zhiping Xiao, et al.
+Native sparse attention: Hardware-aligned and natively trainable sparse attention.
+arXiv preprint arXiv:2502.11089
+, 2025.
+Zaheer et al. (2020)
+Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al.
+Big Bird: Transformers for longer sequences.
+Advances in neural information processing systems
+, 33:17283–17297, 2020.
+Zeng et al. (2025)
+Weihao Zeng, Yuzhen Huang, Qian Liu, Wei Liu, Keqing He, Zejun Ma, and Junxian He.
+Simplerl-zoo: Investigating and taming zero reinforcement learning for open base models in the wild.
+arXiv preprint arXiv:2503.18892
+, 2025.
+Zhang et al. (2024)
+Yu Zhang, Songlin Yang, Rui-Jie Zhu, Yue Zhang, Leyang Cui, Yiqiao Wang, Bolun Wang, Freda Shi, Bailin Wang, Wei Bi, et al.
+Gated slot attention for efficient linear-time sequence modeling.
+Advances in Neural Information Processing Systems
+, 37:116870–116898, 2024.
+Appendix A
+Contributors
+The contributors to the report are listed in alphabetical order as follows:
+Aili Chen,
+Aonian Li,
+Bangwei Gong,
+Binyang Jiang,
+Bo Fei,
+Bo Yang,
+Boji Shan,
+Changqing Yu,
+Chao Wang,
+Cheng Zhu,
+Chengjun Xiao,
+Chengyu Du,
+Chi Zhang,
+Chu Qiao,
+Chunhao Zhang,
+Chunhui Du,
+Congchao Guo,
+Da Chen,
+Deming Ding,
+Dianjun Sun,
+Dong Li,
+Enwei Jiao,
+Haigang Zhou,
+Haimo Zhang,
+Han Ding,
+Haohai Sun,
+Haoyu Feng,
+Huaiguang Cai,
+Haichao Zhu,
+Jian Sun,
+Jiaqi Zhuang,
+Jiaren Cai,
+Jiayuan Song,
+Jin Zhu,
+Jingyang Li,
+Jinhao Tian,
+Jinli Liu,
+Junhao Xu,
+Junjie Yan,
+Junteng Liu,
+Junxian He,
+Kaiyi Feng,
+Ke Yang,
+Kecheng Xiao,
+Le Han,
+Leyang Wang,
+Lianfei Yu,
+Liheng Feng,
+Lin Li,
+Lin Zheng,
+Linge Du,
+Lingyu Yang,
+Lunbin Zeng,
+Minghui Yu,
+Mingliang Tao,
+Mingyuan Chi,
+Mozhi Zhang,
+Mujie Lin,
+Nan Hu,
+Nongyu Di,
+Peng Gao,
+Pengfei Li,
+Pengyu Zhao,
+Qibing Ren,
+Qidi Xu,
+Qile Li,
+Qin Wang,
+Rong Tian,
+Ruitao Leng,
+Shaoxiang Chen,
+Shaoyu Chen,
+Shengmin Shi,
+Shitong Weng,
+Shuchang Guan,
+Shuqi Yu,
+Sichen Li,
+Songquan Zhu,
+Tengfei Li,
+Tianchi Cai,
+Tianrun Liang,
+Weiyu Cheng,
+Weize Kong,
+Wenkai Li,
+Xiancai Chen,
+Xiangjun Song,
+Xiao Luo,
+Xiao Su,
+Xiaobo Li,
+Xiaodong Han,
+Xinzhu Hou,
+Xuan Lu,
+Xun Zou,
+Xuyang Shen,
+Yan Gong,
+Yan Ma,
+Yang Wang,
+Yiqi Shi,
+Yiran Zhong,
+Yonghong Duan,
+Yongxiang Fu,
+Yongyi Hu,
+Yu Gao,
+Yuanxiang Fan,
+Yufeng Yang,
+Yuhao Li,
+Yulin Hu,
+Yunan Huang,
+Yunji Li,
+Yunzhi Xu,
+Yuxin Mao,
+Yuxuan Shi,
+Yuze Wenren,
+Zehan Li,
+Zelin Li,
+Zhanxu Tian,
+Zhengmao Zhu,
+Zhenhua Fan,
+Zhenzhen Wu,
+Zhichao Xu,
+Zhihang Yu,
+Zhiheng Lyu,
+Zhuo Jiang,
+Zibo Gao,
+Zijia Wu,
+Zijian Song,
+Zijun Sun
\ No newline at end of file
diff --git a/research/notes/no-gpu-left-behind-unlocking-efficiency-with-co-located-vllm-in-trl.md b/research/notes/no-gpu-left-behind-unlocking-efficiency-with-co-located-vllm-in-trl.md
new file mode 100644
index 0000000000000000000000000000000000000000..17840e4ffb434904bfc0c9be3b27a0236c8e17e3
--- /dev/null
+++ b/research/notes/no-gpu-left-behind-unlocking-efficiency-with-co-located-vllm-in-trl.md
@@ -0,0 +1,1117 @@
+---
+title: 'No GPU left behind: Unlocking Efficiency with Co-located vLLM in TRL'
+id: no-gpu-left-behind-unlocking-efficiency-with-co-located-vllm-in-trl
+tags:
+- deepread
+created: '2026-06-10T00:40:06.451840Z'
+source: https://huggingface.co/blog/vllm-colocate
+source_domain: huggingface.co
+fetched_at: '2026-06-10T00:40:06.451698Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: unknown
+content_type: unknown
+deprecated: false
+---
+
+No GPU left behind: Unlocking Efficiency with Co-located vLLM in TRL
+Back to Articles
+No GPU left behind: Unlocking Efficiency with Co-located vLLM in TRL
+Published
+				June 3, 2025
+Update on GitHub
+Upvote
+101
++95
+Mert Toslali
+toslali-ibm
+Follow
+ibm-ai-platform
+Yu Chin Fabian Lim
+mirinflim
+Follow
+ibm-ai-platform
+Quentin Gallouédec
+qgallouedec
+Follow
+Ed Snible
+esnible
+Follow
+ibm-ai-platform
+Raghu Ganti
+rganti
+Follow
+ibm-ai-platform
+Mudhakar Srivatsa
+mudhakar
+Follow
+ibm-ai-platform
+🚀 Introduction
+TRL supports training LLMs using GRPO, an online learning algorithm recently introduced in the
+DeepSeekMath
+paper
+. In GRPO, the model learns from its own outputs: it generates responses during training, receives feedback, and uses that feedback to improve itself over time.
+This makes generation a critical step in the training loop — and also a major bottleneck. To speed up generation, TRL integrates with vLLM. This combination lets you train powerful models more efficiently in GRPO setup. However, there’s a catch.
+🧨 The Problem
+Before TRL v0.18.0, vLLM was only supported in
+server mode
+, running as a separate process on different GPUs from the training job. It communicated with the training script over HTTP, which made the setup modular and easy to use — but also introduced GPU inefficiencies.
+Here’s what happens:
+During training, the model needs to generate completions frequently.
+The trainer sends a request to the vLLM server, which runs on its own GPUs.
+While vLLM generates, the
+training GPUs sit idle
+and wait.
+Once generation is done,
+vLLM GPUs become idle
+, and training resumes.
+This “ping-pong” between training and generation causes:
+Wasted GPU time on both sides
+Increased demand for
+extra GPUs
+just to run inference
+Reduced overall
+throughput and higher cost
+In online learning methods like GRPO — where generation happens constantly — this inefficiency becomes even more painful. You spend more on hardware, but don't get the performance you'd expect.
+So, the key question becomes:
+Can we share the same GPUs for both training and generation, instead of separating them?
+💡 The Opportunity
+The main issue was that training and inference ran on separate GPUs, leading to idle time and underutilization. The natural solution? Run both on the same GPUs. Instead of having vLLM operate as a standalone server in its own process and devices, what if vLLM could run alongside the training code, within the same distributed process group? This would let us launch a single distributed job where training and inference share the same devices, switching between tasks efficiently without wasting resources.
+This approach is what we refer to as
+colocation
+. Training and inference are co-located on the same GPUs and coordinated via the same process group, allowing them to take turns smoothly — no extra hardware needed.
+Previously, this wasn’t possible in TRL, which relied on vLLM as an external HTTP server. That changed with our
+PR #3394
+, which added support for vLLM’s external launcher and true integration into the training process.
+What It Enables
+Unified Execution
+: By embedding vLLM in the same process group, both training and inference tasks can share the same GPUs, taking turns instead of waiting on each other. This reduces idle time and boosts overall efficiency.
+Skip HTTP Communication
+: No need for REST API calls or networking — vLLM runs inline with the training loop, avoiding overhead and latency.
+Torchrun Compatibility
+: Works seamlessly with
+torchrun
+, so it's easy to scale across nodes with minimal config changes.
+TP and DP Support
+: Compatible with Tensor Parallelism and Data Parallelism, making it suitable for large-scale training runs.
+SPMD Execution Pattern
+: Uses a Single Program, Multiple Data (SPMD) model, where each GPU runs its own instance of the engine in sync. Ideal for distributed multi-GPU, multi-node setups.
+Simplified Deployment
+: You no longer need to maintain a separate server script — vLLM is launched and controlled directly inside your training job.
+Enhanced Throughput
+: By avoiding idle GPUs and eliminating inter-process communication, the system delivers faster training and generation, especially important in online learning setups like GRPO.
+Robust Inter-process Communication
+: This is more robust because it avoids the complexity of setting up distributed process groups between independent processes, as required in server mode.
+Thanks to this feature, co-located training and inference is no longer a hack — it’s now
+first-class, scalable, and production-ready
+.
+🧩 Design: From Separate Servers to Shared GPUs
+The shift from server TRL to co-located TRL is all about smarter GPU usage. The diagram below shows the difference:
+Server TRL Setup (Top Row)
+In the server TRL setup, training and inference run on separate GPUs. For example:
+GPUs 0 through 2 are used for training.
+GPU 3 is fully dedicated to running vLLM as a separate server.
+During training steps,
+GPU 3 sits idle
+.
+During generation steps (inference),
+GPUs 0–2 are idle
+while GPU 3 generates outputs.
+This leads to:
+Inefficient GPU usage, with devices frequently waiting on each other
+Extra GPUs provisioned solely for inference
+Increased cost and complexity
+Co-located TRL Setup (Bottom Row)
+In contrast, the co-located TRL setup runs both training and vLLM on the
+same GPUs
+. Each GPU:
+Runs the training loop
+Launches a vLLM engine within the
+same process
+Training and inference
+take turns
+using the GPU’s resources — no need for dedicated devices or separate processes.
+This design:
+Reduces idle time
+Minimizes inter-process and HTTP communication
+Fully utilizes available GPU memory and compute
+Delivers
+faster throughput
+without increasing hardware requirements
+🛠️ Implementation Notes
+Instead of launching vLLM as a server,
+the trainer now launches vLLM
+in-process
+using the external launcher, as shown below:
+self.llm = LLM(
+    model=model.name_or_path,
+    tensor_parallel_size=args.vllm_tensor_parallel_size,
+    gpu_memory_utilization=self.vllm_gpu_memory_utilization,
+    max_num_seqs=self.args.per_device_train_batch_size
+        * self.vllm_tensor_parallel_size
+        * self.args.gradient_accumulation_steps,
+    max_model_len=self.max_prompt_length + self.max_completion_length,
+    distributed_executor_backend=
+"external_launcher"
+,
+# Feed identical seed for tp groups to ensure sampling results are the same across workers
+seed=self.accelerator.process_index // self.vllm_tensor_parallel_size,
+)
+Co-located vLLM respects the torch.distributed process group and rank structure. This allows vLLM to be initialized alongside training without conflict and makes TP/DP setups work seamlessly:
+if
+self.vllm_tensor_parallel_size >
+1
+:
+# Create subgroups of ranks for TP, each group with `vllm_tensor_parallel_size` ranks.
+self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration(
+        [
+list
+(
+range
+(i * self.vllm_tensor_parallel_size, (i +
+1
+) * self.vllm_tensor_parallel_size))
+for
+i
+in
+range
+(self.accelerator.num_processes // self.vllm_tensor_parallel_size)
+        ]
+    )
+Co-located vLLM no longer relies on REST APIs — it runs directly in memory and communicates via native Python calls:
+if
+self.vllm_tensor_parallel_size >
+1
+:
+    orig_size =
+len
+(prompts_text)
+    gathered_prompts = [
+None
+for
+_
+in
+range
+(self.vllm_tensor_parallel_size)]
+    torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group)
+    all_prompts_text = [p
+for
+sublist
+in
+gathered_prompts
+for
+p
+in
+sublist]
+else
+:
+    all_prompts_text = prompts_text
+with
+profiling_context(self,
+"vLLM.generate"
+):
+    all_outputs = self.llm.generate(all_prompts_text, sampling_params=sampling_params, use_tqdm=
+False
+)
+
+completion_ids = [output.token_ids
+for
+outputs
+in
+all_outputs
+for
+output
+in
+outputs.outputs]
+if
+self.vllm_tensor_parallel_size >
+1
+:
+    local_rank_in_group = torch.distributed.get_rank(group=self.tp_group)
+    tp_slice =
+slice
+(local_rank_in_group * orig_size, (local_rank_in_group +
+1
+) * orig_size)
+    completion_ids = completion_ids[tp_slice]
+To use this setup, simply set vllm_mode="colocate" in your GRPO configuration:
+training_args = GRPOConfig(
+    ...,
+    use_vllm=
+True
+,
+    vllm_mode=
+"colocate"
+,
+)
+Note: Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the vllm_gpu_memory_utilization parameter in
+GRPOConfig
+to avoid underutilization or out-of-memory errors.
+📊 Showcase: Co-located vs. Plain TRL Performance
+To measure the impact of colocation, we ran a series of experiments comparing the traditional
+server mode
+(where vLLM runs on a separate GPU as a standalone server) with the new
+co-locate mode
+(where training and inference share the same GPUs).
+In
+server mode
+, only 7 GPUs are used for training because 1 GPU is fully dedicated to the vLLM inference server.
+In
+co-locate mode
+, all 8 GPUs are used for training — increasing the effective batch size by default.
+To ensure a fair comparison, we
+normalized throughput in server mode by a factor of 8/7
+. This adjustment accounts for the greater training capacity in co-locate mode and allows us to compare the two setups under equal training conditions.
+Experiment 1: 1.5B Model — Varying Batch Sizes
+As the batch size increases, throughput improves in both setups.
+Co-located setup reaches up to 1.43× speedup
+at the largest batch size.
+Larger batches make better use of shared GPU memory in co-located mode.
+Experiment 2: 1.5B Model — Varying Tensor Parallelism (TP)
+In the co-located setup, increasing TP
+reduces performance
+.
+More sharding introduces more communication overhead — which is
+not ideal for smaller models
+.
+Takeaway
+: For small models, avoid over-sharding in co-located mode.
+Experiment 3: 7B Model — Varying Batch Sizes
+Again, co-located mode
+scales better with batch size
+.
+Gains reach
+1.35× speedup
+at the largest batch tested.
+Experiment 4: 7B Model — Varying Tensor Parallelism (TP)
+Opposite trend from the 1.5B model.
+With 7B,
+more TP improves throughput
+, reaching up to
+1.73× speedup
+.
+Larger models benefit from sharding
+in co-located setups.
+📊 Scaling to 72B Model
+When training large models like
+Qwen2.5-Math-72B
+, it's important to use the right strategies to make training efficient, scalable, and stable across many GPUs and nodes. In our setup, we combined
+co-located vLLM
+with several key optimizations to make this work efficiently.
+Sleep Mode in vLLM
+When using co-located training, managing GPU memory is crucial so that both training and inference can run smoothly on the same devices. To support this, we added vLLM’s
+sleep()
+API into the GRPO training loop.
+The
+sleep()
+function temporarily pauses the vLLM engine and frees up GPU memory. It supports two levels:
+Level 1
+: Unloads model weights from GPU (keeps them in CPU memory) and clears the KV cache.
+Useful when the same model will be reused soon.
+Level 2
+: Unloads both model weights and KV cache entirely.
+Best for scenarios where the model will change or won’t be reused right away.
+In GRPO, the model is updated after every step — so we use
+Level 2 sleep
+.
+Benefits of Level 2 sleep:
+Maximizes free GPU memory
+for training
+Avoids memory contention
+between training and generation
+Keeps colocation efficient, even for large models like Qwen2.5-72B
+This small addition makes a
+big difference
+in enabling smooth and scalable co-located training.
+DeepSpeed Optimizations
+To train large models like Qwen2.5-72B, we rely on
+DeepSpeed ZeRO Stage 3
+, the same setup used in plain TRL.
+ZeRO helps scale large models by distributing memory across GPUs. Stage 3 goes further by partitioning:
+Model weights
+Gradients
+Optimizer states
+This is essential for models that can’t fit on a single GPU. With ZeRO Stage 3, each GPU handles only a portion of the model.
+Additional options we enable:
+"offload_optimizer": {"device": "cpu"}
+Moves optimizer states to CPU to free GPU memory — critical in co-located setups.
+"overlap_comm": true
+Enables communication overlap with computation, speeding up training.
+"contiguous_gradients": true
+Allocates gradients in a single memory block, improving memory access and reducing fragmentation.
+These optimizations help
+train 72B models efficiently
+, and ensure colocation remains stable under tight memory constraints.
+Accelerate Integration
+As recommended in TRL, we use
+Accelerate
+, a lightweight library that simplifies distributed training. It handles:
+Multi-GPU and multi-node job launching
+Data parallelism
+Gradient accumulation
+Distributed data loading
+This makes the setup clean, scalable, and easy to maintain.
+Experiment 5: Qwen2.5-Math-72B — Throughput, Accuracy, and Benchmark Results
+Throughput
+Even with
+4 fewer GPUs
+, the
+co-locate setup is ~1.26× faster
+than plain TRL.
+This highlights the effectiveness of smarter GPU sharing and memory cleanup using
+sleep()
+.
+Reward Curve
+Training reward plots for co-locate and plain setups are
+nearly identical
+, demonstrating that:
+Co-located training preserves accuracy
+There’s
+no regression in model learning performance
+Math500 Benchmark
+We evaluated three models:
+Base model
+,
+Co-locate-trained model
+,
+Plain-trained model
+on the Math500 benchmark. Both trained models
+outperform the base
+, and the
+co-locate model performs on par
+with the plain-trained model — confirming that colocation does not compromise downstream performance.
+🎓 Challenges & Lessons Learned & next steps
+Through our work on scaling GRPO training with co-located vLLM, we've faced several critical challenges and learned important lessons about efficiency, flexibility, and system design when training large models.
+Challenges
+Tensor Parallelism Bug in vLLM ≥ 0.8.0.
+Tensor Parallelism (TP) with external_launcher stopped working in vLLM version 0.8.0 and above. This was tracked under Issue
+#15895
+. To identify the breaking point, we followed the approach described in this vLLM developer
+blog post
+, which provides wheels for every commit. After bisecting, we identified the breaking commit as
+cc10281
+. The root cause was determinism — the newer versions required explicitly setting the random seed. Once the seed was set, the issue went away.
+Level 2 Sleep Buffer Bug.
+Initially, level 2 sleep didn’t work correctly when we tried to reload weights using load_weights. This issue was tracked in
+Issue #16564
+. The problem was that model buffers (like running mean/var in BatchNorm) weren’t restored after waking up from sleep. The fix came with PR
+#16889
+, which added logic to explicitly restore buffers when waking up from level 2 sleep. We now keep a copy of the original buffers and manually reapply them after loading new weights.
+Segmentation Fault on Exit.
+There’s still an open issue with vLLM sleep causing a segmentation fault at the end of training when closing processes. This was reported in Issue
+#16993
+. This crash happens during shutdown but does not break training itself, so we were able to complete all demos and experiments shared in this blog. However, we’re waiting for an official fix before integrating sleep() fully into TRL upstream.
+These challenges were not blockers, but they required careful debugging, version control, and a deeper understanding of how vLLM manages memory and parallelism under the hood.
+Lessons Learned
+Co-located inference dramatically improves GPU utilization. By allowing training and generation to share the same GPUs, we eliminate idle time and reduce hardware requirements — achieving higher throughput even with fewer GPUs.
+vLLM's sleep() feature is essential for large-scale colocation. It enables fine-grained control over memory usage, allowing training to fully reclaim GPU memory between generation steps — a key enabler for models like Qwen2.5-72B.
+DeepSpeed ZeRO Stage 3 is essential for training large models. It allows extremely large networks to fit into memory by distributing model weights, gradients, and optimizer states across multiple GPUs. In our experience, enabling contiguous_gradients helped reduce memory fragmentation, while offloading the optimizer to the CPU freed up critical GPU memory — both of which were especially helpful in colocated setups.
+Colocation is powerful but comes with trade-offs. It works best when GPU memory is carefully managed, often requiring manual tuning of memory usage parameters like vllm_gpu_memory_utilization. While it offers clear throughput benefits and reduces idle GPU time, colocation may not be ideal for models with tight memory budgets or when memory fragmentation is not well controlled. When done right, though, it unlocks significant efficiency gains.
+TP/DP compatibility, Accelerate, and torchrun support make deployment seamless. Despite the complexity of the underlying architecture, the entire system can be launched and scaled with standard distributed tools.
+Co-located training maintains model quality. Across multiple benchmarks (Math500, AIME24), co-located and plain setups produced comparable results, validating that performance isn’t sacrificed for efficiency.
+✅ Conclusion
+This blog post explored how co-locating vLLM with GRPO training unlocks significant efficiency gains when training large language models — including models as large as Qwen2.5-72B.
+Traditionally, TRL only supported vLLM in server mode, which required separate processes and GPUs for inference, leading to wasted compute and idle time. With the introduction of vLLM’s external launcher and the colocation PR in TRL
+PR #3394
+, we can now run training and inference within the same distributed process group, on the same GPUs, with full support for TP, DP, and Accelerate.
+While challenges remain — such as version-specific vLLM bugs and edge cases such as with sleep() — the overall results show that co-located GRPO is a practical, scalable solution for training large models efficiently. We’re excited to continue refining this setup, integrating features like FSDP, and pushing the limits of large model training — making it faster, cheaper, and more accessible for everyone building the next generation of LLMs.
+✅ Give It a Try!
+Below is an example to try out GRPO training with co-located vLLM.
+📄
+train_grpo_colocate.py
+from
+datasets
+import
+load_dataset
+from
+trl
+import
+GRPOConfig, GRPOTrainer
+# Load dataset
+dataset = load_dataset(
+"trl-lib/tldr"
+, split=
+"train"
+)
+# Define the reward function
+def
+reward_len
+(
+completions, **kwargs
+):
+return
+[-
+abs
+(
+20
+-
+len
+(completion))
+for
+completion
+in
+completions]
+# Define training arguments
+training_args = GRPOConfig(
+    output_dir=
+"Qwen2-0.5B-GRPO"
+,
+    logging_steps=
+1
+,
+    use_vllm=
+True
+,
+    vllm_mode=
+"colocate"
+,
+    vllm_tensor_parallel_size=
+1
+,
+    vllm_gpu_memory_utilization=
+0.3
+,
+    max_prompt_length=
+512
+,
+    max_completion_length=
+1024
+,
+    max_steps=
+2
+,
+    num_generations=
+4
+,
+    num_train_epochs=
+1
+,
+    per_device_train_batch_size=
+4
+,
+    push_to_hub=
+False
+,
+    report_to=
+None
+)
+# Create and run the trainer
+trainer = GRPOTrainer(
+    model=
+"Qwen/Qwen2-0.5B-Instruct"
+,
+    reward_funcs=reward_len,
+    args=training_args,
+    train_dataset=dataset,
+)
+
+trainer.train()
+Papers mentioned in this article
+1
+More Articles from our Blog
+liger
+grpo
+trl
+🐯 Liger GRPO meets TRL
++2
+54
+May 25, 2025
+llm
+rl
+trl
+Shipping a Trillion Parameters With a Hub Bucket: Delta Weight Sync in TRL
++4
+41
+May 27, 2026
+Community
+trbula92
+Jun 3, 2025
+Great work on this and thanks for the detailed write up. In our experience this approach has worked really well for larger-scale multi-node training. We've seen up to 3x improvement in training speed training 32b models.
+See translation
+1 reply
+·
+🚀
+2
+2
+❤️
+1
+1
++
+qgallouedec
+Article author
+Jun 3, 2025
+Impressive! Thanks for sharing!
+See translation
+👍
+1
+1
++
+daniel-dona
+Jun 8, 2025
+Do the example code
+train_grpo_colocate.py
+need to be launched using accelerate?  Running it just using
+python3 train_grpo_colocate.py
+ends with an exception about missing env vars ("RANK", "LOCAL_RANK"...).
+See translation
+3 replies
+·
+toslali-ibm
+Article author
+Jun 9, 2025
+Yes!
+👍
+1
+1
++
+Expand 2
+						replies
+lhkhiem28
+Jun 9, 2025
+Does vllm_mode="colocate" work with PEFT?
+See translation
+2 replies
+·
+mirinflim
+Article author
+Jun 9, 2025
+@
+lhkhiem28
+actually we didnt try this, however there is no reason that it wouldnt work since LoRA is relates to model training, whereas our change relates to generation. However, it seems that
+@
+ajinkya-tejankar
+below has tried it and it seems to work
+See translation
+🚀
+1
+1
++
+Expand 1
+						reply
+wetsoledrysoul
+Jun 9, 2025
+Great article! Is data parallel support planned for the colocate mode?
+See translation
+1 reply
+·
+toslali-ibm
+Article author
+Jun 9, 2025
+DP is supported.
+For example, if # of GPUS = 8 and vllm_tensor_parallel_size = 2 → groups: [0,1], [2,3], [4,5], [6,7] -> making DP=4
+See translation
+👍
+1
+1
++
+ajinkya-tejankar
+Jun 9, 2025
+•
+edited Jun 9, 2025
+Is DeepSpeed planned to be the main engine to support multi-gpu and multi-node setups for TRL in the future? I tried FSDP and it doesn't work with a bunch of configurations that work with DeepSpeed. For instance, I couldn't get GRPO + FSDP + LoRA + VLLM colocate to work, but swapping FSDP with DeepSpeed works. Is DeepSpeed more reliable than plain FSDP from PyTorch?
+PS: Great blog! Thanks a lot for your efforts :)
+See translation
+4 replies
+·
+mirinflim
+Article author
+Jun 9, 2025
+@
+ajinkya-tejankar
+in our private experimentation, we have tried to hack in FSDP2 into accelerate, and tested it with collocate. There are a few issues I believe that remain. 1. TRL's weight loading code only works with FSDP1 I believe. 2. FSDP1 has a NAN problem and I had filed a bug report awhile back
+https://github.com/vllm-project/vllm/issues/14443
+See the previous discussion here:
+https://github.com/huggingface/trl/pull/3317#issuecomment-2842576427
+See translation
+Expand 3
+						replies
+tryumanshow
+Jun 19, 2025
+•
+edited Jun 19, 2025
+Thank you so much for the great article.
+Your article was tremendously helpful for training GRPO in colocate mode.
+By the way, have you ever trained a model using LoRA?
+You mentioned training a 72B model, but I don’t have access to 32 GPUs, so full finetuning isn’t an option for me.
+When training a model with the combination of
+DeepSpeed ZeRO-3
++
+vLLM colocate
++
+LoRA
++
+GRPO
+, and configuring
+modules_to_save=["embed_tokens", "lm_head"]
+in the LoRA config (as shown below), I encounter the error at the bottom.
+I’d appreciate any solutions or tips you used to train the 72B model.
+The versions of the libraries I used are:
+trl==0.18.2
+peft==0.15.2
+transformers==4.52.4
+deepspeed==0.17.1
+LoRA Config
+lora_config = LoraConfig(
+    r=training_config["rank"],
+    lora_alpha=training_config["alpha"],
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj", 
+        "up_proj",
+        "down_proj",
+    ],
+    lora_dropout=training_config["dropout"],
+    bias="none",
+    task_type="CAUSAL_LM",
+    modules_to_save=["embed_tokens", "lm_head"],
+)
+Error
+AttributeError: 'Linear' object has no attribute 'ds_grads_remaining'
+The full error log is as follows:
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/workspace/LLMTrainFlow/./src/train/rl_gemma3.py", line 180, in <module>
+[rank0]:     trainer.train()
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2240, in train
+[rank0]:     return inner_training_loop(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2555, in _inner_training_loop
+[rank0]:     tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+[rank0]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3745, in training_step
+[rank0]:     loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/extras/profiling.py", line 96, in wrapper
+[rank0]:     return func(self, *args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 1330, in compute_loss
+[rank0]:     return self._compute_loss(model, inputs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 1340, in _compute_loss
+[rank0]:     per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
+[rank0]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/extras/profiling.py", line 96, in wrapper
+[rank0]:     return func(self, *args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 852, in _get_per_token_logps
+[rank0]:     logits = model(
+[rank0]:              ^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:               ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/engine.py", line 2087, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/peft/peft_model.py", line 1757, in forward
+[rank0]:     return self.base_model(
+[rank0]:            ^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/peft/tuners/tuners_utils.py", line 193, in forward
+[rank0]:     return self.model.forward(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
+[rank0]:     output = func(self, *args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/models/gemma3/modeling_gemma3.py", line 880, in forward
+[rank0]:     logits = self.lm_head(hidden_states[:, slice_indices, :])
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1782, in inner
+[rank0]:     args_result = hook(self, args)
+[rank0]:                   ^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py", line 745, in _fn
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/parameter_offload.py", line 378, in _post_backward_module_hook
+[rank0]:     return apply_to_tensors_only(module.post_bwd_fn.apply,
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/utils.py", line 133, in apply_to_tensors_only
+[rank0]:     touched_output = apply_to_tensors_only(function, elem)
+[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/utils.py", line 149, in apply_to_tensors_only
+[rank0]:     touched_output = function(value)
+[rank0]:                      ^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py", line 575, in apply
+[rank0]:     return super().apply(*args, **kwargs)  # type: ignore[misc]
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/parameter_offload.py", line 446, in forward
+[rank0]:     module.ds_grads_remaining += 1
+[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__
+[rank0]:     raise AttributeError(
+[rank0]: AttributeError: 'Linear' object has no attribute 'ds_grads_remaining'
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/workspace/LLMTrainFlow/./src/train/rl_gemma3.py", line 180, in <module>
+[rank0]:     trainer.train()
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2240, in train
+[rank0]:     return inner_training_loop(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2555, in _inner_training_loop
+[rank0]:     tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+[rank0]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3745, in training_step
+[rank0]:     loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/extras/profiling.py", line 96, in wrapper
+[rank0]:     return func(self, *args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 1330, in compute_loss
+[rank0]:     return self._compute_loss(model, inputs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 1340, in _compute_loss
+[rank0]:     per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
+[rank0]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/extras/profiling.py", line 96, in wrapper
+[rank0]:     return func(self, *args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/trl/trainer/grpo_trainer.py", line 852, in _get_per_token_logps
+[rank0]:     logits = model(
+[rank0]:              ^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
+[rank0]:     ret_val = func(*args, **kwargs)
+[rank0]:               ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/engine.py", line 2087, in forward
+[rank0]:     loss = self.module(*inputs, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/peft/peft_model.py", line 1757, in forward
+[rank0]:     return self.base_model(
+[rank0]:            ^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1793, in inner
+[rank0]:     result = forward_call(*args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/peft/tuners/tuners_utils.py", line 193, in forward
+[rank0]:     return self.model.forward(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
+[rank0]:     output = func(self, *args, **kwargs)
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/transformers/models/gemma3/modeling_gemma3.py", line 880, in forward
+[rank0]:     logits = self.lm_head(hidden_states[:, slice_indices, :])
+[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1845, in _call_impl
+[rank0]:     return inner()
+[rank0]:            ^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1782, in inner
+[rank0]:     args_result = hook(self, args)
+[rank0]:                   ^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py", line 745, in _fn
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/parameter_offload.py", line 378, in _post_backward_module_hook
+[rank0]:     return apply_to_tensors_only(module.post_bwd_fn.apply,
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/utils.py", line 133, in apply_to_tensors_only
+[rank0]:     touched_output = apply_to_tensors_only(function, elem)
+[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/utils.py", line 149, in apply_to_tensors_only
+[rank0]:     touched_output = function(value)
+[rank0]:                      ^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py", line 575, in apply
+[rank0]:     return super().apply(*args, **kwargs)  # type: ignore[misc]
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/deepspeed/runtime/zero/parameter_offload.py", line 446, in forward
+[rank0]:     module.ds_grads_remaining += 1
+[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__
+[rank0]:     raise AttributeError(
+[rank0]: AttributeError: 'Linear' object has no attribute 'ds_grads_remaining'
+See translation
+Reply
+JVP15
+Jul 2, 2025
+I noticed that having vLLM sleep didn't make it in to TRL, what's the reason for that?
+See translation
+2 replies
+·
+toslali-ibm
+Article author
+Jul 3, 2025
+The reason is noted in the "segmentation fault" discussion under
+https://huggingface.co/blog/vllm-colocate#challenges
+. Basically, we are waiting for a fix of the bug (
+https://github.com/vllm-project/vllm/issues/16993
+) before integrating sleep() fully into TRL upstream.
+See translation
+👍
+1
+1
++
+Expand 1
+						reply
+JVP15
+Jul 3, 2025
+How did you distribute the weights in your Qwen 72B experiment? Did you just have it running w/ TP=8 on a single node, or did each node have it's own copy of Qwen 72B?
+1 reply
+·
+toslali-ibm
+Article author
+Jul 21, 2025
+Yes, we set TP=8, meaning each node has a copy of the shard of the 72B model.
+See translation
+JonasNasimzada
+Jul 11, 2025
+Great article!
+I am using VLLM Co-located in a Slurm cluster for GRPO and I get an TCP exception:
+TCP client failed to connect/validate to host 10.0.1.163:35345
+though I thought it runs inline with the training loop. Is this normal? :D
+See translation
+3 replies
+·
+mirinflim
+Article author
+Jul 11, 2025
+this is not normal. make sure you set
+vllm_mode="colocate"
+See translation
+Expand 2
+						replies
+dfalck
+Aug 8, 2025
+This is great;  should the
+docs
+be updated?
+See translation
+Reply
+ewhacc
+Aug 25, 2025
+•
+edited Aug 25, 2025
+It's working nicely.  But, I got error when I tried qLoRA with
+load_in_4bit: true
+.
+[rank0]:   File "/home/xxx/.pyenv/versions/3.12.10/envs/trlgrpo/lib/python3.12/site-packages/peft/tuners/lora/bnb.py", line 373, in merge
+[rank0]:     self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device)
+[rank0]:                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]: TypeError: Params4bit.__new__() got an unexpected keyword argument 'ds_param_type'
+Does it support qLoRA, or not yet?   I found the following in the TRL/GRPO document, so I thought qLoRA is supported. Thanks a lot.
+Use LoRA on vision-language projection layers
+Enable 4-bit quantization to reduce memory usage
+See translation
+👀
+1
+1
++
+Reply
+AaronHuangWei
+Aug 31, 2025
+could you please release the training scripts and deepspeed configuration of 70B model?
+See translation
+3 replies
+·
+toslali-ibm
+Article author
+Sep 2, 2025
+•
+edited Sep 2, 2025
+Here are all the config used. But note that this was a while ago, so parameter names may have changed.
+deepspeed.yaml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+ deepspeed_multinode_launcher: standard
+ deepspeed_config_file: ds_config.json
+ zero3_init_flag: true
+distributed_type: DEEPSPEED
+fsdp_config: {}
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+ds_config.json
+{
+    "bf16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e6,
+        "stage3_prefetch_bucket_size": 0.94e6,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+    "train_batch_size": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+experiment_config.yaml
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Math-72B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: DigitalLearningGmbH/MATH-lighteval
+dataset_config: default
+dataset_prompt_column: problem
+system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_mode: colocate
+vllm_tensor_parallel_size: 8
+vllm_gpu_memory_utilization: 0.5
+vllm_enable_prefix_caching: false
+vllm_max_model_len: 4096
+
+do_eval: false
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+learning_rate: 3.0e-06
+log_completions: false
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+
+lr_scheduler_type: cosine
+
+max_prompt_length: 512
+max_completion_length: 3584
+max_steps: -1
+num_generations: 4
+num_train_epochs: 1
+overwrite_output_dir: true
+per_device_train_batch_size: 4 
+push_to_hub: false
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 1.0
+
+eval_strategy: "no"
+save_strategy: "steps"
+save_steps: 30
+save_total_limit: 3
+
+report_to: 
+- wandb
+
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
+See translation
+Expand 2
+						replies
+cindy2000sh
+Nov 13, 2025
+Thanks for sharing the post! I wonder if you can provide the transformers, trl, vllm, deepspeed, accelerate versions to launch the code?
+See translation
+1 reply
+·
+toslali-ibm
+Article author
+Nov 13, 2025
+•
+edited Nov 13, 2025
+Thanks for sharing the post! I wonder if you can provide the transformers, trl, vllm, deepspeed, accelerate versions to launch the code?
+Here they are:
+accelerate==1.4.0
+deepspeed==0.16.8
+vllm==0.8.5
+trl (https://github.com/huggingface/trl/pull/3394)
+transformers==4.53
+See translation
+🚀
+1
+1
++
+Edit
+Preview
+Upload images, audio, and videos by dragging in the text input, pasting, or
+clicking here
+.
+Tap or paste here to upload images
+Comment
+·
+Sign up
+or
+log in
+to comment
+Upvote
+101
++89
+Papers mentioned in this article
+1
\ No newline at end of file
diff --git a/research/notes/r2e-gym-scaling-open-weights-software-engineering-agents-with-procedural-synthet.md b/research/notes/r2e-gym-scaling-open-weights-software-engineering-agents-with-procedural-synthet.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbe4d40adc5e010c7ccc7e84f36cc4168338d5c4
--- /dev/null
+++ b/research/notes/r2e-gym-scaling-open-weights-software-engineering-agents-with-procedural-synthet.md
@@ -0,0 +1,6485 @@
+---
+title: 'R2E-Gym: Scaling Open-Weights Software Engineering Agents with Procedural
+  Synthetic Data and Agentic Tests'
+id: r2e-gym-scaling-open-weights-software-engineering-agents-with-procedural-synthet
+tags:
+- deepread
+created: '2026-06-10T00:23:13.436814Z'
+source: https://arxiv.org/html/2504.07164
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:13.436574Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+R2E-Gym: Scaling Open-Weights Software Engineering Agents with Procedural Synthetic Data and Agentic Tests
+R2E-Gym: Scaling Open-Weights Software Engineering Agents with Procedural Synthetic Data and Agentic Tests
+Naman Jain
+1
+⋆
+Jaskirat Singh
+2
+⋆
+Manish Shetty
+1
+Liang Zheng
+2
+Koushik Sen
+1
+Ion Stoica
+1
+1
+UC Berkeley
+2
+Australian National University
+{
+naman_jain@berkeley.edu
+jaskirat.singh@anu.edu.au
+}
+R2E-Gym: Synthetic Data and Tests are All You Need!
+Naman Jain
+1
+⋆
+Jaskirat Singh
+2
+⋆
+Manish Shetty
+1
+Liang Zheng
+2
+Koushik Sen
+1
+Ion Stoica
+1
+1
+UC Berkeley
+2
+Australian National University
+{
+naman_jain@berkeley.edu
+jaskirat.singh@anu.edu.au
+}
+R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents
+Naman Jain
+1
+⋆
+Jaskirat Singh
+2
+⋆
+Manish Shetty
+1
+Liang Zheng
+2
+Koushik Sen
+1
+Ion Stoica
+1
+1
+UC Berkeley
+2
+Australian National University
+{
+naman_jain@berkeley.edu
+jaskirat.singh@anu.edu.au
+}
+Abstract
+Improving open-source models on real-world
+Swe
+tasks (solving
+GitHub
+issues) faces two key challenges:
+1) scalable curation of execution environments to train these models, and
+2) optimal scaling of test-time compute.
+We introduce R2E-Gym, the largest procedurally-curated executable gym environment for training real-world
+Swe
+-agents, consisting of more than
+8.1
+8.1
+8.1
+8.1
+K tasks.
+R2E-Gym is powered by two main contributions: 1)
+SweGen
+: a synthetic data curation recipe that enables scalable curation of executable environments using test-generation and back-translation directly from commits, thereby reducing reliance on human-written issues or unit tests.
+We show that this enables more scalable training leading to
+Pass@
+1 of 34.4% on
+SWEBench-Verified
+benchmark with our 32B model.
+2) Hybrid Test-time Scaling: we next provide an in-depth analysis of two test-time scaling axes;
+execution-based and execution-free verifiers, demonstrating that they exhibit complementary strengths and limitations.
+Test-based verifiers suffer from low distinguishability, while execution-free verifiers are biased and often rely on stylistic features.
+Surprisingly, we find that while each approach individually saturates around 42-43%, significantly higher gains can be obtained by leveraging their complementary strengths.
+Overall, our approach achieves
+51
+% on the
+SWEBench-Verified
+benchmark, reflecting a new state-of-the-art for open-weight
+Swe
+agents and for first time being competitive with proprietary systems such as
+o1
+or
+sonnet w/ tools
+.
+†
+†
+footnotetext:
+⋆
+Equal Contribution.
+Project Page:
+https://r2e-gym.github.io
+1
+Introduction
+Autonomous software engineering (
+Swe
+), aiming to solve real-world software engineering problems such as
+GitHub
+issues, has made significant progress in recent times
+(Wang et al.,
+2024
+; Yang et al.,
+2024b
+)
+.
+While
+LLM
+-based
+Swe
+-Agents have demonstrated remarkable improvements, state-of-the-art performance is largely driven by proprietary models
+(Anthropic,
+2025
+; Jaech et al.,
+2024
+)
+—
+with open-models lagging behind
+(Xie et al.,
+2025
+)
+.
+(a)
+Synthetic Data
+(b)
+Hybrid Test-time Scaling
+(c)
+Open-weights SOTA Performance
+Figure 1:
+Overview.
+In this paper, we introduce R2E-Gym, the largest gym environment and training framework for training open-weight
+Swe
+agents. R2E-Gym is powered by two main contributions: (a)
+SweGen
+: a synthetic data curation recipe for curating executable training environments w/o relying on human tests and issues (§
+2
+). (b) Hybrid Inference Time Scaling: showing that while both execution-based and execution-free verifiers elicit inference-time gains; significantly better performance can be achieved by leveraging the strengths of both (§
+4
+). (c) Overall, the final approach reflects SOTA performance for open-weight
+Swe
+-agents, while also being competitive with some proprietary model baselines
+2
+2
+2
+Results with all open-weight models are reported with test-time scaling.
+.
+Addressing this gap requires solving two fundamental challenges:
+First, scalable curation of high-quality execution environments to train these models;
+and second, developing efficient aggregation strategies to maximize test-time performance.
+While several benchmarks for evaluating
+Swe
+-agents on
+GitHub
+issues exist
+(Jimenez et al.,
+2023
+; Zhao et al.,
+2024
+)
+, scalable curation of high-quality training environments remains a challenging problem.
+For instance, while the training split from SWE-Bench
+(Jimenez et al.,
+2023
+)
+contains output patches, it lacks executable environments.
+Pan et al. (
+2024
+)
+collect executable test environments, but rely on human-written issues and test cases restricting sample-size.
+In this paper, we introduce
+R2E-Gym
+, the largest procedurally curated environment for training real-world
+Swe
+-agents — consisting of more than
+8.1
+8.1
+8.1
+8.1
+K problems, with executable gym environments, unit tests, and natural-language task descriptions (§
+2
+).
+R2E-Gym
+addresses both key challenges through two primary contributions (Figures
+1(a)
+and
+1(b)
+):
+Synthetic Data Enables More Scalable Training.
+We propose
+SweGen
+— a novel synthetic data curation recipe that enables collection of a large number of executable training environments without reliance on human-written pull requests (PRs) or unit tests.
+We show that instead of using human-written PRs, good-quality execution environments can directly be curated from
+commits
+through backtranslation
+(Li et al.,
+2023
+; Wei et al.,
+2023
+)
+and test collection or generation (§
+2
+).
+Compared to PR-based data collection
+(Pan et al.,
+2024
+)
+, this approach enables more scalable data curation (Figure
+1(a)
+) and agent-training, resulting in a
+Pass@
+1 performance of 34.4% on the challenging
+SWEBench-Verified
+benchmark.
+Hybrid Inference Time Scaling.
+We next leverage
+R2E-Gym
+to investigate two complementary axes for scaling test-time compute (§
+4
+):
+1) Execution-based verifiers that evaluate patches through test cases
+(Xia et al.,
+2024b
+)
+, and
+2) Execution-free verifiers that assess trajectories through learned models
+(Pan et al.,
+2024
+)
+.
+While prior works have studied these approaches in isolation, they lack a comprehensive analysis of their relative strengths and weaknesses.
+We first present a unique and in-depth analysis of their working mechanisms, demonstrating that execution-free and execution-based methods actually exhibit complementary strengths and weaknesses.
+We find two key insights (studied in §
+4.2
+):
+a) Execution-based methods provide direct signals for patch correctness but struggle with discriminating between solutions , and
+b) Execution-free verifiers provide better discrimination but can be biased by other heuristics (
+e.g
+., agent thoughts) over the final patch.
+Based on the above insights, we propose a hybrid scaling approach leveraging the strengths of both methods.
+Surprisingly, while the performance of both execution-based and execution-free methods plateaus around 42-43%, the hybrid approach yields significantly higher gains, achieving a final performance of 51% on
+SWEBench-Verified
+(Figure
+1(b)
+and §
+4.3
+).
+The key contributions of this paper are:
+1) We introduce
+R2E-Gym
+, the largest procedurally curated environment for training real-world
+Swe
+-agents, increasing the number of executable environments by over
+3
+3
+3
+3
+times.
+2) We provide an in-depth analysis demonstrating that execution-based and execution-free axes for scaling test-time compute exhibit complementary strengths and weaknesses.
+3) Based on the above insights, we propose a
+hybrid scaling
+approach that leverages the strengths of both methods, significantly improving test-time performance.
+4) Finally, we release an open-weights 32B model that achieves 51% on
+SWEBench-Verified
+, reflecting a new state-of-the-art for open-weight
+Swe
+-agents, while also for the first time demonstrating competitive or better performance compared to commercial models (Fig.
+1(c)
+), e.g., o1
+(Jaech et al.,
+2024
+)
+and sonnet-3.5-v2
+(Anthropic,
+2024
+)
+.
+2
+R2E-Gym
+: Procedural Synthetic Data Generation
+Dataset (split)
+Repo?
+Executable?
+# Instances
+APPS
+(Hendrycks et al.,
+2021
+)
+✗
+✓
+10,000
+R2E
+(Jain et al.,
+2024b
+)
+✓
+✓
+246
+SWE-Bench(train)
+(Jimenez et al.,
+2023
+)
+✓
+✗
+19,008
+SWE-Gym Raw
+(Pan et al.,
+2024
+)
+✓
+✗
+66,894
+SWE-Bench (test)
+(Jimenez et al.,
+2023
+)
+✓
+✓
+2,294
+SWE-Gym
+(Pan et al.,
+2024
+)
+✓
+✓
+2,438
+R2E-Gym-Subset
+(Ours)
+✓
+✓
+4,578
+R2E-Gym
+(Ours)
+✓
+✓
+8,135
+Table 1:
+Dataset Statistics.
+Comparing statistics across different datasets curating executable training environments for
+Swe
+-agent training.
+R2E-Gym refers to our full dataset, and R2E-Gym-Subset refers to a filtered subset of tasks, with non-overlapping repositories with SWE-Bench.
+Table 2:
+Repo distribution
+for R2E-Gym subset (no overlap with SWE-Bench) used for training (refer §
+3
+).
+Overview.
+Swe
+task collection methods
+(Jimenez et al.,
+2023
+)
+rely on human-written issues and unit tests for problem statements and evaluation functions.
+However, this presents a challenge for scaling data curation as size is limited by human-written PRs.
+To overcome this limitation, we propose
+SweGen
+— a synthetic data curation recipe using backtranslation and test generation.
+We procedurally generate environments using only commits from
+GitHub
+repositories, reducing reliance on both human-written issues and test cases.
+Repository and Commit Curation.
+We use SEART
+GitHub
+search
+3
+3
+3
+https://seart-ghs.si.usi.ch/
+to identify
+Python
+repositories with a large number of commits.
+Next, we extract commit history and associated code changes for each repository.
+We filter relevant commits using a combination of rule-based and
+LLM
+-based heuristics, identifying
+interesting
+code changes.
+For each relevant commit, we next collect build scripts by semi-manually searching across dependency pins.
+We expand our set of heuristics and installation procedure further in the Appendix
+A
+.
+Test-Validation and Generation for Environment Collection.
+Following
+Jimenez et al. (
+2023
+)
+, we use the existing test cases in the curated commits to identify Fail
+→
+→
+\rightarrow
+→
+Pass (F2P) test cases, i.e. test cases that fail in the original buggy commit and pass in the fixed commit.
+In cases where the curated commits do not have associated tests, limiting the ability to use them for training environments, we supplement such commits with automatically generated Fail
+→
+→
+\rightarrow
+→
+Pass test-cases.
+Appendix
+A
+expands our test generation approach.
+Backtranslation: Non-reliance on
+GitHub
+Issues.
+Using the above steps, we collect a large number of commits, associated build environments and F2P (Fail
+→
+→
+\rightarrow
+→
+Pass) test cases.
+Now, we need to collect the problem statements associated with the commits.
+Prior works
+(Jimenez et al.,
+2023
+; Pan et al.,
+2024
+)
+use human-written
+GitHub
+issues as problem statements.
+This inevitably cannot use the entire commit history since human-written issues are not available for all commits.
+Here, following
+Li et al. (
+2023
+); Wei et al. (
+2023
+)
+we propose a backtranslation approach to collect the problem statements associated with the commits.
+However, naively back-translating code changes is quite noisy as models often generate generic problem statements that do not capture the essence of the code changes.
+Instead, we identify that human-written issues often contain failing tests and execution traces as part of bug reports.
+We use this observation to collect high-quality problem statements by using the F2P test-cases as part of the backtranslation prompt.
+Similar to existing works
+(Jain et al.,
+2024b
+; Zhuo et al.,
+2024
+)
+, we find that using test execution information allows generating precise and directed problem statements.
+Please find prompts and examples in Appendix.
+We collect over
+8.1
+8.1
+8.1
+8.1
+K problem statements using this approach (referred to as R2E-Gym).
+We decontaminate this set by removing repositories overlapping with SWE-Bench test-set repositories, obtaining 4578 problems (referred to as R2E-Gym-Subset) and use that across all experiments unless specified otherwise.
+Table
+2
+shows the statistics of different datasets, and Figure
+2
+and Figure
+8
+show the distribution of the repositories in R2E-Gym-Subset and R2E-Gym respectively.
+Notably, using our
+SweGen
+approach, we can collect over
+2.5
+2.5
+2.5
+2.5
+times more problems than relying on the data collection relying on
+GitHub
+issues (Figure
+1(a)
+).
+3
+Training SWE-Agents using
+R2E-Gym
+Environments
+Table 3:
+Resolve Rate (%) Comparison on
+SWEBench-Verified
+and
+SWEBench-Lite
+benchmarks. We observe that synthetic data curation (
+SweGen
+): allows our approach to scale better across different model sizes. All experiments use the
+Qwen-2.5-Coder
+as base-models.
+Model
+SWEBench-Lite
+SWEBench-Verified
+Size
+Base-model
+SWE-Gym
+Ours
+Δ
+Δ
+\Delta
+roman_Δ
+Base-model
+SWE-Gym
+Ours
+Δ
+Δ
+\Delta
+roman_Δ
+7B
+1.0 (
+±
+1.0
+plus-or-minus
+1.0
+\pm 1.0
+± 1.0
+)
+10.0 (
+±
+2.4
+plus-or-minus
+2.4
+\pm 2.4
+± 2.4
+)
+11.0
+(
+±
+0.8
+plus-or-minus
+0.8
+\pm 0.8
+± 0.8
+)
++1.0
+1.8 (
+±
+1.3
+plus-or-minus
+1.3
+\pm 1.3
+± 1.3
+)
+10.6 (
+±
+2.1
+plus-or-minus
+2.1
+\pm 2.1
+± 2.1
+)
+19.0
+(
+±
+1.0
+plus-or-minus
+1.0
+\pm 1.0
+± 1.0
+)
++8.4
+14B
+2.7 (
+±
+1.9
+plus-or-minus
+1.9
+\pm 1.9
+± 1.9
+)
+12.7 (
+±
+2.3
+plus-or-minus
+2.3
+\pm 2.3
+± 2.3
+)
+20.67
+(
+±
+0.7
+plus-or-minus
+0.7
+\pm 0.7
+± 0.7
+)
++7.97
+4.0 (
+±
+1.6
+plus-or-minus
+1.6
+\pm 1.6
+± 1.6
+)
+16.4 (
+±
+2.0
+plus-or-minus
+2.0
+\pm 2.0
+± 2.0
+)
+26.8
+(
+±
+1.4
+plus-or-minus
+1.4
+\pm 1.4
+± 1.4
+)
++10.4
+32B
+3.0 (
+±
+1.4
+plus-or-minus
+1.4
+\pm 1.4
+± 1.4
+)
+15.3 (
+±
+2.5
+plus-or-minus
+2.5
+\pm 2.5
+± 2.5
+)
+23.77
+(
+±
+0.8
+plus-or-minus
+0.8
+\pm 0.8
+± 0.8
+)
++8.47
+7.0 (
+±
+1.3
+plus-or-minus
+1.3
+\pm 1.3
+± 1.3
+)
+20.6 (
+±
+2.1
+plus-or-minus
+2.1
+\pm 2.1
+± 2.1
+)
+34.4
+(
+±
+1.2
+plus-or-minus
+1.2
+\pm 1.2
+± 1.2
+)
++13.8
+Agent Scaffolding.
+We design a minimal scaffold on top of
+OpenHands
+(Wang et al.,
+2024
+)
+to experiment with agents for diverse
+Swe
+tasks.
+It uses a traditional
+ReAct
+framework
+(Yao et al.,
+2022
+)
+without any specialized workflow; equipping the
+LLM
+with only a bash terminal, file editor, and search tool.
+Figure
+15
+depicts an example code editing trajectory.
+Trajectory Collection and SFT Training
+.
+We next collect SFT trajectories using from R2E-Gym environments. To avoid contamination, we only use a subset of R2E-Gym consisting of repos with no overlap with the SWE-Bench dataset. The resulting subset (R2E-Gym-Subset) consists of 4578 executable environments across 10 repositories (Figure
+2
+).
+For each task environment, we use
+Sonnet-3.5-v2
+with our agent scaffold and collect the successful agent trajectories.
+Through this process, we collect
+3321
+3321
+3321
+3321
+trajectories from
+2048
+2048
+2048
+2048
+unique task environments.
+We then use these trajectories to train our agent via supervised fine-tuning on agent thoughts and actions.
+For training, we use LLaMA-Factory
+(Zheng et al.,
+2024
+)
+and
+Qwen-2.5-Coder
+models (7B, 14B, 32B) as our base models.
+For detailed experiment configuration and hyperparameters, please refer to Appendix
+B
+.
+3.1
+Results and Analysis
+Comparison to open-weight SWE-Agents across Model Scales
+.
+We report
+Pass@
+1 of R2E-Gym trained models on the
+SWEBench-Verified
+and
+SWEBench-Lite
+benchmarks in Table
+3
+. We also report comparisons with recently proposed SWE-Gym
+(Pan et al.,
+2024
+)
+, which is most closest to our work. As seen in Table
+3
+, we find that our approach enables better scaling for training
+Swe
+-agents across all model sizes. For instance, on
+SWEBench-Verified
+, for the same base-model type and scale, our 32B model significantly improves the
+Pass@
+1 performance by
+14
+%
+percent
+14
+~{}14\%
+14 %
+; pushing the final performance from 20.6 (SWE-Gym) to 34.4%.
+Scaling with Number of Trajectories
+.
+We investigate the relationship between training samplesize (number of trajectories) and agent performance in Figure
+3
+.
+We evaluate
+14
+14
+14
+14
+B and
+32
+32
+32
+32
+B models trained with trajectory counts ranging from
+100
+100
+100
+100
+to
+3
+,
+200
+3
+200
+3,200
+3 , 200
+.
+Our findings indicate that performance improves with increasing trajectory count, though with diminishing returns for both models.
+Notably, the
+14
+14
+14
+14
+B model begins to saturate at approximately
+800
+800
+800
+800
+samples, while the
+32
+32
+32
+32
+B model still shows improvements, likely due to its larger capacity.
+These results extend the findings of
+Pan et al. (
+2024
+)
+, who studied dataset scaling up to
+∼
+500
+similar-to
+absent
+500
+\mathrel{\mathchoice{\vbox{\hbox{$\scriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptstyle\sim$}}}{\vbox{\hbox{$\scriptscriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptscriptstyle\sim$}}}}500
+∼ 500
+samples.
+Our analysis demonstrates that while performance does improve with increasing samplesize, the rate of improvement diminishes or even plateaus for smaller models.
+Figure 2:
+Pass@
+1 scaling curve with increasing number of training samples.
+Performance improvement with more training samples, enabled by
+SweGen
+approach.
+Ablation
+Config
+Pass@
+1 (%)
+Adding Thoughts
+With
+34.4
+Without
+30.4
+Real vs. Synthetic
+Real
+28.0
+Synthetic
+27.8
+Figure 3:
+Top.
+Using thoughts in
+ReAct
+agent trajectories leads to significant performance improvements.
+Bottom.
+Using
+SweGen
+synthetic generated issues and test cases achieves similar performance as real-world issues (400 trajectories for both real & synthetic in above) while providing better scalability during data collection.
+Real vs Synthetic Problem Statements.
+The R2E-Gym approach enables us to generate problem statements without relying on human-written descriptions and test cases, offering greater scalability.
+We compare the performance of models trained on real GitHub issues versus our synthetic problem statements (collecting
+400
+400
+400
+400
+trajectories from both sets).
+Remarkably, models trained on synthetic data achieve nearly identical performance (27.8%
+Pass@
+1) to those trained on real data (28.0%).
+This finding validates the efficacy of our synthetic data generation methodology, demonstrating that procedurally generated environments can match the training value of real-world examples while providing scalability.
+Explicit Thought Traces are Important.
+During SFT we use both the agent’s thought processes and actions as training targets.
+Models trained with thought demonstrations achieve significantly better performance compared to those trained without (34.2% vs 30.4% in Table
+3
+).
+This suggests that exposing the model to step-by-step reasoning processes is necessary for reliable problem-solving in complex environments.
+4
+Efficient Inference Time Scaling With Hybrid Verifiers
+We utilize R2E-Gym (§
+2
+) for inference-time scaling experiments with coding agents.
+In §
+4.1
+, we explore different axes for scaling test-time compute, focusing on two distinct approaches: 1) Execution-based Verifiers and 2) Execution-free Verifiers.
+We analyze the relative strengths and weaknesses of each approach, demonstrating their complementary nature (§
+4.2
+).
+Based on this insight, we propose a hybrid approach that leverages the strengths of both, significantly improving test-time performance (§
+4.3
+).
+Finally, we provide detailed ablations and analysis, examining critical design choices for our approach (§
+4.4
+).
+4.1
+Exploring Different Axes for Training Verifiers
+Given an input task description
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+, a set of agent trajectories
+{
+𝒯
+i
+}
+i
+=
+1
+K
+superscript
+subscript
+subscript
+𝒯
+𝑖
+𝑖
+1
+𝐾
+\{\mathcal{T}_{i}\}_{i=1}^{K}
+{ caligraphic_T start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_K end_POSTSUPERSCRIPT
+and candidate patch outputs
+{
+𝒫
+i
+}
+i
+=
+1
+K
+superscript
+subscript
+subscript
+𝒫
+𝑖
+𝑖
+1
+𝐾
+\{\mathcal{P}_{i}\}_{i=1}^{K}
+{ caligraphic_P start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_K end_POSTSUPERSCRIPT
+, our objective is to build a verifier that assigns scores
+𝐒
+=
+{
+s
+i
+}
+i
+=
+1
+K
+𝐒
+superscript
+subscript
+subscript
+𝑠
+𝑖
+𝑖
+1
+𝐾
+\mathbf{S}=\{s_{i}\}_{i=1}^{K}
+bold_S = { italic_s start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT } start_POSTSUBSCRIPT italic_i = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_K end_POSTSUPERSCRIPT
+to rank the outputs.
+To this end, we investigate two types of verifiers:
+Execution-Based Verifiers.
+We train a specialized
+testing-agent
+that generates reproduction test cases to determine whether a candidate patch resolves the issue (i.e., whether the patch passes the generated test suite).
+Additionally, following
+Xia et al. (
+2024b
+)
+, we leverage existing regression tests to filter out patches that fail to maintain backward compatibility.
+Our execution-based (EB) verifier thus comprises two components:
+1) a
+testing-agent
+that generates targeted tests to evaluate bug fixes, and
+2) a regression test filter that eliminates patches that compromise existing functionality.
+Specifically, we train the testing-agent (using
+Qwen-Coder
+-32B as base-model) to generate a comprehensive test script containing
+M
+=
+10
+𝑀
+10
+M=10
+italic_M = 10
+diverse tests that cover various inputs, corner cases,
+etc
+..
+See Appendix
+D
+for example generated tests.
+The execution-based score
+s
+k
+E
+⁢
+B
+subscript
+superscript
+𝑠
+𝐸
+𝐵
+𝑘
+s^{EB}_{k}
+italic_s start_POSTSUPERSCRIPT italic_E italic_B end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+for each each patch
+𝒫
+k
+subscript
+𝒫
+𝑘
+\mathcal{P}_{k}
+caligraphic_P start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+is then computed as,
+s
+k
+E
+⁢
+B
+=
+{
+TestScore
+k
+,
+if
+⁢
+R
+⁢
+S
+k
+=
+max
+j
+∈
+[
+1
+,
+K
+]
+⁡
+R
+⁢
+S
+j
+,
+0
+,
+otherwise
+,
+;
+where
+TestScore
+k
+=
+∑
+i
+Pass
+⁢
+(
+𝒫
+k
+,
+T
+⁢
+e
+⁢
+s
+⁢
+t
+i
+)
+formulae-sequence
+subscript
+superscript
+𝑠
+𝐸
+𝐵
+𝑘
+cases
+subscript
+TestScore
+𝑘
+if
+𝑅
+subscript
+𝑆
+𝑘
+subscript
+𝑗
+1
+𝐾
+𝑅
+subscript
+𝑆
+𝑗
+0
+otherwise
+where
+subscript
+TestScore
+𝑘
+subscript
+𝑖
+Pass
+subscript
+𝒫
+𝑘
+𝑇
+𝑒
+𝑠
+subscript
+𝑡
+𝑖
+\displaystyle s^{EB}_{k}=\begin{cases}\mathrm{TestScore}_{k},&\text{if }RS_{k}%
+=\max\limits_{j\in[1,K]}RS_{j},\\
+0,&\text{otherwise},\end{cases};\text{ where }\quad\mathrm{TestScore}_{k}=\sum%
+_{i}\mathrm{Pass}(\mathcal{P}_{k},Test_{i})
+italic_s start_POSTSUPERSCRIPT italic_E italic_B end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT = { start_ROW start_CELL roman_TestScore start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT , end_CELL start_CELL if italic_R italic_S start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT = roman_max start_POSTSUBSCRIPT italic_j ∈ [ 1 , italic_K ] end_POSTSUBSCRIPT italic_R italic_S start_POSTSUBSCRIPT italic_j end_POSTSUBSCRIPT , end_CELL end_ROW start_ROW start_CELL 0 , end_CELL start_CELL otherwise , end_CELL end_ROW ; where roman_TestScore start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT = ∑ start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT roman_Pass ( caligraphic_P start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT , italic_T italic_e italic_s italic_t start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT )
+(1)
+where
+R
+⁢
+S
+k
+𝑅
+subscript
+𝑆
+𝑘
+RS_{k}
+italic_R italic_S start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+refers to the regression test score for the
+k
+t
+⁢
+h
+superscript
+𝑘
+𝑡
+ℎ
+k^{th}
+italic_k start_POSTSUPERSCRIPT italic_t italic_h end_POSTSUPERSCRIPT
+patch and helps select the patches with the highest regression test scores
+(Xia et al.,
+2024b
+)
+.
+TestScore
+k
+subscript
+TestScore
+𝑘
+\mathrm{TestScore}_{k}
+roman_TestScore start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+is simply the sum of the number of passing tests for each patch
+𝒫
+k
+subscript
+𝒫
+𝑘
+\mathcal{P}_{k}
+caligraphic_P start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+.
+Please refer to Appendix §
+C
+for further details.
+Notably, unlike zero-shot test generation with Agentless
+(Xia et al.,
+2024b
+)
+, our testing agent interacts with the environment to examine existing test cases and generates new tests informed by these examples with execution feedback.
+We demonstrate that this environment-aware approach provides additional benefits over zero-shot methods in §
+4.4
+.
+Figure 4:
+Left.
+Best@
+K with increasing number of editing-agent rollouts.
+Inference-time scaling improves final performance for both execution-based and execution-free verifiers.
+Hybrid Verifier combining execution-based and execution-free verifiers provides significantly superious scaling.
+Right.
+Best@
+K with increasing number of testing-agent rollouts.
+Increasing test-agent rollouts also improves final performance and can provide more compute efficient scaling than naively increasing only editing-agent rollouts.
+Table 4:
+Performance of various models/methods on SWE-Bench Verified.
+Method
+Model
+Type
+Verified
+Proprietary Models
+Agentless-1.5
+(Xia et al.,
+2024b
+)
+GPT-4o
+Pipeline
+34.0
+Agentless
+(Xia et al.,
+2024b
+)
+O1
+Pipeline
+48.0
+Claude + Tools
+Claude-3.6-Sonnet
+Agent
+49.0
+Agentless-1.5
+(Xia et al.,
+2024b
+)
+Claude-3.6-Sonnet
+Pipeline
+50.8
+OpenHands
+(Wang et al.,
+2024
+)
+Claude-3.6-Sonnet
+Agent
+53.0
+Claude + Tools
+Claude-3.7-Sonnet
+Agent
+62.3
+Claude + Tools (Best@Any)
+Claude-3.7-Sonnet
+Agent
+70.3
+Open-source Models
+SWE-SynInfer
+(Ma et al.,
+2024
+)
+Lingma-SWE-GPT-72B
+Agent
+30.2
+SWE-Fixer
+(Xie et al.,
+2025
+)
+SWE-Fixer-72B
+Pipeline
+30.2
+SWE-Gym (
+Best@
+16 w/ Verifier)
+(Pan et al.,
+2024
+)
+SWE-Gym-32B
+Agent
+32.0
+SWE-RL (
+Best@
+500 w/ Tests)
+(Wei et al.,
+2025
+)
+SWE-RL-70B
+Pipeline
+41.0
+Agentless
+(Xia et al.,
+2024b
+)
+DeepSeek-R1
+Pipeline
+49.2
+R2E-Gym (Ours)
+(
+Pass@
+1)
+R2E-Gym-32B
+Agent
+34.4
+R2E-Gym (Ours)
+(
+Best@
+16 w / Hybrid)
+R2E-Gym-32B
+Agent
+49.4
+R2E-Gym (Ours) (
+Best@
+26 w / Hybrid
+)
+R2E-Gym-32B
+Agent
+51.0
+Execution-free Verifiers.
+We next train execution-free (EF) verifiers for selecting the best trajectory from a set of sampled trajectories from the code-editing agent (§
+3
+).
+In particular, following
+(Pan et al.,
+2024
+)
+, given task description
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+, agent-trajectory
+𝒯
+𝒯
+\mathcal{T}
+caligraphic_T
+(sequence of thought, action, and observations) and output patch
+𝒫
+𝒫
+\mathcal{P}
+caligraphic_P
+, we finetune a
+Qwen2.5-Coder-14B
+model to predict
+YES
+and
+NO
+tokens to determine correctness of a trajectory using SFT on correct and incorrect trajectories.
+The execution-free score is then computed by normalizing the relative probability of
+YES
+token as
+s
+E
+⁢
+F
+=
+P
+⁢
+(
+YES
+)
+/
+(
+P
+⁢
+(
+YES
+)
++
+P
+⁢
+(
+NO
+)
+)
+superscript
+𝑠
+𝐸
+𝐹
+P
+YES
+P
+YES
+P
+NO
+s^{EF}=\mathrm{P}(\texttt{YES})/(\mathrm{P}(\texttt{YES})+\mathrm{P}(\texttt{%
+NO}))
+italic_s start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT = roman_P ( YES ) / ( roman_P ( YES ) + roman_P ( NO ) )
+, where P(
+YES
+) and P(
+NO
+) are estimated through log-probabilities of corresponding token predictions.
+4.2
+Comparative Analysis of Execution-Based and Execution-Free Verifiers
+Experimental Methodology.
+We evaluate verifier performance using the
+Best@
+K metric, which quantifies each verifier’s ability to identify correct patches from multiple candidates.
+Specifically, given
+K
+𝐾
+K
+italic_K
+trajectories, the
+Best@
+K metric represents the percentage of problems where the verifier successfully selects the correct patch using its scoring mechanism.
+For our experiments, we sample
+1
+1
+1
+1
+trajectory at temperature
+T
+=
+0
+𝑇
+0
+T=0
+italic_T = 0
+and
+25
+25
+25
+25
+trajectories at temperatures
+T
+=
+0.8
+𝑇
+0.8
+T=0.8
+italic_T = 0.8
+and
+T
+=
+0.9
+𝑇
+0.9
+T=0.9
+italic_T = 0.9
+from the R2E-Gym-32B model on
+SWEBench-Verified
+problems.
+These trajectories achieve
+Pass@
+26 =
+64.4
+64.4
+64.4
+64.4
+% (Figure
+13
+).
+Next, we sample
+7
+7
+7
+7
+tests using our testing agent at temperature
+T
+=
+0.8
+𝑇
+0.8
+T=0.8
+italic_T = 0.8
+.
+When generating tests, the test agent is provided a
+fixed
+in-context example (from Django) showing sample starter code and format for writing test cases. We empirically find that use of an incontext example is useful for improving output formatting and lacking domain knowledge in the base LM; improving test generation for
+∼
+2
+similar-to
+absent
+2
+\mathrel{\mathchoice{\vbox{\hbox{$\scriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptstyle\sim$}}}{\vbox{\hbox{$\scriptscriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptscriptstyle\sim$}}}}2
+∼ 2
+% problems. Please see Listing
+10
+for further details and incontext starter code.
+Both verifiers elicit inference time gains
+.
+Figure
+4
+illustrates the
+Best@
+K performance of both verifier types on the
+SWEBench-Verified
+benchmark as a function of number of editing agent rollouts.
+Both execution-based and execution-free verifiers demonstrate substantial performance improvements with increased number of rollouts.
+However,
+Best@
+K rate quickly plateaus for both methods, converging similarly to
+43.7
+%
+percent
+43.7
+43.7\%
+43.7 %
+and
+42.8
+%
+percent
+42.8
+42.8\%
+42.8 %
+respectively.
+Limited Distinguishability in Execution-Based Verifiers
+.
+Recall that these verifiers output scores based on test pass counts and thus cannot differentiate between patches with identical test pass-rates, limiting their discriminative capacity.
+We study this discriminative capability from tests generated by our
+32
+32
+32
+32
+B testing agent, prompted
+Sonnet-3.5-v2
+model, and Agentless-1.5 reproduction tests
+(Xia et al.,
+2024b
+)
+4
+4
+4
+We utilize test cases from the official artifacts repository
+(Xia et al.,
+2024a
+)
+.
+on a subset of
+SWEBench-Verified
+problems.
+Figure
+5
+(left) presents the problem density distribution for distinguishability rate, i.e., the proportion of tests that successfully differentiate between top-ranked correct and incorrect patches.
+The results demonstrate that for the majority of problems, less than
+20
+%
+percent
+20
+20\%
+20 %
+of tests provide discriminative signal, constraining the re-ranking.
+Figure
+6
+additionally depicts that most generated tests either do not reproduce the bug (high Pass
+→
+→
+\rightarrow
+→
+Pass values in
+6
+-left) or do not pass ground truth patches (high Fail
+→
+→
+\rightarrow
+→
+Fail values in
+6
+-middle) primarily due to bugs or exceptions in the generated test cases.
+Vulnerability to Test Toxicity.
+Following
+(Chen et al.,
+2022
+)
+, we examine the prevalence of toxic tests, i.e., tests that pass incorrect patches but fail correct patches.
+Figure
+5
+(right) illustrates the distribution of toxic test rates across different test generation approaches.
+While toxic tests are generally rare, we find that for a small but significant subset of problems, testing agents generate toxic tests (up to
+10
+%
+percent
+10
+10\%
+10 %
+of total tests) that can erroneously rank incorrect patches above correct ones, undermining the reliability of execution-based verification.
+Figure 5:
+Analyzing limitations of execution-based verifiers.
+Left:
+Problem Probability Distributions for distinguishability rates depicting weak discrimination capabilities of tests. We observe that for the majority of problems, less than
+20
+%
+percent
+20
+20\%
+20 %
+of tests provide discriminative signal, constraining the re-ranking ability of test-based agent.
+Right:
+Distributions for toxicity rates showing (rare) generation of toxic tests. We find that execution-based verifiers are also vulnerable to (rare) generation of toxic tests (tests that pass incorrect patches but fail correct patches); which can undermine the reliability of execution-based verifiers.
+Figure 6:
+Problem Probability Distributions for Pass
+→
+→
+\rightarrow
+→
+Pass, Fail
+→
+→
+\rightarrow
+→
+Fail, and Fail
+→
+→
+\rightarrow
+→
+Pass generated test fractions for various approaches.
+We identify a large fraction of generated tests either do not reproduce the bug (left) or do not even pass the correct solution (middle).
+Execution-Free Verifiers can rely on heuristics
+.
+We next study the workings and limitations of execution-free verifiers.
+In particular, we first perform quantitative ablation studies, studying the impact of different trajectory components (e.g., output patch, agent thoughts) to verifier performance. To this end, we train multiple execution-free verifiers (§
+4.1
+) excluding different trajectory components while training the verifier.
+Results are shown in Figure
+LABEL:fig:execution-free-limitations-mini
+-a.
+We find that agent thoughts play a considerable role in determining the verifier performance.
+Surprisingly, the final
+Best@
+26 drops from
+42.8
+%
+percent
+42.8
+42.8\%
+42.8 %
+to
+37.6
+%
+percent
+37.6
+37.6\%
+37.6 %
+when we remove the trajectory from the verifier input (i.e., only use the final patches).
+This means that while patch alone is responsible for determining the correctness, execution-free verifiers heavily rely on trajectory features, such as agent thoughts, to make predictions.
+To further investigate this phenomenon, we also perform an attention analysis trying to visualize parts of the input trajectory which are most relevant while predicting the output success with execution-free verifiers. In particular, we perform a sliding window search over the input trajectory, and compute the mean attention score over the tokens in the window when predicting the final output token (
+YES
+: correct,
+NO
+: incorrect).
+Figure
+LABEL:fig:execution-free-limitations-mini
+(right) illustrates the top two windows receiving the highest attention scores, demonstrating that verifiers disproportionately attend to agent thoughts. This can be misleading since the verifier can use these sentiment signals in these thoughts as proxies for correctness rather than evaluating the technical merits of the solution (i.e. the output patch).
+Method
+Accuracy (%)
+Best@26 (%)
+Final Patch + Traj.
+71.82
+42.8
+Patch Only
+68.01
+37.6
+Traj. - Thoughts
+68.77
+41.4
+(a)
+Impact of Patch & Thoughts
+on execution-free verifier. Patch alone reduces performance, indicating that model relies on other heuristics (e.g., agent thoughts) for reranking; which can be misleading (see part-b: right).
+⬇
+1.
+S
+uccessfully
+reproduced
+the
+issue
+2.
+Implemented
+a
+fix
+[…]
+4.
+Ensured
+edge
+cases
+are
+handled
+5.
+M
+aintained
+backward
+compatibility
+[…]
+<
+function
+=
+finish
+>
+submit
+</
+function
+>
+[…]
+\
+end
+{
+lstlisting
+}
+\
+end
+{
+minipage
+}
+\
+vskip
+-0.1
+in
+%
+Second
+code
+block
+\
+begin
+{
+minipage
+}[
+t
+]{\
+textwidth
+}
+\
+lstset
+{
+frame
+=
+single
+,
+basicstyle
+=\
+tiny
+,
+breaklines
+=
+true
+,
+backgroundcolor
+=\
+color
+{
+nicebg
+},
+keepspaces
+=
+true
+,
+columns
+=
+flexible
+,
+emph
+={
+Great
+,
+works
+},
+emphstyle
+={\
+tiny
+\
+colorbox
+{
+yellow
+}}
+}
+\
+begin
+{
+lstlisting
+}[]
+Great
+!
+The
+fix
+works
+.
+Let
+’s
+␣
+see
+␣
+what
+␣
+we
+␣
+did
+␣
+to
+␣
+fix
+␣
+the
+␣
+issue:
+1.
+␣
+We
+␣
+identified
+␣
+that
+␣
+the
+␣
+original
+␣
+code
+␣
+was
+␣
+failing
+␣
+because
+␣
+it
+␣
+was
+␣
+trying
+␣
+to
+␣
+use
+␣
+the
+␣
+‘.inverse()‘
+␣
+method
+␣
+directly
+␣
+on
+␣
+permutations,
+␣
+which
+␣␣
+[…]
+␣
+\end{lstlisting}
+\end{minipage}
+\vskip
+␣
+-0.1
+␣
+in
+␣␣␣␣
+\caption*{(b)
+␣
+Top
+␣
+two
+␣
+attention
+␣
+windows
+␣
+while
+␣
+predicting
+␣
+\texttt{YES}
+␣
+for
+␣
+an
+␣
+incorrect
+␣
+trajectory.
+␣
+We
+␣
+find
+␣
+that
+␣
+focusing
+␣
+on
+␣
+heuristics
+␣
+(agent
+␣
+thoughts)
+␣
+can
+␣
+mislead
+␣
+the
+␣
+verifier.}
+\end{minipage}
+%
+␣
+\vskip
+␣
+-0.1in
+\caption{\textbf{Quantitative
+␣
+and
+␣
+qualitative
+␣
+analysis
+␣
+on
+␣
+limitations
+␣
+of
+␣
+execution-free
+␣
+verifiers.}
+We
+␣
+perform
+␣
+two
+␣
+experiments:
+a)
+␣
+Quantitative
+␣
+ablations
+␣
+on
+␣
+the
+␣
+impact
+␣
+of
+␣
+output
+␣
+patch
+␣
+on
+␣
+verifier
+␣
+performance;
+␣
+showing
+␣
+that
+␣
+execution-based
+␣
+verifiers
+␣
+rely
+␣
+on
+␣
+other
+␣
+heuristics
+␣
+(e.g.,
+␣
+agent
+␣
+thoughts)
+␣
+over
+␣
+the
+␣
+final
+␣
+patch.
+b)
+␣
+Qualitative
+␣
+visualization
+␣
+analyzing
+␣
+top
+␣
+$k=2$
+␣
+sliding
+␣
+windows
+␣␣
+with
+␣
+highest
+␣
+mean
+␣
+attention
+␣
+score
+␣
+while
+␣
+predicting
+␣
+output
+␣
+token
+␣
+\texttt{YES}
+␣
+(\sref{sec:strengths-weaknesses})
+␣
+for
+␣
+an
+␣
+\emph{incorrect}
+␣
+agent
+␣
+trajectory
+␣
+(\texttt{sympy\_\_sympy-24443}:
+␣
+\swebench
+␣
+\
+␣
+\citep{yang2024swe}).
+%
+Focusing
+␣
+on
+␣
+heuristics
+␣
+(\eg,
+␣
+agent
+␣
+thoughts)
+␣
+can
+␣
+be
+␣
+misleading,
+␣
+and
+␣
+the
+␣
+verifier
+␣
+predicts
+␣
+the
+␣
+trajectory
+␣
+as
+␣
+correct.
+%
+Visualizations
+␣
+are
+␣
+condensed
+␣
+for
+␣
+space.
+␣
+Please
+␣
+refer
+␣
+to
+␣
+the
+␣
+Appendix
+␣
+for
+␣
+further
+␣
+visualizations
+␣
+and
+␣
+results.
+}
+\vskip
+␣
+-0.2in
+\label{fig:execution-free-limitations-mini}
+\end{figure}
+%
+␣
+\begin{figure}[t]
+%
+␣
+\vskip
+␣
+-0.15in
+%
+␣
+\centering
+%
+␣
+%
+␣
+Table
+%
+␣
+\begingroup
+%
+␣
+%
+␣
+\setlength{\tabcolsep}{5.0pt}
+%
+␣
+%
+␣
+\renewcommand{\arraystretch}{1.1}
+%
+␣
+%
+␣
+\small
+%
+␣
+%
+␣
+\footnotesize
+%
+␣
+\scriptsize
+%
+␣
+%
+␣
+\caption{Ablation
+␣
+study
+␣
+on
+␣
+verfier
+␣
+data
+␣
+design.
+␣
+We
+␣
+report
+␣
+the
+␣
+overall
+␣
+accuracy
+␣
+and
+␣
+reward
+␣
+model
+␣
+(RM)
+␣
+accuracy
+␣
+for
+␣
+each
+␣
+configuration.
+␣
+All
+␣
+experiments
+␣
+use
+␣
+the
+␣
+14B
+␣
+Qwen
+␣
+Coder
+␣
+as
+␣
+the
+␣
+verifier
+␣
+and
+␣
+32B
+␣
+Qwen
+␣
+Coder
+␣
+Rollouts.}
+%
+␣
+%
+␣
+\vskip
+␣
+-0.1in
+%
+␣
+\begin{tabular}{lccc}
+%
+␣
+\toprule
+%
+␣
+%
+␣
+\textbf{Configuration}
+␣
+&
+␣
+\textbf{Overall
+␣
+Accuracy
+␣
+(\%)}
+␣
+&
+␣
+\textbf{RM
+␣
+Accuracy
+␣
+(\%)}
+␣
+\\
+%
+␣
+\textbf{Metric
+␣
+(Execution-free
+␣
+scaling)}
+␣
+&
+␣
+[Final
+␣
+Patch
+␣
++
+␣
+Traj.]
+␣
+&
+␣
+[Final
+␣
+Patch
+␣
+Only]
+␣
+&
+␣␣␣
+[Final
+␣
+Patch
+␣
++
+␣
+Traj.
+␣
+-
+␣
+Thoughts]
+␣
+\\
+%
+␣
+\midrule
+%
+␣
+\rowcolor{gray!8}
+␣
+\textbf{Verifier
+␣
+Accuracy
+␣
+(\%)}
+␣
+&
+␣
+\textbf{71.82}\%
+␣␣
+&
+␣
+68.01\%
+␣␣
+&
+␣
+68.77\%
+␣
+\\
+%
+␣
+\textbf{Aggregation
+␣
+Performance
+␣
+(\textsc{Best@}$26$)}
+␣
+&
+␣
+\textbf{42.8}\%
+␣
+&
+␣
+37.6\%
+␣
+&
+␣
+41.4\%
+␣
+\\
+%
+␣
+%
+␣
+\rowcolor{gray!8}
+␣
+w/o
+␣
+THOUGHT
+␣
+&
+␣
+41.4
+␣
+&
+␣
+68.8
+␣
+\\
+%
+␣
+%
+␣
+w/o
+␣
+ACTION
+␣
+&
+␣
+43.2
+␣
+&
+␣
+72.3
+␣
+\\
+%
+␣
+%
+␣
+\rowcolor{gray!8}
+␣
+w/o
+␣
+ASSISTANT
+␣
+&
+␣
+42.2
+␣
+&
+␣
+69.6
+␣
+\\
+%
+␣
+%
+␣
+w/o
+␣
+USER
+␣
+&
+␣
+43.2
+␣
+&
+␣
+70.2
+␣
+\\
+%
+␣
+\bottomrule
+%
+␣
+\end{tabular}
+%
+␣
+\vskip
+␣
+-0.1in
+%
+␣
+\label{tab:verifier-ablations}
+%
+␣
+\endgroup
+%
+␣
+%
+␣
+\\
+␣
+%
+␣
+breakline
+%
+␣␣␣␣␣
+\centering
+%
+␣␣␣␣␣
+%
+␣
+First
+␣
+code
+␣
+block
+%
+␣␣␣␣␣
+\begin{minipage}[t]{0.495\textwidth}
+%
+␣␣␣␣␣␣␣␣␣
+\begin{minted}[frame=single,
+␣
+fontsize=\scriptsize,
+␣
+breaklines,
+␣
+bgcolor=nicebg]{sql}
+%
+␣␣
+1.
+␣
+Successfully
+␣
+reproduced
+␣
+the
+␣
+issue
+%
+␣␣
+2.
+␣
+Implemented
+␣
+a
+␣
+fix
+%
+␣␣
+3.
+␣
+Verified
+␣
+the
+␣
+fix
+␣
+works
+%
+␣␣
+4.
+␣
+Ensured
+␣
+edge
+␣
+cases
+␣
+are
+␣
+handled
+%
+␣␣
+5.
+␣
+Maintained
+␣
+backward
+␣
+compatibility
+%
+␣␣␣
+I
+␣
+believe
+␣
+we
+␣
+can
+␣
+now
+␣
+submit
+␣
+this
+␣
+as
+␣
+the
+␣
+solution:
+%
+␣␣
+<function=finish>
+%
+␣␣␣␣
+<parameter=command>submit</parameter>
+␣
+[…]
+%
+␣␣␣␣
+\end{minted}
+%
+␣␣␣␣␣
+\end{minipage}
+%
+␣␣␣␣␣
+\hfill
+%
+␣␣␣␣␣
+%
+␣
+Second
+␣
+code
+␣
+block
+%
+␣␣␣␣␣
+\begin{minipage}[t]{0.495\textwidth}
+%
+␣␣␣␣␣␣␣␣␣
+\begin{minted}[frame=single,
+␣
+fontsize=\scriptsize,
+␣
+breaklines,bgcolor=nicebg]{sql}
+%
+␣
+[ASSISTANT]
+␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣
+Great!
+␣
+The
+␣
+fix
+␣
+works.
+␣
+Let’
+s
+analyze
+what
+we
+did
+to
+fix
+the
+issue
+:
+%
+1.
+We
+identified
+that
+the
+original
+code
+was
+failing
+because
+it
+was
+trying
+to
+use
+the
+%
+‘.
+inverse
+()‘
+method
+directly
+on
+permutations
+,
+which
+doesn
+’t
+␣
+exist.
+%
+␣␣
+[…]
+%
+␣␣␣␣␣␣␣␣␣
+\end{minted}
+%
+␣␣␣␣␣
+\end{minipage}
+%
+␣␣␣␣␣
+\vskip
+␣
+-0.1in
+%
+␣␣␣␣␣
+%
+␣
+Third
+␣
+code
+␣
+block
+%
+␣
+%
+␣␣␣␣␣
+\begin{minipage}[t]{0.495\textwidth}
+%
+␣
+%
+␣␣␣␣␣␣␣␣␣
+\begin{minted}[frame=single,
+␣
+fontsize=\scriptsize,
+␣
+breaklines,
+␣
+bgcolor=nicebg]{sql}
+%
+␣
+%
+␣␣
+The
+␣
+changes
+␣
+look
+␣
+good.
+␣
+I’
+ve
+made
+the
+following
+improvements
+:
+%
+%
+1.
+Simplified
+the
+logic
+to
+explicitly
+check
+for
+both
+the
+generator
+and
+its
+inverse
+%
+%
+[…]
+%
+%
+Let
+’s
+␣
+run
+␣
+the
+␣
+reproduce
+␣
+script
+␣
+again:
+%
+␣
+%
+␣␣
+<function=execute_bash>
+%
+␣
+%
+␣␣␣␣
+<parameter=cmd>python
+␣
+reproduce_issue.py</parameter>
+%
+␣
+%
+␣␣
+</function>
+%
+␣
+%
+␣␣␣␣␣␣␣␣␣
+\end{minted}
+%
+␣
+%
+␣␣␣␣␣
+\end{minipage}
+%
+␣
+%
+␣␣␣␣␣
+\hfill
+%
+␣
+%
+␣␣␣␣␣
+%
+␣
+Fourth
+␣
+code
+␣
+block
+%
+␣
+%
+␣␣␣␣␣
+\begin{minipage}[t]{0.495\textwidth}
+%
+␣
+%
+␣␣␣␣␣␣␣␣␣
+\begin{minted}[frame=single,
+␣
+fontsize=\scriptsize,
+␣
+breaklines,
+␣
+bgcolor=nicebg]{sql}
+%
+␣
+%
+␣
+<function=execute_bash>
+%
+␣
+%
+␣␣␣␣
+<parameter=cmd>python
+␣
+reproduce_issue.py</parameter>
+%
+␣
+%
+␣␣
+</function>
+%
+␣
+%
+␣␣
+[/ASSISTANT]
+%
+␣
+%
+␣␣
+[USER]
+%
+␣
+%
+␣␣
+Exit
+␣
+code:
+␣
+0
+%
+␣
+%
+␣␣
+Execution
+␣
+output
+␣
+of
+␣
+[execute_bash]:
+%
+␣
+%
+␣␣
+[STDOUT]
+␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣␣
+Testing
+␣
+with
+␣
+DihedralGroup(3):
+%
+␣
+%
+␣␣
+SUCCESS:
+␣
+Homomorphism
+␣
+created
+␣
+successfully
+%
+␣
+%
+␣␣
+\end{minted}
+%
+␣
+%
+␣
+\end{minipage}
+%
+␣
+\vskip
+␣
+-0.05in
+%
+␣
+\caption{
+%
+␣␣␣␣␣
+\textbf{Top.}
+%
+␣␣␣␣␣
+\textbf{Top.}
+%
+␣␣␣␣␣
+%
+%
+␣␣␣␣␣
+Quantitative
+␣
+ablations
+␣
+on
+␣
+the
+␣
+impact
+␣
+of
+␣
+output
+␣
+patch
+␣
+on
+␣
+verifier
+␣
+performance
+␣
+(verifier
+␣
+accurace
+␣
+and
+␣
+re-ranking
+␣
+performance)
+␣
+depicting
+␣
+that
+␣
+patch-only
+␣
+verifiers
+␣
+are
+␣
+considerably
+␣
+less
+␣
+effective
+␣
+than
+␣
+those
+␣
+that
+␣
+also
+␣
+consider
+␣
+the
+␣
+trajectory.
+%
+␣␣␣␣␣
+%
+%
+␣␣␣␣␣
+\textbf{Bottom.}
+%
+␣␣␣␣␣
+%
+%
+␣␣␣␣␣
+Qualitative
+␣
+visualization
+␣
+analyzing
+␣
+top
+␣
+$k=2$
+␣
+sliding
+␣
+windows
+␣
+over
+␣
+an
+␣
+\emph{incorrect}
+␣
+agent
+␣
+trajectory
+␣
+(refer
+␣
+\sref{sec:strengths-weaknesses})
+␣
+with
+␣
+highest
+␣
+mean
+␣
+attention
+␣
+score
+␣
+(\texttt{sympy\_\_sympy-24443}:
+␣
+\swebench).
+%
+␣␣␣␣␣
+%
+%
+␣␣␣␣␣
+Verifier
+␣
+attends
+␣
+to
+␣
+\nj{emotionally
+␣
+charged}
+␣
+agent
+␣
+thoughts
+␣
+and
+␣
+trajectory
+␣
+relying
+␣
+on
+␣
+these
+␣
+sentiments
+␣
+as
+␣
+partial
+␣
+heuristic
+␣
+for
+␣
+determining
+␣
+correctness.
+␣
+\todo{fix
+␣
+coloring
+␣
+and
+␣
+table
+␣
+col
+␣
+names}
+␣
+}
+%
+␣
+\label{fig:execution-free-limitations-mini}
+%
+␣
+\end{figure}’
+4.3
+Hybrid Inference Time Scaling
+Combining the verifier strengths.
+Given the analysis from §
+4.2
+, we can summarize two key insights:
+1) Execution-based approach provides direct signal for patch correctness through execution but suffers from lack of distinguishing tests
+2) Execution-free approach offers better distinguishability between patches through a continuous reward score
+s
+E
+⁢
+F
+superscript
+𝑠
+𝐸
+𝐹
+s^{EF}
+italic_s start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT
+but can be biased to pay more attention to heuristics (e.g., agent thoughts) over final output patch.
+Given the above insights, we thus propose a hybrid verifier that leverages the strengths of both approaches.
+Particularly, we define the hybrid verifier with score
+s
+k
+H
+subscript
+superscript
+𝑠
+𝐻
+𝑘
+s^{H}_{k}
+italic_s start_POSTSUPERSCRIPT italic_H end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+as,
+s
+k
+H
+=
+Top
+n
+⁢
+(
+s
+k
+E
+⁢
+F
+)
++
+s
+k
+E
+⁢
+B
+,
+where
+⁢
+Top
+n
+⁢
+(
+s
+k
+E
+⁢
+F
+)
+=
+{
+s
+k
+E
+⁢
+F
+,
+if
+⁢
+s
+k
+E
+⁢
+F
+⁢
+is among the top
+⁢
+n
+⁢
+scores
+,
+−
+∞
+,
+otherwise
+.
+formulae-sequence
+superscript
+subscript
+𝑠
+𝑘
+𝐻
+subscript
+Top
+𝑛
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐵
+where
+subscript
+Top
+𝑛
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+cases
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+if
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+is among the top
+𝑛
+scores
+otherwise
+\displaystyle s_{k}^{H}=\mathrm{Top}_{n}(s_{k}^{EF})+s_{k}^{EB},\text{ where }%
+\mathrm{Top}_{n}(s_{k}^{EF})=\begin{cases}s_{k}^{EF},&\text{if }s_{k}^{EF}%
+\text{ is among the top }n\text{ scores},\\
+-\infty,&\text{otherwise}.\end{cases}
+italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_H end_POSTSUPERSCRIPT = roman_Top start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ( italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT ) + italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_B end_POSTSUPERSCRIPT , where roman_Top start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ( italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT ) = { start_ROW start_CELL italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT , end_CELL start_CELL if italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT is among the top italic_n scores , end_CELL end_ROW start_ROW start_CELL - ∞ , end_CELL start_CELL otherwise . end_CELL end_ROW
+(2)
+where
+s
+k
+E
+⁢
+B
+subscript
+superscript
+𝑠
+𝐸
+𝐵
+𝑘
+s^{EB}_{k}
+italic_s start_POSTSUPERSCRIPT italic_E italic_B end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+provides execution-feedback,
+s
+k
+E
+⁢
+F
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+s_{k}^{EF}
+italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT
+provides distinguishability in case of a tie with execution-based test scores (as
+s
+k
+E
+⁢
+F
+superscript
+subscript
+𝑠
+𝑘
+𝐸
+𝐹
+s_{k}^{EF}
+italic_s start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT
+provides a continuous score between
+0
+0
+and
+1
+1
+1
+1
+), and
+Top
+n
+subscript
+Top
+𝑛
+\mathrm{Top}_{n}
+roman_Top start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT
+restricts hybrid verifier to only consider the top verifier ranked patches. In practice, we perform regression filtering after the top-n filtering to ensure non-zero scores.
+Main Results.
+Results are shown in Tab.
+4
+and Fig.
+4
+.
+While both execution-based and execution-free methods rapidly reach performance plateaus with increasing agent rollouts (saturating at
+∼
+similar-to
+\mathrel{\mathchoice{\vbox{\hbox{$\scriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptstyle\sim$}}}{\vbox{\hbox{$\scriptscriptstyle\sim$}}}{\vbox{\hbox{$%
+\scriptscriptstyle\sim$}}}}
+∼
+43
+%
+percent
+43
+43\%
+43 %
+), our hybrid approach demonstrates substantially superior scaling properties, yielding significant performance improvements (additional  7-8%); achieving a
+Best@
+26 performance of
+51
+%
+percent
+51
+51\%
+51 %
+on the challenging
+SWEBench-Verified
+benchmark.
+Comparison to Open Systems
+.
+The proposed approach significantly outperforms other open-weight alternatives; reflecting a new state-of-the-art in this domain.
+Among other generalist-agent methods, SWE-Gym
+(Pan et al.,
+2024
+)
+recently achieves a
+Best@
+16 performance of
+32.0
+32.0
+32.0
+32.0
+%.
+Similarly, concurrent work
+(Wei et al.,
+2025
+)
+recently achieved
+41.0
+41.0
+41.0
+41.0
+% using RL and
+Best@
+500 (using Agentless).
+In contrast, despite mainly relying on supervised fine-tuning for training, our proposed approach achieves a
+Pass@
+1 itself of
+34.4
+34.4
+34.4
+34.4
+% with
+Best@
+26 performance of
+51.0
+51.0
+51.0
+51.0
+% — achieving strong performance improvements through simply more
+scalable data curation
+(§
+2
+) and better test-time scaling (Figure
+4
+).
+4.4
+Ablation Studies on Hybrid Verification Design
+Figure 7:
+Ablation Study on Hybrid Verifier.
+We find three key insights: 1) While both execution-based and execution-free verifiers saturate around 42-43%, the hybrid approach yields significantly higher test-time gains (51%). 2) Regression tests alone are insufficient for hybrid scaling — achieving only 47.4% aggregation performance. 3) Agentic vs Agentless: training a specialized testing agent is important improving the performance from 48.8% to 51%.
+Variation with Test-Agent Rollouts.
+As in
+4.2
+, execution-based test generation can suffer from a lack of distinguishing tests. One approach to address this, is to sample more test-agent rollouts. We quantify this effect in Figure
+4
+(right).
+We observe that increasing number of test-agent rollouts consistently helps improve performance with our hybrid approach.
+Compute-Efficient Rollouts.
+Figure
+4
+(right) illustrates the
+Best@
+K performance as a function of both test-agent and code-editing agent rollout counts.
+Interestingly, we find that sampling more test-agent rollouts can provide more compute optimized inference-scaling over naively sampling more editing-agent rollouts.
+For instance, increasing the number of editing-agent rollouts from 16 to 21 improves the
+Best@
+K performance from 47.6% to 48.4%.
+In contrast, simply sampling 5 more test-rollouts can yield better gains (
+Best@
+K 49.3%).
+5
+5
+5
+Note that test-agent rollouts are also usually considerably cheaper than editing-agent rollouts.
+Regression Tests Alone are Insufficient.
+Our execution-based verification framework integrates both regression and generated reproduction tests.
+Figure
+5
+(right) isolates the impact of regression tests alone on the final performance.
+While regression tests alone improve performance from 42.9% to 47.4%, using generated tests further enhances performance to 51.0%, demonstrating that both test types provide essential and complimentary signals.
+Agentic vs Agentless Tests.
+A distinguishing feature of our approach is to train a specialized agent for test-generation; instead of the zero-shot approach from
+Xia et al. (
+2024b
+)
+.
+To evaluate this design choice, we conducted a controlled comparison using official Agentless tests from their released artifact
+(Xia et al.,
+2024a
+)
+within our hybrid verification framework on the
+SWEBench-Verified
+benchmark.
+Figure
+5
+(right) demonstrates that while Agentless tests provide meaningful performance improvements, our agent-generated tests yield superior results (
+51.0
+51.0
+51.0
+51.0
+% versus
+48.8
+48.8
+48.8
+48.8
+%), validating our agent-based approach to test generation.
+Role of
+Top
+𝐧
+subscript
+Top
+𝐧
+\mathbf{\mathrm{Top}_{n}}
+roman_Top start_POSTSUBSCRIPT bold_n end_POSTSUBSCRIPT
+.
+We evaluate the impact of the
+Top
+n
+subscript
+Top
+𝑛
+\mathrm{Top}_{n}
+roman_Top start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT
+filtering mechanism introduced in Equation (
+2
+).
+Figure
+5
+(right) shows that this selective application strategy improves performance from
+49.8
+49.8
+49.8
+49.8
+% to
+51.0
+51.0
+51.0
+51.0
+%.
+This improvement likely stems from mitigating the impact of toxic tests (§
+4.2
+) by restricting their application to higher-quality patches (identified via execution-free reward scores
+s
+k
+E
+⁢
+F
+subscript
+superscript
+𝑠
+𝐸
+𝐹
+𝑘
+s^{EF}_{k}
+italic_s start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+), thereby enhancing the reliability of the verification process.
+5
+Related Work
+Programming Agents
+.
+Recent work on
+GitHub
+issue resolution includes SWE-agent
+(Yang et al.,
+2024b
+)
+, Autocoderover
+(Zhang et al.,
+2024b
+)
+, OpenHands
+(Wang et al.,
+2024
+)
+, AgentLess
+(Xia et al.,
+2024b
+)
+, Moatless
+Orwall (
+2024
+)
+.
+All of them rely on proprietary models due to a lack of datasets and open-weight models -— a gap our work addresses.
+Agent Training Environments
+.
+Existing SWE agent environments have key limitations:
+SWE-Bench
+(Jimenez et al.,
+2023
+)
+lacks executable training environments,
+R2E
+(Jain et al.,
+2024b
+)
+offers only 246 instances with function completion.
+SWE-Gym
+(Pan et al.,
+2024
+)
+collects executable
+GitHub
+environments similar to us but rely on human-written issues and test cases.
+Synthetic data generation has been studied in various domains but our work is the first to apply it for executable
+GitHub
+environment collection.
+We use back-translation
+(Li et al.,
+2024
+)
+and test-generation in
+SweGen
+approach.
+Please see
+Long et al. (
+2024
+)
+for a comprehensive survey on synthetic data generation methods.
+SWE-Agent Training
+.
+Ma et al. (
+2024
+)
+and
+Xie et al. (
+2025
+)
+train on synthetic code editing tasks.
+Pan et al. (
+2024
+)
+study SFT on agent trajectories and inference scaling similar to our work.
+Wei et al. (
+2025
+)
+explores reinforcement learning on large scale data collected from real-world
+GitHub
+issues without execution feedback.
+Verifiers for SWE-Coding Tasks
+.
+Various works have explored use of verifiers for SWE tasks.
+AgentLess
+(Xia et al.,
+2024b
+)
+used majority voting to select the best patch from multiple agents.
+Agentless-1.5 relied on reproduction and regression tests to verify the correctness of generated patches.
+Zhang et al. (
+2024a
+)
+proposed multi-agent commitee-review (
+LLM
+judge) to select the best patch from multiple agents.
+Pan et al. (
+2024
+)
+proposed trajectory verifiers to re-rank the generated patches based on
+LLM
+score.
+Verifiers for General Coding Tasks
+.
+Various works have explored the use of verifiers for general coding tasks on isolated puzzles (HumanEval
+(Chen et al.,
+2021
+)
+), interviews
+(Jain et al.,
+2024a
+)
+, and competition or olympiad problems
+(Hendrycks et al.,
+2021
+; Li et al.,
+2022
+)
+Gu et al. (
+2024
+)
+showed that
+LLM
+judges perform poorly on checking correctness of generated code.
+Chen et al. (
+2022
+); Ridnik et al. (
+2024
+); Key et al. (
+2022
+); Zhang et al. (
+2023a
+)
+study how test generation can be used to re-rank the generated code samples.
+Inala et al. (
+2022
+); Zhang et al. (
+2023b
+); Ni et al. (
+2023
+)
+employ neural code re-ranker models.
+In this work, we extend these lines of work by first presenting
+novel insights on challenges and opportunities for both execution-based and execution-free approaches in SWE-Coding.
+Using these insights, we also propose a novel hybrid approach that effectively combines their strengths
+to achieve better performance (
+51.0
+51.0
+51.0
+51.0
+% on
+SWEBench-Verified
+).
+6
+Conclusion
+In this paper, we introduce R2E-Gym, the largest gym environment and training framework for scaling open-weight
+Swe
+agents. We share two key insights: 1) Synthetic data curation can enable more scalable training on
+Swe
+tasks. 2) Hybrid-test time scaling: different axis for test-time scaling (execution-based testing agents and execution-free verifiers) exhibit complementary strengths; which can be leveraged to achieve significantly higher test-time gains. Overall, our final approach achieves 51% on SWE-Bench Verified, reflecting a new state-of-the-art for open-weight
+Swe
+agents, while also for first-time showing competitive performance with some proprietary models. We hope that our work can offer unique insights for scaling open-source
+Swe
+-agents on real-world applications.
+Acknowledgement
+N. Jain and M. Shetty are supported by NSF grants CCF:1900968, CCF:1908870, and by SKY Lab industrial sponsors and affiliates.
+This work is additionally supported by the R2E OpenPhilanthropy grant.
+References
+Aleithan et al. (2024)
+Reem Aleithan, Haoran Xue, Mohammad Mahdi Mohajer, Elijah Nnorom, Gias Uddin, and Song Wang.
+Swe-bench+: Enhanced coding benchmark for llms.
+arXiv preprint arXiv:2410.06992
+, 2024.
+Anthropic (2024)
+Anthropic.
+Raising the bar on SWE-bench Verified with Claude 3.5 Sonnet.
+https://www.anthropic.com/research/swe-bench-sonnet
+, 2024.
+Anthropic (2025)
+Anthropic.
+Claude 3.7 sonnet.
+https://www.anthropic.com/news/claude-3-7-sonnet
+, February 2025.
+Chen et al. (2022)
+Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen.
+Codet: Code generation with generated tests.
+arXiv preprint arXiv:2207.10397
+, 2022.
+Chen et al. (2021)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.
+Evaluating large language models trained on code.
+arXiv preprint arXiv:2107.03374
+, 2021.
+Cobbe et al. (2021)
+Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al.
+Training verifiers to solve math word problems.
+arXiv preprint arXiv:2110.14168
+, 2021.
+Gu et al. (2024)
+Alex Gu, Wen-Ding Li, Naman Jain, Theo X Olausson, Celine Lee, Koushik Sen, and Armando Solar-Lezama.
+The counterfeit conundrum: Can code language models grasp the nuances of their incorrect generations?
+arXiv preprint arXiv:2402.19475
+, 2024.
+Hendrycks et al. (2021)
+Dan Hendrycks, Steven Basart, Saurav Kadavath, Mantas Mazeika, Akul Arora, Ethan Guo, Collin Burns, Samir Puranik, Horace He, Dawn Song, and Jacob Steinhardt.
+Measuring coding challenge competence with apps.
+NeurIPS
+, 2021.
+Inala et al. (2022)
+Jeevana Priya Inala, Chenglong Wang, Mei Yang, Andres Codas, Mark Encarnación, Shuvendu Lahiri, Madanlal Musuvathi, and Jianfeng Gao.
+Fault-aware neural code rankers.
+Advances in Neural Information Processing Systems
+, 35:13419–13432, 2022.
+Jaech et al. (2024)
+Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al.
+Openai o1 system card.
+arXiv preprint arXiv:2412.16720
+, 2024.
+Jain et al. (2024a)
+Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica.
+Livecodebench: Holistic and contamination free evaluation of large language models for code.
+arXiv preprint arXiv:2403.07974
+, 2024a.
+Jain et al. (2024b)
+Naman Jain, Manish Shetty, Tianjun Zhang, King Han, Koushik Sen, and Ion Stoica.
+R2e: Turning any github repository into a programming agent environment.
+In
+ICML 2024
+, 2024b.
+Jimenez et al. (2023)
+Carlos E Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, and Karthik Narasimhan.
+Swe-bench: Can language models resolve real-world github issues?
+arXiv preprint arXiv:2310.06770
+, 2023.
+Key et al. (2022)
+Darren Key, Wen-Ding Li, and Kevin Ellis.
+I speak, you verify: Toward trustworthy neural program synthesis.
+arXiv preprint arXiv:2210.00848
+, 2022.
+Li et al. (2023)
+Xian Li, Ping Yu, Chunting Zhou, Timo Schick, Omer Levy, Luke Zettlemoyer, Jason Weston, and Mike Lewis.
+Self-alignment with instruction backtranslation.
+arXiv preprint arXiv:2308.06259
+, 2023.
+Li et al. (2024)
+Xian Li, Ping Yu, Chunting Zhou, Timo Schick, Omer Levy, Luke Zettlemoyer, Jason E Weston, and Mike Lewis.
+Self-alignment with instruction backtranslation.
+In
+The Twelfth International Conference on Learning Representations
+, 2024.
+URL
+https://openreview.net/forum?id=1oijHJBRsT
+.
+Li et al. (2022)
+Yujia Li, David Choi, Junyoung Chung, Nate Kushman, Julian Schrittwieser, Rémi Leblond, Tom Eccles, James Keeling, Felix Gimeno, Agustin Dal Lago, et al.
+Competition-level code generation with alphacode.
+Science
+, 378(6624):1092–1097, 2022.
+Long et al. (2024)
+Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang.
+On llms-driven synthetic data generation, curation, and evaluation: A survey.
+arXiv preprint arXiv:2406.15126
+, 2024.
+Ma et al. (2024)
+Yingwei Ma, Rongyu Cao, Yongchang Cao, Yue Zhang, Jue Chen, Yibo Liu, Yuchen Liu, Binhua Li, Fei Huang, and Yongbin Li.
+Lingma swe-gpt: An open development-process-centric language model for automated software improvement.
+arXiv preprint arXiv:2411.00622
+, 2024.
+Ni et al. (2023)
+Ansong Ni, Srini Iyer, Dragomir Radev, Veselin Stoyanov, Wen-tau Yih, Sida Wang, and Xi Victoria Lin.
+Lever: Learning to verify language-to-code generation with execution.
+In
+International Conference on Machine Learning
+, pp.  26106–26128. PMLR, 2023.
+Orwall (2024)
+A. Orwall.
+Moatless tool.
+https://github.com/aorwall/moatless-tools
+, 2024.
+Accessed: 2024-10-22.
+Pan et al. (2024)
+Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, and Yizhe Zhang.
+Training software engineering agents and verifiers with swe-gym, 2024.
+URL
+https://arxiv.org/abs/2412.21139
+.
+Ridnik et al. (2024)
+Tal Ridnik, Dedy Kredo, and Itamar Friedman.
+Code generation with alphacodium: From prompt engineering to flow engineering.
+arXiv preprint arXiv:2401.08500
+, 2024.
+Wang et al. (2024)
+Xingyao Wang, Boxuan Li, Yufan Song, Frank F Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, et al.
+Openhands: An open platform for ai software developers as generalist agents.
+arXiv preprint arXiv:2407.16741
+, 2024.
+Wei et al. (2023)
+Yuxiang Wei, Zhe Wang, Jiawei Liu, Yifeng Ding, and Lingming Zhang.
+Magicoder: Source code is all you need.
+arXiv preprint arXiv:2312.02120
+, 2023.
+Wei et al. (2025)
+Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, and Sida I. Wang.
+Swe-rl: Advancing llm reasoning via reinforcement learning on open software evolution.
+arXiv preprint arXiv:2502.18449
+, 2025.
+Xia et al. (2024a)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying llm-based software engineering agents.
+https://github.com/OpenAutoCoder/Agentless
+, 2024a.
+Xia et al. (2024b)
+Chunqiu Steven Xia, Yinlin Deng, Soren Dunn, and Lingming Zhang.
+Agentless: Demystifying llm-based software engineering agents.
+arXiv preprint arXiv:2407.01489
+, 2024b.
+Xie et al. (2025)
+Chengxing Xie, Bowen Li, Chang Gao, He Du, Wai Lam, Difan Zou, and Kai Chen.
+Swe-fixer: Training open-source llms for effective and efficient github issue resolution.
+arXiv preprint arXiv:2501.05040
+, 2025.
+Yang et al. (2024a)
+An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zhihao Fan.
+Qwen2 technical report.
+arXiv preprint arXiv:2407.10671
+, 2024a.
+Yang et al. (2024b)
+John Yang, Carlos E Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, and Ofir Press.
+Swe-agent: Agent-computer interfaces enable automated software engineering.
+arXiv preprint arXiv:2405.15793
+, 2024b.
+Yao et al. (2022)
+Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao.
+React: Synergizing reasoning and acting in language models.
+arXiv preprint arXiv:2210.03629
+, 2022.
+Zhang et al. (2023a)
+Kexun Zhang, Danqing Wang, Jingtao Xia, William Yang Wang, and Lei Li.
+Algo: Synthesizing algorithmic programs with generated oracle verifiers.
+arXiv preprint arXiv:2305.14591
+, 2023a.
+Zhang et al. (2024a)
+Kexun Zhang, Weiran Yao, Zuxin Liu, Yihao Feng, Zhiwei Liu, Rithesh RN, Tian Lan, Lei Li, Renze Lou, Jiacheng Xu, et al.
+Diversity empowers intelligence: Integrating expertise of software engineering agents.
+In
+The Thirteenth International Conference on Learning Representations
+, 2024a.
+Zhang et al. (2023b)
+Tianyi Zhang, Tao Yu, Tatsunori Hashimoto, Mike Lewis, Wen-tau Yih, Daniel Fried, and Sida Wang.
+Coder reviewer reranking for code generation.
+In
+International Conference on Machine Learning
+, pp.  41832–41846. PMLR, 2023b.
+Zhang et al. (2024b)
+Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, and Abhik Roychoudhury.
+Autocoderover: Autonomous program improvement.
+In
+Proceedings of the 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis
+, pp.  1592–1604, 2024b.
+Zhao et al. (2024)
+Wenting Zhao, Nan Jiang, Celine Lee, Justin T Chiu, Claire Cardie, Matthias Gallé, and Alexander M Rush.
+Commit0: Library generation from scratch.
+arXiv preprint arXiv:2412.01769
+, 2024.
+Zheng et al. (2024)
+Yaowei Zheng, Richong Zhang, Junhao Zhang, Yanhan Ye, Zheyan Luo, Zhangchi Feng, and Yongqiang Ma.
+Llamafactory: Unified efficient fine-tuning of 100+ language models.
+In
+Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
+, Bangkok, Thailand, 2024. Association for Computational Linguistics.
+URL
+http://arxiv.org/abs/2403.13372
+.
+Zhuo et al. (2024)
+Terry Yue Zhuo, Minh Chien Vu, Jenny Chim, Han Hu, Wenhao Yu, Ratnadira Widyasari, Imam Nur Bani Yusuf, Haolan Zhan, Junda He, Indraneil Paul, et al.
+Bigcodebench: Benchmarking code generation with diverse function calls and complex instructions.
+arXiv preprint arXiv:2406.15877
+, 2024.
+Appendix A
+Dataset Details
+Figure 8:
+Repo distribution for our complete R2E-Gym dataset consisting of 8135 instances.
+Commit Filtering Heuristics.
+Our commit filtering approach employs multiple heuristics to identify high-quality bug fixes and improvements suitable for training data.
+We particularly filter for small scoped changes, prioritizing non-documentation updates, and correlated code and test matches.
+We perform this filter at both line and
+AST
+entity level.
+To ensure consistency and quality, we employ specific thresholds in our filtering process:
+•
+Maximum of 5 non-test files modified in a single commit
+•
+Maximum of 100 edited lines across all non-test files
+•
+Maximum patch length of 2000 characters to ensure focused changes
+•
+No more than 1 deleted entity in non-test files
+•
+Maximum of 3 added entities in non-test files
+•
+Maximum of 3 edited entities in non-test files
+•
+No more than 10 statement-level changes to maintain tractability
+Additionally, we use
+LLM
+as a judge filter to further refine our dataset.
+Repository Installation.
+Installing historical commits from GitHub repositories presents significant challenges due to evolving dependency requirements and API changes.
+We use a Docker-based approach with a search-based dependency resolution strategy to create reproducible environments for each commit.
+Our installation process follows these steps:
+1.
+Extract dependency information from
+requirements.txt, setup.py
+, etc
+2.
+Iteratively identify potential version conflicts and compatibility issues
+3.
+Generate multiple candidate dependency configurations
+4.
+Test each configuration until a working environment is found
+This process is semi-manual and challenging to scale and we aim to rely more on
+LLMs
+in the future.
+Example installation scripts test multiple dependency combinations sequentially, exiting on the first successful build:
+⬇
+build_and_check_pandas
+(){
+local
+python_version
+=
+$1
+;
+local
+numpy_version
+=
+$1
+;
+local
+setuptools_version
+=
+$3$
+...
+}
+#
+Attempt
+with
+first
+configuration
+if
+build_and_check_pandas
+"3.7"
+"1.17.*"
+"<0.30"
+"62.*"
+"0.23"
+;
+then
+echo
+"[INFO]
+␣
+First
+␣
+combo
+␣
+succeeded.
+␣
+Exiting."
+exit
+0
+fi
+#
+Attempt
+with
+second
+configuration
+if
+build_and_check_pandas
+"3.8"
+"1.20.*"
+"<0.30"
+"62.*"
+"0.23"
+;
+then
+echo
+"[INFO]
+␣
+Second
+␣
+combo
+␣
+succeeded.
+␣
+Exiting."
+exit
+0
+fi
+#
+Attempt
+with
+third
+configuration
+if
+build_and_check_pandas
+"3.10"
+"1.26.*"
+"===3.0.5"
+"62.*"
+"0.23"
+;
+then
+echo
+"[INFO]
+␣
+Third
+␣
+combo
+␣
+succeeded.
+␣
+Exiting."
+exit
+0
+fi
+Listing 1:
+Example installation script excerpt
+This approach allows us to create working environments for historical commits, enabling execution-based validation of our dataset.
+Test Generation.
+We use an Agentless-like reproduction test generation approach.
+A key difference is that we use the ground truth patch as context when generating the tests.
+Issue Generation.
+As discussed in the main paper, we use backtranslation to generate synthetic issues for commits that lack human-written GitHub issues.
+Our approach leverages both the code changes in the commit and the test execution results to create realistic, informative issue descriptions.
+The issue generation process follows these steps:
+1.
+Extract failing test functions from the execution results
+2.
+Analyze test outputs to identify error messages and expected behaviors
+3.
+Provide the
+LLM
+with commit message, code patch, and test execution results
+4.
+Guide the
+LLM
+to generate a concise, informative issue that describes the bug without revealing the solution
+For each commit, we extract and utilize specific components:
+•
+Commit metadata
+: Hash and commit message provide context about the change
+•
+Code patches
+: We separate non-test file changes (showing what was fixed) from test file changes (showing how to verify the fix)
+•
+Test execution
+: We include both old (failing) and new (passing) executions
+•
+Test functions
+: We extract relevant test functions that demonstrate the bug
+•
+Assertion failures
+: We extract and format the failing assertions from the old commit to show error details
+The prompt construction carefully organizes these components to give the LLM sufficient context while focusing attention on the most relevant information for issue generation.
+We carefully design our prompting strategy to ensure the generated issues resemble human-written ones, focusing on clarity, naturalness, and providing sufficient information for understanding the bug.
+⬇
+#
+Build
+the
+complete
+prompt
+with
+all
+components
+def
+get_prompt
+(
+commit
+,
+execution_result
+,
+issues
+=
+None
+):
+#
+Include
+commit
+hash
+and
+message
+#
+Include
+commit
+patch
+(non-test
+files)
+#
+Include
+test
+file
+changes
+#
+Include
+execution
+results
+from
+old
+and
+new
+commits
+#
+Include
+improved
+test
+functions
+#
+Include
+test
+function
+code
+#
+Include
+assertion
+failures
+#
+Include
+example
+issues
+and
+instructions
+Listing 2:
+Issue generation code structure
+The template below shows our prompt guidelines:
+⬇
+As
+you
+are
+trying
+to
+generate
+synthetic
+issues
+,
+you
+will
+follow
+these
+guidelines
+:
+1.
+Keep
+the
+issue
+concise
+and
+informative
+.
+2.
+Describe
+the
+failing
+test
+,
+including
+the
+input
+that
+causes
+the
+failure
+,
+the
+nature
+of
+the
+failure
+,
+and
+the
+expected
+behavior
+.
+Do
+NOT
+mention
+test
+functions
+or
+files
+directly
+.
+3.
+Do
+not
+reveal
+the
+solution
+to
+the
+problem
+in
+the
+issue
+.
+Only
+describe
+the
+bug
+and
+the
+expected
+behavior
+.
+4.
+If
+there
+are
+multiple
+failing
+tests
+,
+focus
+on
+the
+most
+informative
+one
+or
+a
+subset
+that
+best
+describes
+the
+general
+nature
+of
+the
+failure
+.
+5.
+Describe
+the
+expected
+output
+of
+the
+failing
+test
+:
+-
+For
+errors
+,
+describe
+the
+error
+message
+.
+-
+For
+failing
+tests
+,
+mention
+what
+is
+supposed
+to
+happen
+.
+6.
+Write
+the
+issue
+as
+a
+human
+would
+,
+using
+simple
+language
+without
+excessive
+formatting
+.
+7.
+Use
+concrete
+terms
+to
+describe
+the
+nature
+of
+the
+failure
+.
+Avoid
+vague
+terms
+like
+"
+specific
+output
+"
+or
+"
+certain
+data
+".
+8.
+INCLUDE
+test
+code
+to
+describe
+the
+bug
+but
+keep
+it
+brief
+and
+relevant
+.
+Truncate
+or
+simplify
+tests
+longer
+than
+5-6
+lines
+.
+9.
+Do
+not
+mention
+external
+files
+unless
+absolutely
+necessary
+.
+10.
+Format
+code
+snippets
+using
+triple
+backticks
+.
+The
+issue
+should
+include
+:
+1.
+A
+clear
+and
+concise
+title
+2.
+A
+description
+of
+the
+problem
+with
+detailed
+example
+buggy
+code
+3.
+Expected
+behavior
+4.
+Actual
+behavior
+or
+error
+message
+Listing 3:
+Issue generation template
+This approach enables us to generate high-quality synthetic issues that provide clear problem statements for our training data, even for commits that lack human-written issues.
+Below are examples of synthetic issues generated using our approach:
+⬇
+**
+Title
+:**
+Calling
+‘
+load
+()‘
+Before
+‘
+draft
+()‘
+Causes
+‘
+draft
+()‘
+to
+Fail
+for
+JPEG
+Images
+**
+Description
+:**
+When
+generating
+a
+thumbnail
+for
+a
+JPEG
+image
+using
+the
+‘
+thumbnail
+()‘
+method
+,
+the
+method
+calls
+‘
+load
+()‘
+before
+‘
+draft
+()‘.
+This
+sequence
+results
+in
+the
+‘
+draft
+()‘
+method
+returning
+‘
+None
+‘,
+which
+prevents
+the
+thumbnail
+from
+being
+properly
+optimized
+.
+**
+Example
+Code
+:**
+‘‘‘
+python
+from
+PIL
+import
+Image
+with
+Image
+.
+open
+("
+Tests
+/
+images
+/
+hopper
+.
+jpg
+")
+as
+im
+:
+im
+.
+thumbnail
+((64,
+64))
+‘‘‘
+**
+Expected
+Behavior
+:**
+The
+‘
+thumbnail
+()‘
+method
+should
+utilize
+the
+‘
+draft
+()‘
+method
+to
+optimize
+the
+image
+size
+before
+loading
+,
+ensuring
+that
+the
+thumbnail
+is
+resized
+correctly
+and
+efficiently
+.
+**
+Actual
+Behavior
+:**
+The
+‘
+draft
+()‘
+method
+returns
+‘
+None
+‘
+because
+‘
+load
+()‘
+is
+invoked
+before
+it
+.
+This
+prevents
+the
+thumbnail
+from
+being
+optimized
+,
+potentially
+leading
+to
+incorrect
+thumbnail
+sizes
+or
+unnecessary
+memory
+usage
+.
+Listing 4:
+Example synthetic issue for a PIL image thumbnail bug
+⬇
+**
+Title
+:**
+Unable
+to
+Register
+Route
+with
+Names
+Containing
+Both
+Dots
+and
+Colons
+**
+Description
+:**
+After
+merging
+branch
+’0.18’,
+attempting
+to
+register
+a
+route
+with
+a
+name
+that
+includes
+both
+dots
+(‘.‘)
+and
+colons
+(‘:‘)
+results
+in
+a
+‘
+ValueError
+‘.
+The
+recent
+changes
+were
+intended
+to
+allow
+route
+names
+to
+be
+a
+sequence
+of
+Python
+identifiers
+separated
+by
+dots
+or
+colons
+,
+but
+this
+combination
+is
+still
+causing
+issues
+.
+**
+Example
+Code
+:**
+‘‘‘
+python
+from
+aiohttp
+.
+web
+import
+UrlDispatcher
+,
+PlainRoute
+def
+handler
+(
+request
+):
+return
+’
+Hello
+’
+router
+=
+UrlDispatcher
+()
+#
+Attempting
+to
+register
+a
+route
+with
+both
+dots
+and
+colons
+in
+the
+name
+route
+=
+PlainRoute
+(’
+GET
+’,
+handler
+,
+’
+test
+.
+test
+:
+test
+’,
+’/
+handler
+/
+to
+/
+path
+’)
+router
+.
+register_route
+(
+route
+)
+‘‘‘
+**
+Expected
+Behavior
+:**
+Registering
+a
+route
+with
+a
+name
+like
+‘’
+test
+.
+test
+:
+test
+’‘
+should
+succeed
+without
+errors
+,
+as
+the
+name
+follows
+the
+updated
+rules
+allowing
+multiple
+identifiers
+separated
+by
+dots
+or
+colons
+.
+**
+Actual
+Behavior
+:**
+A
+‘
+ValueError
+‘
+is
+raised
+with
+the
+message
+:
+‘‘‘
+ValueError
+:
+Incorrect
+route
+name
+value
+,
+Route
+name
+should
+be
+a
+sequence
+of
+python
+identifiers
+separated
+by
+dot
+or
+column
+‘‘‘
+This
+prevents
+the
+registration
+of
+route
+names
+that
+include
+both
+dots
+and
+colons
+,
+contrary
+to
+the
+intended
+flexibility
+introduced
+in
+the
+recent
+commit
+.
+Listing 5:
+Example synthetic issue for a route name validation bug
+Patch Minimization.
+We identify that the ground-truth patches often contain irrelavant code changes that are not required to fix the bug, often making modifications to style and structure of the programs.
+We implement a patch-minimization approach to identify the minimal set of code changes required to fix the bug by iteratively removing the code changes and checking whether the tests still pass.
+This allows us to collect fine-grained signal for evaluating localization capabilities of
+LLMs
+.
+Appendix B
+SFT Training
+Agent Details.
+Issue + Code
+Editing Agent
+Patch
+Trajectory
+Figure 9:
+Code-editing agent architecture: The agent takes an issue description and codebase as input and produces a patch that fixes the issue.
+We use R2E-Gym to train a general-purpose prompting agent. In particular, we train our code-editing agent on tasks from R2E-Gym, where given an executable environment
+ℰ
+ℰ
+\mathcal{E}
+caligraphic_E
+and problem description
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+, the agent is asked to solve the provided issue using any means necessary. Particularly, unlike
+(Orwall,
+2024
+)
+, we do not rely on the use of specialized workflows. The agent is tasked to solve the entire task end-to-end, including writing its own reproduction scripts, finding the bug, proposing a fix and then testing its correctness. Similar to
+(Wang et al.,
+2024
+)
+, the agent is also provided with a finish tool, allowing it to submit a solution if it thinks it has completed the task.
+Agent and Tools.
+Similar to
+(Aleithan et al.,
+2024
+; Wang et al.,
+2024
+)
+, we adopt the traditional
+ReAct
+format
+(Yao et al.,
+2022
+)
+for agent-design. For
+AgentHub
+, we use a minimalistic set of four tools to enable the agent to perform diverse
+SWE
+tasks; 1)
+file_editor:
+for viewing and editing files, 2)
+search_tool:
+for searching a relevant term in a given file or folder, 3)
+execute_bash:
+allowing execution of non-interactive bash commands (
+e.g
+., for running test scripts), 4)
+submit:
+for ending the current trajectory while returning expected outputs.No internet or browser access is provided to the agent during the training process.
+Data Curation.
+For training, we use supervised finetuning with rejection sampling using trajectories from
+sonnet-3.5
+model for supervision. To avoid contamination, we only use a subset of R2E-Gym consisting of repos with no overlap with the SWE-Benchdataset. The resulting subset (R2E-Gym-lite) consists of 4538 executable environments across 10 repositories (Figure
+2
+). Overall, we collect a total of 3321 successful trajectories from 2048 unique test environments. For rejection sampling we use the unit tests from R2E-Gym environments (both synthetic and existing). For each trajectory, we use a maximum of
+N
+=
+40
+𝑁
+40
+N=40
+italic_N = 40
+steps. Also, we limit the number of tokens per-trajectory to 32K max tokens. Finally, we also use a maximum timeout of 10-min for the overall trajectory and 90 seconds for each action execution, in order to avoid cases where the agent launches a long-running background process. We collect all training data using a temperature of 0.2.
+Training Setup and Hyperparameters.
+For training, we use the
+Qwen-2.5-Coder
+7B, 14B and 32B series as the base model for training
+Swe
+-agents on R2E-Gym. For training we perform full SFT using the above collected trajectories using LLaMA-Factory
+(Zheng et al.,
+2024
+)
+. We train the overall model for a total of 2 epochs, batch size as 8 while using a learning rate of
+1
+⁢
+e
+−
+5
+1
+superscript
+𝑒
+5
+1e^{-5}
+1 italic_e start_POSTSUPERSCRIPT - 5 end_POSTSUPERSCRIPT
+. The warmup ratio for training was set to 0.1. Due to computational constraints, a maximum context length of 20K was used for training the agent. In future, the use of context-parallelism can enable us to further push the performance when training
+Swe
+-agents on more complex tasks requiring larger-context lengths.
+Appendix C
+Inference Time Scaling
+C.1
+Execution-Based Testing Agents
+Issue + Code
+Testing Agent
+Test Patch
+Trajectory
+Figure 10:
+Testing agent architecture: The agent generates comprehensive test cases to verify if a candidate patch resolves the issue.
+Agent Details.
+We train a specialized
+testing-agent
+that generates reproduction test cases to determine whether a candidate patch resolves the issue (i.e., whether the patch passes the generated test suite). Specifically, we train the testing-agent (using
+Qwen-Coder
+-32B as base-model) to generate a comprehensive test script containing
+M
+=
+10
+𝑀
+10
+M=10
+italic_M = 10
+diverse tests that cover various inputs, corner cases, etc. We use the same agent scaffold from Sec.
+3
+for training the testing agent.
+Data Curation.
+For training, we use supervised finetuning using trajectories from
+sonnet-3.5
+model for supervision. Overall, we collect a total of 2203 test-generation trajectories from sonnet (both positive and negative trajectories with minimal rejection sampling). For each trajectory, we use a maximum of
+N
+=
+40
+𝑁
+40
+N=40
+italic_N = 40
+steps. Also, we limit the number of tokens per-trajectory to 20K max tokens. Finally, we also use a maximum timeout of 5-min for the overall trajectory and 60 seconds for each action execution, in order to avoid cases where the agent launches a long-running background process.
+Training Setup and Hyperparameters.
+For training, we use the
+Qwen-Coder
+-32B model as the base model. We then use the above collected training SFT trajectories to perform full finetuning with the
+Qwen-Coder
+-32B model using LLaMA-Factory
+(Zheng et al.,
+2024
+)
+. We train the overall model for a total of 2 epochs, batch size as 8 while using a learning rate of
+1
+⁢
+e
+−
+5
+1
+𝑒
+5
+1e-5
+1 italic_e - 5
+. A maximum context length of 20K was used for training the agent. The warmup ratio for training was set to 0.1.
+In-Context Starter Code Demonstration
+.
+We provide the following in-context starter-code demonstration (from the
+Django
+repository) to the testing agent.
+⬇
+frame
+=
+shadowbox
+,
+backgroundcolor
+=\
+color
+{
+blue
+!5},
+rulecolor
+=\
+color
+{
+purple
+!40!
+black
+},
+basicstyle
+=\
+small
+\
+ttfamily
+,
+keywordstyle
+=\
+color
+{
+purple
+!70!
+black
+},
+commentstyle
+=\
+color
+{
+green
+!50!
+black
+},
+stringstyle
+=\
+color
+{
+red
+!60!
+black
+},
+breaklines
+=
+true
+,
+captionpos
+=
+b
+,
+caption
+=
+Incontext
+Demonstration
+for
+Testing
+Agent
+]
+import
+os
+import
+django
+from
+django
+.
+conf
+import
+settings
+from
+django
+.
+db
+import
+models
+from
+django
+.
+test
+import
+TestCase
+from
+django
+.
+test
+.
+utils
+import
+setup_test_environment
+#
+Configure
+Django
+settings
+before
+setup
+os
+.
+environ
+.
+setdefault
+(’
+DJANGO_SETTINGS_MODULE
+’,
+’
+tests
+.
+test_sqlite
+’)
+#
+Override
+settings
+settings
+.
+configure
+(
+DATABASES
+={
+”
+default
+”:
+{
+”
+ENGINE
+”:
+”
+django
+.
+db
+.
+backends
+.
+sqlite3
+”,
+”
+NAME
+”:
+”
+test
+.
+db
+”,
+”
+TEST
+”:
+{
+”
+NAME
+”:
+”
+test
+.
+db
+”,
+},
+}
+},
+INSTALLED_APPS
+=[”
+tests
+”],
+MIGRATION_MODULES
+={”
+tests
+”:
+None
+},
+#
+Disable
+migrations
+for
+the
+tests
+app
+)
+#
+Setup
+Django
+django
+.
+setup
+()
+setup_test_environment
+()
+#
+Define
+test
+models
+class
+ExampleModel
+(
+models
+.
+Model
+):
+example_char
+=
+models
+.
+CharField
+(
+max_length
+=255)
+example_int
+=
+models
+.
+IntegerField
+()
+class
+Meta
+:
+app_label
+=
+’
+tests
+’
+#
+Set
+the
+app_label
+to
+’
+tests
+’
+#
+Create
+the
+database
+tables
+from
+django
+.
+core
+.
+management
+import
+call_command
+call_command
+(’
+migrate
+’,
+run_syncdb
+=
+True
+)
+def
+add_test_data
+():
+”””
+Create
+test
+instances
+of
+the
+model
+”””
+ExampleModel
+.
+objects
+.
+create
+(
+example_char
+=”
+Test
+1”,
+example_int
+=1)
+ExampleModel
+.
+objects
+.
+create
+(
+example_char
+=”
+Test
+2”,
+example_int
+=2)
+#
+Add
+test
+data
+add_test_data
+()
+C.2
+Execution-Free Verifiers
+Trajectory
+(Issue + React-Loop + Patch)
+Trajectory Verifier
+YES/NO
+Figure 11:
+Execution-free verifier architecture: The verifier predicts whether a patch is correct based on the full trajectory without executing the code.
+Verifier Details.
+In addition to the execution-based “testing agents”, we also explore the execution-free outcome-supervised reward models (a.k.a verifiers)
+(Cobbe et al.,
+2021
+)
+. In particular, given a problem statement
+𝒟
+𝒟
+\mathcal{D}
+caligraphic_D
+, agent-trajectory
+𝒯
+=
+{
+a
+1
+,
+o
+1
+,
+a
+2
+,
+o
+2
+,
+…
+,
+a
+n
+,
+o
+n
+}
+𝒯
+subscript
+𝑎
+1
+subscript
+𝑜
+1
+subscript
+𝑎
+2
+subscript
+𝑜
+2
+…
+subscript
+𝑎
+𝑛
+subscript
+𝑜
+𝑛
+\mathcal{T}=\{a_{1},o_{1},a_{2},o_{2},\ldots,a_{n},o_{n}\}
+caligraphic_T = { italic_a start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_o start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , italic_o start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , … , italic_a start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT , italic_o start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT }
+and output patch
+𝒪
+𝒪
+\mathcal{O}
+caligraphic_O
+from the code-editing agent on the R2E-Gym environments, we train a
+Qwen2.5-Coder-14B
+model
+(Yang et al.,
+2024a
+)
+to output a scalar score value
+s
+E
+⁢
+F
+∈
+[
+0
+,
+1
+]
+superscript
+𝑠
+𝐸
+𝐹
+0
+1
+s^{EF}\in[0,1]
+italic_s start_POSTSUPERSCRIPT italic_E italic_F end_POSTSUPERSCRIPT ∈ [ 0 , 1 ]
+predicting the probability of output patch being correct. Specifically, following
+(Pan et al.,
+2024
+)
+we output the correctness of each patch through output tokens
+YES
+(correct) and
+NO
+(incorrect). The overall reward score is then computed by normalizing the relative probability of
+YES
+token as
+r
+=
+P
+⁢
+(
+YES
+)
+/
+(
+P
+⁢
+(
+YES
+)
++
+P
+⁢
+(
+NO
+)
+)
+𝑟
+P
+YES
+P
+YES
+P
+NO
+r=\mathrm{P}(\texttt{YES})/(\mathrm{P}(\texttt{YES})+\mathrm{P}(\texttt{NO}))
+italic_r = roman_P ( YES ) / ( roman_P ( YES ) + roman_P ( NO ) )
+, where P(
+YES
+) and P(
+NO
+) are estimated through the log-probabilities of the corresponding token predictions.
+Training Data.
+We first use the trajectories collected for code-editing agent training §
+3
+in order to obtain a collection of positive and negative samples for verifier training.
+Following the best configuration from
+(Pan et al.,
+2024
+)
+, we also generate on-policy trajectories using our trained 32B model.
+We then filter the collected samples to have an equal number of positive and negative samples. The overall dataset consists of 5700 total trajectories including both positive and negative samples. For training, we follow the template from
+(Pan et al.,
+2024
+)
+, asking the LLM model to predict the output as YES for positive and NO for negative trajectories.
+Training Setup and Hyperparameters.
+For training, we use the
+Qwen-Coder
+-14B model as the base model. We then use the above collected training SFT trajectories to perform finetuning using LLaMA-Factory
+(Zheng et al.,
+2024
+)
+. Similar to
+(Pan et al.,
+2024
+)
+, we perform LORA finetuning using a rank of 64. We train the overall model for a total of 2 epochs, batch size of 8 while using a learning rate of
+1
+⁢
+e
+−
+5
+1
+𝑒
+5
+1e-5
+1 italic_e - 5
+. A maximum context length of 32K was used for training the agent. The warmup ratio for training was set to 0.1.
+C.3
+Execution-Based Analysis
+In our analysis of execution-based testing agents, we focus on two key metrics: distinguishability and toxicity of generated tests. These metrics help us understand the effectiveness and limitations of execution-based verification.
+Distinguishability Rate.
+The distinguishability rate measures a test’s ability to differentiate between correct and incorrect patches. A test is considered ”distinguishing” if it behaves differently when applied to correct patches versus incorrect patches. In practical terms, this means the test can help us identify which patches are correct and which are not.
+For example, consider a test that passes for all correct patches but fails for all incorrect patches—this test has perfect distinguishability. Conversely, a test that passes (or fails) for both correct and incorrect patches provides no useful signal for distinguishing between them.
+Mathematically, for a given test
+t
+𝑡
+t
+italic_t
+and a set of patches
+P
+𝑃
+P
+italic_P
+divided into correct patches
+P
+c
+subscript
+𝑃
+𝑐
+P_{c}
+italic_P start_POSTSUBSCRIPT italic_c end_POSTSUBSCRIPT
+and incorrect patches
+P
+i
+subscript
+𝑃
+𝑖
+P_{i}
+italic_P start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT
+, we compute distinguishability metric as:
+Distinguish
+⁢
+(
+t
+)
+=
+𝟙
+⁢
+[
+max
+p
+∈
+P
+i
+⁡
+Pass
+⁢
+(
+p
+,
+t
+)
+≠
+max
+p
+∈
+P
+c
+⁡
+Pass
+⁢
+(
+p
+,
+t
+)
+]
+Distinguish
+𝑡
+1
+delimited-[]
+subscript
+𝑝
+subscript
+𝑃
+𝑖
+Pass
+𝑝
+𝑡
+subscript
+𝑝
+subscript
+𝑃
+𝑐
+Pass
+𝑝
+𝑡
+\text{Distinguish}(t)=\mathbbm{1}\left[\max_{p\in P_{i}}\text{Pass}(p,t)\neq%
+\max_{p\in P_{c}}\text{Pass}(p,t)\right]
+Distinguish ( italic_t ) = blackboard_1 [ roman_max start_POSTSUBSCRIPT italic_p ∈ italic_P start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT end_POSTSUBSCRIPT Pass ( italic_p , italic_t ) ≠ roman_max start_POSTSUBSCRIPT italic_p ∈ italic_P start_POSTSUBSCRIPT italic_c end_POSTSUBSCRIPT end_POSTSUBSCRIPT Pass ( italic_p , italic_t ) ]
+(3)
+where
+Pass
+⁢
+(
+p
+,
+t
+)
+Pass
+𝑝
+𝑡
+\text{Pass}(p,t)
+Pass ( italic_p , italic_t )
+indicates whether patch
+p
+𝑝
+p
+italic_p
+passes test
+t
+𝑡
+t
+italic_t
+, and
+𝟙
+⁢
+[
+⋅
+]
+1
+delimited-[]
+⋅
+\mathbbm{1}[\cdot]
+blackboard_1 [ ⋅ ]
+is the indicator function. This formula checks whether the best-performing incorrect patch behaves differently on the test compared to the best-performing correct patch. The distinguishability rate for a set of tests
+T
+𝑇
+T
+italic_T
+is then the average distinguishability across all tests:
+DistinguishRate
+⁢
+(
+T
+)
+=
+1
+|
+T
+|
+⁢
+∑
+t
+∈
+T
+Distinguish
+⁢
+(
+t
+)
+DistinguishRate
+𝑇
+1
+𝑇
+subscript
+𝑡
+𝑇
+Distinguish
+𝑡
+\text{DistinguishRate}(T)=\frac{1}{|T|}\sum_{t\in T}\text{Distinguish}(t)
+DistinguishRate ( italic_T ) = divide start_ARG 1 end_ARG start_ARG | italic_T | end_ARG ∑ start_POSTSUBSCRIPT italic_t ∈ italic_T end_POSTSUBSCRIPT Distinguish ( italic_t )
+(4)
+In our analysis, we found that most generated tests have low distinguishability rates—typically less than 20% of tests can effectively differentiate between correct and incorrect patches. This limitation significantly impacts the ability of execution-based verification to identify the best patches, especially as the number of candidate patches increases.
+Toxicity Rate.
+We define toxic tests as those that incorrectly favor incorrect patches over correct ones. The toxicity rate is the proportion of tests that exhibit this behavior. Mathematically:
+Toxic
+⁢
+(
+t
+)
+=
+𝟙
+⁢
+[
+max
+p
+∈
+P
+i
+⁡
+Pass
+⁢
+(
+p
+,
+t
+)
+>
+max
+p
+∈
+P
+c
+⁡
+Pass
+⁢
+(
+p
+,
+t
+)
+]
+Toxic
+𝑡
+1
+delimited-[]
+subscript
+𝑝
+subscript
+𝑃
+𝑖
+Pass
+𝑝
+𝑡
+subscript
+𝑝
+subscript
+𝑃
+𝑐
+Pass
+𝑝
+𝑡
+\text{Toxic}(t)=\mathbbm{1}\left[\max_{p\in P_{i}}\text{Pass}(p,t)>\max_{p\in P%
+_{c}}\text{Pass}(p,t)\right]
+Toxic ( italic_t ) = blackboard_1 [ roman_max start_POSTSUBSCRIPT italic_p ∈ italic_P start_POSTSUBSCRIPT italic_i end_POSTSUBSCRIPT end_POSTSUBSCRIPT Pass ( italic_p , italic_t ) > roman_max start_POSTSUBSCRIPT italic_p ∈ italic_P start_POSTSUBSCRIPT italic_c end_POSTSUBSCRIPT end_POSTSUBSCRIPT Pass ( italic_p , italic_t ) ]
+(5)
+The toxicity rate for a set of tests
+T
+𝑇
+T
+italic_T
+is:
+ToxicityRate
+⁢
+(
+T
+)
+=
+1
+|
+T
+|
+⁢
+∑
+t
+∈
+T
+Toxic
+⁢
+(
+t
+)
+ToxicityRate
+𝑇
+1
+𝑇
+subscript
+𝑡
+𝑇
+Toxic
+𝑡
+\text{ToxicityRate}(T)=\frac{1}{|T|}\sum_{t\in T}\text{Toxic}(t)
+ToxicityRate ( italic_T ) = divide start_ARG 1 end_ARG start_ARG | italic_T | end_ARG ∑ start_POSTSUBSCRIPT italic_t ∈ italic_T end_POSTSUBSCRIPT Toxic ( italic_t )
+(6)
+While toxic tests are generally rare, they can significantly impact verification reliability when present, with toxicity rates reaching up to 10% for some problems. These findings highlight the importance of generating diverse, high-quality tests and the value of combining execution-based verification with other approaches, such as execution-free verifiers, to achieve more robust results.
+C.4
+Execution-Free Analysis
+Figure
+12
+shows the limitations of the execution-free verifier.
+⬇
+Since
+we
+’ve:
+1.
+Successfully
+reproduced
+the
+issue
+2.
+Implemented
+a
+fix
+3.
+Verified
+the
+fix
+works
+4.
+Ensured
+edge
+cases
+are
+handled
+5.
+Maintained
+backward
+compatibility
+I
+believe
+we
+can
+now
+submit
+this
+as
+the
+solution:
+<function=finish>
+<parameter=command>submit</parameter>
+<parameter=result>Fixed
+the
+‘_check_homomorphism‘
+function...’
+⬇
+[
+ASSISTANT
+]
+Great
+!
+The
+fix
+works
+.
+Let
+’s
+analyze
+what
+we
+did:
+1.
+We
+identified
+the
+original
+code
+was
+failing
+because
+it
+was
+using
+a
+‘.inverse()‘
+method
+directly,
+which
+doesn’
+t
+exist
+for
+permutations
+.
+2.
+We
+fixed
+the
+issue
+by
+:
+-
+Using
+‘**-1‘
+for
+inverses
+,
+which
+works
+for
+both
+Permutations
+and
+FreeGroupElements
+⬇
+[
+ASSISTANT
+]
+The
+changes
+look
+good
+.
+I
+’ve
+made
+the
+following
+improvements:
+1.
+Simplified
+logic
+to
+check
+both
+the
+generator
+and
+inverse
+2.
+Used
+the
+‘**-1‘
+operator
+for
+Permutation
+and
+FreeGroupElement
+3.
+Maintained
+robust
+handling
+of
+inverted
+generators
+Let’
+s
+run
+the
+reproduce
+script
+again
+:
+<
+function
+=
+execute_bash
+>
+<
+parameter
+=
+cmd
+>
+python
+reproduce_issue
+.
+py
+</
+parameter
+>
+</
+function
+>
+[/
+ASSISTANT
+]
+⬇
+[
+USER
+]
+Exit
+code
+:
+0
+Execution
+output
+of
+[
+execute_bash
+]:
+[
+STDOUT
+]
+Testing
+with
+DihedralGroup
+(3):
+SUCCESS
+:
+Homomorphism
+created
+s
+uccessfully
+Testing
+with
+SymmetricGroup
+(3):
+SUCCESS
+:
+Homomorphism
+created
+s
+uccessfully
+Figure 12:
+Analyzing limitations of execution-free verifiers
+. Qualitative visualization analyzing top
+k
+=
+4
+𝑘
+4
+k=4
+italic_k = 4
+sliding windows over agent trajectory (refer §
+4.2
+) with highest mean attention score (
+sympy__sympy-24443
+: SWE-Bench). We observe that the RM can be biased by the agent thought / actions, instead of relying on the final output patch.
+Figure 13:
+Pass@K
+plot for our agent and using Agentless tests respectively.
+Appendix D
+Example Testing Agent Outputs
+This section provides examples of test cases generated by our approach.
+D.1
+Example 1: SymPy Relational Parsing Tests
+The following example shows a truncated test suite for validating relational parsing in SymPy, demonstrating our approach’s ability to generate multiple test cases. This test was generated to address the issue in
+SymPy PR #24661
+, which fixes relational parsing in the SymPy library.
+⬇
+from
+sympy
+import
+Lt
+,
+Gt
+,
+Le
+,
+Ge
+,
+Eq
+,
+Ne
+def
+test_relational_parsing
+():
+#
+Test
+case
+1:
+Basic
+less
+than
+operation
+try
+:
+result
+=
+parse_expr
+(
+’1
+<
+2’
+,
+evaluate
+=
+False
+)
+expected
+=
+Lt
+(1,
+2,
+evaluate
+=
+False
+)
+if
+str
+(
+result
+)
+==
+str
+(
+expected
+):
+print
+(
+"Test
+Case
+1:
+Issue
+resolved"
+)
+else
+:
+print
+(
+"Test
+Case
+1:
+Issue
+reproduced"
+)
+except
+Exception
+as
+e
+:
+print
+(
+"Test
+Case
+1:
+Other
+issues"
+)
+#
+Test
+case
+2:
+Greater
+than
+operation
+try
+:
+result
+=
+parse_expr
+(
+’3
+>
+2’
+,
+evaluate
+=
+False
+)
+expected
+=
+Gt
+(3,
+2,
+evaluate
+=
+False
+)
+if
+str
+(
+result
+)
+==
+str
+(
+expected
+):
+print
+(
+"Test
+Case
+2:
+Issue
+resolved"
+)
+else
+:
+print
+(
+"Test
+Case
+2:
+Issue
+reproduced"
+)
+except
+Exception
+as
+e
+:
+print
+(
+"Test
+Case
+2:
+Other
+issues"
+)
+#
+...
+[6
+more
+test
+cases
+omitted
+for
+brevity]
+...
+#
+Test
+case
+9:
+Chained
+comparisons
+try
+:
+result
+=
+parse_expr
+(
+’1
+<
+x
+<
+2’
+,
+evaluate
+=
+False
+)
+if
+isinstance
+(
+result
+,
+bool
+):
+print
+(
+"Test
+Case
+9:
+Issue
+reproduced"
+)
+else
+:
+print
+(
+"Test
+Case
+9:
+Issue
+resolved"
+)
+except
+Exception
+as
+e
+:
+print
+(
+"Test
+Case
+9:
+Other
+issues"
+)
+Listing 6:
+Test cases for SymPy relational parsing (truncated). Successfully detects incorrect code from correct code.
+D.2
+Example 2: Django Model Choice Field Validation Tests
+The following truncated example demonstrates tests for Django’s ModelChoiceField validation with proper setup and teardown code. This test was generated to address the issue in
+Django PR #13933
+, which improves error messages in ModelChoiceField validation.
+⬇
+from
+django
+.
+forms
+import
+ModelChoiceField
+,
+ModelMultipleChoiceField
+from
+django
+.
+db
+import
+models
+from
+django
+.
+core
+.
+exceptions
+import
+ValidationError
+from
+django
+.
+test
+import
+TestCase
+from
+django
+.
+db
+import
+connection
+#
+Create
+a
+simple
+test
+model
+class
+TestModel
+(
+models
+.
+Model
+):
+name
+=
+models
+.
+CharField
+(
+max_length
+=100)
+def
+__str__
+(
+self
+):
+return
+self
+.
+name
+class
+Meta
+:
+app_label
+=
+’test_app’
+def
+setup_test_environment
+():
+#
+Create
+the
+test
+table
+with
+connection
+.
+schema_editor
+()
+as
+schema_editor
+:
+schema_editor
+.
+create_model
+(
+TestModel
+)
+#
+Create
+some
+test
+data
+TestModel
+.
+objects
+.
+create
+(
+name
+=
+"Option
+1"
+)
+TestModel
+.
+objects
+.
+create
+(
+name
+=
+"Option
+2"
+)
+TestModel
+.
+objects
+.
+create
+(
+name
+=
+"Option
+3"
+)
+def
+cleanup_test_environment
+():
+#
+Drop
+the
+test
+table
+with
+connection
+.
+schema_editor
+()
+as
+schema_editor
+:
+schema_editor
+.
+delete_model
+(
+TestModel
+)
+def
+test_modelchoice_validation
+():
+try
+:
+#
+Test
+Case
+1:
+Basic
+invalid
+choice
+for
+ModelChoiceField
+field
+=
+ModelChoiceField
+(
+queryset
+=
+TestModel
+.
+objects
+.
+all
+())
+try
+:
+field
+.
+clean
+(999)
+#
+Non-existent
+ID
+print
+(
+"Test
+Case
+1:
+Issue
+resolved"
+)
+except
+ValidationError
+as
+e
+:
+if
+"999"
+not
+in
+str
+(
+e
+):
+#
+Value
+should
+be
+in
+error
+message
+print
+(
+"Test
+Case
+1:
+Issue
+reproduced"
+)
+else
+:
+print
+(
+"Test
+Case
+1:
+Issue
+resolved"
+)
+#
+...
+[4
+more
+test
+cases
+omitted
+for
+brevity]
+...
+#
+Test
+Case
+6:
+Valid
+choice
+obj
+=
+TestModel
+.
+objects
+.
+first
+()
+try
+:
+result
+=
+field
+.
+clean
+(
+obj
+.
+id
+)
+if
+result
+==
+obj
+:
+print
+(
+"Test
+Case
+6:
+Issue
+resolved"
+)
+else
+:
+print
+(
+"Test
+Case
+6:
+Issue
+reproduced"
+)
+except
+ValidationError
+:
+print
+(
+"Test
+Case
+6:
+Issue
+reproduced"
+)
+except
+Exception
+as
+e
+:
+print
+(
+f
+"Unexpected
+error:
+{e}"
+)
+Listing 7:
+Test cases for Django ModelChoiceField validation (truncated). Most test cases error due to unhandled exceptions and do not distinguish.
+Appendix E
+Agent Trajectory Example
+This section provides a visual example of an agent’s trajectory while solving a software engineering task. The sequence shows the step-by-step process from problem statement to solution, demonstrating how our agent approaches and solves real-world programming issues.
+Figure 14:
+Problem statement presented to the agent, describing the issue that needs to be resolved.
+(a)
+Step 1: Initial analysis and exploration
+(b)
+Step 2: Detailed Exploration
+(c)
+Step 3: Reproducing the issue
+(d)
+Step 4: Running reproduction tests
+(e)
+Step 5: Implementing the fix
+(f)
+Step 6: Verifying the fix
+Figure 15:
+Short successful agent trajectory (using our 32B model) showing the step-by-step process of solving a software engineering task. The agent analyzes the problem, identifies the root cause, implements a solution, tests it, and verifies that it resolves the issue.
+This trajectory example illustrates several key aspects of our agent’s problem-solving approach:
+•
+Systematic Exploration:
+The agent methodically explores the codebase to understand the context and locate the issue.
+•
+Root Cause Analysis:
+Rather than addressing symptoms, the agent identifies the underlying cause of the problem using
+test_issue.py
+.
+•
+Solution Development:
+The agent formulates a clear plan before implementing changes.
+These capabilities enable our agent to effectively tackle complex software engineering tasks that require deep understanding of code structure, programming language semantics, and software design principles.
\ No newline at end of file
diff --git a/research/notes/reinforcement-learning-via-self-distillation-2.md b/research/notes/reinforcement-learning-via-self-distillation-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6a71ea5d651029611483184f2f239d2f34744f0
--- /dev/null
+++ b/research/notes/reinforcement-learning-via-self-distillation-2.md
@@ -0,0 +1,9413 @@
+---
+title: Reinforcement Learning via Self-Distillation
+id: reinforcement-learning-via-self-distillation-2
+tags:
+- deepread
+created: '2026-06-10T00:23:44.609515Z'
+source: https://arxiv.org/html/2601.20802v2
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:44.609224Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Reinforcement Learning via Self-Distillation
+Reinforcement Learning via Self-Distillation
+Jonas Hübotter
+1
+1
+Frederike Lübeck
+,
+1
+,
+2
+,1,2
+Lejs Behric
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+,
+1
+,1
+Anton Baumann
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+,
+1
+,1
+Marco Bagatella
+1
+,
+2
+1,2
+Daniel Marta
+1
+1
+Ido Hakimi
+1
+1
+Idan Shenfeld
+3
+3
+Thomas Kleine Buening
+1
+1
+Carlos Guestrin
+4
+4
+Andreas Krause
+1
+1
+1
+1
+ETH Zurich
+2
+2
+Max Planck Institute for Intelligent Systems
+3
+3
+MIT
+4
+4
+Stanford
+https://github.com/lasgroup/SDPO
+Equal second authorship. Correspondence to
+jonas.huebotter@inf.ethz.ch
+.
+Abstract
+Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math.
+Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck.
+Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain
+why
+an attempt failed.
+We formalize this setting as reinforcement learning with rich feedback and introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+), which converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model.
+SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy.
+In this way, SDPO leverages the model’s ability to retrospectively identify its own mistakes in-context.
+Across scientific reasoning, tool use, and competitive programming on LiveCodeBench v6, SDPO improves sample efficiency and final accuracy over strong RLVR baselines.
+Notably, SDPO also outperforms baselines in standard RLVR environments that only return scalar feedback by using successful rollouts as implicit feedback for failed attempts.
+Finally, applying SDPO to individual questions at test time accelerates discovery on difficult binary-reward tasks, achieving the same discovery probability as best-of-
+k
+k
+sampling or multi-turn conversations with
+3
+×
+3\times
+fewer attempts.
+1
+Introduction
+Figure 1:
+SDPO substantially outperforms an improved version of Group Relative Policy Optimization (GRPO) on LCB v6 with Qwen3-8B.
+Further, SDPO achieves GRPO’s final accuracy in
+4
+×
+4\times
+fewer generations.
+Claude Sonnet 4 is the strongest instruct model on the public LCBv6 leaderboard.
+Shaded regions show the standard deviation across 3 seeds.
+Progress in deep reinforcement learning has shown that iterating on experience—acting, receiving feedback, and updating a policy—can unlock capabilities that are difficult to obtain from static supervision alone
+(Mnih et al.,
+2015
+; Silver et al.,
+2016
+;
+2017
+; Berner et al.,
+2019
+)
+.
+The same theme now appears in large language models (LLMs): large-scale post-training with reinforcement learning (RL) has substantially improved performance on reasoning-heavy tasks, especially in settings with programmatic or otherwise verifiable evaluation
+(Jaech et al.,
+2024
+; Guo et al.,
+2025
+; Kimi et al.,
+2025
+; Olmo et al.,
+2025
+)
+.
+Nevertheless, the dominant RL recipe for LLM post-training remains bottlenecked by credit assignment.
+Most current approaches operate in the setting of reinforcement learning with verifiable rewards (RLVR): given a question
+x
+x
+, the model samples an answer
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+{y\sim\pi_{\theta}(\cdot\mid x)}
+and receives a scalar reward
+r
+∈
+ℝ
+r\in\mathbb{R}
+, often binary (e.g., unit-tests pass/fail in code generation).
+Modern policy gradient RLVR methods such as Group Relative Policy Optimization
+(GRPO; Shao et al.,
+2024
+)
+estimate advantages from these sparse outcome rewards.
+Furthermore, when all rollouts in a group receive the same (often zero) reward, GRPO advantages collapse to zero and learning stalls.
+To overcome this sparsity, one might prefer distillation from a strong teacher
+(Guo et al.,
+2025
+; Yang et al.,
+2025a
+; Lu & Thinking Machines Lab,
+2025
+; Guha et al.,
+2026
+)
+, which provides dense, token-level supervision.
+However, strong teachers are often unavailable in online learning, where the goal is to raise the capability ceiling beyond existing models.
+In this work, we argue that the key limitation is not RL per se, but the information bottleneck imposed by scalar outcome rewards.
+Many verifiable environments expose
+rich tokenized feedback
+beyond scalar rewards
+r
+r
+, such as runtime errors, failing unit tests, or evaluations from an LLM judge.
+This feedback not only reveals
+whether
+a rollout was wrong, but also
+what
+went wrong.
+We formalize this more general setting as
+Reinforcement Learning with Rich Feedback
+(
+RLRF
+) and illustrate its difference to RLVR in
+Figure
+˜
+2
+.
+Here, feedback can be any tokenized representation of any state reached by an agentic system.
+The central question becomes: how can we convert rich feedback into effective credit assignment without requiring external supervision from a strong teacher?
+Figure 2:
+Comparison of RLVR and RLRF settings.
+In Reinforcement Learning with Verifiable Rewards (RLVR), the agent learns from a scalar reward
+r
+r
+, which often acts as an information bottleneck by masking the underlying environment state.
+In contrast, Reinforcement Learning with Rich Feedback (RLRF) utilizes tokenized feedback.
+This provides a significantly richer signal than a scalar reward, as the feedback can encapsulate both the reward as well as detailed observations of the state (such as runtime errors from a code environment or feedback from an LLM judge).
+⬇
+Runtime
+Error
+ZeroDivisionError
+:
+division
+by
+zero
+Line
+73
+in
+separateSquares
+(
+Solution
+.
+py
+)
+\
+parLast
+Executed
+Input
+[[26,30,2],[11,23,1]]
+Figure 3:
+Example of feedback from our code environment, inspired by LeetCode. Listings
+LABEL:lst:feedback_example_wrong_answer
+,
+LABEL:lst:memory_error
+, and
+LABEL:lst:index_error
+in the appendix show examples of feedback in case of a wrong answer, a memory error, and an index error.
+Our starting point is the observation that LLMs already possess a powerful mechanism for using feedback: in-context learning
+(Brown et al.,
+2020
+; Wei et al.,
+2022
+)
+.
+When conditioned on feedback, the same model can often identify plausible mistakes and propose a corrected approach.
+A common example of such feedback is the summary of failed test cases on coding platforms like LeetCode (
+Figure
+˜
+3
+).
+Many recent works leverage this capability to iteratively generate corrections
+(Chen et al.,
+2021a
+; Madaan et al.,
+2023
+; Shinn et al.,
+2023
+; Yao et al.,
+2024
+; Yuksekgonul et al.,
+2025
+; Lee et al.,
+2025
+)
+.
+In contrast, we use the current policy as a “self-teacher” that, rather than sampling a new response, re-evaluates the
+existing
+rollout after receiving rich feedback.
+Including the feedback in-context transforms the model’s next-token distribution, allowing the self-teacher to agree or disagree with the student’s original choices at specific tokens.
+This yields dense, logit-level credit assignment.
+For example, when provided with the feedback from
+Figure
+˜
+3
+, the self-teacher can identify how the initial attempt should be modified to avoid the runtime error.
+Crucially, this mechanism incurs no sampling overhead: we simply re-compute the log-probabilities of the original attempt under the self-teacher’s feedback-augmented context.
+Building on this idea, we introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+), an on-policy algorithm that performs RL via self-distillation.
+SDPO samples rollouts from the current policy, obtains rich environment feedback, and then minimizes a logit-level distillation loss that matches the current policy’s next-token distribution to that of the self-teacher.
+Conceptually, SDPO addresses the central limitation of applying distillation to online learning: the absence of a stronger external teacher.
+Instead of relying on a fixed teacher, SDPO leverages the model’s ability to recognize its own mistakes in hindsight.
+By conditioning the current policy on the rich feedback it just received, we construct a self-teacher that provides the dense supervision of distillation while retaining the exploration benefits of on-policy RL.
+Table
+˜
+1
+summarizes how this positions SDPO relative to RLVR and distillation baselines.
+We include a comprehensive summary of related work in
+Section
+˜
+6
+.
+We show that SDPO is a policy gradient algorithm whose advantages are estimated using the self-teacher.
+This enables the implementation of SDPO with minor changes to standard RLVR pipelines, simply by swapping out the advantages.
+Method
+Sampling
+Signal
+Feedback
+SFT / Distillation
+(Hinton et al.,
+2015
+)
+×
+\boldsymbol{\times}
+off-policy
+✓
+rich
+×
+\boldsymbol{\times}
+strong teacher
+On-Policy Distillation
+(Agarwal et al.,
+2024
+)
+✓
+on-policy
+✓
+rich
+×
+\boldsymbol{\times}
+strong teacher
+RLVR (such as GRPO)
+(Lambert et al.,
+2025
+)
+✓
+on-policy
+×
+\boldsymbol{\times}
+weak
+✓
+environment
+RL via Self-Distillation (SDPO)
+(ours)
+✓
+on-policy
+✓
+rich
+✓
+environment
+Table 1:
+Comparison of self-distillation to alternative methods for post-training LLMs.
+Summary of evaluation results.
+We evaluate SDPO in three online RL settings:
+•
+Learning without rich feedback
+(§
+3
+):
+We evaluate standard RLVR environments that do not return any feedback beyond scalar rewards.
+Here, SDPO treats successful attempts sampled in the current batch as “feedback” for failed attempts on the same question.
+We perform training runs on scientific reasoning and tool use, starting with Qwen3-8B and Olmo3-7B-Instruct.
+We find that SDPO outperforms a strong GRPO baseline that integrates recent improvements: 70.2% vs. 66.6% final accuracy on aggregate.
+SDPO achieves higher accuracy with up to
+11
+×
+11\times
+shorter generation lengths compared to GRPO, demonstrating that effective reasoning need not be verbose.
+•
+Learning with rich feedback
+(§
+4
+):
+We evaluate competitive programming problems from LiveCodeBench v6 with LeetCode-style feedback.
+As shown in
+Figure
+˜
+1
+, SDPO substantially improves over GRPO, reaching a higher final accuracy (48.8% vs. 41.2%) and achieving GRPO’s final accuracy in
+4
+×
+4\times
+fewer generations.
+SDPO’s gains grow with model scale, suggesting that the ability for self-teaching emerges as models become stronger in-context learners.
+•
+Discovering novel solutions to hard tasks at test-time
+(§
+5
+):
+Finally, we demonstrate that SDPO can accelerate the discovery of solutions to difficult binary-reward questions.
+This contrasts with RLVR methods, which only begin learning once the first solution has been found.
+We leverage SDPO for
+Test-Time Self-Distillation
+, a form of test-time training where the model specializes to an individual test question.
+We consider very difficult LiveCodeBench questions, for which the base model’s pass@
+64
+64
+is below 0.03, and show that SDPO accelerates the discovery of solutions by
+3
+×
+3\times
+.
+2
+SDPO: Self-Distillation Policy Optimization
+We propose an algorithm that uses the in-context learning ability of the current policy for assigning credit.
+Our key object is the
+self-teacher
+,
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+\pi_{\theta}(\cdot\mid x,f)
+, which refers to the current policy (the “student”) prompted with the question
+x
+x
+and the rich feedback
+f
+f
+.
+Next to the students’ original attempt
+y
+y
+,
+f
+f
+may incorporate two key kinds of feedback: any environment output (such as runtime errors from a code environment) and a sample solution if
+x
+x
+was already solved with another attempt in the rollout group.
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+As discussed before, the self-teacher
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+\pi_{\theta}(\cdot\mid x,f)
+should have a higher accuracy than the student
+π
+θ
+(
+⋅
+∣
+x
+)
+\pi_{\theta}(\cdot\mid x)
+since it sees additional information in-context.
+This leads us to observe:
+We can use the same policy in two different roles: As the student for the initial attempt and as the teacher to determine the value of actions in hindsight.
+We introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+) which repeatedly distills the self-teacher into the student.
+Given a question
+x
+x
+, we first sample rollouts from the student
+π
+θ
+\pi_{\theta}
+and obtain corresponding environment feedback.
+We then use the KL-divergence,
+KL
+​
+(
+p
+∥
+q
+)
+=
+∑
+i
+p
+​
+(
+i
+)
+​
+log
+⁡
+p
+​
+(
+i
+)
+/
+q
+​
+(
+i
+)
+\smash{\mathrm{KL}\left(p\|q\right)=\sum_{i}p(i)\log\nicefrac{{p(i)}}{{q(i)}}}
+, as a distance measure for the next-token distributions of student and teacher, and optimize a standard logit distillation loss:
+ℒ
+SDPO
+(
+θ
+)
+:=
+∑
+t
+KL
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\mathcal{L}_{\mathrm{SDPO}}(\theta):=\sum_{t}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{<t})\|\mathrm{stopgrad}(\pi_{\theta}(\cdot\mid x,f,y_{<t})))
+(1)
+Algorithm 1
+SDPO
+1:
+Language model
+π
+θ
+\pi_{\theta}
+; dataset with questions
+x
+x
+; number of rollouts
+G
+G
+per question; environment to obtain feedback for attempts.
+2:
+repeat
+3:
+Sample question
+x
+x
+from dataset.
+4:
+Sample responses:
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+\smash{\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta}(\cdot\mid x)}
+.
+5:
+Evaluate responses to obtain feedback
+f
+i
+f_{i}
+.
+⊳
+\triangleright
+Self-distillation:
+6:
+Compute log-probs of self-teacher
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+f
+i
+,
+y
+i
+,
+<
+t
+)
+.
+\log\pi_{\theta}(y_{i,t}\mid x,f_{i},y_{i,<t}).
+7:
+Update
+θ
+\theta
+with gradient descent on
+ℒ
+SDPO
+​
+(
+θ
+)
+\mathcal{L}_{\mathrm{SDPO}}(\theta)
+.
+8:
+until
+converged
+where the stopgrad operator blocks gradients from flowing through the teacher, and thus prevents it from regressing towards the student and ignoring
+f
+f
+.
+The intuitive role of the teacher is to determine where and how the students’ original attempt
+y
+y
+was wrong through retrospection based on the feedback
+f
+f
+.
+Figure
+˜
+4
+shows an example of self-teaching with Qwen3-8B as student and self-teacher.
+We summarize SDPO in
+Algorithm
+˜
+1
+and display the teachers’ reprompt template in
+Table
+˜
+2
+.
+Figure 4:
+Example of self-teaching with Qwen3-8B. The answer is generated by the model before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the
+self-teacher
+after seeing the feedback. We show the per-token
+log
+⁡
+(
+ℙ
+​
+(
+self-teacher
+)
+/
+ℙ
+​
+(
+student
+)
+)
+\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}})
+, with red indicating negative values (
+self-teacher disagrees
+)
+and white indicating values around zero. Notably, in this example, Qwen3-8B identifies the error through retrospection without an explicit solution. Further, the activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution.
+User:
+prompt
+Correct solution:
+successful_previous_rollout
+The following is feedback from your unsuccessful earlier attempt:
+environment_output
+Correctly solve the original question.
+Assistant:
+original_response
+Table 2:
+Template for self-teacher.
+prompt
+is replaced with the question. A sample solution previously generated by the student is substituted for
+successful_previous_rollout
+(if available for this question; otherwise the paragraph is skipped).
+environment_output
+is replaced with the environment output (see, e.g.,
+Figure
+˜
+3
+) from the models’ original attempt (if it was not successful and there is no solution; otherwise the paragraph is skipped). If the models’ original attempt was successful, this attempt is passed as the correct solution.
+original_response
+is replaced with the models’ original attempt to re-evaluate its log-probabilities under the self-teacher.
+We can derive the SDPO gradient as follows (see
+Section
+˜
+A.1
+for details):
+Proposition 2.1
+.
+The gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is
+∇
+ℒ
+SDPO
+​
+(
+θ
+)
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+∑
+t
+=
+1
+|
+y
+|
+𝔼
+y
+^
+t
+∼
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+​
+[
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+⋅
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+]
+]
+.
+\boldsymbol{\nabla}\mathcal{L}_{\mathrm{SDPO}}(\theta)=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\!\!\left[\sum_{t=1}^{|y|}\mathbb{E}_{\hat{y}_{t}\sim\pi_{\theta}(\cdot\mid x,y_{<t})}\!\!\left[\log\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\pi_{\theta}(\hat{y}_{t}\mid x,f,y_{<t})}\cdot\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\right]\right].
+(2)
+2.1
+Comparison to RLVR
+Note that the SDPO gradient is a (negated) logit-level policy gradient where the advantages are estimated using the self-teacher.
+2
+2
+2
+See
+Section
+˜
+A.4
+for a detailed comparison of the SDPO gradient to the standard policy gradient.
+We can therefore reuse standard RLVR implementations and simply swap out the advantages.
+Let
+y
+i
+y_{i}
+be the
+i
+i
+-th rollout from a rollout group of size
+G
+G
+for question
+x
+x
+, then comparing GRPO and SDPO we have:
+A
+i
+,
+t
+GRPO
+:=
+r
+i
+−
+mean
+​
+{
+r
+i
+}
+i
+=
+1
+G
+​
+(constant in
+t
+)
+,
+A
+i
+,
+t
+SDPO
+​
+(
+y
+^
+i
+,
+t
+)
+=
+log
+⁡
+π
+θ
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+f
+i
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+.
+A_{i,t}^{\mathrm{GRPO}}:=r_{i}-\mathrm{mean}\{r_{i}\}_{i=1}^{G}\;\text{{\color[rgb]{.5,.5,.5}\definecolor[named]{pgfstrokecolor}{rgb}{.5,.5,.5}\pgfsys@color@gray@stroke{.5}\pgfsys@color@gray@fill{.5}(constant in $t$)}},\quad A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t})=\log\frac{\pi_{\theta}(\hat{y}_{i,t}\mid x,f_{i},y_{i,<t})}{\pi_{\theta}(\hat{y}_{i,t}\mid x,y_{i,<t})}.
+The GRPO advantages are applied only to the sampled token
+y
+i
+,
+t
+y_{i,t}
+and are constant within a rollout
+y
+i
+y_{i}
+.
+3
+3
+3
+We use the GRPO
+(Shao et al.,
+2024
+)
+advantage without normalization
+(Liu et al.,
+2025b
+)
+.
+In contrast, the SDPO advantages are zero only for tokens where student and teacher perfectly agree.
+The SDPO advantage is positive for tokens which are more likely under the teacher while being negative for tokens which are less likely under the teacher.
+Thus, SDPO can be seen as a direct extension of standard RLVR methods in two ways:
+1.
+from 1-bit feedback to
+allowing arbitrary sequences of tokens as feedback
+, and
+2.
+leveraging this rich feedback to
+estimate dense logit-level advantages
+.
+This tight connection to RLVR methods also enables a straightforward extension of the SDPO gradient from
+Equation
+˜
+2
+to off-policy data via PPO-style clipped importance sampling
+(Schulman et al.,
+2017
+)
+, see
+Section
+˜
+A.4
+.
+2.2
+Compute time & memory
+Figure 5:
+Time per step for SDPO vs GRPO (solid: without code environment, light: with code environment).
+The only computational overhead of SDPO compared to GRPO is the additional computation of log-probs from the self-teacher, which can be effectively parallelized and is substantially faster than sequential generation.
+Figure
+˜
+5
+compares the compute time of SDPO and GRPO.
+As expected, the compute overhead of SDPO is relatively small.
+Here, we use a micro batch size of 2;
+4
+4
+4
+The micro batch size corresponds to # rollouts we train on at a time while accumulating gradients.
+compute time can be further reduced by using larger micro batch sizes.
+Naively computing the KL divergence between student and teacher requires holding full logits of both models in memory.
+To avoid this, we approximate the KL divergence in the SDPO loss by performing top-
+K
+K
+distillation (i.e., only computing the top-
+K
+K
+logits of the student and the corresponding logits of the teacher alongside a term capturing the tail probability; cf.
+Section
+˜
+A.3
+). With a reasonable choice of
+K
+K
+(e.g.,
+K
+=
+100
+{K=100}
+), this avoids virtually any memory overhead while capturing most of the information.
+2.3
+Stability improvements
+We find that two practical modifications significantly enhance the training stability of SDPO.
+First, we employ a regularized self-teacher, implemented either via an exponential moving average (EMA) of the student parameters or by interpolating the current teacher with the initial teacher (cf.
+Section
+˜
+A.2
+).
+As detailed later, both strategies effectively stabilize learning.
+Second, we adopt the symmetric Jensen-Shannon divergence for the distillation loss; this formulation has similarly been shown to improve stability in on-policy distillation from external teachers
+(Agarwal et al.,
+2024
+)
+.
+3
+Learning without Rich Environment Feedback
+We first evaluate SDPO in standard RLVR environments, where feedback is limited to scalar rewards.
+Instead of using the scalar reward, SDPO treats successful attempts sampled in the current batch as “feedback” for failed attempts on the same question.
+By comparing the student’s attempt with a correct solution, the self-teacher can identify where the student was wrong and provide dense credit assignment.
+Figure 6:
+Training progression of Olmo3-7B-Instruct on Chemistry. We report the average accuracy across 16 samples per question and a rolling average of response lengths over 5 steps. We report GRPO with the optimal hyperparameters for this model and task. We run each configuration for 3 seeds and report standard errors as shaded areas.
+3.1
+Experimental setting
+We evaluate tasks on which the model has not been explicitly fine-tuned:
+•
+Science Q&A
+(Chemistry, Physics, Biology, Materials science): Undergraduate-level scientific reasoning using reasoning subsets (L3) from SciKnowEval
+(Feng et al.,
+2024a
+)
+.
+•
+Tool use
+: Mapping a tool-API specification and user request to the correct tool call, using ToolAlpaca
+(Tang et al.,
+2023
+)
+.
+We perform a train-test split to test in-domain generalization.
+We use Qwen3-8B
+(Yang et al.,
+2025a
+)
+and Olmo3-7B-Instruct
+(Olmo et al.,
+2025
+)
+as initial checkpoints and report avg@16 relative to wall-clock training time, excluding initialization & validation.
+Baselines.
+We compare SDPO to an improved variant of
+GRPO
+(Shao et al.,
+2024
+)
+, which incorporates several recent modifications
+(Olmo et al.,
+2025
+; Khatri et al.,
+2026
+)
+such as asymmetric clipping
+(Yu et al.,
+2025
+)
+, avoiding biased normalization
+(Liu et al.,
+2025b
+)
+, and correcting for off-policy data when using efficient inference frameworks
+(Yao et al.,
+2025
+)
+.
+We integrate these modifications into a GRPO implementation that represents a strong baseline, as detailed in
+Equation
+˜
+14
+in
+Section
+˜
+A.4
+.
+GRPO enables off-policy training through PPO’s clipped importance weighting
+(Schulman et al.,
+2017
+)
+.
+We additionally report the special case of
+on-policy GRPO
+(matching the hyperparameters of vanilla SDPO).
+For both baselines, we perform a hyperparameter sweep and report results for the models that achieve the highest validation performance across all target tasks.
+Hyperparameters and training details are provided in
+Appendix
+˜
+E
+.
+We use the
+verl
+library
+(Sheng et al.,
+2025
+)
+for fast multi-GPU training.
+3.2
+Results
+Table
+˜
+3
+summarizes our results.
+We find that SDPO outperforms GRPO across almost all runs, often leading to substantial improvements.
+SDPO learns notably faster than GRPO, performing close to 5 hours of GRPO training after only 1 hour of training with SDPO in several cases.
+SDPO achieves a particularly substantial improvement over GRPO on the Chemistry task, as is displayed in Figure
+6
+(left)
+.
+With Olmo3-7B-Instruct,
+SDPO achieves the 5h GRPO accuracy in 50 minutes of wall-clock training time
+, a
+6
+×
+6\times
+speedup.
+Moreover, SDPO’s 5h accuracy is more than
+10
+10
+%-points higher than that of GRPO.
+Chemistry
+Physics
+Biology
+Materials
+Tool use
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+Qwen3-8B
+41.2
+59.2
+30.8
+58.9
+57.5
++ GRPO
+65.9
+65.9
+74.5
+74.5
+63.8
+63.8
+72.7
+72.7
+35.1
+35.1
+59.9
+74.3
+77.1
+77.1
+64.9
+64.9
+67.7
+67.7
++ GRPO (on-policy)
+63.3
+63.4
+63.6
+63.6
+49.8
+49.8
+73.9
+74.1
+60.2
+65.7
++
+SDPO
+(on-policy)
+73.2
+80.9
+66.6
+75.6
+50.6
+56.8
+72.1
+78.4
+68.0
+68.5
+Olmo3-7B-Instruct
+22.8
+37.7
+16.2
+36.7
+39.3
++ GRPO
+39.7
+56.7
+55.3
+63.3
+63.3
+35.6
+55.8
+70.9
+75.0
+75.0
+56.4
+65.0
++ GRPO (on-policy)
+51.4
+57.5
+62.7
+62.7
+49.8
+49.8
+73.3
+73.5
+56.8
+60.6
++
+SDPO
+(on-policy)
+68.0
+80.0
+59.9
+66.1
+48.0
+52.8
+52.8
+73.7
+79.1
+60.8
+62.1
+62.1
+Table 3:
+Comparison of SDPO and GRPO on reasoning-related benchmarks.
+We report the highest achieved avg@16 within 1 hour and 5 hours of wall-clock training time, respectively. Both SDPO and on-policy GRPO perform one gradient step per generation batch, while GRPO performs 4 off-policy mini batch steps. We select optimal hyperparameters for SDPO and baselines based on 5h accuracy. Each run is performed on a node with 4 NVIDIA GH200 GPUs. Together with initialization and validation, each run takes approximately 6 hours.
+We remark that our results with SDPO use strictly on-policy training (i.e., one gradient step per generation batch).
+Given the known efficiency gains of off-policy methods that perform multiple gradient updates per generation batch, we believe that studying SDPO with off-policy updates is an exciting direction for future work.
+Takeaway 1
+We demonstrate that SDPO can learn to reason effectively, generalizing to challenging reasoning tasks.
+Without requiring any modification to existing RLVR environments, SDPO outperforms GRPO substantially in several cases.
+3.3
+Self-distillation learns to reason concisely
+We consistently observe that SDPO produces substantially shorter generations than GRPO while achieving higher accuracy.
+SDPO’s responses are more than
+3
+×
+3\times
+shorter on average across tasks (cf.
+Table
+˜
+8
+in
+Appendix
+˜
+D
+).
+On Chemistry with Olmo3-7B-Instruct, SDPO even achieves an
+11
+×
+11\times
+reduction in response length relative to GRPO while maintaining higher accuracy (Figure
+6
+(right)
+).
+While recent progress in RLVR has demonstrated that scaling response length is a powerful driver of emergent reasoning capabilities
+(Jaech et al.,
+2024
+; Guo et al.,
+2025
+; Muennighoff et al.,
+2025
+)
+, our results suggest that effective reasoning need not always be verbose. We find that SDPO improves the
+efficiency
+of reasoning.
+Qualitatively, we observe that the longer responses from GRPO often stem from “superficial” reasoning rather than necessary analytical steps.
+GRPO frequently generates filler phrases like “Hmm” and “Wait” or enters circular logical loops that repeat previous steps verbatim.
+Figure
+˜
+7
+displays a representative example of this phenomenon.
+Remarkably, SDPO’s generations remain concise and avoid these superficial patterns.
+This may be explained by SDPO’s dense credit assignment, which assigns a specific advantage to each next-token prediction, leading to sparse advantages (cf.
+Figure
+˜
+21
+in
+Appendix
+˜
+F
+).
+By improving the efficiency of reasoning, SDPO reduces inference generation time and demonstrates that reasoning performance can be improved by refining
+how
+the model reasons, not just how
+long
+it reasons.
+…
+Alternatively
+…
+Closer to D? No
+…
+Wait I’m going in circles
+…
+Wait, perhaps the correct answer is B
+…
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+…
+Ah, this works
+…
+Wait I think I messed up
+…
+Hmm
+…
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+…
+Thus, the correct answer is likely B: 1.85.
+<answer>
+B
+</answer>
+(a)
+GRPO (5,549 tokens)
+…
+At pH 7.4, all functional groups are neutral
+…
+maintaining a balance between hydrophobic and hydrophilic character
+…
+[The] overall polarity
+…
+keeps logD from being very high
+…
+or very low
+…
+[typically falling] in the 2.0-3.0 range, with 2.61 (C) being a reasonable estimate
+…
+<answer>
+C
+</answer>
+(b)
+SDPO (764 tokens)
+Figure 7:
+Example responses from GRPO and SDPO after 50 training steps to the following question: “What is the correct octanol/water distribution coefficient logD under the circumstance of pH 7.4 for the molecule
+O=C1O[C@@H](COc2ccon2)CN1c1ccc(C2=CCOCC2)c(F)c1
+?” The answer options are A: 1.32, B: 1.85, C: 2.61, D: 3.76. The correct answer is
+C
+.
+GRPO’s answer contains
+5
+×
+5\times
+“Hmm.”,
+9
+×
+9\times
+“No.”, and
+25
+×
+25\times
+“Wait”. Further, GRPO’s answer repeats calculations such as “
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+”, which appears four times, and the model even explicitly generates “Wait I’m going in circles”.
+SDPO’s answer avoids any circular reasoning and is more than
+7
+×
+7\times
+shorter. The base model is Qwen3-8B.
+4
+Learning with Rich Environment Feedback
+We next evaluate SDPO on coding tasks.
+Coding is a canonical example of an RL environment that provides rich feedback, such as runtime errors and failed unit tests.
+Learning to solve these coding problems requires strong credit assignment since the student must identify its precise mistakes to avoid repeating them in the future.
+LiveCodeBench
+(LCB; Jain et al.,
+2025
+)
+provides a set of contest-style coding problems, ranging from simple to competition-level.
+We restrict our evaluation to the most recent LCBv6 subset of LCB, which contains 131 questions released between February and May 2025.
+We consider a setting with public and private unit tests, common for code contests and coding platforms like LeetCode, where the public tests are used for evaluation during training and the private tests are used for validation
+(Chen et al.,
+2022
+; Le et al.,
+2022
+; El-Kishky et al.,
+2025
+; Samadi et al.,
+2025
+)
+.
+5
+5
+5
+We select public tests as a 50% random subset of private tests.
+We use the Qwen3
+(Yang et al.,
+2025a
+)
+model family for our experiments, with Qwen3-8B as default unless otherwise specified.
+We report the average accuracy over 4 rollouts and use the same GRPO baseline as outlined in
+Section
+˜
+3.1
+.
+Results.
+Figure
+˜
+1
+compares the learning curves of SDPO and GRPO on LCBv6.
+We find that SDPO achieves a substantially higher final accuracy (48.8%) than GRPO (41.2%) while also outperforming the strongest instruct models on the public LCBv6 leaderboard:
+6
+6
+6
+On the public leaderboard, the LCBv6 subset can be obtained by selecting February to May 2025.
+Claude Sonnet 4 (40.5%) and Claude Opus 4 (39.7%).
+Furthermore, SDPO reaches the final accuracy of GRPO in
+4
+×
+4\times
+fewer generations.
+We include an extended comparison to other RLVR baselines that perform similarly to GRPO in
+Table
+˜
+9
+in the appendix.
+Differentiating between the easy, medium, and hard questions of LCB, we find that SDPO particularly improves over GRPO in solving medium and hard questions (cf.
+Figure
+˜
+15
+in the appendix).
+4.1
+Self-distillation benefits from stronger models
+Figure 8:
+SDPO improves with model size.
+We compare the final LCBv6 validation accuracy of SDPO and GRPO at train step 80, across model sizes from Qwen3.
+The ability of SDPO’s teacher to perform accurate retrospection appears to be an emergent phenomenon with scale.
+We include an additional scaling study with Qwen2.5-Instruct in the appendix (cf.
+Figure
+˜
+17
+) which further supports this finding.
+Error bars indicate the standard error across 3 seeds.
+A central question for our work is whether SDPO is sensitive to the in-context learning ability of the base model.
+Intuitively, we expect that SDPO benefits from a strong in-context learner, since this enables the teacher to perform more accurate retrospection.
+To answer this question, we perform a scaling study with different model sizes from the Qwen3
+(Yang et al.,
+2025a
+)
+family.
+As shown by extensive prior work, the ability to learn in-context increases with model size
+(e.g., Brown et al.,
+2020
+)
+.
+As depicted in
+Figure
+˜
+8
+, SDPO significantly outperforms GRPO on larger models while only slightly improving over GRPO on smaller models.
+To determine whether SDPO can also underperform GRPO on a model weaker than Qwen3-0.6B, we performed an additional scaling study with Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+.
+While outperforming GRPO with Qwen2.5-7B and performing similarly with Qwen2.5-8B, we find that SDPO underperforms GRPO on Qwen2.5-1.5B, as seen in
+Figure
+˜
+17
+in
+Appendix
+˜
+D
+.
+Takeaway 2
+Our results suggest that the marginal improvement of SDPO over GRPO is tightly coupled with the strength of the base model, and motivates future study on models stronger than Qwen3-8B.
+In the same way that in-context learning is an emergent phenomenon with scale, the self-teacher’s ability to perform accurate retrospection in SDPO appears to be emergent with scale.
+4.2
+Self-distillation performs dense credit assignment
+Figure 9:
+Dense credit assignment in SDPO in the example from
+Figure
+˜
+4
+. Shown in blue are tokens which become more likely under the self-teacher. The self-teacher identifies how the returned range has to be modified so that it does not contain
+n
+.
+Whereas GRPO assigns a constant advantage to each generated token, SDPO assigns an individual advantage to
+each possible next token
+along the generated sequence based on the agreement of student and teacher.
+At each position
+t
+t
+in the generated sequence
+y
+y
+, there are
+|
+𝒱
+|
+|\mathcal{V}|
+possible next tokens where
+𝒱
+\mathcal{V}
+is the vocabulary.
+In distillation, this level is typically called the
+logit-level
+since it corresponds to the logits of the model.
+In practice, we approximate the full next-token distribution by the top-
+K
+K
+tokens plus the tail, and as such, SDPO assigns
+|
+y
+|
+⋅
+(
+K
++
+1
+)
+|y|\cdot(K+1)
+unique advantages per sequence.
+This is illustrated in
+Figure
+˜
+9
+and allows SDPO to perform dense credit assignment.
+A natural question is whether the performance gains of SDPO are due to leveraging rich feedback in RLRF or due to the dense credit assignment of SDPO.
+To answer this question, we ablate the performance of SDPO in three configurations:
+•
+Logit-level SDPO:
+credit assignment over the 100 most likely tokens (under the student) at each position.
+•
+Token-level SDPO:
+credit assignment over the most likely token at each position.
+•
+Sequence-level SDPO:
+We compute SDPO advantages for all generated tokens and average them to produce a single scalar advantage per sequence (as in GRPO). This does not perform denser credit assignment than GRPO but still leverages the rich feedback
+f
+f
+.
+As shown in Figure
+10
+(left)
+, the dense credit assignment of logit-level SDPO leads to significant performance gains over token-level SDPO and sequence-level SDPO.
+Nevertheless, even sequence-level SDPO outperforms GRPO, indicating that leveraging rich feedback in RLRF can lead to substantial gains over RLVR methods even without dense credit assignment.
+4.3
+The self-teacher improves during training
+Figure 10:
+Left: Rich feedback in RLRF and dense credit assignment of SDPO are complementary.
+We compare logit-level, token-level, and sequence-level SDPO advantages to GRPO. While denser credit assignment in SDPO is beneficial (logit-level > token-level > sequence-level), even sequence-level SDPO significantly outperforms GRPO due to leveraging the rich feedback. Error bars indicate the standard error across 3 seeds.
+Right: The self-teacher improves during training.
+We display the generative accuracy of the self-teacher compared to student on the current training batch (with a rolling average over 5 steps). The final student score is taken at step 80. Notably, the performance of the student significantly surpasses the initial teacher’s accuracy. Error bars indicate the standard deviation across 3 seeds.
+Teacher
+Accuracy
+Avg accuracy
+q
+θ
+q_{\theta}
+36.1
+±
+1.6
+36.1\pm 1.6
+29.8
+±
+1.3
+29.8\pm 1.3
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+48.8
+±
+0.7
+48.8\pm 0.7
+44.4
+±
+0.2
+44.4\pm 0.2
+Trust-region
+50.6
+±
+0.9
+\mathbf{50.6}\pm 0.9
+45.6
+±
+0.2
+\mathbf{45.6}\pm 0.2
+EMA
+49.3
+±
+0.3
+49.3\pm 0.3
+45.3
+±
+0.2
+\mathbf{45.3}\pm 0.2
+Table 4:
+Best/average accuracy until step 90 of various methods for teacher regularization. Trust-region and EMA teachers use
+α
+=
+0.01
+\alpha=0.01
+. Training of the
+q
+θ
+q_{\theta}
+eventually diverges. Error ranges indicate standard errors across 3 seeds.
+Contrary to standard distillation, the self-teacher in SDPO is not frozen, but updated throughout training.
+This is a critical component of SDPO, since it enables the teacher to improve over time, which means that the student can learn from a stronger target.
+To investigate whether the self-teacher improves during training, we plot the average accuracy when
+generating
+using the self-teacher in Figure
+10
+(right)
+.
+We find that the self-teacher improves significantly during training.
+Most notably, the student’s accuracy surpasses the initial teacher’s accuracy in later stages of training.
+This demonstrates that SDPO enables true bootstrapping of a weak model to a strong model, without the initial self-teacher’s performance limiting the final student.
+As described in
+Section
+˜
+2.3
+, SDPO uses a regularized teacher to stabilize training.
+As can be seen in
+Table
+˜
+4
+, a non-regularized teacher significantly underperforms the regularized teachers.
+Furthermore, trust-region and EMA teachers outperform the teacher frozen at the initial teacher’s parameters, showing that the teacher improves through parameter sharing with the student.
+Yet, SDPO performs well even with a frozen teacher.
+4.4
+On-policy self-distillation avoids catastrophic forgetting
+Prior work has shown that a key benefit of on-policy algorithms, such as GRPO, is that models tend not to forget previously obtained capabilities
+(Shenfeld et al.,
+2026b
+; Chen et al.,
+2025b
+; Lu & Thinking Machines Lab,
+2025
+)
+.
+This is practically desirable since it enables continual training pipelines where a model is trained sequentially on diverse tasks without the need to retrain from scratch.
+To evaluate forgetting, we test the final checkpoints of GRPO and SDPO on diverse holdout tasks: IFEval
+(Zhou et al.,
+2023
+)
+, which tests the ability of a model to follow precise format instructions; ArenaHard-v2
+(Li et al.,
+2025a
+)
+, which is an LLM-judged benchmark of real-world instruction-following prompts derived from LMArena
+(Chiang et al.,
+2024
+)
+; and MMLU-Pro
+(Wang et al.,
+2024b
+)
+, which tests broad multi-task knowledge and reasoning.
+As displayed in
+Table
+˜
+5
+, SDPO learns the new task while mitigating degradation of initial capabilities, overall achieving a better performance–forgetting tradeoff than GRPO.
+Off-policy self-distillation baseline.
+As an additional baseline, we consider training the student via supervised fine-tuning (SFT) on successful generations from the self-teacher
+(Scheurer et al.,
+2023
+; Dou et al.,
+2024
+; Zhou et al.,
+2025
+)
+.
+7
+7
+7
+SFT on a teacher’s predictions is a standard off-policy distillation approach
+(Kim & Rush,
+2016
+)
+.
+This requires
+2
+×
+2\times
+the generations of SDPO for the same number of steps, since we have to generate from both the student and the teacher.
+We report SFT on the successes of the self-teacher, which achieves a higher accuracy than also including initial successes from the student in the SFT data.
+As shown in
+Table
+˜
+5
+, SFT on the self-teacher significantly underperforms SDPO on LCBv6, while leading to worse forgetting of prior capabilities.
+This mirrors prior findings on the instability of off-policy imitation
+(see, e.g., Agarwal et al.,
+2024
+)
+.
+Task:
+Holdout tasks:
+LCBv6
+IFEval
+ArenaHard-v2
+(hard prompt)
+ArenaHard-v2
+(creative writing)
+MMLU-Pro
+Avg.
+(holdout)
+Base
+27.9
+27.9
+83.9
+{83.9}
+14.0
+{14.0}
+13.7
+{13.7}
+62.5
+{62.5}
+43.5
+{43.5}
+SFT on self-teacher
+42.7
+{42.7}
+83.7
+11.2
+8.9
+61.9
+41.4
+GRPO
+41.2
+41.2
+82.2
+82.2
+12.0
+12.0
+10.8
+10.8
+62.3
+62.3
+41.8
+41.8
+SDPO
+48.8
+{48.8}
+83.2
+{83.2}
+12.3
+{12.3}
+11.1
+{11.1}
+62.9
+{62.9}
+42.4
+{42.4}
+Table 5:
+On-policy methods do not suffer from catastrophic forgetting.
+We compare the accuracy of the final checkpoint on the training task LCBv6 and on holdout tasks IFEval, ArenaHard-v2, and MMLU-Pro. We compare to a baseline that trains directly on responses generated by the initial self-teacher with SFT. Overall, SDPO achieves the best performance–forgetting tradeoff. We include additional baseline results in
+Table
+˜
+9
+in the appendix.
+4.5
+Can GRPO and SDPO be combined?
+GRPO utilizes Monte Carlo advantages, which are unbiased with respect to the objective of maximizing expected reward
+J
+​
+(
+θ
+)
+:=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+r
+​
+(
+y
+∣
+x
+)
+]
+J(\theta):=\smash{\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[r(y\mid x)\right]}
+.
+In contrast, SDPO advantages are inherently biased with respect to
+J
+​
+(
+θ
+)
+J(\theta)
+due to being computed from rich feedback and a self-teacher.
+This dichotomy parallels the fundamental distinction between Monte Carlo and bootstrapped advantages in RL: while the latter are biased, they typically yield lower variance
+(Sutton & Barto,
+1998
+; Schulman et al.,
+2016
+)
+.
+This motivates a hybrid approach that combines reward-derived GRPO advantages with feedback-derived SDPO advantages:
+A
+i
+,
+t
+SDPO
++
+GRPO
+​
+(
+y
+^
+i
+,
+t
+)
+:=
+λ
+​
+A
+i
+,
+t
+GRPO
+​
+(
+y
+^
+i
+,
+t
+)
++
+(
+1
+−
+λ
+)
+​
+A
+i
+,
+t
+SDPO
+​
+(
+y
+^
+i
+,
+t
+)
+,
+λ
+∈
+[
+0
+,
+1
+]
+.
+A_{i,t}^{\mathrm{SDPO+GRPO}}(\hat{y}_{i,t}):=\lambda A_{i,t}^{\mathrm{GRPO}}(\hat{y}_{i,t})+(1-\lambda)A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t}),\quad\lambda\in[0,1].
+(3)
+Figure 11:
+We compare the LCBv6 validation accuracy at step 80, across model sizes from Qwen3.
+SDPO+GRPO significantly outperforms SDPO on the weaker Qwen3-0.6B, while slightly underperforming SDPO on stronger models.
+We use
+λ
+=
+0.9
+\lambda=0.9
+.
+Error bars indicate the standard error across 3 seeds.
+As shown in
+Figure
+˜
+11
+, SDPO+GRPO appears to be more robust to weaker models than SDPO.
+Intuitively, in a weaker model such as Qwen3-0.6B, the SDPO advantages are less reliable, and hence including the GRPO advantage helps to stabilize training.
+In contrast, we find that SDPO+GRPO slightly underperforms SDPO on stronger models such as Qwen3-8B.
+This suggests that the signal of GRPO, only informed by a scalar reward, can be actively harmful with a strong initial model.
+4.6
+Which feedback is most informative?
+To understand which type of rich feedback is most informative, we ablate the three types of feedback present in a verifiable environment like code generation: the sample solution (if a successful rollout is available in the current rollout group), the environment output (such as runtime errors), and the student’s original attempt.
+Sample solutions.
+Including a sample solution from a failed attempt’s rollout group (if available) closely mirrors the group-relative advantages of GRPO.
+We emphasize that these sample solutions are always generated by the student, as in GRPO, and do not require an expert model.
+They allow for disincentivizing unsuccessful approaches if the model is already able to solve the question.
+However, unlike GRPO where all tokens receive the same negative advantage, the self-teacher can identify specific mistakes and provide feedback on how to fix them.
+Environment output.
+The environment output describes the state of the environment after the student’s attempt.
+This is complementary to sample solutions since it can provide useful signal even if the student has never solved the question before (a setting we explore extensively in
+Section
+˜
+5
+).
+Leveraging environment output is a key differentiating factor between RLRF and RLVR settings.
+Student’s original attempt.
+The student’s original attempt
+y
+y
+does not have to be included in the reprompting template of the teacher.
+Indeed, we find that including it biases the teacher towards the student’s attempt (cf.
+Table
+˜
+6
+).
+This reduces the entropy of the student’s distribution (particularly for initially uncertain tokens), thereby reducing exploration.
+Teacher before training
+Student trained with SDPO
+↑
+\uparrow
+Acc. (%)
+↓
+\downarrow
+Same output (%)
+↑
+\uparrow
+Acc. (%)
+Avg. entropy
+f
+=
+f=
+output
+32.5
+±
+0.5
+32.5\pm 0.5
+13.7
+±
+0.6
+13.7\pm 0.6
+39.9
+±
+1.1
+39.9\pm 1.1
+0.40
+±
+0.0
+0.40\pm 0.0
+f
+=
+f=
+own solution
+42.4
+±
+1.0
+\mathbf{42.4}\pm 1.0
+12.1
+±
+0.7
+12.1\pm 0.7
+42.6
+±
+1.3
+42.6\pm 1.3
+0.41
+±
+0.0
+0.41\pm 0.0
+f
+=
+f=
+output + own solution
+42.5
+±
+1.2
+\mathbf{42.5}\pm 1.2
+10.1
+±
+0.2
+\mathbf{10.1}\pm 0.2
+48.3
+±
+1.4
+\mathbf{48.3}\pm 1.4
+0.38
+±
+0.0
+0.38\pm 0.0
+f
+=
+f=
+y
+y
++ output + own solution
+39.3
+±
+0.8
+39.3\pm 0.8
+30.0
+±
+0.9
+30.0\pm 0.9
+44.5
+±
+1.3
+44.5\pm 1.3
+0.23
+±
+0.0
+\emph{0.23}\pm 0.0
+Table 6:
+Performance of varying kinds of feedback.
+We evaluate informativeness of feedback based on SDPO training (until step 60) as well as the direct impact on the self-teacher. “Same output” measures the percentage of cases where the teacher receives the same environment output as the student’s initial attempt (i.e., not exploring alternative approaches). We observe that environment output and sample solutions are complementary and each provide informative feedback. Naively including only solutions or initial attempts
+y
+y
+significantly reduces diversity in the teacher and student. We remark that the sample solutions are generated by the student, enabling similar group-relative advantage estimation to GRPO. Error bars indicate standard deviation across 3 seeds.
+We summarize results in
+Table
+˜
+6
+where we evaluate the effect on SDPO training as well as the direct impact on the self-teacher.
+We find that environment output & sample solutions are complementary, each providing informative feedback.
+Generally, we observe that performance is not sensitive to syntactic variations of the reprompting template from
+Table
+˜
+2
+.
+5
+Solving Hard Questions via Test-Time Self-Distillation
+In
+Sections
+˜
+3
+and
+4
+, we have demonstrated that SDPO can substantially improve over RLVR methods when performing “train-time RL” for reasoning tasks.
+We now turn to a test-time setting where the model is given only a single hard (binary-reward) question
+x
+x
+and must discover a solution as quickly as possible:
+Definition 5.1
+(Discovery time)
+.
+The discovery time is the number of trials needed until a solution is found (i.e., the smallest
+k
+k
+with the
+k
+k
+-th attempt
+y
+k
+y_{k}
+receiving reward 1).
+Based on this notion, we
+can define a measure of the efficacy of discovery:
+discovery
+​
+@
+​
+k
+:=
+ℙ
+​
+(
+discovery time
+≤
+k
+)
+=
+ℙ
+​
+(
+r
+​
+(
+y
+1
+∣
+x
+)
+=
+1
+or
+r
+​
+(
+y
+2
+∣
+x
+)
+=
+1
+or …or
+r
+​
+(
+y
+k
+∣
+x
+)
+=
+1
+)
+,
+\displaystyle\begin{split}\mathrm{discovery@}k:=&\ \mathbb{P}(\text{discovery time $\leq k$})\\
+=&\ \mathbb{P}(\text{$r(y_{1}\mid x)=1$ or $r(y_{2}\mid x)=1$ or \ldots or $r(y_{k}\mid x)=1$}),\end{split}
+(4)
+where the probability is over any randomness in the algorithm producing
+y
+k
+y_{k}
+and the rewards.
+Thus, the discovery@
+k
+k
+metric quantifies the probability of
+discovering the solution within
+k
+k
+steps.
+8
+8
+8
+Our proposed discovery@
+k
+k
+metric is a canonical metric
+in the study of runtime speedup (i.e., time until termination,
+Dolan & Moré (
+2002
+)
+).
+While prior work has studied discovery with continuous rewards
+(e.g., Novikov et al.,
+2025
+; Yuksekgonul et al.,
+2026
+)
+, discovery with language models in sparse or binary-reward settings does not allow “hill-climbing” a continuous reward and has remained less well understood.
+The most naive approach to discovery in binary-reward tasks is to sample repeatedly i.i.d. from the base model, also known as
+best-of-
+k
+k
+.
+The canonical pass@
+k
+k
+metric for best-of-
+k
+k
+sampling is exactly the probability of discovering at least one solution within
+k
+k
+independent samples from a fixed model, coinciding with discovery@
+k
+k
+.
+The discovery@
+k
+k
+metric generalizes pass@
+k
+k
+to algorithms that sample attempts sequentially.
+A common sequential approach re-prompts the base model with additional context from previous attempts
+(Madaan et al.,
+2023
+; Shinn et al.,
+2023
+)
+.
+We refer to this as
+multi-turn
+sampling.
+Here, the model itself does not change, only its context evolves over time.
+Performing RLVR on the question
+x
+x
+does not improve over best-of-
+k
+k
+sampling from the base model, since a binary reward provides no signal until the first solution has already been found.
+9
+9
+9
+For this reason, several works consider explicitly constructing curricula of solvable questions
+(e.g., Zhao et al.,
+2025
+; Huang et al.,
+2026
+; Diaz-Bone et al.,
+2025
+; Hübotter et al.,
+2025b
+)
+, which self-distillation avoids. Other work found that RLVR yields limited improvement on hard questions
+(Yue et al.,
+2025
+)
+.
+An RLRF method like SDPO does not face the same limitation, as it receives rich feedback from the environment after each attempt.
+This rich feedback enables the model to repeatedly “correct” its mistakes as it encounters them and receives feedback, even before ever discovering a solution.
+In contrast to multi-turn sampling, SDPO repeatedly compresses context
+c
+=
+(
+y
+k
+,
+f
+k
+)
+c=(y_{k},f_{k})
+by distilling
+π
+θ
+(
+⋅
+∣
+x
+,
+c
+)
+\pi_{\theta}(\cdot\mid x,c)
+into a model
+π
+θ
+′
+(
+⋅
+∣
+x
+)
+\pi_{\theta^{\prime}}(\cdot\mid x)
+as we illustrate in
+Figure
+˜
+12
+.
+This self-distillation enables SDPO to continually learn over long contexts, whereas the memory bottleneck of transformers inherently limits the context length of multi-turn sampling
+(Vaswani et al.,
+2017
+)
+.
+In this section, we seek to answer the question:
+Can repeatedly compressing context into model weights via self-distillation
+accelerate discovery for hard questions?
+Figure 12:
+Compressing context into model weights via self-distillation.
+We illustrate the process of distilling the interaction history (context
+c
+c
+) into the model parameters
+θ
+\theta
+.
+The model
+π
+θ
+\pi_{\theta}
+repeatedly attempts a fixed hard question
+x
+x
+, generating an answer
+y
+y
+and receiving feedback
+f
+f
+.
+Rather than appending this history to the context window, the model updates its weights
+θ
+t
+→
+θ
+t
++
+1
+\theta_{t}\to\theta_{t+1}
+with SDPO (batch size
+1
+1
+) based on the feedback, effectively “fixing” mistakes by encoding
+π
+θ
+(
+⋅
+∣
+x
+,
+c
+)
+\pi_{\theta}(\cdot\mid x,c)
+directly into the policy
+π
+θ
+′
+(
+⋅
+∣
+x
+)
+\pi_{\theta^{\prime}}(\cdot\mid x)
+.
+5.1
+Experimental setting
+We consider a particularly challenging subset of questions from LCBv6 that are at Qwen3-8B’s performance ceiling and require significant test-time sampling to find any solution.
+Concretely, we define two groups using Qwen3-8B’s pass@
+k
+k
+:
+Hard tasks
+with
+pass@
+​
+64
+<
+0.5
+{\text{pass@}64<0.5}
+and
+very hard tasks
+with
+pass@
+​
+64
+<
+0.03
+\text{pass@}64<0.03
+.
+Among these, we retain questions for which any of best-of-
+k
+k
+, multi-turn, or SDPO find at least one solution within
+512
+512
+steps across
+5
+5
+seeds.
+This results in 19 hard and 9 very hard questions.
+For best-of-
+k
+k
+sampling under the base model, we report the standard
+pass
+​
+@
+​
+k
+\text{pass}@k
+estimate
+(Chen et al.,
+2021b
+)
+from 2944 independent rollouts.
+As multi-turn sampling, we sequentially reprompt the model in-context using the concatenated feedback from previous attempts. To remain within Qwen3-8B’s 40k-token context limit, we employ a first-in, first-out sliding window, discarding the earliest feedback once the maximum prompt length (32k tokens) is reached.
+We ablate the multi-turn reprompting strategy in
+Figure
+˜
+19
+in
+Appendix
+˜
+D
+and find that retaining only past feedback while forgetting earlier attempts significantly outperforms the baseline that additionally retains past attempts.
+We evaluate SDPO with a batch size of 16. We ablate this choice in
+Figure
+˜
+19
+in
+Appendix
+˜
+D
+and find that overall performance differences are marginal, yet smaller batch sizes are beneficial for improvements at low generation budgets, while larger batch sizes result in more stable updates that still learn to solve questions at later stages into the run.
+5.2
+Results
+Figure 13:
+Self-distillation at test-time solves LiveCodeBench questions that neither the base model nor multi-turn conversations can solve.
+Left:
+Very hard questions (9 total) from LCBv6 where the base model achieves
+pass
+​
+@
+​
+64
+<
+0.03
+\text{pass}@64<0.03
+, i.e., in less than 3% cases, sampling 64 responses yields any success.
+Right:
+Hard questions (19 total) from LCBv6 where the base model achieves
+pass
+​
+@
+​
+64
+<
+0.5
+\text{pass}@64<0.5
+.
+We report the
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+metric, representing the probability of discovering at least one solution within
+k
+k
+total generations.
+Across both difficulty levels, SDPO achieves higher
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+rates at almost all generation budgets, compared to the base model and a multi-turn conversation baseline that receives the feedback in-context. We report the mean and bootstrapped 90% confidence intervals of the mean across 5 random seeds per question.
+Figure
+˜
+13
+compares
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+for SDPO, multi-turn sampling, and best-of-
+k
+k
+sampling on very hard (left) and hard (right) questions from LCBv6. Across both difficulty levels, SDPO achieves substantially higher
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+rates at almost all generation budgets.
+On very hard tasks, multi-turn and best-of-
+k
+k
+largely fail to solve questions within the available generation budget, achieving discovery@2750 of only
+35.6
+%
+35.6\%
+and
+41.5
+%
+{41.5}\%
+, respectively, whereas SDPO discovers a solution in
+53.2
+%
+{53.2}\%
+of cases.
+SDPO not only solves more questions overall but also does so with substantially fewer attempts.
+Notably, to reach a
+22
+%
+22\%
+discovery probability on very hard questions, SDPO requires approximately
+3
+×
+3\times
+fewer generations than best-of-
+k
+k
+and multi-turn sampling.
+On hard tasks, SDPO reaches a
+78
+%
+{78}\%
+discovery@2750 probability while achieving a
+67
+%
+67\%
+discovery probability with roughly
+2.4
+×
+2.4\times
+fewer generations than best-of-
+k
+k
+and multi-turn sampling. Overall, multi-turn and best-of-
+k
+k
+sampling solve only
+68.4
+%
+{68.4}\%
+and
+72.3
+%
+{72.3}\%
+of questions, respectively.
+The context window length for multi-turn sampling is reached after 837 (
+±
+466
+\pm 466
+) steps for hard questions and after 1007 (
+±
+349
+\pm 349
+) steps for very hard questions, offering a possible explanation for its diminishing gains at high generation budgets.
+Question 3 is only solved by SDPO.
+SDPO solves all questions that are solved by best-of-
+k
+k
+and multi-turn sampling. Beyond that, SDPO uniquely discovers a solution for Q3, which is neither solvable with multi-turn sampling nor with best-of-
+k
+k
+sampling within 2750 attempts. In contrast, SDPO first discovers a solution for Q3 after 321 attempts, which corresponds to 20 iteration steps of self-distillation based on feedback with a batch size of 16. We include detailed per-question results in
+Table
+˜
+10
+in
+Appendix
+˜
+D
+.
+The initial self-teacher does not solve hard questions.
+Notably, the self-teacher’s initial accuracy is
+<
+1
+<1
+% for almost all questions, and even exactly
+0
+% on
+78
+78
+% of them (
+Table
+˜
+11
+in
+Appendix
+˜
+D
+).
+This shows that a single turn of in-context feedback is insufficient to solve the problem.
+Despite this, the self-teacher’s credit assignment is sufficiently effective for SDPO to iteratively refine the policy and eventually solve these questions.
+Takeaway 3
+We demonstrate that rich environment feedback enables SDPO to significantly accelerate discovery for hard questions.
+This is in contrast to RLVR methods, which only receive a binary reward signal, and therefore only begin learning once the first solution has already been found.
+6
+Related Work
+6.1
+Reinforcement Learning with LLMs
+Recently, large-scale RL training on diverse tasks has significantly improved the performance of LLMs on general reasoning tasks
+(Guo et al.,
+2025
+; Kimi et al.,
+2025
+; Olmo et al.,
+2025
+; Jaech et al.,
+2024
+; Lambert et al.,
+2025
+)
+.
+This progress is primarily enabled by RLVR methods that use Monte Carlo estimates of rewards, such as STaR or GRPO
+(Zelikman et al.,
+2022
+; Shao et al.,
+2024
+)
+, similar to the classical REINFORCE algorithm
+(Williams,
+1992
+)
+.
+While several traditional RLVR algorithms rely on learning separate value networks
+(Schulman et al.,
+2017
+)
+, they incur substantial memory costs and retain the information bottleneck of scalar rewards.
+In the RLVR setting, it is common for an (outcome) reward to be given only at the end of a sequence.
+To improve credit assignment, several works learn so-called process reward models (PRMs) that estimate rewards for each step in the sequence
+(Lightman et al.,
+2023
+; Wang et al.,
+2024a
+; Setlur et al.,
+2025
+)
+.
+Unlike our RLRF setting, PRMs are typically trained on scalar rewards, either on value estimates for intermediate states or on outcome rewards
+(Cui et al.,
+2025
+)
+.
+Unlike the self-teacher in SDPO, PRMs are a distinct model from the student, introducing significant memory overhead.
+Our work shows that
+each language model is implicitly a PRM
+through retrospection if given rich feedback.
+Conceptually, our work is related to “bootstrapping your own latent”
+(BYOL; Grill et al.,
+2020
+)
+and “expert iteration”
+(Anthony et al.,
+2017
+)
+where a student is bootstrapped by repeatedly imitating an improved version of itself (called the “expert”).
+Canonically, the expert combines the student with test-time search, such as tree search
+(Anthony et al.,
+2017
+)
+or majority voting
+(Zuo et al.,
+2025
+)
+.
+In contrast, SDPO leverages the student’s ability to learn from rich feedback provided in-context, which is related to “augmented views” in BYOL.
+6.2
+Learning from Rich Feedback and through Retrospection
+Beyond scalar outcome rewards, recent works have leveraged rich execution or verbal feedback to guide generation
+(Gehring et al.,
+2025
+; Feng et al.,
+2024b
+; Yuksekgonul et al.,
+2025
+)
+.
+A primary line of research focuses on translating verbal feedback into reward functions for RL.
+This is often achieved by mapping feedback to discrete token-level rewards using an external frozen model
+(Wang et al.,
+2026
+)
+, or by employing strong external LLMs to explicitly construct state-wise reward functions
+(Goyal et al.,
+2019
+; Xie et al.,
+2024
+; Urcelay et al.,
+2026
+)
+.
+Alternatively, feedback can be utilized without explicit reward modeling.
+Several approaches focus on in-context improvement without integrating the process into the RL optimization loop
+(Chen et al.,
+2021a
+; Madaan et al.,
+2023
+; Shinn et al.,
+2023
+; Yao et al.,
+2024
+; Yuksekgonul et al.,
+2025
+; Lee et al.,
+2025
+)
+.
+Others manually curate preference datasets by pairing responses before and after feedback to train with direct preference optimization
+(Stephan et al.,
+2024
+; Lee et al.,
+2024
+)
+, though this requires additional generation and lacks the direct credit assignment of SDPO.
+Various recent works bootstrap thinking traces from known answers, using these answers as rich feedback
+(Zhou et al.,
+2026
+; Hatamizadeh et al.,
+2026
+; Zhang et al.,
+2025
+)
+.
+A central object in several recent works is a feedback-conditioned policy
+π
+θ
+​
+(
+y
+∣
+x
+,
+f
+)
+\pi_{\theta}(y\mid x,f)
+, which learns answers
+y
+y
+that lead to feedback
+f
+f
+(Liu et al.,
+2023
+; Zhang et al.,
+2023
+; Luo et al.,
+2025
+)
+, typically through supervised objectives.
+The idea behind these approaches is to deploy a policy conditioned on desirable (i.e., positive) feedback for deployment.
+This approach is conceptually related to goal-conditioned RL
+(Schaul et al.,
+2015
+; Liu et al.,
+2025a
+)
+, where one can learn from negative examples through goal relabeling
+(Andrychowicz et al.,
+2017
+)
+.
+Feedback-conditioned policies view feedback as a goal, whereas RLRF views feedback as a state that can be used to determine whether the goal
+x
+x
+is achieved.
+Unlike SDPO, these methods do not use feedback for credit assignment in negative trajectories, but rather as a data transformation for goal relabeling.
+6.3
+Distillation
+Distillation is frequently employed as an alternative to supervised fine-tuning (SFT) when a strong teacher model is available.
+Distillation transfers capabilities by training a student to mimic the output distribution or intermediate representations of the teacher
+(Hinton et al.,
+2015
+; Romero et al.,
+2015
+; Kim & Rush,
+2016
+; Sanh et al.,
+2019
+; Xie et al.,
+2020
+)
+.
+While often performed on fixed off-policy datasets, to address the distribution shift between training and inference, recent works explore on-policy distillation, where the student learns from feedback on its own generations provided by an external teacher
+(Agarwal et al.,
+2024
+; Gu et al.,
+2024
+; Yang et al.,
+2025a
+; Lu & Thinking Machines Lab,
+2025
+)
+.
+This mitigates the train-test mismatch, which relates closely to earlier work on online imitation learning
+(Ross et al.,
+2011
+)
+.
+6.4
+Self-Distillation
+The concept of self-distillation was first proposed by
+Snell et al. (
+2022
+)
+in a setting akin to supervised learning, introducing the idea of sampling from a model provided with extra context and training the same model to mimic these predictions without that context.
+This mechanism has proven effective for compressing behavior
+(Bai et al.,
+2022
+; Choi et al.,
+2022
+; Yang et al.,
+2024
+;
+2025b
+)
+and factual information
+(Eyuboglu et al.,
+2026
+; Kujanpää et al.,
+2025
+; Cao et al.,
+2025a
+)
+into model weights.
+Beyond compressing a fixed context into model weights, recent works have used self-distillation to learn from environment feedback
+(Scheurer et al.,
+2023
+; Dou et al.,
+2024
+; Zhou et al.,
+2025
+; Mitra & Ulukus,
+2025
+; Song et al.,
+2026
+)
+.
+These approaches use an
+off-policy
+self-distillation objective, which we find to substantially underperform SDPO’s on-policy learning.
+Off-policy self-distillation trains the student on generations from the teacher, whereas SDPO trains the student to avoid mistakes in its own generations.
+In concurrent work,
+Chen et al. (
+2025c
+)
+apply on-policy self-distillation to grid world settings where feedback is a scalar reward, and a reflection stage in the self-teacher diagnoses possible mistakes, showing improved credit assignment compared to learning value networks for advantage estimation.
+Other concurrent work studies SDPO on a fixed dataset of expert demonstrations, without online environment interaction
+(Shenfeld et al.,
+2026a
+; Zhao et al.,
+2026
+)
+.
+7
+Conclusion, Limitations, and Future Work
+We introduced
+Reinforcement Learning with Rich Feedback
+(RLRF), a paradigm where environments provide tokenized feedback beyond scalar rewards, and argued that this removes a key information bottleneck of RLVR.
+We then proposed
+Self-Distillation Policy Optimization
+(SDPO), which uses the current policy as a feedback-conditioned
+self-teacher
+and distills its corrected log-probabilities into the student.
+This leverages the model’s ability to learn from context for dense credit assignment.
+We further demonstrated that SDPO can be implemented as a minimal, drop-in modification to standard RLVR pipelines.
+Empirically, SDPO demonstrates superior sample efficiency and wall-clock convergence compared to GRPO on reasoning tasks, even when training in standard RLVR environments without rich feedback.
+SDPO’s gains grow with model scale, suggesting that the capacity for self-correction scales with the model’s in-context learning capabilities.
+Moreover, we show that performing SDPO at test time on individual hard binary-reward tasks accelerates the discovery of solutions compared to strong baselines.
+SDPO enables learning from rich feedback in a way that is arguably closer to human cognition: utilizing precise outcomes rather than just binary rewards.
+By allowing the model to determine retrospectively how it should have acted, we demonstrate that language models can convert diverse tokenized feedback into effective self-supervision.
+Limitations.
+Our findings show that SDPO’s performance depends on a model’s in-context learning ability, suggesting that SDPO is primarily applicable for RL-training stronger base models, while it can underperform GRPO on weaker models.
+Moreover, performance depends on the quality of the environment feedback. If the environment provides uninformative or misleading feedback, a model may not be able to learn from it through SDPO.
+Finally, SDPO adds a small computational overhead compared to GRPO for computing the log-probs of the retrospective model.
+While often negligible, this may be a larger overhead for smaller models with shorter generation lengths, where generation time is comparatively small.
+Future Work.
+Our work highlights several exciting directions for future research:
+•
+Long-horizon and agentic settings.
+RLRF is particularly appealing when trajectories are long or expose information about intermediate states.
+Evaluating SDPO in agentic environments is a natural next step.
+•
+Training dynamics at scale.
+Beyond our evaluation on LiveCodeBench, it would be particularly interesting to scale SDPO to large multi-task RL training runs and further study its scaling properties with frontier base models.
+•
+Beyond verifiable rewards.
+While we focused on verifiable code generation, many tasks provide textual feedback without a ground-truth verifier.
+Investigating whether SDPO’s retrospection mechanism can improve alignment in open-ended text generation or continuous-reward tasks remains an open empirical question.
+•
+Behavioral differences in reasoning.
+We observed that SDPO induces qualitatively different reasoning patterns than GRPO, notably avoiding the latter’s tendency toward verbosity and superficial reasoning.
+Future work should systematically study how individual aspects, such as the reprompt template, influence behavior.
+Author Contributions
+Jonas Hübotter
+conceived of the project in summer 2025 and has been working on it full-time since then, leading the team.
+Jonas proposed the conceptual framework of self-distillation for credit assignment with input from Lejs, implemented the algorithm with help from others, led the quantitative experiments on LCBv6, and led the writing of the paper.
+Frederike Lübeck
+led the design of the code environment, led the design and evaluation of the TTT setting in
+Section
+˜
+5
+with input from Jonas, contributed to the project direction in discussions, and contributed significantly to the writing of the paper.
+Lejs Behric
+noted the dense credit assignment of knowledge distillation with strong teacher models in discussions with Jonas, inspiring the idea of self-distillation. Further, Lejs led the evaluation of different teacher templates, co-led the development of a tool for qualitative analysis of runs with Marco and Daniel, helped implement parts of the algorithm, and contributed to the project direction in discussions.
+Anton Baumann
+joined in December 2025 and led the evaluation of SDPO without rich feedback in
+Section
+˜
+3
+with input from Jonas, and contributed to the writing of the paper.
+Marco Bagatella and Daniel Marta
+co-led the development of a tool for qualitative analysis of runs with Lejs, contributed to the training infrastructure, and contributed to the project direction in discussions.
+Ido Hakimi
+significantly contributed to the initial codebase and experimental setup, contributed early algorithmic ideas, and contributed to the project direction in discussions.
+Idan Shenfeld, Thomas Kleine Buening, Carlos Guestrin, and Andreas Krause
+supported this project, with Idan and Carlos joining in December 2025. They made significant contributions to the project direction in discussions and gave valuable advice on our presentation. Thomas and Idan, in particular, significantly contributed to the development of core algorithmic ideas and design of experiments. Thomas further evaluated checkpoints on holdout benchmarks. Carlos suggested the qualitative analysis of reasoning traces in
+Figure
+˜
+7
+and the presentation of TTT results in
+Section
+˜
+5
+. Andreas pointed out valuable connections to existing work in RL which shaped the direction of the project.
+Acknowledgments
+We would like to thank Akira Yoshiyama, Yassir Akram, Parnian Kassraie, Jonathan Thomm, Roman Vorushin, Afra Amini, Imanol Schlag, Yu Sun, and Moritz Hardt for helpful discussions.
+We thank Eduard Durech for helpful conversations regarding the scaling of RL fine-tuning and for his technical guidance on distributed infrastructure and long-context optimization.
+We are grateful to Ruixu Zhou from Tsinghua University & the Tencent Hunyuan Team for pointing out an error in the initially derived gradient estimator.
+Furthermore, we would like to thank Leander Diaz-Bone for supporting dataset generation.
+This project was supported through the Swiss AI compute grant a156 and, in part, compute grant infra01.
+JH was supported by the Swiss National Science Foundation under NCCR Automation, grant agreement 51NF40 180545.
+FL and MB were supported by the ETH-MPI Center for Learning Systems.
+TKB and IH were supported by an ETH AI Center Postdoctoral Fellowship.
+DM was supported by the Knut and Alice Wallenberg Foundation.
+References
+Agarwal et al. (2024)
+Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos Garea, Matthieu Geist, and Olivier Bachem.
+On-policy distillation of language models: Learning from self-generated mistakes.
+In
+ICLR
+, 2024.
+Akyürek et al. (2025)
+Ekin Akyürek, Mehul Damani, Adam Zweiger, Linlu Qiu, Han Guo, Jyothish Pari, Yoon Kim, and Jacob Andreas.
+The surprising effectiveness of test-time training for few-shot learning.
+In
+ICML
+, 2025.
+Amini et al. (2025)
+Afra Amini, Tim Vieira, and Ryan Cotterell.
+Better estimation of the kullback–leibler divergence between language models.
+In
+NeurIPS
+, 2025.
+Andrychowicz et al. (2017)
+Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, and Wojciech Zaremba.
+Hindsight experience replay.
+In
+NeurIPS
+, 2017.
+Anthony et al. (2017)
+Thomas Anthony, Zheng Tian, and David Barber.
+Thinking fast and slow with deep learning and tree search.
+In
+NeurIPS
+, 2017.
+Bai et al. (2022)
+Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, et al.
+Constitutional ai: Harmlessness from ai feedback.
+arXiv preprint arXiv:2212.08073
+, 2022.
+Behrouz et al. (2025)
+Ali Behrouz, Peilin Zhong, and Vahab Mirrokni.
+Titans: Learning to memorize at test time.
+In
+NeurIPS
+, 2025.
+Berner et al. (2019)
+Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Przemysław Debiak, Christy Dennison, David Farhi, Quirin Fischer, Shariq Hashme, Chris Hesse, et al.
+Dota 2 with large scale deep reinforcement learning.
+arXiv preprint arXiv:1912.06680
+, 2019.
+Brown et al. (2020)
+Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.
+Language models are few-shot learners.
+arXiv preprint ArXiv:2005.14165
+, 2020.
+Cao et al. (2025a)
+Bowen Cao, Deng Cai, and Wai Lam.
+Infiniteicl: Breaking the limit of context window size via long short-term memory transformation.
+In
+ACL
+, 2025a.
+Cao et al. (2025b)
+Meng Cao, Shuyuan Zhang, Xiao-Wen Chang, and Doina Precup.
+Scar: Shapley credit assignment for more efficient rlhf.
+arXiv preprint arXiv:2505.20417
+, 2025b.
+Chan et al. (2024)
+Alex J Chan, Hao Sun, Samuel Holt, and Mihaela Van Der Schaar.
+Dense reward for free in reinforcement learning from human feedback.
+In
+ICML
+, 2024.
+Chen et al. (2025a)
+Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, et al.
+Minimax-m1: Scaling test-time compute efficiently with lightning attention.
+arXiv preprint arXiv:2506.13585
+, 2025a.
+Chen et al. (2022)
+Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen.
+Codet: Code generation with generated tests.
+In
+ICLR
+, 2022.
+Chen et al. (2025b)
+Howard Chen, Noam Razin, Karthik Narasimhan, and Danqi Chen.
+Retaining by doing: The role of on-policy data in mitigating forgetting.
+arXiv preprint arXiv:2510.18874
+, 2025b.
+Chen et al. (2021a)
+Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch.
+Decision transformer: Reinforcement learning via sequence modeling.
+In
+NeurIPS
+, 2021a.
+Chen et al. (2021b)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.
+Evaluating large language models trained on code.
+arXiv preprint arXiv:2107.03374
+, 2021b.
+Chen et al. (2025c)
+Wentse Chen, Jiayu Chen, Fahim Tajwar, Hao Zhu, Xintong Duan, Ruslan Salakhutdinov, and Jeff Schneider.
+Retrospective in-context learning for temporal credit assignment with large language models.
+In
+NeurIPS
+, 2025c.
+Chiang et al. (2024)
+Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael Jordan, Joseph E Gonzalez, et al.
+Chatbot arena: An open platform for evaluating llms by human preference.
+In
+ICML
+, 2024.
+Choi et al. (2022)
+Eunbi Choi, Yongrae Jo, Joel Jang, and Minjoon Seo.
+Prompt injection: Parameterization of fixed inputs.
+arXiv preprint arXiv:2206.11349
+, 2022.
+Cui et al. (2025)
+Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al.
+Process reinforcement through implicit rewards.
+arXiv preprint arXiv:2502.01456
+, 2025.
+Diaz-Bone et al. (2025)
+Leander Diaz-Bone, Marco Bagatella, Jonas Hübotter, and Andreas Krause.
+Discover: Automated curricula for sparse-reward reinforcement learning.
+In
+NeurIPS
+, 2025.
+Dolan & Moré (2002)
+Elizabeth D Dolan and Jorge J Moré.
+Benchmarking optimization software with performance profiles.
+Mathematical programming
+, 91(2), 2002.
+Dou et al. (2024)
+Zi-Yi Dou, Cheng-Fu Yang, Xueqing Wu, Kai-Wei Chang, and Nanyun Peng.
+Re-rest: Reflection-reinforced self-training for language agents.
+In
+EMNLP
+, 2024.
+El-Kishky et al. (2025)
+Ahmed El-Kishky, Alexander Wei, Andre Saraiva, Borys Minaiev, Daniel Selsam, David Dohan, Francis Song, Hunter Lightman, Ignasi Clavera, Jakub Pachocki, et al.
+Competitive programming with large reasoning models.
+arXiv preprint arXiv:2502.06807
+, 2025.
+Eyuboglu et al. (2026)
+Sabri Eyuboglu, Ryan Ehrlich, Simran Arora, Neel Guha, Dylan Zinsley, Emily Liu, Will Tennien, Atri Rudra, James Zou, Azalia Mirhoseini, et al.
+Cartridges: Lightweight and general-purpose long context representations via self-study.
+In
+ICLR
+, 2026.
+Feng et al. (2024a)
+Kehua Feng, Keyan Ding, Weijie Wang, Xiang Zhuang, Zeyuan Wang, Ming Qin, Yu Zhao, Jianhua Yao, Qiang Zhang, and Huajun Chen.
+Sciknoweval: Evaluating multi-level scientific knowledge of large language models.
+arXiv preprint arXiv:2406.09098
+, 2024a.
+Feng et al. (2024b)
+Xidong Feng, Bo Liu, Yan Song, Haotian Fu, Ziyu Wan, Girish A Koushik, Zhiyuan Hu, Mengyue Yang, Ying Wen, and Jun Wang.
+Natural language reinforcement learning.
+arXiv preprint arXiv:2411.14251
+, 2024b.
+Gehring et al. (2025)
+Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve.
+Rlef: Grounding code llms in execution feedback with reinforcement learning.
+In
+ICML
+, 2025.
+Goyal et al. (2019)
+Prasoon Goyal, Scott Niekum, and Raymond J Mooney.
+Using natural language for reward shaping in reinforcement learning.
+In
+IJCAI
+, 2019.
+Grill et al. (2020)
+Jean-Bastien Grill, Florian Strub, Florent Altché, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al.
+Bootstrap your own latent-a new approach to self-supervised learning.
+In
+NeurIPS
+, 2020.
+Gu et al. (2024)
+Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang.
+Minillm: Knowledge distillation of large language models.
+2024.
+Guha et al. (2026)
+Etash Guha, Ryan Marten, Sedrick Keh, Negin Raoof, Georgios Smyrnis, Hritik Bansal, Marianna Nezhurina, Jean Mercat, Trung Vu, Zayne Sprague, et al.
+Openthoughts: Data recipes for reasoning models.
+In
+ICLR
+, 2026.
+Guo et al. (2025)
+Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+Haarnoja et al. (2018)
+Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine.
+Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor.
+In
+ICML
+, 2018.
+Hardt & Sun (2024)
+Moritz Hardt and Yu Sun.
+Test-time training on nearest neighbors for large language models.
+In
+ICLR
+, 2024.
+Hatamizadeh et al. (2026)
+Ali Hatamizadeh, Syeda Nahida Akter, Shrimai Prabhumoye, Jan Kautz, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, and Yejin Choi.
+Rlp: Reinforcement as a pretraining objective.
+In
+ICLR
+, 2026.
+Hinton et al. (2015)
+Geoffrey Hinton, Oriol Vinyals, and Jeff Dean.
+Distilling the knowledge in a neural network.
+arXiv preprint arXiv:1503.02531
+, 2015.
+Huang et al. (2026)
+Chengsong Huang, Wenhao Yu, Xiaoyang Wang, Hongming Zhang, Zongxia Li, Ruosen Li, Jiaxin Huang, Haitao Mi, and Dong Yu.
+R-zero: Self-evolving reasoning llm from zero data.
+In
+ICLR
+, 2026.
+Hübotter et al. (2026)
+Jonas Hübotter, Patrik Wolf, Alexander Shevchenko, Dennis Jüni, Andreas Krause, and Gil Kur.
+Specialization after generalization: Towards understanding test-time training in foundation models.
+In
+ICLR
+, 2026.
+Hübotter et al. (2025a)
+Jonas Hübotter, Sascha Bongni, Ido Hakimi, and Andreas Krause.
+Efficiently learning at test-time: Active fine-tuning of llms.
+In
+ICLR
+, 2025a.
+Hübotter et al. (2025b)
+Jonas Hübotter, Leander Diaz-Bone, Ido Hakimi, Andreas Krause, and Moritz Hardt.
+Learning on the job: Test-time curricula for targeted reinforcement learning.
+arXiv preprint arXiv:2510.04786
+, 2025b.
+Jaech et al. (2024)
+Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al.
+Openai o1 system card.
+arXiv preprint arXiv:2412.16720
+, 2024.
+Jain et al. (2025)
+Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica.
+Livecodebench: Holistic and contamination free evaluation of large language models for code.
+In
+ICLR
+, 2025.
+Kaelbling et al. (1998)
+Leslie Pack Kaelbling, Michael L Littman, and Anthony R Cassandra.
+Planning and acting in partially observable stochastic domains.
+Artificial intelligence
+, 101(1-2), 1998.
+Kazemnejad et al. (2025)
+Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux.
+Vineppo: Refining credit assignment in rl training of llms.
+In
+ICML
+, 2025.
+Khatri et al. (2026)
+Devvrit Khatri, Lovish Madaan, Rishabh Tiwari, Rachit Bansal, Sai Surya Duvvuri, Manzil Zaheer, Inderjit S Dhillon, David Brandfonbrener, and Rishabh Agarwal.
+The art of scaling reinforcement learning compute for llms.
+In
+ICLR
+, 2026.
+Kim & Rush (2016)
+Yoon Kim and Alexander M Rush.
+Sequence-level knowledge distillation.
+In
+EMNLP
+, 2016.
+Kimi et al. (2025)
+Kimi, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al.
+Kimi k1.5: Scaling reinforcement learning with llms.
+arXiv preprint arXiv:2501.12599
+, 2025.
+Kujanpää et al. (2025)
+Kalle Kujanpää, Pekka Marttinen, Harri Valpola, and Alexander Ilin.
+Efficient knowledge injection in LLMs via self-distillation.
+TMLR
+, 2025.
+Kwon et al. (2023)
+Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.
+Efficient memory management for large language model serving with pagedattention.
+In
+PSIGOPS
+, 2023.
+Lambert et al. (2025)
+Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al.
+Tulu 3: Pushing frontiers in open language model post-training.
+In
+COLM
+, 2025.
+Le et al. (2022)
+Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi.
+Coderl: Mastering code generation through pretrained models and deep reinforcement learning.
+In
+NeurIPS
+, 2022.
+Lee et al. (2024)
+Kyungjae Lee, Dasol Hwang, Sunghyun Park, Youngsoo Jang, and Moontae Lee.
+Reinforcement learning from reflective feedback (rlrf): Aligning and improving llms via fine-grained self-reflection.
+arXiv preprint arXiv:2403.14238
+, 2024.
+Lee et al. (2025)
+Yoonho Lee, Joseph Boen, and Chelsea Finn.
+Feedback descent: Open-ended text optimization via pairwise comparison.
+arXiv preprint arXiv:2511.07919
+, 2025.
+Levine (2018)
+Sergey Levine.
+Reinforcement learning and control as probabilistic inference: Tutorial and review.
+arXiv preprint arXiv:1805.00909
+, 2018.
+Li et al. (2025a)
+Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu, Joseph E Gonzalez, and Ion Stoica.
+From crowdsourced data to high-quality benchmarks: Arena-hard and benchbuilder pipeline.
+In
+ICML
+, 2025a.
+Li et al. (2025b)
+Yi-Chen Li, Tian Xu, Yang Yu, Xuqin Zhang, Xiong-Hui Chen, Zhongxiang Ling, Ningjing Chao, Lei Yuan, and Zhi-Hua Zhou.
+Generalist reward models: Found inside large language models.
+arXiv preprint arXiv:2506.23235
+, 2025b.
+Lightman et al. (2023)
+Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+In
+ICLR
+, 2023.
+Liu et al. (2025a)
+Grace Liu, Michael Tang, and Benjamin Eysenbach.
+A single goal is all you need: Skills and exploration emerge from contrastive rl without rewards, demonstrations, or subgoals.
+In
+ICLR
+, 2025a.
+Liu et al. (2023)
+Hao Liu, Carmelo Sferrazza, and Pieter Abbeel.
+Chain of hindsight aligns language models with feedback.
+arXiv preprint arXiv:2302.02676
+, 2023.
+Liu et al. (2025b)
+Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin.
+Understanding r1-zero-like training: A critical perspective.
+In
+COLM
+, 2025b.
+Lu & Thinking Machines Lab (2025)
+Kevin Lu and Thinking Machines Lab.
+On-policy distillation.
+Thinking Machines Lab: Connectionism
+, 2025.
+URL
+https://thinkingmachines.ai/blog/on-policy-distillation
+.
+Luo et al. (2025)
+Renjie Luo, Zichen Liu, Xiangyan Liu, Chao Du, Min Lin, Wenhu Chen, Wei Lu, and Tianyu Pang.
+Language models can learn from verbal feedback without scalar rewards.
+arXiv preprint arXiv:2509.22638
+, 2025.
+Madaan et al. (2023)
+Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al.
+Self-refine: Iterative refinement with self-feedback.
+In
+NeurIPS
+, 2023.
+Mitra & Ulukus (2025)
+Purbesh Mitra and Sennur Ulukus.
+Semantic soft bootstrapping: Long context reasoning in llms without reinforcement learning.
+arXiv preprint arXiv:2512.05105
+, 2025.
+Mnih et al. (2015)
+Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Andrei A. Rusu, Joel Veness, Marc G. Bellemare, Alex Graves, Martin Riedmiller, Andreas K. Fidjeland, Georg Ostrovski, et al.
+Human-level control through deep reinforcement learning.
+Nature
+, 518(7540), 2015.
+Muennighoff et al. (2025)
+Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori B Hashimoto.
+s1: Simple test-time scaling.
+In
+EMNLP
+, 2025.
+Ng et al. (2000)
+Andrew Y Ng, Stuart Russell, et al.
+Algorithms for inverse reinforcement learning.
+In
+ICML
+, 2000.
+Novikov et al. (2025)
+Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii, Francisco JR Ruiz, Abbas Mehrabian, et al.
+Alphaevolve: A coding agent for scientific and algorithmic discovery.
+arXiv preprint arXiv:2506.13131
+, 2025.
+Olmo et al. (2025)
+Team Olmo, Allyson Ettinger, Amanda Bertsch, Bailey Kuehl, David Graham, David Heineman, Dirk Groeneveld, Faeze Brahman, Finbarr Timbers, Hamish Ivison, et al.
+Olmo 3.
+arXiv preprint arXiv:2512.13961
+, 2025.
+Peng et al. (2019)
+Xue Bin Peng, Aviral Kumar, Grace Zhang, and Sergey Levine.
+Advantage-weighted regression: Simple and scalable off-policy reinforcement learning.
+arXiv preprint arXiv:1910.00177
+, 2019.
+Qwen et al. (2024)
+Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, et al.
+Qwen2.5 technical report.
+arXiv preprint arXiv:2412.15115
+, 2024.
+Rafailov et al. (2023)
+Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn.
+Direct preference optimization: Your language model is secretly a reward model.
+In
+NeurIPS
+, 2023.
+Romero et al. (2015)
+Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio.
+Fitnets: Hints for thin deep nets.
+In
+ICLR
+, 2015.
+Ross et al. (2011)
+Stéphane Ross, Geoffrey Gordon, and Drew Bagnell.
+A reduction of imitation learning and structured prediction to no-regret online learning.
+In
+AISTATS
+, 2011.
+Samadi et al. (2025)
+Mehrzad Samadi, Aleksander Ficek, Sean Narenthiran, Siddhartha Jain, Wasi Uddin Ahmad, Somshubra Majumdar, Vahid Noroozi, and Boris Ginsburg.
+Scaling test-time compute to achieve ioi gold medal with open-weight models.
+arXiv preprint arXiv:2510.14232
+, 2025.
+Sanh et al. (2019)
+Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf.
+Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter.
+arXiv preprint arXiv:1910.01108
+, 2019.
+Schaul et al. (2015)
+Tom Schaul, Daniel Horgan, Karol Gregor, and David Silver.
+Universal value function approximators.
+In
+ICML
+, 2015.
+Scheurer et al. (2023)
+Jérémy Scheurer, Jon Ander Campos, Tomasz Korbak, Jun Shern Chan, Angelica Chen, Kyunghyun Cho, and Ethan Perez.
+Training language models with language feedback at scale.
+arXiv preprint arXiv:2303.16755
+, 2023.
+Schulman et al. (2015)
+John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz.
+Trust region policy optimization.
+In
+ICML
+, 2015.
+Schulman et al. (2016)
+John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel.
+High-dimensional continuous control using generalized advantage estimation.
+In
+ICLR
+, 2016.
+Schulman et al. (2017)
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Setlur et al. (2025)
+Amrith Setlur, Chirag Nagpal, Adam Fisch, Xinyang Geng, Jacob Eisenstein, Rishabh Agarwal, Alekh Agarwal, Jonathan Berant, and Aviral Kumar.
+Rewarding progress: Scaling automated process verifiers for llm reasoning.
+In
+ICLR
+, 2025.
+Shao et al. (2024)
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Shenfeld et al. (2026a)
+Idan Shenfeld, Mehul Damani, Jonas Hübotter, and Pulkit Agrawal.
+Self-distillation enables continual learning.
+arXiv preprint arXiv:2601.19897
+, 2026a.
+Shenfeld et al. (2026b)
+Idan Shenfeld, Jyothish Pari, and Pulkit Agrawal.
+Rl’s razor: Why online reinforcement learning forgets less.
+In
+ICLR
+, 2026b.
+Sheng et al. (2025)
+Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu.
+Hybridflow: A flexible and efficient rlhf framework.
+In
+EuroSys
+, 2025.
+Shinn et al. (2023)
+Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+In
+NeurIPS
+, 2023.
+Silver et al. (2016)
+David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al.
+Mastering the game of go with deep neural networks and tree search.
+Nature
+, 529(7587), 2016.
+Silver et al. (2017)
+David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, et al.
+Mastering chess and shogi by self-play with a general reinforcement learning algorithm.
+arXiv preprint arXiv:1712.01815
+, 2017.
+Snell et al. (2022)
+Charlie Snell, Dan Klein, and Ruiqi Zhong.
+Learning by distilling context.
+arXiv preprint arXiv:2209.15189
+, 2022.
+Song et al. (2026)
+Yuda Song, Lili Chen, Fahim Tajwar, Remi Munos, Deepak Pathak, J Andrew Bagnell, Aarti Singh, and Andrea Zanette.
+Expanding the capabilities of reinforcement learning via text feedback.
+arXiv preprint arXiv:2602.02482
+, 2026.
+Stephan et al. (2024)
+Moritz Stephan, Alexander Khazatsky, Eric Mitchell, Annie S Chen, Sheryl Hsu, Archit Sharma, and Chelsea Finn.
+Rlvf: Learning from verbal feedback without overgeneralization.
+In
+ICML
+, 2024.
+Sun et al. (2020)
+Yu Sun, Xiaolong Wang, Zhuang Liu, John Miller, Alexei Efros, and Moritz Hardt.
+Test-time training with self-supervision for generalization under distribution shifts.
+In
+ICML
+, 2020.
+Sun et al. (2025)
+Yu Sun, Xinhao Li, Karan Dalal, Jiarui Xu, Arjun Vikram, Genghan Zhang, Yann Dubois, Xinlei Chen, Xiaolong Wang, Sanmi Koyejo, et al.
+Learning to (learn at test time): Rnns with expressive hidden states.
+In
+ICML
+, 2025.
+Sutton & Barto (1998)
+Richard S Sutton and Andrew G Barto.
+Reinforcement learning: An introduction
+.
+MIT press, 1998.
+Tandon et al. (2025)
+Arnuv Tandon, Karan Dalal, Xinhao Li, Daniel Koceja, Marcel Rød, Sam Buchanan, Xiaolong Wang, Jure Leskovec, Sanmi Koyejo, Tatsunori Hashimoto, et al.
+End-to-end test-time training for long context.
+arXiv preprint arXiv:2512.23675
+, 2025.
+Tang et al. (2023)
+Qiaoyu Tang, Ziliang Deng, Hongyu Lin, Xianpei Han, Qiao Liang, Boxi Cao, and Le Sun.
+Toolalpaca: Generalized tool learning for language models with 3000 simulated cases.
+arXiv preprint arXiv:2306.05301
+, 2023.
+Urcelay et al. (2026)
+Belen Martin Urcelay, Andreas Krause, and Giorgia Ramponi.
+From words to rewards: Leveraging natural language for reinforcement learning.
+In
+TMLR
+, 2026.
+Vaswani et al. (2017)
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.
+Attention is all you need.
+In
+NeurIPS
+, 2017.
+Wang et al. (2026)
+Hanyang Wang, Lu Wang, Chaoyun Zhang, Tianjun Mao, Si Qin, Qingwei Lin, Saravan Rajmohan, and Dongmei Zhang.
+Text2grad: Reinforcement learning from natural language feedback.
+In
+ICLR
+, 2026.
+Wang et al. (2024a)
+Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui.
+Math-shepherd: Verify and reinforce llms step-by-step without human annotations.
+In
+ACL
+, 2024a.
+Wang et al. (2025)
+Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, et al.
+Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning.
+In
+NeurIPS
+, 2025.
+Wang et al. (2024b)
+Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, et al.
+Mmlu-pro: A more robust and challenging multi-task language understanding benchmark.
+In
+NeurIPS
+, 2024b.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al.
+Chain-of-thought prompting elicits reasoning in large language models.
+In
+NeurIPS
+, 2022.
+Williams (1992)
+Ronald J Williams.
+Simple statistical gradient-following algorithms for connectionist reinforcement learning.
+Machine learning
+, 8(3), 1992.
+Xie et al. (2020)
+Qizhe Xie, Minh-Thang Luong, Eduard Hovy, and Quoc V Le.
+Self-training with noisy student improves imagenet classification.
+In
+CVPR
+, 2020.
+Xie et al. (2024)
+Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, and Tao Yu.
+Text2reward: Reward shaping with language models for reinforcement learning.
+In
+ICLR
+, 2024.
+Yang et al. (2025a)
+An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al.
+Qwen3 technical report.
+arXiv preprint arXiv:2505.09388
+, 2025a.
+Yang et al. (2025b)
+Wenkai Yang, Yankai Lin, Jie Zhou, and Ji-Rong Wen.
+Distilling rule-based knowledge into large language models.
+In
+COLING
+, 2025b.
+Yang et al. (2024)
+Zhaorui Yang, Tianyu Pang, Haozhe Feng, Han Wang, Wei Chen, Minfeng Zhu, and Qian Liu.
+Self-distillation bridges distribution gap in language model fine-tuning.
+In
+ACL
+, 2024.
+Yao et al. (2025)
+Feng Yao, Liyuan Liu, Dinghuai Zhang, Chengyu Dong, Jingbo Shang, and Jianfeng Gao.
+Your efficient rl framework secretly brings you off-policy rl training, 2025.
+URL
+https://fengyao.notion.site/off-policy-rl
+.
+Yao et al. (2024)
+Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, et al.
+Retroformer: Retrospective large language agents with policy gradient optimization.
+In
+ICLR
+, 2024.
+Yu et al. (2025)
+Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, et al.
+Dapo: An open-source llm reinforcement learning system at scale.
+In
+NeurIPS
+, 2025.
+Yue et al. (2025)
+Yang Yue, Zhiqi Chen, Rui Lu, Andrew Zhao, Zhaokai Wang, Shiji Song, and Gao Huang.
+Does reinforcement learning really incentivize reasoning capacity in llms beyond the base model?
+In
+NeurIPS
+, 2025.
+Yuksekgonul et al. (2025)
+Mert Yuksekgonul, Federico Bianchi, Joseph Boen, Sheng Liu, Pan Lu, Zhi Huang, Carlos Guestrin, and James Zou.
+Optimizing generative ai by backpropagating language model feedback.
+Nature
+, 639:609–616, 2025.
+Yuksekgonul et al. (2026)
+Mert Yuksekgonul, Daniel Koceja, Xinhao Li, Federico Bianchi, Jed McCaleb, Xiaolong Wang, Jan Kautz, Yejin Choi, James Zou, Carlos Guestrin, et al.
+Learning to discover at test time.
+arXiv preprint arXiv:2601.16175
+, 2026.
+Zelikman et al. (2022)
+Eric Zelikman, Yuhuai Wu, Jesse Mu, and Noah D Goodman.
+Star: Bootstrapping reasoning with reasoning.
+In
+NeurIPS
+, 2022.
+Zhang et al. (2025)
+Kai Zhang, Xiangchao Chen, Bo Liu, Tianci Xue, Zeyi Liao, Zhihan Liu, Xiyao Wang, Yuting Ning, Zhaorun Chen, Xiaohan Fu, et al.
+Agent learning via early experience.
+arXiv preprint arXiv:2510.08558
+, 2025.
+Zhang et al. (2023)
+Tianjun Zhang, Fangchen Liu, Justin Wong, Pieter Abbeel, and Joseph E Gonzalez.
+The wisdom of hindsight makes language models better instruction followers.
+In
+ICML
+, 2023.
+Zhao et al. (2025)
+Andrew Zhao, Yiran Wu, Yang Yue, Tong Wu, Quentin Xu, Matthieu Lin, Shenzhi Wang, Qingyun Wu, Zilong Zheng, and Gao Huang.
+Absolute zero: Reinforced self-play reasoning with zero data.
+In
+NeurIPS
+, 2025.
+Zhao et al. (2026)
+Siyan Zhao, Zhihui Xie, Mengchen Liu, Jing Huang, Guan Pang, Feiyu Chen, and Aditya Grover.
+Self-distilled reasoner: On-policy self-distillation for large language models.
+arXiv preprint arXiv:2601.18734
+, 2026.
+Zheng et al. (2025a)
+Chujie Zheng, Shixuan Liu, Mingze Li, Xiong-Hui Chen, Bowen Yu, Chang Gao, Kai Dang, Yuqiong Liu, Rui Men, An Yang, et al.
+Group sequence policy optimization.
+arXiv preprint arXiv:2507.18071
+, 2025a.
+Zheng et al. (2025b)
+Tianyu Zheng, Tianshun Xing, Qingshui Gu, Taoran Liang, Xingwei Qu, Xin Zhou, Yizhi Li, Zhoufutu Wen, Chenghua Lin, Wenhao Huang, et al.
+First return, entropy-eliciting explore.
+arXiv preprint arXiv:2507.07017
+, 2025b.
+Zhou et al. (2023)
+Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and Le Hou.
+Instruction-following evaluation for large language models.
+arXiv preprint arXiv:2311.07911
+, 2023.
+Zhou et al. (2025)
+Ruiyang Zhou, Shuozhe Li, Amy Zhang, and Liu Leqi.
+Expo: Unlocking hard reasoning with self-explanation-guided reinforcement learning.
+In
+NeurIPS
+, 2025.
+Zhou et al. (2026)
+Xiangxin Zhou, Zichen Liu, Anya Sims, Haonan Wang, Tianyu Pang, Chongxuan Li, Liang Wang, Min Lin, and Chao Du.
+Reinforcing general reasoning without verifiers.
+In
+ICLR
+, 2026.
+Ziebart et al. (2008)
+Brian D Ziebart, Andrew L Maas, J Andrew Bagnell, Anind K Dey, et al.
+Maximum entropy inverse reinforcement learning.
+In
+AAAI
+, 2008.
+Zuo et al. (2025)
+Yuxin Zuo, Kaiyan Zhang, Shang Qu, Li Sheng, Xuekai Zhu, Biqing Qi, Youbang Sun, Ganqu Cui, Ning Ding, and Bowen Zhou.
+Ttrl: Test-time reinforcement learning.
+In
+NeurIPS
+, 2025.
+Contents
+section.1table.caption.4section.2subsection.2.1subsection.2.2subsection.2.3section.3subsection.3.1subsection.3.1subsection.3.2subsection.3.3section.4section.4subsection.4.1subsection.4.2subsection.4.3subsection.4.4subsection.4.4subsection.4.5subsection.4.6subsection.4.6subsection.4.6subsection.4.6section.5subsection.5.1subsection.5.2figure.caption.20figure.caption.20section.6subsection.6.1subsection.6.2subsection.6.3subsection.6.4section.7section.7section.7appendix.Asubsection.A.1subsection.A.1equation.6subsection.A.2subsection.A.3subsection.A.4appendix.Bsubsection.B.1subsection.B.2appendix.Cappendix.Cappendix.Cappendix.Cappendix.Csubsection.C.1subsection.C.1equation.19appendix.Dsubsection.D.1subsection.D.2subsubsection.D.2.1subsubsection.D.2.2subsubsection.D.2.3subsection.D.3appendix.Esubsection.E.1subsection.E.2subsubsection.E.2.1subsection.E.3appendix.Fsubsection.F.1subsection.F.2subsection.F.3subsection.F.4
+Appendix A
+Implementation of SDPO
+The following pseudocode in
+Figure
+˜
+14
+outlines the implementation of SDPO:
+⬇
+def
+compute_sdpo_loss
+(
+batch
+,
+teacher_context
+,
+loss_mask
+):
+"""
+Computes
+probabilities
+of
+response
+y
+under
+the
+self
+-
+teacher
+and
+the
+per
+-
+logit
+SDPO
+loss
+.
+"""
+#
+Compute
+model
+probabilities
+for
+response
+y
+logprobs_student
+=
+compute_log_prob
+(
+batch
+)
+#
+(
+T
+,
+V
+)
+probs_student
+=
+logprobs_student
+.
+exp
+()
+#
+(
+T
+,
+V
+)
+\
+par
+#
+Compute
+self
+-
+teacher
+probabilities
+for
+response
+y
+teacher_batch
+=
+reprompt
+(
+batch
+,
+teacher_context
+)
+logprobs_teacher
+=
+compute_log_prob
+(
+teacher_batch
+).
+detach
+()
+#
+(
+T
+,
+V
+)
+\
+par
+#
+Compute
+SDPO
+loss
+:
+per
+-
+token
+divergence
+per_token_loss
+=
+divergence
+(
+logprobs_student
+,
+logprobs_teacher
+)
+#
+(
+T
+,)
+return
+agg_loss
+(
+per_token_loss
+,
+loss_mask
+,
+loss_agg_mode
+="
+token
+-
+mean
+")
+Figure 14:
+The pseudo-code of SDPO within a standard RL training pipeline. Omitted here is the filtering to top-
+K
+K
+logprobs for student and teacher (including a tail term) as described in
+Section
+˜
+A.3
+. Further, we omit here any importance sampling weights to correct for off-policy data.
+reprompt
+modifies the batch to incorporate teacher context (i.e., rich feedback).
+divergence
+implements any per-token divergence such as reverse-KL, forward-KL, or Jensen-Shannon.
+In the following, we provide further details on:
+•
+The gradient estimator used in our implementation (
+Section
+˜
+A.1
+)
+•
+Teacher regularization (
+Section
+˜
+A.2
+)
+•
+Approximating logit-distillation with the top-
+K
+K
+logits for saving GPU memory (
+Section
+˜
+A.3
+)
+•
+Generalizing PPO-style policy gradient algorithms to logit-level advantages (
+Section
+˜
+A.4
+)
+To disambiguate the notation of the self-teacher, we use
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+:=
+π
+θ
+(
+⋅
+∣
+reprompt
+(
+x
+,
+f
+)
+)
+q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f))
+in the following.
+Here,
+reprompt
+denotes the reprompt template of the self-teacher.
+A.1
+Gradient Estimators
+In this seciton, we discuss two possible gradient estimators for the KL divergence between the current policy
+π
+θ
+​
+(
+y
+∣
+x
+)
+\pi_{\theta}(y\mid x)
+and the teacher policy
+q
+θ
+​
+(
+y
+∣
+x
+,
+f
+)
+q_{\theta}(y\mid x,f)
+.
+Per-token estimator.
+Deriving the gradient of the SDPO loss as defined in
+Equation
+˜
+1
+:
+ℒ
+token
+(
+θ
+)
+:=
+𝔼
+y
+∼
+stopgrad
+(
+π
+θ
+(
+⋅
+∣
+x
+)
+)
+[
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+]
+\mathcal{L}_{\mathrm{token}}(\theta):=\mathbb{E}_{y\sim\mathrm{stopgrad}(\pi_{\theta}(\cdot\mid x))}\left[\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{<t})\|\mathrm{stopgrad}(\pi_{\theta}(\cdot\mid x,f,y_{<t})))\right]
+(5)
+leads to the following estimator (see a detailed proof in
+Section
+˜
+B.1
+), which corresponds to the sum of gradients of the KL divergence at each token:
+∇
+ℒ
+token
+​
+(
+θ
+)
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+∑
+t
+=
+1
+T
+𝔼
+y
+^
+t
+∼
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+​
+[
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+]
+]
+.
+\boldsymbol{\nabla}\mathcal{L}_{\text{token}}(\theta)=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{T}\mathbb{E}_{\hat{y}_{t}\sim\pi_{\theta}(\cdot\mid x,y_{<t})}\!\!\left[\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\pi_{\theta}(\hat{y}_{t}\mid x,f,y_{<t})}\right]\right].
+(6)
+This corresponds to the estimator presented in Proposition
+2.1
+.
+This gradient estimator effectively assumes that the sampling distribution generating
+y
+y
+is fixed.
+Sequence-level estimator.
+An alternative self-distillation objective minimizes the sequence-level KL divergence between student and self-teacher, i.e.,
+ℒ
+seq
+​
+(
+θ
+)
+:=
+KL
+​
+(
+π
+θ
+∥
+q
+θ
+)
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+log
+⁡
+π
+θ
+​
+(
+y
+∣
+x
+)
+q
+θ
+​
+(
+y
+∣
+x
+,
+f
+)
+]
+=
+∑
+t
+=
+1
+T
+𝔼
+s
+t
+∼
+Π
+θ
+[
+KL
+(
+π
+θ
+(
+⋅
+∣
+s
+t
+)
+∥
+q
+θ
+(
+⋅
+∣
+s
+t
+,
+f
+)
+)
+]
+,
+\displaystyle\begin{split}\mathcal{L}_{\mathrm{seq}}(\theta):=\mathrm{KL}\left(\pi_{\theta}\|q_{\theta}\right)&=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\log\frac{\pi_{\theta}(y\mid x)}{q_{\theta}(y\mid x,f)}\right]\\
+&=\sum_{t=1}^{T}\mathbb{E}_{s_{t}\sim\Pi_{\theta}}\left[\mathrm{KL}\left(\pi_{\theta}(\cdot\mid s_{t})\|q_{\theta}(\cdot\mid s_{t},f)\right)\right],\end{split}
+(7)
+where
+s
+t
+=
+(
+x
+,
+y
+<
+t
+)
+s_{t}=(x,y_{<t})
+is the prefix (“state”) at step
+t
+t
+and
+Π
+θ
+\Pi_{\theta}
+denotes the prefix distribution under policy
+π
+θ
+\pi_{\theta}
+.
+Estimating the gradient of this objective additionally takes into account how the choice of
+y
+t
+y_{t}
+influences future states
+y
+>
+t
+y_{>t}
+(due to the additional dependence on
+Π
+θ
+\Pi_{\theta}
+).
+Amini et al. (
+2025
+)
+show that the corresponding gradient estimator is given by
+∇
+ℒ
+seq
+(
+θ
+)
+=
+∇
+ℒ
+token
+(
+θ
+)
++
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+[
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+⋅
+∣
+s
+t
+)
+∥
+q
+θ
+(
+⋅
+∣
+s
+t
+,
+f
+)
+)
+∇
+θ
+log
+Π
+θ
+(
+s
+t
+)
+]
+.
+\boldsymbol{\nabla}\mathcal{L}_{\text{seq}}(\theta)=\boldsymbol{\nabla}\mathcal{L}_{\text{token}}(\theta)+\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{T}\mathrm{KL}\left(\pi_{\theta}(\cdot\mid s_{t})\|q_{\theta}(\cdot\mid s_{t},f)\right)\boldsymbol{\nabla}_{\!\!\theta}\,\log\Pi_{\theta}(s_{t})\right].
+(8)
+The additional term of the sequence-level gradient captures how prefixes influence the self-distillation divergence of future tokens.
+We also experimented with this sequence-level gradient estimator but did not find measurable gains relative to its additional complexity.
+A.2
+Regularized teacher
+In contrast to standard distillation, the teacher in SDPO changes throughout training. This bootstrapping enables the teacher to improve, but it may also lead to training instability.
+To stabilize training, we seek to prevent the teacher
+q
+q
+from quickly diverging from the initial teacher
+q
+θ
+ref
+\smash{q_{\theta_{{\mathrm{ref}}}}}
+.
+We can achieve this by placing an explicit trust-region constraint on
+q
+q
+(Schulman et al.,
+2015
+; Peng et al.,
+2019
+)
+, that is:
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+ϵ
+>
+0
+.
+\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\quad\epsilon>0.
+(9)
+This trust-region can be implemented in two ways:
+1.
+Explicit trust-region:
+We can define the teacher as the policy closest to
+q
+θ
+q_{\theta}
+while satisfying the trust-region constraint.
+This teacher can be expressed as
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+,
+q(y_{t}\mid x,f,y_{<t})\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})+\alpha\log q_{\theta}(y_{t}\mid x,f,y_{<t})\big),
+(10)
+with
+α
+∈
+(
+0
+,
+1
+)
+\alpha\in(0,1)
+the inverse Lagrange multiplier for the trust-region constraint.
+We include a full derivation in
+Section
+˜
+B.2
+.
+We can plug this explicitly constrained teacher directly into the SDPO objective.
+2.
+Exponential moving average (EMA):
+Alternatively, we can stabilize the teacher’s parameters directly; parameterizing
+q
+θ
+′
+q_{\theta^{\prime}}
+by
+θ
+′
+\theta^{\prime}
+and updating as
+θ
+′
+←
+(
+1
+−
+α
+)
+​
+θ
+′
++
+α
+​
+θ
+\theta^{\prime}\leftarrow(1-\alpha)\theta^{\prime}+\alpha\theta
+with
+α
+∈
+(
+0
+,
+1
+)
+\alpha\in(0,1)
+.
+Note that each implementation has a different practical advantage:
+The EMA teacher requires additional GPU memory for
+θ
+′
+\theta^{\prime}
+yet does not introduce any runtime overhead.
+In contrast, the trust-region teacher requires an additional log-prob computation with
+q
+θ
+ref
+\smash{q_{\theta_{{\mathrm{ref}}}}}
+yet does not require additional GPU memory if
+θ
+ref
+{\theta_{{\mathrm{ref}}}}
+is used for explicit KL regularization.
+A.3
+Approximate Logit Distillation
+To save GPU memory, we perform distillation only on the top-
+K
+K
+tokens predicted by the student:
+ℒ
+SDPO
+​
+(
+θ
+)
+\displaystyle\mathcal{L}_{\mathrm{SDPO}}(\theta)
+=
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{<t})\|\mathrm{stopgrad}(q_{\theta}(\cdot\mid x,f,y_{<t})))
+≈
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
++
+(
+1
+−
+∑
+y
+^
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+⋅
+log
+⁡
+1
+−
+∑
+y
+^
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+1
+−
+∑
+y
+^
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+q
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+⏟
+tail
+\displaystyle\approx\begin{multlined}\sum_{t=1}^{T}\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t}))}\\
++\underbrace{\Big(1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\Big)\cdot\log\frac{1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\mathrm{stopgrad}\Big(1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t})\Big)}}_{\text{tail}}\end{multlined}\sum_{t=1}^{T}\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t}))}\\
++\underbrace{\Big(1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\Big)\cdot\log\frac{1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\mathrm{stopgrad}\Big(1-\textstyle\sum_{\hat{y}_{t}\in\mathrm{top}_{K}(\pi_{\theta})}q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t})\Big)}}_{\text{tail}}
+(13)
+Here, the top-
+K
+K
+is with respect to student.
+Without top-
+K
+K
+distillation, we would have to keep two copies of logits in memory: one for teacher and student each.
+Top-
+K
+K
+distillation avoids virtually any memory overhead without impacting performance significantly, since most tokens of the vocabulary are not informative at a given time.
+A.4
+Off-Policy Training: Generalization to Logit-Level Losses
+PPO-style clipping
+(Schulman et al.,
+2017
+)
+with
+truncated importance sampling
+(Yao et al.,
+2025
+)
+,
+clip-higher
+(Yu et al.,
+2025
+)
+,
+fixed length normalization
+(Liu et al.,
+2025b
+)
+:
+ℒ
+token
+​
+(
+θ
+)
+:=
+−
+1
+∑
+i
+=
+1
+G
+|
+y
+i
+|
+​
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+y
+i
+|
+min
+⁡
+(
+w
+i
+,
+t
+TIS
+,
+ρ
+)
+​
+min
+⁡
+(
+w
+i
+,
+t
+​
+A
+i
+,
+t
+,
+clip
+​
+(
+w
+i
+,
+t
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+​
+A
+i
+,
+t
+)
+,
+\mathcal{L}_{\mathrm{token}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(w^{\mathrm{TIS}}_{i,t},\rho\right)}\min\left(w_{i,t}A_{i,t},\text{clip}(w_{i,t},1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}\right),
+(14)
+with
+w
+i
+,
+t
+:=
+π
+θ
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+w_{i,t}:=\frac{\pi_{\theta}(y_{i,t}\mid x,y_{i,<t})}{\pi_{\theta_{{\mathrm{old}}}}(y_{i,t}\mid x,y_{i,<t})}
+,
+w
+i
+,
+t
+TIS
+:=
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+rollout
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+w^{\mathrm{TIS}}_{i,t}:=\frac{\pi_{\theta_{{\mathrm{old}}}}(y_{i,t}\mid x,y_{i,<t})}{\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(y_{i,t}\mid x,y_{i,<t})}
+, and
+A
+i
+,
+t
+A_{i,t}
+denotes the per-token advantage.
+We extend this to a
+logit-level
+loss:
+ℒ
+logit
+​
+(
+θ
+)
+:=
+−
+1
+∑
+i
+=
+1
+G
+|
+y
+i
+|
+​
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+y
+i
+|
+∑
+y
+^
+i
+,
+t
+min
+⁡
+(
+π
+θ
+old
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+,
+ρ
+​
+π
+θ
+old
+rollout
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+)
+min
+⁡
+(
+w
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+​
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+,
+clip
+​
+(
+w
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+​
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+)
+,
+\begin{multlined}\mathcal{L}_{\mathrm{logit}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.16796875,0.3125,0.66796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.16796875,0.3125,0.66796875}\sum_{\hat{y}_{i,t}}}\ {\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(\pi_{\theta_{{\mathrm{old}}}}(\hat{y}_{i,t}\mid x,y_{i,<t}),\rho\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(\hat{y}_{i,t}\mid x,y_{i,<t})\right)}\\
+\min\left(w_{i,t}(\hat{y}_{i,t})A_{i,t}(\hat{y}_{i,t}),\text{clip}(w_{i,t}(\hat{y}_{i,t}),1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}(\hat{y}_{i,t})\right),\end{multlined}\mathcal{L}_{\mathrm{logit}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.16796875,0.3125,0.66796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.16796875,0.3125,0.66796875}\sum_{\hat{y}_{i,t}}}\ {\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(\pi_{\theta_{{\mathrm{old}}}}(\hat{y}_{i,t}\mid x,y_{i,<t}),\rho\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(\hat{y}_{i,t}\mid x,y_{i,<t})\right)}\\
+\min\left(w_{i,t}(\hat{y}_{i,t})A_{i,t}(\hat{y}_{i,t}),\text{clip}(w_{i,t}(\hat{y}_{i,t}),1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}(\hat{y}_{i,t})\right),
+(15)
+where
+y
+^
+i
+,
+t
+\hat{y}_{i,t}
+sums over all possible tokens at position
+t
+t
+for rollout
+i
+i
+(or the
+K
+K
+most likely under
+π
+θ
+old
+\pi_{\theta_{{\mathrm{old}}}}
+, cf.
+Section
+˜
+A.3
+).
+The TIS changes since we explicitly weight each logit by its probability under
+π
+θ
+old
+\pi_{\theta_{{\mathrm{old}}}}
+rather than relying on a Monte Carlo estimate of the expectation over next-token predictions.
+Here,
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+A_{i,t}(\hat{y}_{i,t})
+is a per-logit advantage.
+In our experiments for SDPO, we apply the TIS term on a token-level rather than logit-level.
+Appendix B
+Theoretical Analysis
+This section is organized as follows:
+•
+Section
+˜
+B.1
+derives the SDPO gradient from Proposition
+2.1
+.
+•
+Section
+˜
+B.2
+derives the trust-region regularized teacher discussed in
+Section
+˜
+A.2
+.
+To disambiguate the notation of the self-teacher, we use
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+:=
+π
+θ
+(
+⋅
+∣
+reprompt
+(
+x
+,
+f
+)
+)
+q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f))
+in the following.
+Here,
+reprompt
+denotes the reprompt template of the self-teacher.
+B.1
+Proof of Proposition
+2.1
+.
+Proof.
+In the following, we derive the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+.
+∇
+θ
+ℒ
+SDPO
+​
+(
+θ
+)
+\displaystyle\boldsymbol{\nabla}_{\!\!\theta}\,\mathcal{L}_{\mathrm{SDPO}}(\theta)
+=
+∇
+θ
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{<t})\|\mathrm{stopgrad}(q_{\theta}(\cdot\mid x,f,y_{<t})))
+=
+∇
+θ
+​
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+log
+⁡
+(
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\log\left(\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t}))}\right)
+Let
+A
+t
+,
+k
+:=
+log
+⁡
+(
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+A_{t,k}:=\log\left(\frac{\mathrm{stopgrad}(q_{\theta}(\hat{y}_{t}\mid x,f,y_{<t}))}{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}\right)
+. Then,
+=
+−
+∇
+θ
+​
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+A
+t
+,
+k
+\displaystyle=-\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})A_{t,k}
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+A
+t
+,
+k
++
+A
+t
+,
+k
+​
+∇
+θ
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+.
+\displaystyle=-\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}+A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t}).
+We have that
+∇
+θ
+A
+t
+,
+k
+=
+−
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}=-\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})
+is the negative score function. Using the score trick,
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+∇
+θ
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})=\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})
+. Hence, the first term simplifies to
+−
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+A
+t
+,
+k
+\displaystyle-\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}
+=
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+∇
+θ
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+∑
+t
+=
+1
+T
+∇
+θ
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+⏟
+=
+1
+=
+0
+.
+\displaystyle=\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})=\sum_{t=1}^{T}\boldsymbol{\nabla}_{\!\!\theta}\,\underbrace{\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}_{=1}=0.
+Thus, the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is
+∇
+θ
+ℒ
+SDPO
+\displaystyle\boldsymbol{\nabla}_{\!\!\theta}\,\mathcal{L}_{\mathrm{SDPO}}
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+A
+t
+,
+k
+​
+∇
+θ
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+\displaystyle=-\sum_{t=1}^{T}\sum_{\hat{y}_{t}}A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+^
+t
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+(
+A
+t
+,
+k
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+\displaystyle=-\sum_{t=1}^{T}\sum_{\hat{y}_{t}}\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\Big(A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\Big)
+=
+−
+∑
+t
+=
+1
+T
+𝔼
+y
+^
+t
+∼
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+​
+[
+A
+t
+,
+k
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+]
+.
+\displaystyle=-\sum_{t=1}^{T}\mathbb{E}_{\hat{y}_{t}\sim\pi_{\theta}(\cdot\mid x,y_{<t})}\left[A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\right].
+∎
+Notably, the above implies that the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is equivalent to the gradient of the loss if
+A
+t
+,
+k
+=
+stopgrad
+​
+(
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+A_{t,k}=\mathrm{stopgrad}\left(\log\frac{q_{\theta}(y_{t}\mid x,f,y_{<t})}{\pi_{\theta}(y_{t}\mid x,y_{<t})}\right)
+.
+B.2
+Trust-region Teacher
+To stabilize training, we seek to prevent the teacher
+q
+q
+from diverging from the initial teacher
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+.
+We can achieve this by placing an explicit trust-region constraint on the teacher
+q
+q
+(Schulman et al.,
+2015
+; Peng et al.,
+2019
+)
+, that is:
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+ϵ
+>
+0
+.
+\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\quad\epsilon>0.
+(16)
+In the following, we derive a teacher
+q
+q
+which satisfies the trust-region constraint while staying close to the target
+q
+θ
+q_{\theta}
+.
+The following optimization problem characterizes such a
+q
+q
+(Peng et al.,
+2019
+)
+:
+arg
+​
+max
+q
+∈
+Δ
+∑
+t
+∑
+y
+t
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+s.t.
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+\displaystyle\begin{split}\operatorname*{arg\,max}_{q\in\Delta}\ &\sum_{t}\sum_{y_{t}}q(y_{t}\mid x,f,y_{<t})\log\frac{q_{\theta}(y_{t}\mid x,f,y_{<t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})}\\
+\text{s.t.}\ &\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\end{split}
+(17)
+where
+Δ
+\Delta
+denotes the probability simplex.
+Intuitively, the solution is the
+q
+q
+satisfying the trust-region constraint, which is closest to
+q
+θ
+q_{\theta}
+(i.e., has minimal cross-entropy to
+q
+θ
+q_{\theta}
+) while being farthest from
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+(i.e., has maximal cross-entropy to
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+).
+Proposition B.1
+.
+The solution to
+Equation
+˜
+17
+can be expressed in closed form as
+q
+∗
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+.
+\displaystyle q^{*}(y_{t}\mid x,f,y_{<t})\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})+\alpha\log q_{\theta}(y_{t}\mid x,f,y_{<t})\big).
+(18)
+Proof.
+To simplify notation, we omit the conditioning in the following.
+The Lagrangian (with
+λ
+≥
+0
+\lambda\geq 0
+for the KL constraint and
+ν
+\nu
+for normalization) is
+ℒ
+​
+(
+q
+,
+λ
+,
+ν
+)
+=
+∑
+t
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+λ
+​
+(
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+​
+log
+⁡
+q
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+ϵ
+)
++
+ν
+​
+(
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+−
+1
+)
+.
+\displaystyle\mathcal{L}(q,\lambda,\nu)=\sum_{t}\sum_{y_{t}}q({y_{t}})\log\frac{q_{\theta}({y_{t}})}{q_{\theta_{{\mathrm{ref}}}}({y_{t}})}-\lambda\Big(\sum_{y_{t}}q({y_{t}})\log\frac{q({y_{t}})}{q_{\theta_{{\mathrm{ref}}}}({y_{t}})}-\epsilon\Big)+\nu\Big(\sum_{y_{t}}q({y_{t}})-1\Big).
+Stationarity gives, for all
+y
+t
+y_{t}
+,
+0
+=
+∂
+ℒ
+∂
+q
+​
+(
+y
+t
+)
+=
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+λ
+​
+(
+log
+⁡
+q
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
++
+1
+)
++
+ν
+.
+\displaystyle 0=\frac{\partial\mathcal{L}}{\partial q(y_{t})}=\log\frac{q_{\theta}(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}-\lambda\Big(\log\frac{q(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}+1\Big)+\nu.
+Let
+α
+:=
+1
+/
+λ
+\alpha:=1/\lambda
+. Then, the solution to
+Equation
+˜
+17
+can be characterized in closed form as
+q
+∗
+​
+(
+y
+t
+)
+\displaystyle q^{*}(y_{t})
+∝
+q
+θ
+ref
+​
+(
+y
+t
+)
+​
+exp
+⁡
+(
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+)
+\displaystyle\propto q_{\theta_{{\mathrm{ref}}}}(y_{t})\exp\!\Big(\alpha\log\tfrac{q_{\theta}(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}\Big)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+)
+.
+\displaystyle\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t})+\alpha\log q_{\theta}(y_{t})\big).
+∎
+Chen et al. (
+2025c
+)
+perform a similar derivation, but use reference
+π
+θ
+ref
+\pi_{{\theta_{{\mathrm{ref}}}}}
+, which we observe to underperform compared to the reference
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+.
+Appendix C
+Additional Related Work
+Value networks and Monte Carlo advantage estimation.
+Several prior approaches aim to improve credit assignment but face the same information bottleneck as GRPO. Classical RL frequently trains value networks which provide token-level advantages, but themselves are learned from scalar rewards
+(Schulman et al.,
+2016
+;
+2017
+)
+. Furthermore, value networks incur significant computational and memory overhead and are therefore typically not used to train LLMs.
+Other recent work estimates token-level advantages by performing additional generations starting from various positions in the original attempt
+(Kazemnejad et al.,
+2025
+; Zheng et al.,
+2025b
+)
+.
+While this can learn with fewer gradient steps than GRPO it still uses only scalar rewards as signal and requires costly additional generations.
+Dense credit assignment with a reward model.
+Several recent works study dense (per-token) reward assignment given access to an external reward model, typically by exploiting the reward model’s internal structure
+(Chan et al.,
+2024
+; Cao et al.,
+2025b
+)
+.
+Relatedly,
+Li et al. (
+2025b
+)
+argue that a token-level reward signal is implicit in an LLM’s logits by linking next-token prediction to offline inverse reinforcement learning, effectively yielding a training-free reward model for RL fine-tuning.
+Partial observability.
+From the perspective of classical RL, many verifiable domains for LLMs are naturally
+partially observable
+:
+executing a proposed solution induces a latent environment state (e.g., failing tests or states of an agentic system) that is revealed only through rich feedback.
+This aligns with the formalism of partially observable Markov decision processes (POMDPs), where agents must act under incomplete observations of state
+(Kaelbling et al.,
+1998
+; Sutton & Barto,
+1998
+)
+.
+By contrast, RLVR and RLHF pipelines typically discard this observation channel and learn only from terminal scalar rewards or pairwise preferences.
+Relation to test-time training.
+Our setting from
+Section
+˜
+5
+can be seen as a special case of test-time training where the model itself is updated at test-time using self-distillation.
+Updating the model at test-time is known as test-time training
+(Sun et al.,
+2020
+;
+2025
+; Hardt & Sun,
+2024
+; Hübotter et al.,
+2025a
+;
+b
+; Akyürek et al.,
+2025
+; Behrouz et al.,
+2025
+; Tandon et al.,
+2025
+; Hübotter et al.,
+2026
+)
+.
+Unlike prior work, self-distillation uses the in-context learning ability of the current model to attribute credit after receiving feedback.
+This can be seen as simulating long-context reasoning with periodic compression of context into the model weights.
+C.1
+SDPO as Maximum Entropy RL
+The SDPO objective resembles the objective in maximum entropy RL
+(e.g., Levine,
+2018
+; Haarnoja et al.,
+2018
+)
+with a particular choice of reward function.
+Maximum Entropy RL
+Consider optimizing
+arg
+​
+max
+θ
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+[
+∑
+t
+r
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+]
++
+λ
+H
+[
+π
+θ
+(
+⋅
+∣
+x
+)
+]
+,
+λ
+>
+0
+\operatorname*{arg\,max}_{\theta}\ \mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[\sum_{t}r(y_{t}\mid x,y_{<t})\right]+\lambda\mathrm{H}\left[\pi_{\theta}(\cdot\mid x)\right],\quad\lambda>0
+(19)
+where
+π
+θ
+​
+(
+y
+∣
+x
+)
+=
+∏
+t
+=
+1
+T
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+\smash{\pi_{\theta}(y\mid x)=\prod_{t=1}^{T}\pi_{\theta}(y_{t}\mid x,y_{<t})}
+and
+H
+[
+π
+θ
+(
+⋅
+∣
+x
+)
+]
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+[
+−
+log
+π
+θ
+(
+y
+∣
+x
+)
+]
+\smash{\mathrm{H}\left[\pi_{\theta}(\cdot\mid x)\right]=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[-\log\pi_{\theta}(y\mid x)\right]}
+is the entropy of the policy.
+Here,
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+r(y_{t}\mid x,y_{<t})
+is an arbitrary reward function, possibly “dense” (i.e., per-token).
+Equation
+˜
+19
+is known as maximum entropy RL.
+It is known that this objective is equivalent to solving a variational inference problem which discuss next.
+To this end, we define a Bernoulli random variable
+𝒞
+\mathcal{C}
+which is
+1
+1
+if the attempt
+y
+y
+is correct and
+0
+otherwise.
+We then define its distribution as
+p
+​
+(
+𝒞
+=
+1
+∣
+x
+,
+y
+)
+∝
+exp
+⁡
+(
+1
+λ
+​
+∑
+t
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+\smash{p(\mathcal{C}=1\mid x,y)\propto\exp(\tfrac{1}{\lambda}\sum_{t}r(y_{t}\mid x,y_{<t}))}
+.
+Further assuming w.l.o.g. that the “prior” over responses is uniform, we can express the posterior conditioned on the event of correctness as
+π
+⋆
+​
+(
+y
+∣
+x
+)
+:=
+p
+​
+(
+y
+∣
+x
+,
+𝒞
+=
+1
+)
+∝
+p
+​
+(
+𝒞
+=
+1
+∣
+x
+,
+y
+)
+∝
+exp
+⁡
+(
+1
+λ
+​
+∑
+t
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+.
+\pi^{\star}(y\mid x):=p(y\mid x,\mathcal{C}=1)\propto p(\mathcal{C}=1\mid x,y)\propto\exp\!\left(\frac{1}{\lambda}\sum_{t}r(y_{t}\mid x,y_{<t})\right).
+(20)
+Then,
+Equation
+˜
+19
+is equivalent to minimizing the KL divergence with respect to
+π
+⋆
+\pi^{\star}
+:
+arg
+​
+min
+θ
+∑
+t
+KL
+(
+π
+θ
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+∥
+π
+⋆
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+.
+\operatorname*{arg\,min}_{\theta}\ \sum_{t}\mathrm{KL}\left(\pi_{\theta}(y_{t}\mid x,y_{<t})\|\pi^{\star}(y_{t}\mid x,y_{<t})\right).
+(21)
+SDPO optimizes an implicit reward defined by the teacher
+Note that
+Equation
+˜
+21
+is equivalent to the SDPO objective (
+Equation
+˜
+1
+) with implicit reward
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+log
+⁡
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+r(y_{t}\mid x,y_{<t})=\log q(y_{t}\mid x,f,y_{<t})
+and
+λ
+=
+1
+\lambda=1
+.
+In this sense, SDPO can be seen as a maximum entropy RL algorithm with dense rewards constructed implicitly through the retrospective model.
+This also points to a connection of SDPO to inverse RL
+(Ng et al.,
+2000
+; Ziebart et al.,
+2008
+; Rafailov et al.,
+2023
+)
+, where the goal is to recover an unknown reward function.
+In SDPO, the student learns an implicit reward function defined by the retrospective model.
+Appendix D
+Additional Results & Ablations
+This section is organized as follows:
+•
+Section
+˜
+D.1
+contains results and ablations for
+Section
+˜
+3
+.
+•
+Section
+˜
+D.2
+contains results and ablations for
+Section
+˜
+4
+.
+•
+Section
+˜
+D.3
+contains results and ablations for
+Section
+˜
+5
+.
+D.1
+Learning without rich environment feedback
+•
+Table
+˜
+7
+reports results when optimal hyperparameters are selected for each model/task combination.
+•
+Table
+˜
+8
+compares average response lengths of SDPO and GRPO.
+Chemistry
+Physics
+Biology
+Materials
+Tool use
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+Qwen3-8B
+41.2
+59.2
+30.8
+58.9
+57.5
++ GRPO
+65.9
+74.5
+62.9
+74.5
+35.1
+59.9
+74.3
+77.1
+61.7
+68.1
++ GRPO (on-policy)
+52.2
+71.6
+62.9
+74.8
+49.8
+49.8
+73.3
+75.8
+61.7
+68.1
++
+SDPO
+(on-policy)
+73.2
+80.9
+70.6
+80.6
+50.6
+56.8
+72.1
+78.3
+56.4
+68.5
+Olmo3-7B-Instruct
+22.8
+37.7
+16.2
+36.7
+39.3
++ GRPO
+53.1
+67.7
+55.3
+63.3
+35.6
+55.8
+73.8
+78.1
+56.4
+65.0
++ GRPO (on-policy)
+47.1
+65.4
+62.7
+62.7
+49.8
+49.8
+67.9
+74.4
+56.0
+61.3
++
+SDPO
+(on-policy)
+68.0
+80.0
+60.3
+71.4
+48.0
+52.8
+75.3
+79.2
+57.3
+62.5
+Table 7:
+Comparison of SDPO and GRPO on reasoning-related benchmarks.
+We report the highest achieved avg@16 within 1 hour and 5 hours of wall-clock training time, respectively. Both SDPO and on-policy GRPO perform one gradient step per generation batch, while GRPO performs 4 off-policy mini batch steps. We select optimal hyperparameters for SDPO and baselines based on 5h accuracy. We perform this selection independently for each model and dataset. Each run is performed on a node with 4 NVIDIA GH200 GPUs. Together with initialization and validation, each run takes approximately 6 hours.
+As opposed to
+Table
+˜
+3
+which selects globally optimal hyperparameters per method, this table selects optimal hyperparameters individually for each model/task combination based on 5h accuracy.
+The hyperparameter grid is described in
+Section
+˜
+E.2.1
+.
+Model
+GRPO
+SDPO
+Reduction of SDPO
+Qwen3-8B
+820.8
+255.8
+3.2
+×
+3.2\times
+Olmo3-7B-Instruct
+1095.4
+343.9
+3.2
+×
+3.2\times
+Table 8:
+Average response lengths of SDPO and GRPO (averaged across tasks from
+Section
+˜
+3
+). Both algorithms are evaluated in the on-policy setting.
+D.2
+Learning with rich environment feedback
+D.2.1
+Additional Results
+Figure 15:
+Average accuracy during training until step 80, stratified by difficulty. Error bars show standard deviation across 3 seeds.
+Figure
+˜
+15
+shows the average accuracy of SDPO and GRPO stratified by question difficulty. LCB differentiates between easy, medium, and hard questions.
+As displayed, SDPO significantly improves over GRPO in solving medium and hard questions, highlighting the importance of rich feedback for challenging tasks. Note that this categorization of questions is different from the one in
+Section
+˜
+5
+.
+In
+Figure
+˜
+16
+, we compare different train batch sizes and number of rollouts for training GRPO and SDPO on LCBv6.
+Figure 16:
+Accuracy (pass@1) for varying train batch sizes (4, 8, 16, 32) and number of rollouts (4, 8) for training SDPO and GRPO with Qwen3-8B
+(Yang et al.,
+2025a
+)
+on LCBv6,
+±
+\pm
+stderr across 3 seeds. Different shades of the same color correspond to different runs.
+Complementing the results shown in
+Figure
+˜
+8
+, we show additional results using Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+in
+Figure
+˜
+17
+.
+Figure 17:
+Average validation accuracy by model size,
+±
+\pm
+std across 3 seeds. With Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+and Qwen3
+(Yang et al.,
+2025a
+)
+on LCBv6. Until step 65 for Qwen2.5 and until step 80 for Qwen3.
+D.2.2
+Training Stability
+Figure
+˜
+18
+shows diverse metrics logged during training, including the loss, entropy, average gradient norm, and average response length.
+Figure 18:
+Loss, entropy, avg. gradient norm and avg. response length during training of SDPO on LCBv6 (
+Section
+˜
+4
+.
+D.2.3
+Baselines
+Table
+˜
+9
+compares the performance on LCBv6 of various baselines, including two variants of GRPO, GSPO, and CISPO to SDPO.
+Accuracy
+Avg accuracy
+GRPO
+41.2
+±
+0.8
+41.2\pm 0.8
+38.2
+±
+0.0
+38.2\pm 0.0
++ only high-entropy tokens
+(Wang et al.,
+2025
+)
+37.8
+±
+2.2
+37.8\pm 2.2
+35.9
+±
+0.1
+35.9\pm 0.1
+GSPO
+(Zheng et al.,
+2025a
+)
+40.1
+±
+2.3
+40.1\pm 2.3
+37.7
+±
+0.1
+37.7\pm 0.1
+CISPO
+(Chen et al.,
+2025a
+)
+41.2
+±
+1.8
+41.2\pm 1.8
+37.8
+±
+0.1
+37.8\pm 0.1
+SDPO
+48.8
+±
+0.6
+\mathbf{48.8}\pm 0.6
+43.8
+±
+0.0
+\mathbf{43.8}\pm 0.0
+Table 9:
+Performance on LCBv6 at/until training step 80 with std over 3 seeds. We compare to GSPO
+(Zheng et al.,
+2025a
+)
+and CISPO
+(Chen et al.,
+2025a
+)
+. With Qwen3-8B.
+D.3
+Test-time self-distillation
+Complementing the results shown in
+Section
+˜
+5
+, we show the discovery@
+k
+k
+curves for all hard question in
+Figure
+˜
+20
+, and report the mean number of generations until the first discovery in
+Table
+˜
+10
+. Further,
+Table
+˜
+11
+shows the per-question accuracy of the self-teacher at the initial training step of SDPO. In
+Figure
+˜
+19
+, we ablate the choice of batch size for SDPO and the in-context reprompting strategy for multi-turn sampling.
+In the selection of hard questions, we have discarded one malformed question (Q9) where the coding environment did not correctly validate the solution due to rounding inaccuracies, which led to failures even with correct logic.
+Question
+SDPO
+Best-of-
+k
+k
+Multi-turn
+Speedup
+Best-of-
+k
+k
+→
+\rightarrow
+SDPO
+1
+104
+98
+59
+0.9
+×
+\times
+3*
+1987
+≥
+2750
+\geq 2750
+≥
+2750
+\geq 2750
+1.4
+×
+\times
+10*
+938
+≥
+2750
+\geq 2750
+1706
+2.9
+×
+\times
+43
+111
+109
+111
+1.0
+×
+\times
+46*
+1852
+1466
+1315
+0.8
+×
+\times
+59
+172
+123
+76
+0.7
+×
+\times
+69
+280
+134
+134
+0.5
+×
+\times
+74*
+1948
+1466
+2405
+0.8
+×
+\times
+86
+85
+421
+335
+5.0
+×
+\times
+91*
+1360
+≥
+2750
+\geq 2750
+2384
+2.0
+×
+\times
+92*
+1575
+≥
+2750
+\geq 2750
+2203
+1.8
+×
+\times
+95*
+1948
+1466
+1794
+0.8
+×
+\times
+100
+277
+294
+1596
+1.1
+×
+\times
+103*
+2246
+≥
+2750
+\geq 2750
+2210
+1.2
+×
+\times
+111
+85
+95
+39
+1.1
+×
+\times
+120
+24
+327
+70
+13.6
+×
+\times
+125*
+1795
+1466
+2320
+0.8
+×
+\times
+127
+28
+368
+61
+13.1
+×
+\times
+129
+168
+173
+104
+1.0
+×
+\times
+Hard tasks
+894
+1145
+1141
+1.3
+×
+\times
+Very hard tasks
+1739
+2180
+2121
+1.2
+×
+\times
+Table 10:
+Mean number of generations until first success per question for SDPO, best-of-
+k
+k
+sampling, and the multi-turn sampling. For the mean calculation, values are truncated at the maximum budget of 2750 generations. Very hard tasks (
+pass
+​
+@
+​
+64
+<
+0.03
+\text{pass}@64<0.03
+) are marked with an asterisk (*). Averaged over all questions, SDPO achieves successes faster than the baselines, reaching a speedup of up to
+13.6
+×
+13.6\times
+on individual questions compared to best-of-
+k
+k
+sampling.
+Question
+Initial Teacher
+Accuracy (%)
+1
+0.00
+3
+0.00
+10
+0.00
+43
+6.25
+46
+0.00
+59
+0.00
+69
+3.12
+74
+0.00
+86
+0.00
+91
+0.00
+92
+0.00
+95
+0.00
+100
+0.00
+103
+0.00
+111
+0.00
+120
+0.00
+125
+0.00
+127
+1.23
+129
+0.06
+Table 11:
+Average accuracy of the retrospective teacher at the first step for each question.
+These scores represent the percentage of successful solutions generated when the base model is reprompted with feedback in a single-turn interaction. For the majority of these hard and very hard tasks, the teacher accuracy is near or exactly 0%. Despite this, the self-distilled token-level advantages are sufficiently rich for SDPO to iteratively refine its policy and solve these questions over successive updates.
+Figure 19:
+Ablations self-distillation at test-time on hard tasks.
+Left:
+Impact of SDPO batch size on
+pass
+​
+@
+​
+k
+\text{pass}@k
+curves. While smaller batch sizes (8 and 16) can lead to slightly earlier discoveries at very low generation budgets (
+k
+<
+2
+6
+k<2^{6}
+), larger batch sizes (16, 32) result in more stable updates that significantly improve the discovery rate as the budget scales.
+Right:
+Comparison of multi-turn reprompting templates on a subset of hard questions. The “Only feedback” template concatenates the feedback from previous attempts using a first-in, first-out sliding window. The “Attempts + Feedback” template concatenates the full turn, also using a sliding window. Including only the feedback substantially outperforms concatenating full conversations.
+Figure 20:
+Individual task results self-distillation at test-time.
+Discovery
+​
+@
+​
+k
+\text{Discovery}@k
+for each of the 19 questions evaluated in
+Section
+˜
+5
+. In most cases, SDPO finds a successful solution significantly earlier than both the base model and the multi-turn baseline. Notably, for one question (Q3) where the base model and the multi-turn baseline maintain a
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+of zero for the entire budget up to 2750 , SDPO discovers a solution after 321 attempts. Curves represent the mean and 90% confidence intervals across 5 random seeds per question.
+Appendix E
+Experiment Details
+E.1
+Technical setup
+All experiments were conducted on a single node equipped with four NVIDIA
+GH200 GPUs, for a total of 378GB VRAM. Our environment is built on top of the NVIDIA PyTorch container
+nvcr.io/nvidia/pytorch:25.02-py3
+, with CUDA 12.8 and PyTorch v2.7.0.
+Our implementation is based on the
+verl
+library
+(Sheng et al.,
+2025
+)
+. We use PyTorch Fully Sharded Data Parallel (FSDP2) for distributed training. For rollout generation, we employ
+vLLM
+(Kwon et al.,
+2023
+)
+, which enables efficient batched inference on the multi-GPU node.
+E.2
+Hyperparameters
+We summarize hyperparameters used for SDPO in
+Table
+˜
+12
+and those used for GRPO in
+Table
+˜
+13
+.
+Parameters
+Without Feedback
+With Feedback
+TTT
+Section
+3
+Section
+4
+Section
+5
+General
+Model
+Qwen/Qwen3-8B
+Qwen/Qwen3-8B
+Qwen/Qwen3-8B
+allenai/Olmo3-7B-Instruct
+Thinking
+False
+False
+False
+Data
+Max. prompt length
+2048
+2048
+2048
+Max. response length
+8192
+8192
+8192
+Batching
+Question batch size
+32
+32
+1
+Mini batch size
+32
+1
+1
+Number of rollouts
+8
+8
+16
+Rollout
+Inference engine
+vllm
+vllm
+vllm
+Temperature
+1.0
+1.0
+1.0
+Validation
+Number of rollouts
+16
+4
+-
+Temperature
+0.6
+0.6
+-
+Top-
+p
+p
+0.95
+0.95
+-
+SDPO loss
+Top-
+K
+K
+distillation
+100
+20
+20
+Distillation divergence
+Jensen–Shannon
+Reverse-KL
+Reverse-KL
+Clip advantages
+–
+–
+5.0
+Teacher-EMA update rate
+0.05
+0.01
+0.01
+Rollout importance sampling clip
+2
+2
+2
+Training
+Optimizer
+AdamW
+AdamW
+AdamW
+Learning rate
+1
+×
+10
+−
+5
+1\times 10^{-5}
+(constant)
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(constant)
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(constant)
+Warmup steps
+10
+0
+0
+Weight decay
+0.01
+0.01
+0.01
+Gradient Clip Norm
+1.0
+1.0
+1.0
+Table 12:
+Hyperparameters used for
+SDPO
+for each experimental setup.
+Parameters
+Experiment 1
+Section
+3
+General
+Model
+Qwen/Qwen3-8B
+allenai/Olmo3-7B-Instruct
+Thinking
+False
+Data
+Max. prompt length
+2048
+Max. response length
+8192
+Batching
+Question batch size
+32
+Mini batch size
+8 (default) / 32 (on-policy)
+Number of rollouts
+8
+Rollout
+Inference engine
+vllm
+Temperature
+1.0
+Validation
+Temperature
+0.6
+Top-
+p
+p
+0.95
+Number of rollouts
+16
+Loss
+ϵ
+\epsilon
+-high
+0.28
+Rollout importance sampling clip
+2
+KL coefficient (
+λ
+\lambda
+)
+0.0
+Training
+Optimizer
+AdamW
+Learning rate
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(default) /
+1
+×
+10
+−
+5
+1\times 10^{-5}
+(on-policy)
+Warmup steps
+10
+Weight decay
+0.01
+Gradient Clip Norm
+1.0
+Table 13:
+Hyperparameters used for
+GRPO
+.
+E.2.1
+Details on Hyperparameter Selection (
+Section
+˜
+3
+)
+For GRPO in the experiments in
+Section
+˜
+3
+, we perform a grid search over learning rates
+{
+10
+−
+5
+,
+10
+−
+6
+}
+\{10^{-5},10^{-6}\}
+and minibatch sizes
+{
+8
+,
+32
+}
+\{8,32\}
+. For on-policy GRPO, we search over the same learning rates while fixing the minibatch size to 32. For SDPO, we grid-search over KL variants (forward KL, Jensen–Shannon), learning rates
+{
+10
+−
+5
+,
+10
+−
+6
+}
+\{10^{-5},10^{-6}\}
+, and minibatch sizes
+{
+8
+,
+32
+}
+\{8,32\}
+.
+For each method (GRPO, on-policy GRPO, and SDPO), we select a
+single
+hyperparameter configuration that achieves the highest validation accuracy within the first 5 hours of training, evaluated across all datasets and models used in
+Section
+˜
+3
+.
+We further report results obtained by selecting the optimal hyperparameter configuration separately for each model and dataset in
+Table
+˜
+3
+.
+E.3
+User Templates
+For multiple-choice questions and tool use, the model must be prompted in a task-specific manner. We therefore provide the prompt templates used for these settings below.
+⬇
+Given
+a
+question
+and
+four
+options
+,
+please
+select
+the
+right
+answer
+.
+Respond
+in
+the
+following
+format
+:
+<
+reasoning
+>
+...
+</
+reasoning
+>
+<
+answer
+>
+...
+</
+answer
+>
+For
+the
+answer
+,
+only
+output
+the
+letter
+corresponding
+to
+the
+correct
+option
+(
+A
+,
+B
+,
+C
+,
+or
+D
+),
+and
+nothing
+else
+.
+Do
+not
+restate
+the
+answer
+text
+.
+For
+example
+,
+if
+the
+answer
+is
+"
+A
+",
+just
+output
+:
+<
+answer
+>
+A
+</
+answer
+>
+Listing 1:
+System prompt: Multiple Choice Questions
+⬇
+{
+question
+}
+Please
+reason
+step
+by
+step
+.
+Listing 2:
+User prompt: Multiple Choice Questions
+⬇
+Your
+task
+is
+to
+answer
+the
+user
+’
+s
+question
+using
+available
+tools
+.
+You
+have
+access
+to
+the
+following
+tools
+:
+Name
+:
+Axolotl
+Description
+:
+Collection
+of
+axolotl
+pictures
+and
+facts
+Documentation
+:
+getRandomAxolotlImage
+:
+Retrieve
+a
+random
+axolotl
+image
+with
+information
+on
+the
+image
+source
+.
+Parameters
+:
+{}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Object
+{
+url
+,
+source
+,
+description
+}
+searchAxolotlImages
+:
+Search
+for
+axolotl
+images
+based
+on
+specific
+criteria
+such
+as
+color
+,
+gender
+,
+and
+size
+.
+Parameters
+:
+{"
+color
+":
+"
+string
+.
+One
+of
+:
+[
+wild
+,
+leucistic
+,
+albino
+].
+The
+color
+of
+the
+axolotl
+(
+e
+.
+g
+.,
+’
+wild
+’,
+’
+leucistic
+’,
+’
+albino
+’,
+etc
+.).",
+"
+gender
+":
+"
+string
+.
+One
+of
+:
+[
+male
+,
+female
+].
+The
+gender
+of
+the
+axolotl
+(’
+male
+’,
+’
+female
+’).",
+"
+size
+":
+"
+string
+.
+One
+of
+:
+[
+small
+,
+medium
+,
+large
+].
+The
+size
+of
+the
+axolotl
+(’
+small
+’,
+’
+medium
+’,
+’
+large
+’).",
+"
+page
+":
+"
+integer
+.
+The
+page
+number
+for
+pagination
+purposes
+."}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Object
+{
+results
+:
+Array
+[
+Object
+{
+url
+,
+source
+,
+description
+}],
+pagination
+:
+Object
+{
+current_page
+,
+total_pages
+,
+total_results
+}}
+getAxolotlFacts
+:
+Retrieve
+interesting
+facts
+about
+axolotls
+such
+as
+their
+habits
+,
+habitats
+,
+and
+physical
+characteristics
+.
+Parameters
+:
+{"
+category
+":
+"
+string
+.
+One
+of
+:
+[
+habits
+,
+habitat
+,
+physical
+characteristics
+].
+The
+category
+of
+facts
+to
+retrieve
+(
+e
+.
+g
+.,
+’
+habits
+’,
+’
+habitat
+’,
+’
+physical
+characteristics
+’).",
+"
+limit
+":
+"
+integer
+.
+The
+maximum
+number
+of
+facts
+to
+return
+."}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Array
+[
+Object
+{
+fact
+,
+source
+}]
+Use
+the
+following
+format
+:
+Thought
+:
+you
+should
+always
+think
+about
+what
+to
+do
+Action
+:
+the
+action
+to
+take
+,
+should
+be
+one
+of
+the
+tool
+names
+.
+Action
+Input
+:
+the
+input
+to
+the
+action
+,
+must
+be
+in
+JSON
+format
+.
+All
+of
+the
+action
+input
+must
+be
+realistic
+and
+from
+the
+user
+.
+Begin
+!
+Question
+:
+Hey
+,
+can
+you
+show
+me
+a
+random
+picture
+of
+an
+axolotl
+?
+Listing 3:
+Example user prompt: Tool use
+Appendix F
+Qualitative Examples
+F.1
+Visualization of Advantages
+Figure
+˜
+21
+compares the advantages of SDPO and GRPO in a representative example.
+Figure 21:
+Visualization of advantages in SDPO and GRPO with Olmo3-7B-Instruct in a batch from the Chemistry task of
+Section
+˜
+3
+. Each row corresponds to the beginning of a response. The color indicates the advantage value at that token position, with positive advantages shown in blue and negative advantages shown in red.
+F.2
+Examples
+Below, we show an example from training SDPO on LCBv6 using Qwen3-8B.
+⬇
+[Prompt]
+You
+are
+a
+coding
+expert.
+You
+will
+be
+given
+a
+coding
+problem,
+and
+you
+need
+to
+write
+a
+correct
+Python
+program
+that
+matches
+the
+specification
+and
+passes
+all
+tests.
+The
+time
+limit
+is
+1
+second.
+You
+may
+start
+by
+outlining
+your
+thought
+process.
+In
+the
+end,
+please
+provide
+the
+complete
+code
+in
+a
+code
+block
+enclosed
+with
+‘‘‘
+‘‘‘.
+You
+are
+given
+a
+binary
+string
+s
+of
+length
+n,
+where:
+’1’
+represents
+an
+active
+section.
+’0’
+represents
+an
+inactive
+section.
+You
+can
+perform
+at
+most
+one
+trade
+to
+maximize
+the
+number
+of
+active
+sections
+in
+s.
+In
+a
+trade,
+you:
+Convert
+a
+contiguous
+block
+of
+’1’s
+that
+is
+surrounded
+by
+’0’s
+to
+all
+’0’s.
+Afterward,
+convert
+a
+contiguous
+block
+of
+’0’s
+that
+is
+surrounded
+by
+’1’s
+to
+all
+’1’s.
+Return
+the
+maximum
+number
+of
+active
+sections
+in
+s
+after
+making
+the
+optimal
+trade.
+Note:
+Treat
+s
+as
+if
+it
+is
+augmented
+with
+a
+’1’
+at
+both
+ends,
+forming
+t
+=
+’1’
++
+s
++
+’1’.
+The
+augmented
+’1’s
+do
+not
+contribute
+to
+the
+final
+count.
+Example
+1:
+Input:
+s
+=
+"01"
+Output:
+1
+Explanation:
+Because
+there
+is
+no
+block
+of
+’1’s
+surrounded
+by
+’0’s,
+no
+valid
+trade
+is
+possible.
+The
+maximum
+number
+of
+active
+sections
+is
+1.
+Example
+2:
+Input:
+s
+=
+"0100"
+Output:
+4
+Explanation:
+String
+"0100"
+->
+Augmented
+to
+"101001".
+Choose
+"0100",
+convert
+"101001"
+->
+"100001"
+->
+"111111".
+The
+final
+string
+without
+augmentation
+is
+"1111".
+The
+maximum
+number
+of
+active
+sections
+is
+4.
+Example
+3:
+Input:
+s
+=
+"1000100"
+Output:
+7
+Explanation:
+String
+"1000100"
+->
+Augmented
+to
+"110001001".
+Choose
+"000100",
+convert
+"110001001"
+->
+"110000001"
+->
+"111111111".
+The
+final
+string
+without
+augmentation
+is
+"1111111".
+The
+maximum
+number
+of
+active
+sections
+is
+7.
+Example
+4:
+Input:
+s
+=
+"01010"
+Output:
+4
+Explanation:
+String
+"01010"
+->
+Augmented
+to
+"1010101".
+Choose
+"010",
+convert
+"1010101"
+->
+"1000101"
+->
+"1111101".
+The
+final
+string
+without
+augmentation
+is
+"11110".
+The
+maximum
+number
+of
+active
+sections
+is
+4.
+Constraints:
+1
+<=
+n
+==
+s.length
+<=
+10^5
+s[i]
+is
+either
+’0’
+or
+’1’
+Your
+solution
+should
+have
+the
+following
+signature:
+‘‘‘python
+def
+maxActiveSectionsAfterTrade(s:
+str)
+->
+int:
+‘‘‘
+[Response]
+<think>
+</think>
+To
+solve
+this
+problem,
+we
+need
+to
+understand
+the
+structure
+of
+the
+string
+and
+the
+trade
+operation.
+Here’s
+the
+plan:
+###
+Key
+Insights:
+1.
+We
+are
+given
+a
+binary
+string
+‘s‘,
+and
+we
+are
+allowed
+**at
+most
+one
+trade**.
+2.
+A
+**trade**
+consists
+of
+two
+steps:
+-
+Convert
+a
+contiguous
+block
+of
+‘’1’‘
+that
+is
+**surrounded
+by
+‘’0’‘
+into
+‘’0’‘
+(i.e.,
+flip
+it).
+-
+Then
+convert
+a
+contiguous
+block
+of
+‘’0’‘
+that
+is
+**surrounded
+by
+‘’1’‘
+into
+‘’1’‘
+(i.e.,
+flip
+it).
+3.
+The
+actual
+count
+of
+active
+sections
+is
+the
+number
+of
+‘’1’‘
+characters
+**after**
+the
+trade.
+The
+augmented
+‘’1’‘
+at
+the
+beginning
+and
+end
+do
+not
+count
+in
+the
+final
+result.
+###
+Strategy:
+-
+We
+simulate
+the
+trade
+by
+considering
+each
+possible
+way
+to
+flip
+a
+block
+of
+‘’1’‘
+(bounded
+by
+‘’0’‘)
+and
+then
+a
+block
+of
+‘’0’‘
+(bounded
+by
+‘’1’‘).
+-
+For
+each
+possible
+valid
+block,
+simulate
+the
+trade
+and
+compute
+the
+resulting
+number
+of
+‘’1’‘
+characters.
+-
+We
+only
+need
+to
+consider
+the
+original
+string
+‘s‘,
+not
+the
+augmented
+one
+(since
+we
+can
+simulate
+the
+trade
+directly
+on
+‘s‘
+by
+checking
+boundaries).
+We’ll
+use
+a
+sliding
+window
+technique
+to
+find
+contiguous
+blocks
+of
+‘’1’‘
+and
+‘’0’‘
+and
+simulate
+the
+trade.
+---
+###
+Implementation
+Steps:
+1.
+Simulate
+the
+trade
+by
+checking
+for
+a
+block
+of
+‘’1’‘
+surrounded
+by
+‘’0’‘
+or
+at
+the
+edges
+(we
+simulate
+the
+augmented
+‘1‘
+at
+both
+sides).
+2.
+For
+each
+such
+block,
+simulate
+converting
+it
+to
+‘’0’‘,
+then
+convert
+a
+‘’0’‘
+block
+(again,
+surrounded
+by
+‘’1’‘)
+to
+‘’1’‘.
+3.
+Count
+the
+number
+of
+‘’1’‘
+in
+the
+final
+string
+and
+track
+the
+maximum.
+---
+###
+Python
+Code:
+‘‘‘python
+def
+maxActiveSectionsAfterTrade(s:
+str)
+->
+int:
+if
+not
+s:
+return
+0
+max_ones
+=
+0
+n
+=
+len(s)
+#
+We
+are
+simulating
+the
+augmented
+’1’s
+on
+both
+ends,
+so
+we
+treat
+the
+boundaries
+as
+’1’s
+#
+So
+’0’
+at
+index
+0
+or
+n-1
+is
+considered
+surrounded
+by
+’1’s
+#
+Step
+1:
+Find
+all
+candidate
+positions
+for
+the
+first
+part
+of
+the
+trade
+#
+(convert
+a
+block
+of
+1s
+surrounded
+by
+0s
+into
+0s)
+for
+i
+in
+range(n):
+if
+s[i]
+==
+’1’:
+start
+=
+i
+while
+i
+<
+n
+and
+s[i]
+==
+’1’:
+i
++=
+1
+end
+=
+i
+-
+1
+#
+Check
+if
+this
+block
+is
+surrounded
+by
+’0’s
+left_boundary
+=
+(start
+==
+0
+or
+s[start
+-
+1]
+==
+’0’)
+right_boundary
+=
+(end
+==
+n
+-
+1
+or
+s[end
++
+1]
+==
+’0’)
+if
+left_boundary
+and
+right_boundary:
+#
+This
+is
+a
+candidate
+block
+of
+’1’s
+to
+flip
+#
+Now,
+look
+for
+a
+block
+of
+’0’s
+surrounded
+by
+’1’s
+in
+the
+remaining
+string
+#
+Replace
+the
+’1’
+block
+with
+’0’s
+#
+Now
+find
+a
+’0’
+block
+surrounded
+by
+’1’s
+#
+We
+can
+simulate
+the
+remaining
+string
+temp
+=
+list(s)
+for
+j
+in
+range(start,
+end
++
+1):
+temp[j]
+=
+’0’
+#
+Now
+find
+a
+contiguous
+block
+of
+’0’s
+surrounded
+by
+’1’s
+max_trade_ones
+=
+0
+for
+k
+in
+range(n):
+if
+temp[k]
+==
+’0’:
+start0
+=
+k
+while
+k
+<
+n
+and
+temp[k]
+==
+’0’:
+k
++=
+1
+end0
+=
+k
+-
+1
+#
+Check
+if
+this
+block
+is
+surrounded
+by
+’1’s
+left_0
+=
+(start0
+==
+0
+or
+temp[start0
+-
+1]
+==
+’1’)
+right_0
+=
+(end0
+==
+n
+-
+1
+or
+temp[end0
++
+1]
+==
+’1’)
+if
+left_0
+and
+right_0:
+#
+Flip
+this
+block
+of
+’0’s
+to
+’1’s
+for
+j
+in
+range(start0,
+end0
++
+1):
+temp[j]
+=
+’1’
+#
+Count
+the
+number
+of
+’1’s
+count
+=
+sum(temp)
+max_ones
+=
+max(max_ones,
+count)
+break
+#
+Only
+take
+the
+first
+valid
+block
+for
+this
+trade
+#
+After
+trying
+this
+’1’
+block,
+restore
+the
+original
+string
+#
+No
+need,
+we
+just
+used
+a
+copy
+return
+max_ones
+‘‘‘
+---
+###
+Optimization
+Note:
+The
+above
+approach
+is
+**not
+optimal**
+for
+very
+large
+strings.
+A
+more
+efficient
+method
+would
+involve
+precomputing
+the
+positions
+of
+blocks
+of
+‘’1’‘
+and
+‘’0’‘,
+and
+checking
+valid
+trades
+between
+these
+blocks
+in
+linear
+time.
+But
+with
+the
+constraints
+given
+(up
+to
+10^5),
+we
+need
+to
+ensure
+linear
+or
+near-linear
+performance.
+The
+current
+code
+is
+correct
+but
+**not
+efficient**
+for
+large
+inputs.
+Would
+you
+like
+me
+to
+provide
+a
+more
+optimized
+version
+that
+runs
+in
+**O(n)**
+time
+and
+uses
+only
+one
+pass
+through
+the
+string?
+[Feedback]
+b’Runtime
+Error\nTypeError:
+unsupported
+operand
+type(s)
+for
++:
+\’int\’
+and
+\’str\’\nLine
+48
+in
+maxActiveSectionsAfterTrade
+(Solution.py)\n\nLast
+Executed
+Input\n"11000"’
+F.3
+Environment Feedback
+We show three examples of feedback in our coding environment, inspired by LeetCode.
+⬇
+Test
+Case
+3:
+Wrong
+Answer
+Input
+[[3,6,1],[4,21,4]]
+Output
+6
+Expected
+22.87500
+Test
+Case
+6:
+Wrong
+Answer
+Input
+[[12,25,3],[3,14,2]]
+Output
+14
+Expected
+25.83333
+Listing 4:
+Example of feedback “Wrong Answer” from our code environment in case of a wrong answer,
+inspired by LeetCode
+⬇
+Runtime
+Error
+MemoryError
+:
+Line
+91
+in
+<
+module
+>
+(
+Solution
+.
+py
+)
+Line
+25
+in
+solve
+(
+Solution
+.
+py
+)
+Last
+Executed
+Input
+10
+633
+9312
+1314
+8548
+8857
+1062
+6410
+3289
+8594
+1263
+8549
+733
+3858
+5973
+...
+(3
+more
+lines
+)
+Listing 5:
+Example of feedback “Memory Error” from our code environment in case of a wrong answer,
+inspired by LeetCode
+⬇
+Runtime
+Error
+IndexError
+:
+list
+index
+out
+of
+range
+Line
+28
+in
+sortMatrix
+(
+Solution
+.
+py
+)
+Last
+Executed
+Input
+[[-1,-1,-1,-1,-1,-1,-1,-1,...
+Listing 6:
+Example of feedback “Index Error” from our code environment in case of a wrong answer,
+inspired by LeetCode
+F.4
+Illustrative Example
+Figure
+22
+shows an illustrative example of the dense credit assignment in SDPO.
+Figure 22:
+Dense credit assignment through self-teaching in SDPO.
+The answer is generated by then model (Qwen3-8B) before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the self-teacher after seeing the feedback. We show the per-token
+log
+⁡
+(
+ℙ
+​
+(
+self-teacher
+)
+/
+ℙ
+​
+(
+student
+)
+)
+\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}})
+, with red indicating negative values (
+self-teacher disagrees
+), blue indicating positive values (
+teacher reinforces
+), and white indicating values around zero. Using binary rewards, GRPO would assign the same, negative advantage to all tokens in the sequence. In contrast, SDPO turns the feedback into dense credit assignment across the sequence. The first row shows the tokens of the generated response. The 3 other rows show the top-
+k
+k
+logits of the self-teacher that are used during self-distillation, suggesting alternative tokens. Notably, in this example, the self-teacher identifies the error through retrospection without an explicit solution. The credit assignment on the generated sequence, and the alternative top-
+k
+k
+logits correctly show that replacing
+set
+with
+dict
+maintains the order of elements. Further, in the seventh shown position, the model also identifies an alternative solution path which starts with the
+seen
+token, instead of directly returning the output. The activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution for specifically these few tokens.
\ No newline at end of file
diff --git a/research/notes/reinforcement-learning-via-self-distillation.md b/research/notes/reinforcement-learning-via-self-distillation.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc6551c6fb063d8f164275bc416bf2a9e23a0887
--- /dev/null
+++ b/research/notes/reinforcement-learning-via-self-distillation.md
@@ -0,0 +1,10247 @@
+---
+title: Reinforcement Learning via Self-Distillation
+id: reinforcement-learning-via-self-distillation
+tags:
+- deepread
+created: '2026-06-10T00:00:39.997764Z'
+source: https://arxiv.org/html/2601.20802v1
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:00:39.997626Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Reinforcement Learning via Self-Distillation
+Reinforcement Learning via Self-Distillation
+Jonas Hübotter
+1
+1
+Frederike Lübeck
+,
+1
+,
+2
+,1,2
+Lejs Behric
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+,
+1
+,1
+Anton Baumann
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+,
+1
+,1
+Marco Bagatella
+1
+,
+2
+1,2
+Daniel Marta
+1
+1
+Ido Hakimi
+1
+1
+Idan Shenfeld
+3
+3
+Thomas Kleine Buening
+1
+1
+Carlos Guestrin
+4
+4
+Andreas Krause
+1
+1
+1
+1
+ETH Zurich
+2
+2
+Max Planck Institute for Intelligent Systems
+3
+3
+MIT
+4
+4
+Stanford
+https://github.com/lasgroup/SDPO
+Equal second authorship. Correspondence to
+jonas.huebotter@inf.ethz.ch
+.
+Abstract
+Large language models are increasingly post-trained with reinforcement learning in verifiable domains such as code and math.
+Yet, current methods for reinforcement learning with verifiable rewards (RLVR) learn only from a scalar outcome reward per attempt, creating a severe credit-assignment bottleneck.
+Many verifiable environments actually provide rich textual feedback, such as runtime errors or judge evaluations, that explain
+why
+an attempt failed.
+We formalize this setting as reinforcement learning with rich feedback and introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+), which converts tokenized feedback into a dense learning signal without any external teacher or explicit reward model.
+SDPO treats the current model conditioned on feedback as a self-teacher and distills its feedback-informed next-token predictions back into the policy.
+In this way, SDPO leverages the model’s ability to retrospectively identify its own mistakes in-context.
+Across scientific reasoning, tool use, and competitive programming on LiveCodeBench v6, SDPO improves sample efficiency and final accuracy over strong RLVR baselines.
+Notably, SDPO also outperforms baselines in standard RLVR environments that only return scalar feedback by using successful rollouts as implicit feedback for failed attempts.
+Finally, applying SDPO to individual questions at test time accelerates discovery on difficult binary-reward tasks, achieving the same discovery probability as best-of-
+k
+k
+sampling or multi-turn conversations with
+3
+×
+3\times
+fewer attempts.
+1
+Introduction
+Figure 1:
+SDPO substantially outperforms an improved version of Group Relative Policy Optimization (GRPO) on LCB v6 with Qwen3-8B.
+Further, SDPO achieves GRPO’s final accuracy in
+4
+×
+4\times
+fewer generations.
+Claude Sonnet 4 is the strongest instruct model on the public LCBv6 leaderboard.
+Shaded regions show the standard deviation across 3 seeds.
+Progress in deep reinforcement learning has shown that iterating on experience—acting, receiving feedback, and updating a policy—can unlock capabilities that are difficult to obtain from static supervision alone
+(Mnih et al.,
+2015
+; Silver et al.,
+2016
+;
+2017
+; Berner et al.,
+2019
+)
+.
+The same theme now appears in large language models (LLMs): large-scale post-training with reinforcement learning (RL) has substantially improved performance on reasoning-heavy tasks, especially in settings with programmatic or otherwise verifiable evaluation
+(Jaech et al.,
+2024
+; Guo et al.,
+2025
+; Kimi et al.,
+2025
+; Olmo et al.,
+2025
+)
+.
+Nevertheless, the dominant RL recipe for LLM post-training remains bottlenecked by credit assignment.
+Most current approaches operate in the setting of reinforcement learning with verifiable rewards (RLVR): given a question
+x
+x
+, the model samples an answer
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+{y\sim\pi_{\theta}(\cdot\mid x)}
+and receives a scalar reward
+r
+∈
+ℝ
+r\in\mathbb{R}
+, often binary (e.g., unit-tests pass/fail in code generation).
+Modern policy gradient RLVR methods such as Group Relative Policy Optimization
+(GRPO; Shao et al.,
+2024
+)
+estimate advantages from these sparse outcome rewards.
+Furthermore, when all rollouts in a group receive the same (often zero) reward, GRPO advantages collapse to zero and learning stalls.
+To overcome this sparsity, one might prefer distillation from a strong teacher
+(Guo et al.,
+2025
+; Yang et al.,
+2025
+; Lu & Thinking Machines Lab,
+2025
+; Guha et al.,
+2026
+)
+, which provides dense, token-level supervision.
+However, strong teachers are often unavailable in online learning, where the goal is to raise the capability ceiling beyond existing models.
+In this work, we argue that the key limitation is not RL per se, but the information bottleneck imposed by scalar outcome rewards.
+Many verifiable environments expose
+rich tokenized feedback
+beyond scalar rewards
+r
+r
+, such as runtime errors, failing unit tests, or evaluations from an LLM judge.
+This feedback not only reveals
+whether
+a rollout was wrong, but also
+what
+went wrong.
+We formalize this more general setting as
+Reinforcement Learning with Rich Feedback
+(
+RLRF
+) and illustrate its difference to RLVR in
+Figure
+˜
+2
+.
+Here, feedback can be any tokenized representation of any state reached by an agentic system.
+The central question becomes: how can we convert rich feedback into effective credit assignment without requiring external supervision from a strong teacher?
+Figure 2:
+Comparison of RLVR and RLRF settings.
+In Reinforcement Learning with Verifiable Rewards (RLVR), the agent learns from a scalar reward
+r
+r
+, which often acts as an information bottleneck by masking the underlying environment state.
+In contrast, Reinforcement Learning with Rich Feedback (RLRF) utilizes tokenized feedback.
+This provides a significantly richer signal than a scalar reward, as the feedback can encapsulate both the reward as well as detailed observations of the state (such as runtime errors from a code environment or feedback from an LLM judge).
+⬇
+Runtime
+Error
+ZeroDivisionError
+:
+division
+by
+zero
+Line
+73
+in
+separateSquares
+(
+Solution
+.
+py
+)
+\
+parLast
+Executed
+Input
+[[26,30,2],[11,23,1]]
+Figure 3:
+Example of feedback from our code environment, inspired by LeetCode. Listings
+LABEL:lst:feedback_example_wrong_answer
+,
+LABEL:lst:memory_error
+, and
+LABEL:lst:index_error
+in the appendix show examples of feedback in case of a wrong answer, a memory error, and an index error.
+Our starting point is the observation that LLMs already possess a powerful mechanism for using feedback: in-context learning
+(Brown et al.,
+2020
+; Wei et al.,
+2022
+)
+.
+When conditioned on feedback, the same model can often identify plausible mistakes and propose a corrected approach.
+A common example of such feedback is the summary of failed test cases on coding platforms like LeetCode (
+Figure
+˜
+3
+).
+Many recent works leverage this capability to iteratively generate corrections
+(Chen et al.,
+2021a
+; Madaan et al.,
+2023
+; Shinn et al.,
+2023
+; Yao et al.,
+2024
+; Yuksekgonul et al.,
+2025
+; Lee et al.,
+2025
+)
+.
+In contrast, we use the current policy as a “self-teacher” that, rather than sampling a new response, re-evaluates the
+existing
+rollout after receiving rich feedback.
+Including the feedback in-context transforms the model’s next-token distribution, allowing the self-teacher to agree or disagree with the student’s original choices at specific tokens.
+This yields dense, logit-level credit assignment.
+For example, when provided with the feedback from
+Figure
+˜
+3
+, the self-teacher can identify how the initial attempt should be modified to avoid the runtime error.
+Crucially, this mechanism incurs no sampling overhead: we simply re-compute the log-probabilities of the original attempt under the self-teacher’s feedback-augmented context.
+Building on this idea, we introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+), an on-policy algorithm that performs RL via self-distillation.
+SDPO samples rollouts from the current policy, obtains rich environment feedback, and then minimizes a logit-level distillation loss that matches the current policy’s next-token distribution to that of the self-teacher.
+Conceptually, SDPO addresses the central limitation of applying distillation to online learning: the absence of a stronger external teacher.
+Instead of relying on a fixed teacher, SDPO leverages the model’s ability to recognize its own mistakes in hindsight.
+By conditioning the current policy on the rich feedback it just received, we construct a self-teacher that provides the dense supervision of distillation while retaining the exploration benefits of on-policy RL.
+Table
+˜
+1
+summarizes how this positions SDPO relative to RLVR and distillation baselines.
+We include a comprehensive summary of related work in
+Section
+˜
+6
+.
+We show that SDPO is a policy gradient algorithm whose advantages are estimated using the self-teacher.
+This enables the implementation of SDPO with minor changes to standard RLVR pipelines, simply by swapping out the advantages.
+Method
+Sampling
+Signal
+Feedback
+SFT / Distillation
+(Hinton et al.,
+2015
+)
+×
+\boldsymbol{\times}
+off-policy
+✓
+rich
+×
+\boldsymbol{\times}
+strong teacher
+On-Policy Distillation
+(Agarwal et al.,
+2024
+)
+✓
+on-policy
+✓
+rich
+×
+\boldsymbol{\times}
+strong teacher
+RLVR (such as GRPO)
+(Lambert et al.,
+2025
+)
+✓
+on-policy
+×
+\boldsymbol{\times}
+weak
+✓
+environment
+RL via Self-Distillation (SDPO)
+(ours)
+✓
+on-policy
+✓
+rich
+✓
+environment
+Table 1:
+Comparison of self-distillation to alternative methods for post-training LLMs.
+Summary of evaluation results.
+We evaluate SDPO in three online RL settings:
+•
+Learning without rich feedback
+(§
+3
+):
+We evaluate standard RLVR environments that do not return any feedback beyond scalar rewards.
+Here, SDPO treats successful attempts sampled in the current batch as “feedback” for failed attempts on the same question.
+We perform training runs on scientific reasoning and tool use, starting with Qwen3-8B and Olmo3-7B-Instruct.
+We find that SDPO outperforms a strong GRPO baseline that integrates recent improvements: 68.8% vs. 64.1% final accuracy on aggregate.
+SDPO achieves higher accuracy with up to
+7
+×
+7\times
+shorter generation lengths compared to GRPO, demonstrating that effective reasoning need not be verbose.
+•
+Learning with rich feedback
+(§
+4
+):
+We evaluate competitive programming problems from LiveCodeBench v6 with LeetCode-style feedback.
+As shown in
+Figure
+˜
+1
+, SDPO substantially improves over GRPO, reaching a higher final accuracy (48.8% vs. 41.2%) and achieving GRPO’s final accuracy in
+4
+×
+4\times
+fewer generations.
+SDPO’s gains grow with model scale, suggesting that the ability for self-teaching emerges as models become stronger in-context learners.
+•
+Discovering novel solutions to hard tasks at test-time
+(§
+5
+):
+Finally, we demonstrate that SDPO can accelerate the discovery of solutions to difficult binary-reward questions.
+This contrasts with RLVR methods, which only begin learning once the first solution has been found.
+We leverage SDPO for
+Test-Time Self-Distillation
+, a form of test-time training where the model specializes to an individual test question.
+We consider very difficult LiveCodeBench questions, for which the base model’s pass@
+64
+64
+is below 0.03, and show that SDPO accelerates the discovery of solutions by
+3
+×
+3\times
+.
+2
+SDPO: Self-Distillation Policy Optimization
+We propose an algorithm that uses the in-context learning ability of the current policy for assigning credit.
+Our key object is the
+self-teacher
+,
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+\pi_{\theta}(\cdot\mid x,f)
+, which refers to the current policy (the “student”) prompted with the question
+x
+x
+and the rich feedback
+f
+f
+.
+Next to the students’ original attempt
+y
+y
+,
+f
+f
+may incorporate two key kinds of feedback: any environment output (such as runtime errors from a code environment) and a sample solution if
+x
+x
+was already solved with another attempt in the rollout group.
+1
+1
+1
+In standard RLVR implementations a rollout group contains multiple simultaneous attempts for
+x
+x
+.
+As discussed before, the self-teacher
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+\pi_{\theta}(\cdot\mid x,f)
+should have a higher accuracy than the student
+π
+θ
+(
+⋅
+∣
+x
+)
+\pi_{\theta}(\cdot\mid x)
+since it sees additional information in-context.
+This leads us to observe:
+We can use the same policy in two different roles: As the student for the initial attempt and as the teacher to determine the value of actions in hindsight.
+We introduce
+Self-Distillation Policy Optimization
+(
+SDPO
+) which repeatedly distills the self-teacher into the student.
+Given a question
+x
+x
+, we first sample rollouts from the student
+π
+θ
+\pi_{\theta}
+and obtain corresponding environment feedback.
+We then use the KL-divergence,
+KL
+​
+(
+p
+∥
+q
+)
+=
+∑
+i
+p
+​
+(
+i
+)
+​
+log
+⁡
+p
+​
+(
+i
+)
+/
+q
+​
+(
+i
+)
+\smash{\mathrm{KL}\left(p\|q\right)=\sum_{i}p(i)\log\nicefrac{{p(i)}}{{q(i)}}}
+, as a distance measure for the next-token distributions of student and teacher, and optimize a standard logit distillation loss:
+ℒ
+SDPO
+(
+θ
+)
+:=
+∑
+t
+KL
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+π
+θ
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\mathcal{L}_{\mathrm{SDPO}}(\theta):=\sum_{t}\mathrm{KL}(\pi_{\theta}(\cdot\mid x,y_{<t})\|\mathrm{stopgrad}(\pi_{\theta}(\cdot\mid x,f,y_{<t})))
+(1)
+Algorithm 1
+SDPO
+1:
+Language model
+π
+θ
+\pi_{\theta}
+; dataset with questions
+x
+x
+; number of rollouts
+G
+G
+per question; environment to obtain feedback for attempts.
+2:
+repeat
+3:
+Sample question
+x
+x
+from dataset.
+4:
+Sample responses:
+{
+y
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+\smash{\{y_{i}\}_{i=1}^{G}\sim\pi_{\theta}(\cdot\mid x)}
+.
+5:
+Evaluate responses to obtain feedback
+f
+i
+f_{i}
+.
+⊳
+\triangleright
+Self-distillation:
+6:
+Compute log-probs of self-teacher
+log
+⁡
+π
+θ
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+f
+i
+,
+y
+i
+,
+<
+t
+)
+.
+\log\pi_{\theta}(y_{i,t}\mid x,f_{i},y_{i,<t}).
+7:
+Update
+θ
+\theta
+with gradient descent on
+ℒ
+SDPO
+​
+(
+θ
+)
+\mathcal{L}_{\mathrm{SDPO}}(\theta)
+.
+8:
+until
+converged
+where the stopgrad operator blocks gradients from flowing through the teacher, and thus prevents it from regressing towards the student and ignoring
+f
+f
+.
+The intuitive role of the teacher is to determine where and how the students’ original attempt
+y
+y
+was wrong through retrospection based on the feedback
+f
+f
+.
+Figure
+˜
+4
+shows an example of self-teaching with Qwen3-8B as student and self-teacher.
+We summarize SDPO in
+Algorithm
+˜
+1
+and display the teachers’ reprompt template in
+Table
+˜
+2
+.
+Figure 4:
+Example of self-teaching with Qwen3-8B. The answer is generated by the model before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the
+self-teacher
+after seeing the feedback. We show the per-token
+log
+⁡
+(
+ℙ
+​
+(
+self-teacher
+)
+/
+ℙ
+​
+(
+student
+)
+)
+\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}})
+, with red indicating negative values (
+self-teacher disagrees
+)
+and white indicating values around zero. Notably, in this example, Qwen3-8B identifies the error through retrospection without an explicit solution. Further, the activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution.
+User:
+prompt
+Correct solution:
+successful_previous_rollout
+The following is feedback from your unsuccessful earlier attempt:
+environment_output
+Correctly solve the original question.
+Assistant:
+original_response
+Table 2:
+Template for self-teacher.
+prompt
+is replaced with the question. A sample solution previously generated by the student is substituted for
+successful_previous_rollout
+(if available for this question; otherwise the paragraph is skipped).
+environment_output
+is replaced with the environment output (see, e.g.,
+Figure
+˜
+3
+) from the models’ original attempt (if it was not successful and there is no solution; otherwise the paragraph is skipped). If the models’ original attempt was successful, this attempt is passed as the correct solution.
+original_response
+is replaced with the models’ original attempt to re-evaluate its log-probabilities under the self-teacher.
+We can derive the SDPO gradient as follows (see
+Section
+˜
+B.1
+for details):
+Proposition 2.1
+.
+Let
+𝒱
+\mathcal{V}
+be the set of tokens in the vocabulary.
+The gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is
+∇
+θ
+ℒ
+SDPO
+​
+(
+θ
+)
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+∑
+t
+=
+1
+|
+y
+|
+∑
+y
+^
+t
+∈
+𝒱
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+log
+⁡
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+y
+<
+t
+)
+π
+θ
+​
+(
+y
+^
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+]
+.
+\boldsymbol{\nabla}_{\!\!\theta}\,\mathcal{L}_{\mathrm{SDPO}}(\theta)=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{|y|}\sum_{\hat{y}_{t}\in\mathcal{V}}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(\hat{y}_{t}\mid x,y_{<t})}{\pi_{\theta}(\hat{y}_{t}\mid x,f,y_{<t})}\right].\vskip 4.30554pt
+(2)
+2.1
+Comparison to RLVR
+Note that the SDPO gradient is a (negated) logit-level policy gradient where the advantages are estimated using the self-teacher.
+2
+2
+2
+See
+Section
+˜
+A.3
+for a detailed comparison of the SDPO gradient to the standard policy gradient.
+We can therefore reuse standard RLVR implementations and simply swap out the advantages.
+Let
+y
+i
+y_{i}
+be the
+i
+i
+-th rollout from a rollout group of size
+G
+G
+for question
+x
+x
+, then we have:
+A
+i
+,
+t
+GRPO
+​
+(
+y
+^
+i
+,
+t
+)
+:=
+𝟙
+​
+{
+y
+i
+,
+t
+=
+y
+^
+i
+,
+t
+}
+​
+(
+r
+i
+−
+mean
+​
+{
+r
+i
+}
+i
+=
+1
+G
+)
+,
+A
+i
+,
+t
+SDPO
+​
+(
+y
+^
+i
+,
+t
+)
+=
+log
+⁡
+π
+θ
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+f
+i
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+.
+A_{i,t}^{\mathrm{GRPO}}(\hat{y}_{i,t}):=\mathbbm{1}\{{y_{i,t}=\hat{y}_{i,t}}\}\left(r_{i}-\mathrm{mean}\{r_{i}\}_{i=1}^{G}\right),\quad A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t})=\log\frac{\pi_{\theta}(\hat{y}_{i,t}\mid x,f_{i},y_{i,<t})}{\pi_{\theta}(\hat{y}_{i,t}\mid x,y_{i,<t})}.
+The GRPO advantages are zero on any non-generated token and constant within a rollout
+y
+i
+y_{i}
+.
+3
+3
+3
+We use the GRPO
+(Shao et al.,
+2024
+)
+advantage without normalization
+(Liu et al.,
+2025b
+)
+.
+In contrast, the SDPO advantages are zero only for tokens where student and teacher perfectly agree.
+The SDPO advantage is positive for tokens which are more likely under the teacher while being negative for tokens which are less likely under the teacher.
+Thus, SDPO can be seen as a direct extension of standard RLVR methods in two ways:
+1.
+from 1-bit feedback to
+allowing arbitrary sequences of tokens as feedback
+, and
+2.
+leveraging this rich feedback to
+estimate dense logit-level advantages
+.
+This tight connection to RLVR methods also enables a straightforward extension of the SDPO gradient from
+Equation
+˜
+2
+to off-policy data via PPO-style clipped importance sampling
+(Schulman et al.,
+2017
+)
+, see
+Section
+˜
+A.3
+.
+2.2
+Compute time & memory
+Figure 5:
+Time per step for SDPO vs GRPO (solid: without code environment, light: with code environment).
+The only computational overhead of SDPO compared to GRPO is the additional computation of log-probs from the self-teacher, which can be effectively parallelized and is substantially faster than sequential generation.
+Figure
+˜
+5
+compares the compute time of SDPO and GRPO.
+As expected, the compute overhead of SDPO is relatively small.
+Here, we use a micro batch size of 2;
+4
+4
+4
+The micro batch size corresponds to # rollouts we train on at a time while accumulating gradients.
+compute time can be further reduced by using larger micro batch sizes.
+Naively computing the KL divergence between student and teacher requires holding full logits of both models in memory.
+To avoid this, we approximate the KL divergence in the SDPO loss by performing top-
+K
+K
+distillation (i.e., only computing the top-
+K
+K
+logits of the student and the corresponding logits of the teacher alongside a term capturing the tail probability; cf.
+Section
+˜
+A.2
+). With a reasonable choice of
+K
+K
+(e.g.,
+K
+=
+100
+{K=100}
+), this avoids virtually any memory overhead while capturing most of the information.
+2.3
+Stability improvements
+We find that two practical modifications significantly enhance the training stability of SDPO.
+First, we employ a regularized self-teacher, implemented either via an exponential moving average (EMA) of the student parameters or by interpolating the current teacher with the initial teacher (cf.
+Section
+˜
+A.1
+).
+As detailed later, both strategies effectively stabilize learning.
+Second, we adopt the symmetric Jensen-Shannon divergence for the distillation loss; this formulation has similarly been shown to improve stability in on-policy distillation from external teachers
+(Agarwal et al.,
+2024
+)
+.
+3
+Learning without Rich Environment Feedback
+We first evaluate SDPO in standard RLVR environments, where feedback is limited to scalar rewards.
+Instead of using the scalar reward, SDPO treats successful attempts sampled in the current batch as “feedback” for failed attempts on the same question.
+By comparing the student’s attempt with a correct solution, the self-teacher can identify where the student was wrong and provide dense credit assignment.
+Figure 6:
+Training progression of Olmo3-7B-Instruct on Chemistry. We report the average accuracy across 16 samples per question and a rolling average of response lengths over 5 steps. We report GRPO with the optimal hyperparameters for this model and task.
+3.1
+Experimental setting
+We evaluate tasks on which the model has not been explicitly fine-tuned:
+•
+Science Q&A
+(Chemistry, Physics, Biology, Materials science): Undergraduate-level scientific reasoning using reasoning subsets (L3) from SciKnowEval
+(Feng et al.,
+2024
+)
+.
+•
+Tool use
+: Mapping a tool-API specification and user request to the correct tool call, using ToolAlpaca
+(Tang et al.,
+2023
+)
+.
+We perform a train-test split to test in-domain generalization.
+We use Qwen3-8B
+(Yang et al.,
+2025
+)
+and Olmo3-7B-Instruct
+(Olmo et al.,
+2025
+)
+as initial checkpoints and report avg@16 relative to wall-clock training time, excluding initialization & validation.
+Baselines.
+We compare SDPO to an improved variant of
+GRPO
+(Shao et al.,
+2024
+)
+, which incorporates several recent modifications
+(Olmo et al.,
+2025
+; Khatri et al.,
+2026
+)
+such as asymmetric clipping
+(Yu et al.,
+2025
+)
+, avoiding biased normalization
+(Liu et al.,
+2025b
+)
+, and correcting for off-policy data when using efficient inference frameworks
+(Yao et al.,
+2025
+)
+.
+We integrate these modifications into a GRPO implementation that represents a strong baseline, as detailed in
+Equation
+˜
+10
+in
+Section
+˜
+A.3
+.
+GRPO enables off-policy training through PPO’s clipped importance weighting
+(Schulman et al.,
+2017
+)
+.
+We additionally report the special case of
+on-policy GRPO
+(matching the hyperparameters of vanilla SDPO).
+For both baselines, we perform a hyperparameter sweep and report results for the models that achieve the highest validation performance across all target tasks.
+Hyperparameters and training details are provided in
+Appendix
+˜
+E
+.
+We use the
+verl
+library
+(Sheng et al.,
+2025
+)
+for fast multi-GPU training.
+3.2
+Results
+Table
+˜
+3
+summarizes our results.
+We find that SDPO outperforms GRPO across almost all runs, often leading to substantial improvements.
+SDPO learns notably faster than GRPO, performing close to 5 hours of GRPO training after only 1 hour of training with SDPO in several cases.
+SDPO achieves a particularly substantial improvement over GRPO on the Chemistry task, as is displayed in Figure
+6
+(left)
+.
+With Olmo3-7B-Instruct,
+SDPO achieves the 5h GRPO accuracy in 30 minutes of wall-clock training time
+, a
+10
+×
+10\times
+speedup.
+Moreover, SDPO’s 5h accuracy is more than
+20
+20
+%-points higher than that of GRPO.
+Chemistry
+Physics
+Biology
+Materials
+Tool use
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+Qwen3-8B
+35.6
+59.2
+27.9
+58.9
+57.5
++ GRPO
+54.7
+54.7
+60.0
+60.0
+63.8
+63.8
+72.7
+72.7
+34.3
+34.3
+51.8
+51.8
+74.3
+77.1
+77.1
+64.9
+64.9
+67.7
+67.7
++ GRPO (on-policy)
+54.2
+69.6
+63.6
+63.6
+44.4
+44.4
+73.9
+74.1
+60.2
+65.7
++
+SDPO
+(on-policy)
+60.0
+70.1
+66.6
+75.6
+51.5
+52.9
+72.1
+78.4
+68.0
+68.5
+Olmo3-7B-Instruct
+18.8
+37.7
+18.1
+36.7
+39.3
++ GRPO
+32.7
+46.8
+46.8
+55.3
+63.3
+63.3
+47.8
+62.0
+70.9
+75.0
+75.0
+56.4
+65.0
++ GRPO (on-policy)
+48.8
+54.3
+62.7
+62.7
+54.2
+63.8
+73.3
+73.5
+56.8
+60.6
++
+SDPO
+(on-policy)
+59.2
+76.8
+59.9
+66.1
+56.1
+58.3
+58.3
+73.7
+79.1
+60.8
+62.1
+62.1
+Table 3:
+Comparison of SDPO and GRPO on reasoning-related benchmarks.
+We report the highest achieved avg@16 within 1 hour and 5 hours of wall-clock training time, respectively. Both SDPO and on-policy GRPO perform one gradient step per generation batch, while GRPO performs 4 off-policy mini batch steps. We select optimal hyperparameters for SDPO and baselines based on 5h accuracy. Each run is performed on a node with 4 NVIDIA GH200 GPUs. Together with initialization and validation, each run takes approximately 6 hours.
+We remark that our results with SDPO use strictly on-policy training (i.e., one gradient step per generation batch).
+Given the known efficiency gains of off-policy methods that perform multiple gradient updates per generation batch, we believe that studying SDPO with off-policy updates is an exciting direction for future work.
+Takeaway 1
+We demonstrate that SDPO can learn to reason effectively, generalizing to challenging reasoning tasks.
+Without requiring any modification to existing RLVR environments, SDPO outperforms GRPO substantially in several cases.
+3.3
+Self-distillation learns to reason concisely
+We consistently observe that SDPO produces substantially shorter generations than GRPO while achieving higher accuracy.
+SDPO’s responses are more than
+3
+×
+3\times
+shorter on average across tasks (cf.
+Table
+˜
+8
+in
+Appendix
+˜
+D
+).
+On Chemistry with Olmo3-7B-Instruct, SDPO even achieves a
+7
+×
+7\times
+reduction in response length relative to GRPO while maintaining higher accuracy (Figure
+6
+(right)
+).
+While recent progress in RLVR has demonstrated that scaling response length is a powerful driver of emergent reasoning capabilities
+(Jaech et al.,
+2024
+; Guo et al.,
+2025
+; Muennighoff et al.,
+2025
+)
+, our results suggest that effective reasoning need not always be verbose. We find that SDPO improves the
+efficiency
+of reasoning.
+Qualitatively, we observe that the longer responses from GRPO often stem from “superficial” reasoning rather than necessary cognitive steps.
+GRPO frequently generates filler phrases like “Hmm” and “Wait” or enters circular logical loops that repeat previous steps verbatim.
+Figure
+˜
+7
+displays a representative example of this phenomenon.
+Remarkably, SDPO’s generations remain concise and avoid these superficial patterns.
+This may be explained by SDPO’s dense credit assignment, which assigns a specific advantage to each next-token prediction, leading to sparse advantages (cf.
+Figure
+˜
+21
+in
+Appendix
+˜
+F
+).
+By improving the efficiency of reasoning, SDPO reduces inference generation time and demonstrates that reasoning performance can be improved by refining
+how
+the model reasons, not just how
+long
+it reasons.
+…
+Alternatively
+…
+Closer to D? No
+…
+Wait I’m going in circles
+…
+Wait, perhaps the correct answer is B
+…
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+…
+Ah, this works
+…
+Wait I think I messed up
+…
+Hmm
+…
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+…
+Thus, the correct answer is likely B: 1.85.
+<answer>
+B
+</answer>
+(a)
+GRPO (5,549 tokens)
+…
+At pH 7.4, all functional groups are neutral
+…
+maintaining a balance between hydrophobic and hydrophilic character
+…
+[The] overall polarity
+…
+keeps logD from being very high
+…
+or very low
+…
+[typically falling] in the 2.0-3.0 range, with 2.61 (C) being a reasonable estimate
+…
+<answer>
+C
+</answer>
+(b)
+SDPO (764 tokens)
+Figure 7:
+Example responses from GRPO and SDPO after 50 training steps to the following question: “What is the correct octanol/water distribution coefficient logD under the circumstance of pH 7.4 for the molecule
+O=C1O[C@@H](COc2ccon2)CN1c1ccc(C2=CCOCC2)c(F)c1
+?” The answer options are A: 1.32, B: 1.85, C: 2.61, D: 3.76. The correct answer is
+C
+.
+GRPO’s answer contains
+5
+×
+5\times
+“Hmm.”,
+9
+×
+9\times
+“No.”, and
+25
+×
+25\times
+“Wait”. Further, GRPO’s answer repeats calculations such as “
+10
+1.85
+≈
+69.3
+\smash{10^{1.85}\approx 69.3}
+”, which appears four times, and the model even explicitly generates “Wait I’m going in circles”.
+SDPO’s answer avoids any circular reasoning and is more than
+7
+×
+7\times
+shorter. The base model is Qwen3-8B.
+4
+Learning with Rich Environment Feedback
+We next evaluate SDPO on coding tasks.
+Coding is a canonical example of an RL environment that provides rich feedback, such as runtime errors and failed unit tests.
+Learning to solve these coding problems requires strong credit assignment since the student must identify its precise mistakes to avoid repeating them in the future.
+LiveCodeBench
+(LCB; Jain et al.,
+2025
+)
+provides a set of contest-style coding problems, ranging from simple to competition-level.
+We restrict our evaluation to the most recent LCBv6 subset of LCB, which contains 131 questions released between February and May 2025.
+We consider a setting with public and private unit tests, common for code contests and coding platforms like LeetCode, where the public tests are used for evaluation during training and the private tests are used for validation
+(Chen et al.,
+2022
+; Le et al.,
+2022
+; El-Kishky et al.,
+2025
+; Samadi et al.,
+2025
+)
+.
+5
+5
+5
+We select public tests as a 50% random subset of private tests.
+We use the Qwen3
+(Yang et al.,
+2025
+)
+model family for our experiments, with Qwen3-8B as default unless otherwise specified.
+We report the average accuracy over 4 rollouts and use the same GRPO baseline as outlined in
+Section
+˜
+3.1
+.
+Results.
+Figure
+˜
+1
+compares the learning curves of SDPO and GRPO on LCBv6.
+We find that SDPO achieves a substantially higher final accuracy (48.8%) than GRPO (41.2%) while also outperforming the strongest instruct models on the public LCBv6 leaderboard:
+6
+6
+6
+On the public leaderboard, the LCBv6 subset can be obtained by selecting February to May 2025.
+Claude Sonnet 4 (40.5%) and Claude Opus 4 (39.7%).
+Furthermore, SDPO reaches the final accuracy of GRPO in
+4
+×
+4\times
+fewer generations.
+We include an extended comparison to other RLVR baselines that perform similarly to GRPO in
+Table
+˜
+9
+in the appendix.
+Differentiating between the easy, medium, and hard questions of LCB, we find that SDPO particularly improves over GRPO in solving medium and hard questions (cf.
+Figure
+˜
+15
+in the appendix).
+4.1
+Self-distillation benefits from stronger models
+Figure 8:
+SDPO improves with model size.
+We compare the final LCBv6 validation accuracy of SDPO and GRPO at train step 80, across model sizes from Qwen3.
+The ability of SDPO’s teacher to perform accurate retrospection appears to be an emergent phenomenon with scale.
+We include an additional scaling study with Qwen2.5-Instruct in the appendix (cf.
+Figure
+˜
+17
+) which further supports this finding.
+Error bars indicate the standard error across 3 seeds.
+A central question for our work is whether SDPO is sensitive to the in-context learning ability of the base model.
+Intuitively, we expect that SDPO benefits from a strong in-context learner, since this enables the teacher to perform more accurate retrospection.
+To answer this question, we perform a scaling study with different model sizes from the Qwen3
+(Yang et al.,
+2025
+)
+family.
+As shown by extensive prior work, the ability to learn in-context increases with model size
+(e.g., Brown et al.,
+2020
+)
+.
+As depicted in
+Figure
+˜
+8
+, SDPO significantly outperforms GRPO on larger models while only slightly improving over GRPO on smaller models.
+To determine whether SDPO can also underperform GRPO on a model weaker than Qwen3-0.6B, we performed an additional scaling study with Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+.
+While outperforming GRPO with Qwen2.5-7B and performing similarly with Qwen2.5-8B, we find that SDPO underperforms GRPO on Qwen2.5-1.5B, as seen in
+Figure
+˜
+17
+in
+Appendix
+˜
+D
+.
+Takeaway 2
+Our results suggest that the marginal improvement of SDPO over GRPO is tightly coupled with the strength of the base model, and motivates future study on models stronger than Qwen3-8B.
+In the same way that in-context learning is an emergent phenomenon with scale, the self-teacher’s ability to perform accurate retrospection in SDPO appears to be emergent with scale.
+4.2
+Self-distillation performs dense credit assignment
+Figure 9:
+Dense credit assignment in SDPO in the example from
+Figure
+˜
+4
+. Shown in blue are tokens which become more likely under the self-teacher. The self-teacher identifies how the returned range has to be modified so that it does not contain
+n
+.
+Whereas GRPO assigns a constant advantage to each generated token, SDPO assigns an individual advantage to
+each possible next token
+along the generated sequence based on the agreement of student and teacher.
+At each position
+t
+t
+in the generated sequence
+y
+y
+, there are
+|
+𝒱
+|
+|\mathcal{V}|
+possible next tokens where
+𝒱
+\mathcal{V}
+is the vocabulary.
+In distillation, this level is typically called the
+logit-level
+since it corresponds to the logits of the model.
+In practice, we approximate the full next-token distribution by the top-
+K
+K
+tokens, and as such, SDPO assigns
+|
+y
+|
+⋅
+K
+|y|\cdot K
+unique advantages per sequence.
+This is illustrated in
+Figure
+˜
+9
+and allows SDPO to perform dense credit assignment.
+A natural question is whether the performance gains of SDPO are due to leveraging rich feedback in RLRF or due to the dense credit assignment of SDPO.
+To answer this question, we ablate the performance of SDPO in three configurations:
+•
+Logit-level SDPO:
+credit assignment over the 100 most likely tokens (under the student) at each position.
+•
+Token-level SDPO:
+credit assignment over the most likely token at each position.
+•
+Sequence-level SDPO:
+We compute SDPO advantages for all generated tokens and average them to produce a single scalar advantage per sequence (as in GRPO). This does not perform denser credit assignment than GRPO but still leverages the rich feedback
+f
+f
+.
+As shown in Figure
+10
+(left)
+, the dense credit assignment of logit-level SDPO leads to significant performance gains over token-level SDPO and sequence-level SDPO.
+Nevertheless, even sequence-level SDPO outperforms GRPO, indicating that leveraging rich feedback in RLRF can lead to substantial gains over RLVR methods even without dense credit assignment.
+4.3
+The self-teacher improves during training
+Figure 10:
+Left: Rich feedback in RLRF and dense credit assignment of SDPO are complementary.
+We compare logit-level, token-level, and sequence-level SDPO advantages to GRPO. While denser credit assignment in SDPO is beneficial (logit-level > token-level > sequence-level), even sequence-level SDPO significantly outperforms GRPO due to leveraging the rich feedback. Error bars indicate the standard error across 3 seeds.
+Right: The self-teacher improves during training.
+We display the generative accuracy of the self-teacher compared to student on the current training batch (with a rolling average over 5 steps). The final student score is taken at step 80. Notably, the performance of the student significantly surpasses the initial teacher’s accuracy. Error bars indicate the standard deviation across 3 seeds.
+Teacher
+Accuracy
+Avg accuracy
+q
+θ
+q_{\theta}
+36.1
+±
+1.6
+36.1\pm 1.6
+29.8
+±
+1.3
+29.8\pm 1.3
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+48.8
+±
+0.7
+48.8\pm 0.7
+44.4
+±
+0.2
+44.4\pm 0.2
+Trust-region
+50.6
+±
+0.9
+\mathbf{50.6}\pm 0.9
+45.6
+±
+0.2
+\mathbf{45.6}\pm 0.2
+EMA
+49.3
+±
+0.3
+49.3\pm 0.3
+45.3
+±
+0.2
+\mathbf{45.3}\pm 0.2
+Table 4:
+Best/average accuracy until step 90 of various methods for teacher regularization. Trust-region and EMA teachers use
+α
+=
+0.01
+\alpha=0.01
+. Training of the
+q
+θ
+q_{\theta}
+eventually diverges. Error ranges indicate standard errors across 3 seeds.
+Contrary to standard distillation, the self-teacher in SDPO is not frozen, but updated throughout training.
+This is a critical component of SDPO, since it enables the teacher to improve over time, which means that the student can learn from a stronger target.
+To investigate whether the self-teacher improves during training, we plot the average accuracy when
+generating
+using the self-teacher in Figure
+10
+(right)
+.
+We find that the self-teacher improves significantly during training.
+Most notably, the student’s accuracy surpasses the initial teacher’s accuracy in later stages of training.
+This demonstrates that SDPO enables true bootstrapping of a weak model to a strong model, without the initial self-teacher’s performance limiting the final student.
+As described in
+Section
+˜
+2.3
+, SDPO uses a regularized teacher to stabilize training.
+As can be seen in
+Table
+˜
+4
+, a non-regularized teacher significantly underperforms the regularized teachers.
+Furthermore, trust-region and EMA teachers outperform the teacher frozen at the initial teacher’s parameters, showing that the teacher improves through parameter sharing with the student.
+Yet, SDPO performs well even with a frozen teacher.
+4.4
+On-policy self-distillation avoids catastrophic forgetting
+Prior work has shown that a key benefit of on-policy algorithms, such as GRPO, is that models tend not to forget previously obtained capabilities
+(Shenfeld et al.,
+2026
+; Chen et al.,
+2025b
+; Lu & Thinking Machines Lab,
+2025
+)
+.
+This is practically desirable since it enables continual training pipelines where a model is trained sequentially on diverse tasks without the need to retrain from scratch.
+To evaluate forgetting, we test the final checkpoints of GRPO and SDPO on diverse holdout tasks: IFEval
+(Zhou et al.,
+2023
+)
+, which tests the ability of a model to follow precise format instructions; ArenaHard-v2
+(Li et al.,
+2025
+)
+, which is an LLM-judged benchmark of real-world instruction-following prompts derived from LMArena
+(Chiang et al.,
+2024
+)
+; and MMLU-Pro
+(Wang et al.,
+2024b
+)
+, which tests broad multi-task knowledge and reasoning.
+As displayed in
+Table
+˜
+5
+, SDPO learns the new task while mitigating degradation of initial capabilities, overall achieving a better performance–forgetting tradeoff than GRPO.
+Off-policy self-distillation baseline.
+As an additional baseline, we consider training the student via supervised fine-tuning (SFT) on successful generations from the self-teacher
+(Scheurer et al.,
+2023
+; Dou et al.,
+2024
+)
+.
+7
+7
+7
+SFT on a teacher’s predictions is a standard off-policy distillation approach
+(Kim & Rush,
+2016
+)
+.
+This requires
+2
+×
+2\times
+the generations of SDPO for the same number of steps, since we have to generate from both the student and the teacher.
+We report SFT on the successes of the self-teacher, which achieves a higher accuracy than also including initial successes from the student in the SFT data.
+As shown in
+Table
+˜
+5
+, SFT on the self-teacher significantly underperforms SDPO on LCBv6, while leading to worse forgetting of prior capabilities.
+This mirrors prior findings on the instability of off-policy imitation
+(see, e.g., Agarwal et al.,
+2024
+)
+.
+Task:
+Holdout tasks:
+LCBv6
+IFEval
+ArenaHard-v2
+(hard prompt)
+ArenaHard-v2
+(creative writing)
+MMLU-Pro
+Avg.
+(holdout)
+Base
+27.9
+27.9
+83.9
+{83.9}
+14.0
+{14.0}
+13.7
+{13.7}
+62.5
+{62.5}
+43.5
+{43.5}
+SFT on self-teacher
+42.7
+{42.7}
+83.7
+11.2
+8.9
+61.9
+41.4
+GRPO
+41.2
+41.2
+82.2
+82.2
+12.0
+12.0
+10.8
+10.8
+62.3
+62.3
+41.8
+41.8
+SDPO
+48.8
+{48.8}
+83.2
+{83.2}
+12.3
+{12.3}
+11.1
+{11.1}
+62.9
+{62.9}
+42.4
+{42.4}
+Table 5:
+On-policy methods do not suffer from catastrophic forgetting.
+We compare the accuracy of the final checkpoint on the training task LCBv6 and on holdout tasks IFEval, ArenaHard-v2, and MMLU-Pro. We compare to a baseline that trains directly on responses generated by the initial self-teacher with SFT. Overall, SDPO achieves the best performance–forgetting tradeoff. We include additional baseline results in
+Table
+˜
+9
+in the appendix.
+4.5
+Can GRPO and SDPO be combined?
+GRPO utilizes Monte Carlo advantages, which are unbiased with respect to the objective of maximizing expected reward
+J
+​
+(
+θ
+)
+:=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+r
+​
+(
+y
+∣
+x
+)
+]
+J(\theta):=\smash{\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[r(y\mid x)\right]}
+.
+In contrast, SDPO advantages are inherently biased with respect to
+J
+​
+(
+θ
+)
+J(\theta)
+due to being computed from rich feedback and a self-teacher.
+This dichotomy parallels the fundamental distinction between Monte Carlo and bootstrapped advantages in RL: while the latter are biased, they typically yield lower variance
+(Sutton & Barto,
+1998
+; Schulman et al.,
+2016
+)
+.
+This motivates a hybrid approach that combines reward-derived GRPO advantages with feedback-derived SDPO advantages:
+A
+i
+,
+t
+SDPO
++
+GRPO
+​
+(
+y
+^
+i
+,
+t
+)
+:=
+λ
+​
+A
+i
+,
+t
+GRPO
+​
+(
+y
+^
+i
+,
+t
+)
++
+(
+1
+−
+λ
+)
+​
+A
+i
+,
+t
+SDPO
+​
+(
+y
+^
+i
+,
+t
+)
+,
+λ
+∈
+[
+0
+,
+1
+]
+.
+A_{i,t}^{\mathrm{SDPO+GRPO}}(\hat{y}_{i,t}):=\lambda A_{i,t}^{\mathrm{GRPO}}(\hat{y}_{i,t})+(1-\lambda)A_{i,t}^{\mathrm{SDPO}}(\hat{y}_{i,t}),\quad\lambda\in[0,1].
+(3)
+Figure 11:
+We compare the LCBv6 validation accuracy at step 80, across model sizes from Qwen3.
+SDPO+GRPO significantly outperforms SDPO on the weaker Qwen3-0.6B, while slightly underperforming SDPO on stronger models.
+We use
+λ
+=
+0.9
+\lambda=0.9
+.
+Error bars indicate the standard error across 3 seeds.
+As shown in
+Figure
+˜
+11
+, SDPO+GRPO appears to be more robust to weaker models than SDPO.
+Intuitively, in a weaker model such as Qwen3-0.6B, the SDPO advantages are less reliable, and hence including the GRPO advantage helps to stabilize training.
+In contrast, we find that SDPO+GRPO slightly underperforms SDPO on stronger models such as Qwen3-8B.
+This suggests that the signal of GRPO, only informed by a scalar reward, can be actively harmful with a strong initial model.
+4.6
+Which feedback is most informative?
+To understand which type of rich feedback is most informative, we ablate the three types of feedback present in a verifiable environment like code generation: the sample solution (if a successful rollout is available in the current rollout group), the environment output (such as runtime errors), and the student’s original attempt.
+Sample solutions.
+Including a sample solution from a failed attempt’s rollout group (if available) closely mirrors the group-relative advantages of GRPO.
+We emphasize that these sample solutions are always generated by the student, as in GRPO, and do not require an expert model.
+They allow for disincentivizing unsuccessful approaches if the model is already able to solve the question.
+However, unlike GRPO where all tokens receive the same negative advantage, the self-teacher can identify specific mistakes and provide feedback on how to fix them.
+Environment output.
+The environment output describes the state of the environment after the student’s attempt.
+This is complementary to sample solutions since it can provide useful signal even if the student has never solved the question before (a setting we explore extensively in
+Section
+˜
+5
+).
+Leveraging environment output is a key differentiating factor between RLRF and RLVR settings.
+Student’s original attempt.
+The student’s original attempt
+y
+y
+does not have to be included in the reprompting template of the teacher.
+Indeed, we find that including it biases the teacher towards the student’s attempt (cf.
+Table
+˜
+6
+).
+This reduces the entropy of the student’s distribution (particularly for initially uncertain tokens), thereby reducing exploration.
+Teacher before training
+Student trained with SDPO
+↑
+\uparrow
+Acc. (%)
+↓
+\downarrow
+Same output (%)
+↑
+\uparrow
+Acc. (%)
+Avg. entropy
+f
+=
+f=
+output
+32.5
+±
+0.5
+32.5\pm 0.5
+13.7
+±
+0.6
+13.7\pm 0.6
+39.8
+±
+0.2
+39.8\pm 0.2
+0.40
+±
+0.0
+0.40\pm 0.0
+f
+=
+f=
+solution
+42.4
+±
+1.0
+\mathbf{42.4}\pm 1.0
+12.1
+±
+0.7
+12.1\pm 0.7
+36.8
+±
+2.7
+36.8\pm 2.7
+0.07
+±
+0.0
+\emph{0.07}\pm 0.0
+f
+=
+f=
+output + solution
+42.5
+±
+1.2
+\mathbf{42.5}\pm 1.2
+10.1
+±
+0.2
+\mathbf{10.1}\pm 0.2
+48.9
+±
+0.9
+\mathbf{48.9}\pm 0.9
+0.37
+±
+0.0
+0.37\pm 0.0
+f
+=
+f=
+y
+y
++ output + solution
+39.3
+±
+0.8
+39.3\pm 0.8
+30.0
+±
+0.9
+30.0\pm 0.9
+44.5
+±
+1.8
+44.5\pm 1.8
+0.23
+±
+0.0
+\emph{0.23}\pm 0.0
+Table 6:
+Performance of varying kinds of feedback.
+We evaluate informativeness of feedback based on SDPO training (until step 70) as well as the direct impact on the self-teacher. “Same output” measures the percentage of cases where the teacher receives the same environment output as the student’s initial attempt (i.e., not exploring alternative approaches). We observe that environment output and sample solutions are complementary and each provide informative feedback. Naively including only solutions or initial attempts
+y
+y
+significantly reduces diversity in the teacher and student. We remark that the sample solutions are generated by the student, enabling similar group-relative advantage estimation to GRPO. Error bars indicate standard deviation across 3 seeds.
+We summarize results in
+Table
+˜
+6
+where we evaluate the effect on SDPO training as well as the direct impact on the self-teacher.
+We find that environment output & sample solutions are complementary, each providing informative feedback.
+Generally, we observe that performance is not sensitive to syntactic variations of the reprompting template from
+Table
+˜
+2
+.
+5
+Solving Hard Questions via Test-Time Self-Distillation
+In
+Sections
+˜
+3
+and
+4
+, we have demonstrated that SDPO can substantially improve over RLVR methods when performing “train-time RL” for reasoning tasks.
+We now turn to a test-time setting where the model is given only a single hard (binary-reward) question
+x
+x
+and must discover a solution as quickly as possible:
+Definition 5.1
+(Discovery time)
+.
+The discovery time is the number of trials needed until a solution is found (i.e., the smallest
+k
+k
+with the
+k
+k
+-th attempt
+y
+k
+y_{k}
+receiving reward 1).
+Based on this notion, we
+can define a measure of the efficacy of discovery:
+discovery
+​
+@
+​
+k
+:=
+ℙ
+​
+(
+discovery time
+≤
+k
+)
+=
+ℙ
+​
+(
+r
+​
+(
+y
+1
+∣
+x
+)
+=
+1
+or
+r
+​
+(
+y
+2
+∣
+x
+)
+=
+1
+or …or
+r
+​
+(
+y
+k
+∣
+x
+)
+=
+1
+)
+,
+\displaystyle\begin{split}\mathrm{discovery@}k:=&\ \mathbb{P}(\text{discovery time $\leq k$})\\
+=&\ \mathbb{P}(\text{$r(y_{1}\mid x)=1$ or $r(y_{2}\mid x)=1$ or \ldots or $r(y_{k}\mid x)=1$}),\end{split}
+(4)
+where the probability is over any randomness in the algorithm producing
+y
+k
+y_{k}
+and the rewards.
+Thus, the discovery@
+k
+k
+metric quantifies the probability of
+discovering the solution within
+k
+k
+steps.
+8
+8
+8
+Our proposed discovery@
+k
+k
+metric is a canonical metric
+in the study of runtime speedup (i.e., time until termination,
+Dolan & Moré (
+2002
+)
+).
+While prior work has studied discovery with continuous rewards
+(e.g., Novikov et al.,
+2025
+; Yuksekgonul et al.,
+2026
+)
+, discovery with language models in sparse or binary-reward settings does not allow “hill-climbing” a continuous reward and has remained less well understood.
+The most naive approach to discovery in binary-reward tasks is to sample repeatedly i.i.d. from the base model, also known as
+best-of-
+k
+k
+.
+The canonical pass@
+k
+k
+metric for best-of-
+k
+k
+sampling is exactly the probability of discovering at least one solution within
+k
+k
+independent samples from a fixed model, coinciding with discovery@
+k
+k
+.
+The discovery@
+k
+k
+metric generalizes pass@
+k
+k
+to algorithms that sample attempts sequentially.
+A common sequential approach re-prompts the base model with additional context from previous attempts
+(Madaan et al.,
+2023
+; Shinn et al.,
+2023
+)
+.
+We refer to this as
+multi-turn
+sampling.
+Here, the model itself does not change, only its context evolves over time.
+Performing RLVR on the question
+x
+x
+does not improve over best-of-
+k
+k
+sampling from the base model, since a binary reward provides no signal until the first solution has already been found.
+9
+9
+9
+For this reason, several works consider explicitly constructing curricula of solvable questions
+(e.g., Zhao et al.,
+2025
+; Huang et al.,
+2026
+; Diaz-Bone et al.,
+2025
+; Hübotter et al.,
+2025b
+)
+, which self-distillation avoids. Other work found that RLVR yields limited improvement on hard questions
+(Yue et al.,
+2025
+)
+.
+An RLRF method like SDPO does not face the same limitation, as it receives rich feedback from the environment after each attempt.
+This rich feedback enables the model to repeatedly “correct” its mistakes as it encounters them and receives feedback, even before ever discovering a solution.
+In contrast to multi-turn sampling, SDPO repeatedly compresses context
+c
+=
+(
+y
+k
+,
+f
+k
+)
+c=(y_{k},f_{k})
+by distilling
+π
+θ
+(
+⋅
+∣
+x
+,
+c
+)
+\pi_{\theta}(\cdot\mid x,c)
+into a model
+π
+θ
+′
+(
+⋅
+∣
+x
+)
+\pi_{\theta^{\prime}}(\cdot\mid x)
+as we illustrate in
+Figure
+˜
+12
+.
+This self-distillation enables SDPO to continually learn over long contexts, whereas the memory bottleneck of transformers inherently limits the context length of multi-turn sampling
+(Vaswani et al.,
+2017
+)
+.
+In this section, we seek to answer the question:
+Can repeatedly compressing context into model weights via self-distillation
+accelerate discovery for hard questions?
+Figure 12:
+Compressing context into model weights via self-distillation.
+We illustrate the process of distilling the interaction history (context
+c
+c
+) into the model parameters
+θ
+\theta
+.
+The model
+π
+θ
+\pi_{\theta}
+repeatedly attempts a fixed hard question
+x
+x
+, generating an answer
+y
+y
+and receiving feedback
+f
+f
+.
+Rather than appending this history to the context window, the model updates its weights
+θ
+t
+→
+θ
+t
++
+1
+\theta_{t}\to\theta_{t+1}
+with SDPO (batch size
+1
+1
+) based on the feedback, effectively “fixing” mistakes by encoding
+π
+θ
+(
+⋅
+∣
+x
+,
+c
+)
+\pi_{\theta}(\cdot\mid x,c)
+directly into the policy
+π
+θ
+′
+(
+⋅
+∣
+x
+)
+\pi_{\theta^{\prime}}(\cdot\mid x)
+.
+5.1
+Experimental setting
+We consider a particularly challenging subset of questions from LCBv6 that are at Qwen3-8B’s performance ceiling and require significant test-time sampling to find any solution.
+Concretely, we define two groups using Qwen3-8B’s pass@
+k
+k
+:
+Hard tasks
+with
+pass@
+​
+64
+<
+0.5
+{\text{pass@}64<0.5}
+and
+very hard tasks
+with
+pass@
+​
+64
+<
+0.03
+\text{pass@}64<0.03
+.
+Among these, we retain questions for which any of best-of-
+k
+k
+, multi-turn, or SDPO find at least one solution within
+512
+512
+steps across
+5
+5
+seeds.
+This results in 19 hard and 9 very hard questions.
+For best-of-
+k
+k
+sampling under the base model, we report the standard
+pass
+​
+@
+​
+k
+\text{pass}@k
+estimate
+(Chen et al.,
+2021b
+)
+from 2944 independent rollouts.
+As multi-turn sampling, we sequentially reprompt the model in-context using the concatenated feedback from previous attempts. To remain within Qwen3-8B’s 40k-token context limit, we employ a first-in, first-out sliding window, discarding the earliest feedback once the maximum prompt length (32k tokens) is reached.
+We ablate the multi-turn reprompting strategy in
+Figure
+˜
+19
+in
+Appendix
+˜
+D
+and find that retaining only past feedback while forgetting earlier attempts significantly outperforms the baseline that additionally retains past attempts.
+We evaluate SDPO with a batch size of 16. We ablate this choice in
+Figure
+˜
+19
+in
+Appendix
+˜
+D
+and find that overall performance differences are marginal, yet smaller batch sizes are beneficial for improvements at low generation budgets, while larger batch sizes result in more stable updates that still learn to solve questions at later stages into the run.
+5.2
+Results
+Figure 13:
+Self-distillation at test-time solves LiveCodeBench questions that neither the base model nor multi-turn conversations can solve.
+Left:
+Very hard questions (9 total) from LCBv6 where the base model achieves
+pass
+​
+@
+​
+64
+<
+0.03
+\text{pass}@64<0.03
+, i.e., in less than 3% cases, sampling 64 responses yields any success.
+Right:
+Hard questions (19 total) from LCBv6 where the base model achieves
+pass
+​
+@
+​
+64
+<
+0.5
+\text{pass}@64<0.5
+.
+We report the
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+metric, representing the probability of discovering at least one solution within
+k
+k
+total generations.
+Across both difficulty levels, SDPO achieves higher
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+rates at almost all generation budgets, compared to the base model and a multi-turn conversation baseline that receives the feedback in-context. We report the mean and bootstrapped 90% confidence intervals of the mean across 5 random seeds per question.
+Figure
+˜
+13
+compares
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+for SDPO, multi-turn sampling, and best-of-
+k
+k
+sampling on very hard (left) and hard (right) questions from LCBv6. Across both difficulty levels, SDPO achieves substantially higher
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+rates at almost all generation budgets.
+On very hard tasks, multi-turn and best-of-
+k
+k
+largely fail to solve questions within the available generation budget, achieving discovery@2750 of only
+35.6
+%
+35.6\%
+and
+41.5
+%
+{41.5}\%
+, respectively, whereas SDPO discovers a solution in
+53.2
+%
+{53.2}\%
+of cases.
+SDPO not only solves more questions overall but also does so with substantially fewer attempts.
+Notably, to reach a
+22
+%
+22\%
+discovery probability on very hard questions, SDPO requires approximately
+3
+×
+3\times
+fewer generations than best-of-
+k
+k
+and multi-turn sampling.
+On hard tasks, SDPO reaches a
+78
+%
+{78}\%
+discovery@2750 probability while achieving a
+67
+%
+67\%
+discovery probability with roughly
+2.4
+×
+2.4\times
+fewer generations than best-of-
+k
+k
+and multi-turn sampling. Overall, multi-turn and best-of-
+k
+k
+sampling solve only
+68.4
+%
+{68.4}\%
+and
+72.3
+%
+{72.3}\%
+of questions, respectively.
+The context window length for multi-turn sampling is reached after 837 (
+±
+466
+\pm 466
+) steps for hard questions and after 1007 (
+±
+349
+\pm 349
+) steps for very hard questions, offering a possible explanation for its diminishing gains at high generation budgets.
+Question 3 is only solved by SDPO.
+SDPO solves all questions that are solved by best-of-
+k
+k
+and multi-turn sampling. Beyond that, SDPO uniquely discovers a solution for Q3, which is neither solvable with multi-turn sampling nor with best-of-
+k
+k
+sampling within 2750 attempts. In contrast, SDPO first discovers a solution for Q3 after 321 attempts, which corresponds to 20 iteration steps of self-distillation based on feedback with a batch size of 16. We include detailed per-question results in
+Table
+˜
+10
+in
+Appendix
+˜
+D
+.
+The initial self-teacher does not solve hard questions.
+Notably, the self-teacher’s initial accuracy is
+<
+1
+<1
+% for almost all questions, and even exactly
+0
+% on
+78
+78
+% of them (
+Table
+˜
+11
+in
+Appendix
+˜
+D
+).
+This shows that a single turn of in-context feedback is insufficient to solve the problem.
+Despite this, the self-teacher’s credit assignment is sufficiently effective for SDPO to iteratively refine the policy and eventually solve these questions.
+Takeaway 3
+We demonstrate that rich environment feedback enables SDPO to significantly accelerate discovery for hard questions.
+This is in contrast to RLVR methods, which only receive a binary reward signal, and therefore only begin learning once the first solution has already been found.
+6
+Related Work
+6.1
+Reinforcement Learning with LLMs
+Recently, large-scale RL training on diverse tasks has significantly improved the performance of LLMs on general reasoning tasks
+(Guo et al.,
+2025
+; Kimi et al.,
+2025
+; Olmo et al.,
+2025
+; Jaech et al.,
+2024
+; Lambert et al.,
+2025
+)
+.
+This progress is primarily enabled by RLVR methods that use Monte Carlo estimates of rewards, such as STaR or GRPO
+(Zelikman et al.,
+2022
+; Shao et al.,
+2024
+)
+, similar to the classical REINFORCE algorithm
+(Williams,
+1992
+)
+.
+While several traditional RLVR algorithms rely on learning separate value networks
+(Schulman et al.,
+2017
+)
+, they incur substantial memory costs and retain the information bottleneck of scalar rewards.
+In the RLVR setting, it is common for an (outcome) reward to be given only at the end of a sequence.
+To improve credit assignment, several works learn so-called process reward models (PRMs) that estimate rewards for each step in the sequence
+(Lightman et al.,
+2023
+; Wang et al.,
+2024a
+; Setlur et al.,
+2025
+)
+.
+Unlike our RLRF setting, PRMs are typically trained on scalar rewards, either on value estimates for intermediate states or on outcome rewards
+(Cui et al.,
+2025
+)
+.
+Unlike the self-teacher in SDPO, PRMs are a distinct model from the student, introducing significant memory overhead.
+Our work shows that
+each language model is implicitly a PRM
+through retrospection if given rich feedback.
+Conceptually, our work is related to “expert iteration”
+(Anthony et al.,
+2017
+)
+where a student is bootstrapped by repeatedly imitating an improved version of itself (called the “expert”).
+Canonically, the expert combines the student with test-time search, such as tree search
+(Anthony et al.,
+2017
+)
+or majority voting
+(Zuo et al.,
+2025
+)
+.
+In contrast, SDPO leverages the student’s ability to learn from rich feedback provided in-context.
+6.2
+Learning from Rich Feedback and through Retrospection
+Beyond scalar outcome rewards, recent works have leveraged rich execution or verbal feedback to guide generation
+(Gehring et al.,
+2025
+; Yuksekgonul et al.,
+2025
+)
+.
+A primary line of research focuses on translating verbal feedback into reward functions for RL.
+This is often achieved by mapping feedback to discrete token-level rewards using an external frozen model
+(Wang et al.,
+2026
+)
+, or by employing strong external LLMs to explicitly construct state-wise reward functions
+(Goyal et al.,
+2019
+; Xie et al.,
+2024
+; Urcelay et al.,
+2026
+)
+.
+Alternatively, feedback can be utilized without explicit reward modeling.
+Several approaches focus on in-context improvement without integrating the process into the RL optimization loop
+(Chen et al.,
+2021a
+; Madaan et al.,
+2023
+; Shinn et al.,
+2023
+; Yao et al.,
+2024
+; Yuksekgonul et al.,
+2025
+; Lee et al.,
+2025
+)
+.
+Others manually curate preference datasets by pairing responses before and after feedback to train with direct preference optimization
+(Stephan et al.,
+2024
+; Lee et al.,
+2024
+)
+, though this requires additional generation and lacks the direct credit assignment of SDPO.
+Various recent works bootstrap thinking traces from known answers, using these answers as rich feedback
+(Zhou et al.,
+2026
+; Hatamizadeh et al.,
+2026
+; Zhang et al.,
+2025
+)
+.
+A central object in several recent works is a feedback-conditioned policy
+π
+θ
+​
+(
+y
+∣
+x
+,
+f
+)
+\pi_{\theta}(y\mid x,f)
+, which learns answers
+y
+y
+that lead to feedback
+f
+f
+(Liu et al.,
+2023
+; Zhang et al.,
+2023
+; Luo et al.,
+2025
+)
+, typically through supervised objectives.
+The idea behind these approaches is to deploy a policy conditioned on desirable (i.e., positive) feedback for deployment.
+This approach is conceptually related to goal-conditioned RL
+(Schaul et al.,
+2015
+; Liu et al.,
+2025a
+)
+, where one can learn from negative examples through goal relabeling
+(Andrychowicz et al.,
+2017
+)
+.
+Feedback-conditioned policies view feedback as a goal, whereas RLRF views feedback as a state that can be used to determine whether the goal
+x
+x
+is achieved.
+Unlike SDPO, these methods do not use feedback for credit assignment in negative trajectories, but rather as a data transformation for goal relabeling.
+6.3
+Distillation
+Distillation is frequently employed as an alternative to supervised fine-tuning (SFT) when a strong teacher model is available.
+This approach transfers capabilities by training a student to mimic the output distribution or intermediate representations of the teacher
+(Hinton et al.,
+2015
+; Romero et al.,
+2015
+; Kim & Rush,
+2016
+; Sanh et al.,
+2019
+; Xie et al.,
+2020
+)
+.
+Distillation is typically performed on fixed off-policy datasets.
+To address the distribution shift between training and inference, recent works explore on-policy distillation, where the student learns from feedback of an external teacher on its own generations
+(Agarwal et al.,
+2024
+; Gu et al.,
+2024
+; Yang et al.,
+2025
+; Lu & Thinking Machines Lab,
+2025
+)
+.
+This mitigates the train-test mismatch, which relates closely to earlier work on online imitation learning
+(Ross et al.,
+2011
+)
+.
+6.4
+Self-Distillation
+The concept of self-distillation was first proposed by
+Snell et al. (
+2022
+)
+in a setting akin to supervised learning, introducing the idea of sampling from a model provided with extra context and training the same model to mimic these predictions without that context.
+This mechanism has proven effective for compressing behavior
+(Bai et al.,
+2022
+; Choi et al.,
+2022
+)
+and factual information
+(Eyuboglu et al.,
+2026
+; Kujanpää et al.,
+2025
+)
+into model weights.
+Beyond compressing a fixed context into model weights, recent works have used self-distillation to learn from environment feedback
+(Scheurer et al.,
+2023
+; Dou et al.,
+2024
+; Mitra & Ulukus,
+2025
+)
+.
+These approaches use an
+off-policy
+self-distillation objective, which substantially underperforms SDPO’s on-policy learning.
+Off-policy self-distillation trains the student on generations from the teacher, whereas SDPO trains the student to avoid mistakes in its own generations.
+In concurrent work,
+Chen et al. (
+2025c
+)
+apply on-policy self-distillation to grid world settings where feedback is a scalar reward, and a reflection stage in the self-teacher diagnoses possible mistakes, showing improved credit assignment compared to learning value networks for advantage estimation.
+7
+Conclusion, Limitations, and Future Work
+We introduced
+Reinforcement Learning with Rich Feedback
+(RLRF), a paradigm where environments provide tokenized feedback beyond scalar rewards, and argued that this removes a key information bottleneck of RLVR.
+We then proposed
+Self-Distillation Policy Optimization
+(SDPO), which uses the current policy as a feedback-conditioned
+self-teacher
+and distills its corrected log-probabilities into the student.
+This leverages the model’s ability to learn from context for dense credit assignment.
+We further demonstrated that SDPO can be implemented as a minimal, drop-in modification to standard RLVR pipelines.
+Empirically, SDPO demonstrates superior sample efficiency and wall-clock convergence compared to GRPO on reasoning tasks, even when training in standard RLVR environments without rich feedback.
+SDPO’s gains grow with model scale, suggesting that the capacity for self-correction scales with the model’s in-context learning capabilities.
+Moreover, we show that performing SDPO at test-time on individual hard binary-reward tasks accelerates the discovery of solutions compared to strong baselines.
+SDPO enables learning from rich feedback in a way that parallels human cognition: utilizing precise outcomes rather than just binary rewards.
+By allowing the model to determine retrospectively how it should have acted, we demonstrate that language models can convert diverse tokenized feedback into effective self-supervision.
+Limitations.
+Our findings show that SDPO’s performance depends on a model’s in-context learning ability, suggesting that SDPO is primarily applicable for RL-training stronger base models, while it can underperform GRPO on weaker models.
+Moreover, performance depends on the quality of the environment feedback. If the environment provides uninformative or misleading feedback, a model may not be able to learn from it through SDPO.
+Finally, SDPO adds a small computational overhead compared to GRPO for computing the log-probs of the retrospective model.
+While often negligible, this may be a larger overhead for smaller models with shorter generation lengths, where generation time is comparatively small.
+Future Work.
+Our work highlights several exciting directions for future research:
+•
+Long-horizon and agentic settings.
+RLRF is particularly appealing when trajectories are long or expose information about intermediate states.
+Evaluating SDPO in agentic environments is a natural next step.
+•
+Training dynamics at scale.
+Beyond our evaluation on LiveCodeBench, it would be particularly interesting to scale SDPO to large multi-task RL training runs and further study its scaling properties with frontier base models.
+•
+Beyond verifiable rewards.
+While we focused on verifiable code generation, many tasks provide textual feedback without a ground-truth verifier.
+Investigating whether SDPO’s retrospection mechanism can improve alignment in open-ended text generation or continuous-reward tasks remains an open empirical question.
+•
+Behavioral differences in reasoning.
+We observed that SDPO induces qualitatively different reasoning patterns than GRPO, notably avoiding the latter’s tendency toward verbosity and superficial reasoning.
+Future work should systematically study how individual aspects, such as the reprompt template, influence behavior.
+Author Contributions
+Jonas Hübotter
+conceived of the project in summer 2025 and has been working on it full-time since then, leading the team.
+Jonas proposed the conceptual framework of self-distillation for credit assignment with input from Lejs, implemented the algorithm with help from others, led the quantitative experiments on LCBv6, and led the writing of the paper.
+Frederike Lübeck
+led the design of the code environment, led the design and evaluation of the TTT setting in
+Section
+˜
+5
+with input from Jonas, contributed to the project direction in discussions, and contributed significantly to the writing of the paper.
+Lejs Behric
+noted the dense credit assignment of knowledge distillation with strong teacher models in discussions with Jonas, inspiring the idea of self-distillation. Further, Lejs led the evaluation of different teacher templates, co-led the development of a tool for qualitative analysis of runs with Marco and Daniel, helped implement parts of the algorithm, and contributed to the project direction in discussions.
+Anton Baumann
+joined in December 2025 and led the evaluation of SDPO without rich feedback in
+Section
+˜
+3
+with input from Jonas, and contributed to the writing of the paper.
+Marco Bagatella and Daniel Marta
+co-led the development of a tool for qualitative analysis of runs with Lejs, contributed to the training infrastructure, and contributed to the project direction in discussions.
+Ido Hakimi
+significantly contributed to the initial codebase and experimental setup, contributed early algorithmic ideas, and contributed to the project direction in discussions.
+Idan Shenfeld, Thomas Kleine Buening, Carlos Guestrin, and Andreas Krause
+supported this project, with Idan and Carlos joining in December 2025. They made significant contributions to the project direction in discussions and gave valuable advice on our presentation. Thomas and Idan, in particular, significantly contributed to the development of core algorithmic ideas and design of experiments. Thomas further evaluated checkpoints on holdout benchmarks. Carlos suggested the qualitative analysis of reasoning traces in
+Figure
+˜
+7
+and the presentation of TTT results in
+Section
+˜
+5
+. Andreas pointed out valuable connections to existing work in RL which shaped the direction of the project.
+Acknowledgments
+We would like to thank Akira Yoshiyama, Yassir Akram, Parnian Kassraie, Jonathan Thomm, Roman Vorushin, Afra Amini, Imanol Schlag, Yu Sun, and Moritz Hardt for helpful discussions.
+We thank Eduard Durech for helpful conversations regarding the scaling of RL fine-tuning and for his technical guidance on distributed infrastructure and long-context optimization.
+Furthermore, we would like to thank Leander Diaz-Bone for supporting dataset generation.
+This project was supported through the Swiss AI compute grant a156 and, in part, compute grant infra01.
+JH was supported by the Swiss National Science Foundation under NCCR Automation, grant agreement 51NF40 180545.
+FL and MB were supported by the ETH-MPI Center for Learning Systems.
+TKB and IH were supported by an ETH AI Center Postdoctoral Fellowship.
+DM was supported by the Knut and Alice Wallenberg Foundation.
+References
+Agarwal et al. (2024)
+Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos Garea, Matthieu Geist, and Olivier Bachem.
+On-policy distillation of language models: Learning from self-generated mistakes.
+In
+ICLR
+, 2024.
+Akyürek et al. (2025)
+Ekin Akyürek, Mehul Damani, Adam Zweiger, Linlu Qiu, Han Guo, Jyothish Pari, Yoon Kim, and Jacob Andreas.
+The surprising effectiveness of test-time training for few-shot learning.
+In
+ICML
+, 2025.
+Andrychowicz et al. (2017)
+Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, and Wojciech Zaremba.
+Hindsight experience replay.
+In
+NeurIPS
+, 2017.
+Anthony et al. (2017)
+Thomas Anthony, Zheng Tian, and David Barber.
+Thinking fast and slow with deep learning and tree search.
+In
+NeurIPS
+, 2017.
+Bai et al. (2022)
+Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, et al.
+Constitutional ai: Harmlessness from ai feedback.
+arXiv preprint arXiv:2212.08073
+, 2022.
+Behrouz et al. (2025)
+Ali Behrouz, Peilin Zhong, and Vahab Mirrokni.
+Titans: Learning to memorize at test time.
+In
+NeurIPS
+, 2025.
+Berner et al. (2019)
+Christopher Berner, Greg Brockman, Brooke Chan, Vicki Cheung, Przemysław Debiak, Christy Dennison, David Farhi, Quirin Fischer, Shariq Hashme, Chris Hesse, et al.
+Dota 2 with large scale deep reinforcement learning.
+arXiv preprint arXiv:1912.06680
+, 2019.
+Boyd & Vandenberghe (2004)
+Stephen Boyd and Lieven Vandenberghe.
+Convex optimization
+.
+Cambridge university press, 2004.
+Brown et al. (2020)
+Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al.
+Language models are few-shot learners.
+arXiv preprint ArXiv:2005.14165
+, 2020.
+Bubeck (2015)
+Sébastien Bubeck.
+Convex optimization: Algorithms and complexity.
+Foundations and Trends® in Machine Learning
+, 2015.
+Cao et al. (2025)
+Meng Cao, Shuyuan Zhang, Xiao-Wen Chang, and Doina Precup.
+Scar: Shapley credit assignment for more efficient rlhf.
+arXiv preprint arXiv:2505.20417
+, 2025.
+Chan et al. (2024)
+Alex J Chan, Hao Sun, Samuel Holt, and Mihaela Van Der Schaar.
+Dense reward for free in reinforcement learning from human feedback.
+In
+ICML
+, 2024.
+Chen et al. (2025a)
+Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, et al.
+Minimax-m1: Scaling test-time compute efficiently with lightning attention.
+arXiv preprint arXiv:2506.13585
+, 2025a.
+Chen et al. (2022)
+Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, and Weizhu Chen.
+Codet: Code generation with generated tests.
+In
+ICLR
+, 2022.
+Chen et al. (2025b)
+Howard Chen, Noam Razin, Karthik Narasimhan, and Danqi Chen.
+Retaining by doing: The role of on-policy data in mitigating forgetting.
+arXiv preprint arXiv:2510.18874
+, 2025b.
+Chen et al. (2021a)
+Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch.
+Decision transformer: Reinforcement learning via sequence modeling.
+In
+NeurIPS
+, 2021a.
+Chen et al. (2021b)
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde De Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al.
+Evaluating large language models trained on code.
+arXiv preprint arXiv:2107.03374
+, 2021b.
+Chen et al. (2025c)
+Wentse Chen, Jiayu Chen, Fahim Tajwar, Hao Zhu, Xintong Duan, Ruslan Salakhutdinov, and Jeff Schneider.
+Retrospective in-context learning for temporal credit assignment with large language models.
+In
+NeurIPS
+, 2025c.
+Chiang et al. (2024)
+Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael Jordan, Joseph E Gonzalez, et al.
+Chatbot arena: An open platform for evaluating llms by human preference.
+In
+ICML
+, 2024.
+Choi et al. (2022)
+Eunbi Choi, Yongrae Jo, Joel Jang, and Minjoon Seo.
+Prompt injection: Parameterization of fixed inputs.
+arXiv preprint arXiv:2206.11349
+, 2022.
+Cui et al. (2025)
+Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al.
+Process reinforcement through implicit rewards.
+arXiv preprint arXiv:2502.01456
+, 2025.
+Diaz-Bone et al. (2025)
+Leander Diaz-Bone, Marco Bagatella, Jonas Hübotter, and Andreas Krause.
+Discover: Automated curricula for sparse-reward reinforcement learning.
+In
+NeurIPS
+, 2025.
+Dolan & Moré (2002)
+Elizabeth D Dolan and Jorge J Moré.
+Benchmarking optimization software with performance profiles.
+Mathematical programming
+, 91(2), 2002.
+Dou et al. (2024)
+Zi-Yi Dou, Cheng-Fu Yang, Xueqing Wu, Kai-Wei Chang, and Nanyun Peng.
+Re-rest: Reflection-reinforced self-training for language agents.
+In
+EMNLP
+, 2024.
+El-Kishky et al. (2025)
+Ahmed El-Kishky, Alexander Wei, Andre Saraiva, Borys Minaiev, Daniel Selsam, David Dohan, Francis Song, Hunter Lightman, Ignasi Clavera, Jakub Pachocki, et al.
+Competitive programming with large reasoning models.
+arXiv preprint arXiv:2502.06807
+, 2025.
+Eyuboglu et al. (2026)
+Sabri Eyuboglu, Ryan Ehrlich, Simran Arora, Neel Guha, Dylan Zinsley, Emily Liu, Will Tennien, Atri Rudra, James Zou, Azalia Mirhoseini, et al.
+Cartridges: Lightweight and general-purpose long context representations via self-study.
+In
+ICLR
+, 2026.
+Feng et al. (2024)
+Kehua Feng, Keyan Ding, Weijie Wang, Xiang Zhuang, Zeyuan Wang, Ming Qin, Yu Zhao, Jianhua Yao, Qiang Zhang, and Huajun Chen.
+Sciknoweval: Evaluating multi-level scientific knowledge of large language models.
+arXiv preprint arXiv:2406.09098
+, 2024.
+Gehring et al. (2025)
+Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve.
+Rlef: Grounding code llms in execution feedback with reinforcement learning.
+In
+ICML
+, 2025.
+Goyal et al. (2019)
+Prasoon Goyal, Scott Niekum, and Raymond J Mooney.
+Using natural language for reward shaping in reinforcement learning.
+In
+IJCAI
+, 2019.
+Gu et al. (2024)
+Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang.
+Minillm: Knowledge distillation of large language models.
+2024.
+Guha et al. (2026)
+Etash Guha, Ryan Marten, Sedrick Keh, Negin Raoof, Georgios Smyrnis, Hritik Bansal, Marianna Nezhurina, Jean Mercat, Trung Vu, Zayne Sprague, et al.
+Openthoughts: Data recipes for reasoning models.
+In
+ICLR
+, 2026.
+Guo et al. (2025)
+Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+Haarnoja et al. (2018)
+Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine.
+Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor.
+In
+ICML
+, 2018.
+Hardt & Sun (2024)
+Moritz Hardt and Yu Sun.
+Test-time training on nearest neighbors for large language models.
+In
+ICLR
+, 2024.
+Hatamizadeh et al. (2026)
+Ali Hatamizadeh, Syeda Nahida Akter, Shrimai Prabhumoye, Jan Kautz, Mostofa Patwary, Mohammad Shoeybi, Bryan Catanzaro, and Yejin Choi.
+Rlp: Reinforcement as a pretraining objective.
+In
+ICLR
+, 2026.
+Hinton et al. (2015)
+Geoffrey Hinton, Oriol Vinyals, and Jeff Dean.
+Distilling the knowledge in a neural network.
+arXiv preprint arXiv:1503.02531
+, 2015.
+Huang et al. (2026)
+Chengsong Huang, Wenhao Yu, Xiaoyang Wang, Hongming Zhang, Zongxia Li, Ruosen Li, Jiaxin Huang, Haitao Mi, and Dong Yu.
+R-zero: Self-evolving reasoning llm from zero data.
+In
+ICLR
+, 2026.
+Hübotter et al. (2026)
+Jonas Hübotter, Patrik Wolf, Alexander Shevchenko, Dennis Jüni, Andreas Krause, and Gil Kur.
+Specialization after generalization: Towards understanding test-time training in foundation models.
+In
+ICLR
+, 2026.
+Hübotter et al. (2025a)
+Jonas Hübotter, Sascha Bongni, Ido Hakimi, and Andreas Krause.
+Efficiently learning at test-time: Active fine-tuning of llms.
+In
+ICLR
+, 2025a.
+Hübotter et al. (2025b)
+Jonas Hübotter, Leander Diaz-Bone, Ido Hakimi, Andreas Krause, and Moritz Hardt.
+Learning on the job: Test-time curricula for targeted reinforcement learning.
+arXiv preprint arXiv:2510.04786
+, 2025b.
+Jaech et al. (2024)
+Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al.
+Openai o1 system card.
+arXiv preprint arXiv:2412.16720
+, 2024.
+Jain et al. (2025)
+Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, and Ion Stoica.
+Livecodebench: Holistic and contamination free evaluation of large language models for code.
+In
+ICLR
+, 2025.
+Kaelbling et al. (1998)
+Leslie Pack Kaelbling, Michael L Littman, and Anthony R Cassandra.
+Planning and acting in partially observable stochastic domains.
+Artificial intelligence
+, 101(1-2), 1998.
+Kazemnejad et al. (2025)
+Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux.
+Vineppo: Refining credit assignment in rl training of llms.
+In
+ICML
+, 2025.
+Khatri et al. (2026)
+Devvrit Khatri, Lovish Madaan, Rishabh Tiwari, Rachit Bansal, Sai Surya Duvvuri, Manzil Zaheer, Inderjit S Dhillon, David Brandfonbrener, and Rishabh Agarwal.
+The art of scaling reinforcement learning compute for llms.
+In
+ICLR
+, 2026.
+Kim & Rush (2016)
+Yoon Kim and Alexander M Rush.
+Sequence-level knowledge distillation.
+In
+EMNLP
+, 2016.
+Kimi et al. (2025)
+Kimi, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al.
+Kimi k1.5: Scaling reinforcement learning with llms.
+arXiv preprint arXiv:2501.12599
+, 2025.
+Kujanpää et al. (2025)
+Kalle Kujanpää, Pekka Marttinen, Harri Valpola, and Alexander Ilin.
+Efficient knowledge injection in LLMs via self-distillation.
+TMLR
+, 2025.
+Kwon et al. (2023)
+Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica.
+Efficient memory management for large language model serving with pagedattention.
+In
+PSIGOPS
+, 2023.
+Lambert et al. (2025)
+Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al.
+Tulu 3: Pushing frontiers in open language model post-training.
+In
+COLM
+, 2025.
+Le et al. (2022)
+Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi.
+Coderl: Mastering code generation through pretrained models and deep reinforcement learning.
+In
+NeurIPS
+, 2022.
+Lee et al. (2024)
+Kyungjae Lee, Dasol Hwang, Sunghyun Park, Youngsoo Jang, and Moontae Lee.
+Reinforcement learning from reflective feedback (rlrf): Aligning and improving llms via fine-grained self-reflection.
+arXiv preprint arXiv:2403.14238
+, 2024.
+Lee et al. (2025)
+Yoonho Lee, Joseph Boen, and Chelsea Finn.
+Feedback descent: Open-ended text optimization via pairwise comparison.
+arXiv preprint arXiv:2511.07919
+, 2025.
+Levine (2018)
+Sergey Levine.
+Reinforcement learning and control as probabilistic inference: Tutorial and review.
+arXiv preprint arXiv:1805.00909
+, 2018.
+Li et al. (2025)
+Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Tianhao Wu, Banghua Zhu, Joseph E Gonzalez, and Ion Stoica.
+From crowdsourced data to high-quality benchmarks: Arena-hard and benchbuilder pipeline.
+In
+ICML
+, 2025.
+Lightman et al. (2023)
+Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe.
+Let’s verify step by step.
+In
+ICLR
+, 2023.
+Liu et al. (2025a)
+Grace Liu, Michael Tang, and Benjamin Eysenbach.
+A single goal is all you need: Skills and exploration emerge from contrastive rl without rewards, demonstrations, or subgoals.
+In
+ICLR
+, 2025a.
+Liu et al. (2023)
+Hao Liu, Carmelo Sferrazza, and Pieter Abbeel.
+Chain of hindsight aligns language models with feedback.
+arXiv preprint arXiv:2302.02676
+, 2023.
+Liu et al. (2025b)
+Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin.
+Understanding r1-zero-like training: A critical perspective.
+In
+COLM
+, 2025b.
+Lu & Thinking Machines Lab (2025)
+Kevin Lu and Thinking Machines Lab.
+On-policy distillation.
+Thinking Machines Lab: Connectionism
+, 2025.
+URL
+https://thinkingmachines.ai/blog/on-policy-distillation
+.
+Luo et al. (2025)
+Renjie Luo, Zichen Liu, Xiangyan Liu, Chao Du, Min Lin, Wenhu Chen, Wei Lu, and Tianyu Pang.
+Language models can learn from verbal feedback without scalar rewards.
+arXiv preprint arXiv:2509.22638
+, 2025.
+Madaan et al. (2023)
+Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al.
+Self-refine: Iterative refinement with self-feedback.
+In
+NeurIPS
+, 2023.
+Mitra & Ulukus (2025)
+Purbesh Mitra and Sennur Ulukus.
+Semantic soft bootstrapping: Long context reasoning in llms without reinforcement learning.
+arXiv preprint arXiv:2512.05105
+, 2025.
+Mnih et al. (2015)
+Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Andrei A. Rusu, Joel Veness, Marc G. Bellemare, Alex Graves, Martin Riedmiller, Andreas K. Fidjeland, Georg Ostrovski, et al.
+Human-level control through deep reinforcement learning.
+Nature
+, 518(7540), 2015.
+Muennighoff et al. (2025)
+Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori B Hashimoto.
+s1: Simple test-time scaling.
+In
+EMNLP
+, 2025.
+Ng et al. (2000)
+Andrew Y Ng, Stuart Russell, et al.
+Algorithms for inverse reinforcement learning.
+In
+ICML
+, 2000.
+Novikov et al. (2025)
+Alexander Novikov, Ngân Vũ, Marvin Eisenberger, Emilien Dupont, Po-Sen Huang, Adam Zsolt Wagner, Sergey Shirobokov, Borislav Kozlovskii, Francisco JR Ruiz, Abbas Mehrabian, et al.
+Alphaevolve: A coding agent for scientific and algorithmic discovery.
+arXiv preprint arXiv:2506.13131
+, 2025.
+Olmo et al. (2025)
+Team Olmo, Allyson Ettinger, Amanda Bertsch, Bailey Kuehl, David Graham, David Heineman, Dirk Groeneveld, Faeze Brahman, Finbarr Timbers, Hamish Ivison, et al.
+Olmo 3.
+arXiv preprint arXiv:2512.13961
+, 2025.
+Peng et al. (2019)
+Xue Bin Peng, Aviral Kumar, Grace Zhang, and Sergey Levine.
+Advantage-weighted regression: Simple and scalable off-policy reinforcement learning.
+arXiv preprint arXiv:1910.00177
+, 2019.
+Qwen et al. (2024)
+Qwen, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, et al.
+Qwen2.5 technical report.
+arXiv preprint arXiv:2412.15115
+, 2024.
+Rafailov et al. (2023)
+Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn.
+Direct preference optimization: Your language model is secretly a reward model.
+In
+NeurIPS
+, 2023.
+Romero et al. (2015)
+Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio.
+Fitnets: Hints for thin deep nets.
+In
+ICLR
+, 2015.
+Ross et al. (2011)
+Stéphane Ross, Geoffrey Gordon, and Drew Bagnell.
+A reduction of imitation learning and structured prediction to no-regret online learning.
+In
+AISTATS
+, 2011.
+Samadi et al. (2025)
+Mehrzad Samadi, Aleksander Ficek, Sean Narenthiran, Siddhartha Jain, Wasi Uddin Ahmad, Somshubra Majumdar, Vahid Noroozi, and Boris Ginsburg.
+Scaling test-time compute to achieve ioi gold medal with open-weight models.
+arXiv preprint arXiv:2510.14232
+, 2025.
+Sanh et al. (2019)
+Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf.
+Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter.
+arXiv preprint arXiv:1910.01108
+, 2019.
+Schaul et al. (2015)
+Tom Schaul, Daniel Horgan, Karol Gregor, and David Silver.
+Universal value function approximators.
+In
+ICML
+, 2015.
+Scheurer et al. (2023)
+Jérémy Scheurer, Jon Ander Campos, Tomasz Korbak, Jun Shern Chan, Angelica Chen, Kyunghyun Cho, and Ethan Perez.
+Training language models with language feedback at scale.
+arXiv preprint arXiv:2303.16755
+, 2023.
+Schulman et al. (2015)
+John Schulman, Sergey Levine, Pieter Abbeel, Michael Jordan, and Philipp Moritz.
+Trust region policy optimization.
+In
+ICML
+, 2015.
+Schulman et al. (2016)
+John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel.
+High-dimensional continuous control using generalized advantage estimation.
+In
+ICLR
+, 2016.
+Schulman et al. (2017)
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017.
+Setlur et al. (2025)
+Amrith Setlur, Chirag Nagpal, Adam Fisch, Xinyang Geng, Jacob Eisenstein, Rishabh Agarwal, Alekh Agarwal, Jonathan Berant, and Aviral Kumar.
+Rewarding progress: Scaling automated process verifiers for llm reasoning.
+In
+ICLR
+, 2025.
+Shao et al. (2024)
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Shenfeld et al. (2026)
+Idan Shenfeld, Jyothish Pari, and Pulkit Agrawal.
+Rl’s razor: Why online reinforcement learning forgets less.
+In
+ICLR
+, 2026.
+Sheng et al. (2025)
+Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu.
+Hybridflow: A flexible and efficient rlhf framework.
+In
+EuroSys
+, 2025.
+Shinn et al. (2023)
+Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao.
+Reflexion: Language agents with verbal reinforcement learning.
+In
+NeurIPS
+, 2023.
+Silver et al. (2016)
+David Silver, Aja Huang, Chris J. Maddison, Arthur Guez, Laurent Sifre, George van den Driessche, Julian Schrittwieser, Ioannis Antonoglou, Veda Panneershelvam, Marc Lanctot, et al.
+Mastering the game of go with deep neural networks and tree search.
+Nature
+, 529(7587), 2016.
+Silver et al. (2017)
+David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, et al.
+Mastering chess and shogi by self-play with a general reinforcement learning algorithm.
+arXiv preprint arXiv:1712.01815
+, 2017.
+Snell et al. (2022)
+Charlie Snell, Dan Klein, and Ruiqi Zhong.
+Learning by distilling context.
+arXiv preprint arXiv:2209.15189
+, 2022.
+Stephan et al. (2024)
+Moritz Stephan, Alexander Khazatsky, Eric Mitchell, Annie S Chen, Sheryl Hsu, Archit Sharma, and Chelsea Finn.
+Rlvf: Learning from verbal feedback without overgeneralization.
+In
+ICML
+, 2024.
+Sun et al. (2020)
+Yu Sun, Xiaolong Wang, Zhuang Liu, John Miller, Alexei Efros, and Moritz Hardt.
+Test-time training with self-supervision for generalization under distribution shifts.
+In
+ICML
+, 2020.
+Sun et al. (2025)
+Yu Sun, Xinhao Li, Karan Dalal, Jiarui Xu, Arjun Vikram, Genghan Zhang, Yann Dubois, Xinlei Chen, Xiaolong Wang, Sanmi Koyejo, et al.
+Learning to (learn at test time): Rnns with expressive hidden states.
+In
+ICML
+, 2025.
+Sutton & Barto (1998)
+Richard S Sutton and Andrew G Barto.
+Reinforcement learning: An introduction
+.
+MIT press, 1998.
+Tandon et al. (2025)
+Arnuv Tandon, Karan Dalal, Xinhao Li, Daniel Koceja, Marcel Rød, Sam Buchanan, Xiaolong Wang, Jure Leskovec, Sanmi Koyejo, Tatsunori Hashimoto, et al.
+End-to-end test-time training for long context.
+arXiv preprint arXiv:2512.23675
+, 2025.
+Tang et al. (2023)
+Qiaoyu Tang, Ziliang Deng, Hongyu Lin, Xianpei Han, Qiao Liang, Boxi Cao, and Le Sun.
+Toolalpaca: Generalized tool learning for language models with 3000 simulated cases.
+arXiv preprint arXiv:2306.05301
+, 2023.
+Urcelay et al. (2026)
+Belen Martin Urcelay, Andreas Krause, and Giorgia Ramponi.
+From words to rewards: Leveraging natural language for reinforcement learning.
+In
+TMLR
+, 2026.
+Vaswani et al. (2017)
+Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.
+Attention is all you need.
+In
+NeurIPS
+, 2017.
+Wainwright & Jordan (2008)
+Martin J Wainwright and Michael I Jordan.
+Graphical models, exponential families, and variational inference.
+Foundations and Trends® in Machine Learning
+, 2008.
+Wang et al. (2026)
+Hanyang Wang, Lu Wang, Chaoyun Zhang, Tianjun Mao, Si Qin, Qingwei Lin, Saravan Rajmohan, and Dongmei Zhang.
+Text2grad: Reinforcement learning from natural language feedback.
+In
+ICLR
+, 2026.
+Wang et al. (2024a)
+Peiyi Wang, Lei Li, Zhihong Shao, RX Xu, Damai Dai, Yifei Li, Deli Chen, Yu Wu, and Zhifang Sui.
+Math-shepherd: Verify and reinforce llms step-by-step without human annotations.
+In
+ACL
+, 2024a.
+Wang et al. (2025)
+Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, et al.
+Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning.
+In
+NeurIPS
+, 2025.
+Wang et al. (2024b)
+Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, et al.
+Mmlu-pro: A more robust and challenging multi-task language understanding benchmark.
+In
+NeurIPS
+, 2024b.
+Wei et al. (2022)
+Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al.
+Chain-of-thought prompting elicits reasoning in large language models.
+In
+NeurIPS
+, 2022.
+Williams (1992)
+Ronald J Williams.
+Simple statistical gradient-following algorithms for connectionist reinforcement learning.
+Machine learning
+, 8(3), 1992.
+Xie et al. (2020)
+Qizhe Xie, Minh-Thang Luong, Eduard Hovy, and Quoc V Le.
+Self-training with noisy student improves imagenet classification.
+In
+CVPR
+, 2020.
+Xie et al. (2024)
+Tianbao Xie, Siheng Zhao, Chen Henry Wu, Yitao Liu, Qian Luo, Victor Zhong, Yanchao Yang, and Tao Yu.
+Text2reward: Reward shaping with language models for reinforcement learning.
+In
+ICLR
+, 2024.
+Yang et al. (2025)
+An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al.
+Qwen3 technical report.
+arXiv preprint arXiv:2505.09388
+, 2025.
+Yao et al. (2025)
+Feng Yao, Liyuan Liu, Dinghuai Zhang, Chengyu Dong, Jingbo Shang, and Jianfeng Gao.
+Your efficient rl framework secretly brings you off-policy rl training, 2025.
+URL
+https://fengyao.notion.site/off-policy-rl
+.
+Yao et al. (2024)
+Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, et al.
+Retroformer: Retrospective large language agents with policy gradient optimization.
+In
+ICLR
+, 2024.
+Yu et al. (2025)
+Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, et al.
+Dapo: An open-source llm reinforcement learning system at scale.
+In
+NeurIPS
+, 2025.
+Yue et al. (2025)
+Yang Yue, Zhiqi Chen, Rui Lu, Andrew Zhao, Zhaokai Wang, Shiji Song, and Gao Huang.
+Does reinforcement learning really incentivize reasoning capacity in llms beyond the base model?
+In
+NeurIPS
+, 2025.
+Yuksekgonul et al. (2025)
+Mert Yuksekgonul, Federico Bianchi, Joseph Boen, Sheng Liu, Pan Lu, Zhi Huang, Carlos Guestrin, and James Zou.
+Optimizing generative ai by backpropagating language model feedback.
+Nature
+, 639:609–616, 2025.
+Yuksekgonul et al. (2026)
+Mert Yuksekgonul, Daniel Koceja, Xinhao Li, Federico Bianchi, Jed McCaleb, Xiaolong Wang, Jan Kautz, Yejin Choi, James Zou, Carlos Guestrin, et al.
+Learning to discover at test time.
+arXiv preprint arXiv:2601.16175
+, 2026.
+Zelikman et al. (2022)
+Eric Zelikman, Yuhuai Wu, Jesse Mu, and Noah D Goodman.
+Star: Bootstrapping reasoning with reasoning.
+In
+NeurIPS
+, 2022.
+Zhang et al. (2025)
+Kai Zhang, Xiangchao Chen, Bo Liu, Tianci Xue, Zeyi Liao, Zhihan Liu, Xiyao Wang, Yuting Ning, Zhaorun Chen, Xiaohan Fu, et al.
+Agent learning via early experience.
+arXiv preprint arXiv:2510.08558
+, 2025.
+Zhang et al. (2023)
+Tianjun Zhang, Fangchen Liu, Justin Wong, Pieter Abbeel, and Joseph E Gonzalez.
+The wisdom of hindsight makes language models better instruction followers.
+In
+ICML
+, 2023.
+Zhao et al. (2025)
+Andrew Zhao, Yiran Wu, Yang Yue, Tong Wu, Quentin Xu, Matthieu Lin, Shenzhi Wang, Qingyun Wu, Zilong Zheng, and Gao Huang.
+Absolute zero: Reinforced self-play reasoning with zero data.
+In
+NeurIPS
+, 2025.
+Zheng et al. (2025a)
+Chujie Zheng, Shixuan Liu, Mingze Li, Xiong-Hui Chen, Bowen Yu, Chang Gao, Kai Dang, Yuqiong Liu, Rui Men, An Yang, et al.
+Group sequence policy optimization.
+arXiv preprint arXiv:2507.18071
+, 2025a.
+Zheng et al. (2025b)
+Tianyu Zheng, Tianshun Xing, Qingshui Gu, Taoran Liang, Xingwei Qu, Xin Zhou, Yizhi Li, Zhoufutu Wen, Chenghua Lin, Wenhao Huang, et al.
+First return, entropy-eliciting explore.
+arXiv preprint arXiv:2507.07017
+, 2025b.
+Zhou et al. (2023)
+Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, and Le Hou.
+Instruction-following evaluation for large language models.
+arXiv preprint arXiv:2311.07911
+, 2023.
+Zhou et al. (2026)
+Xiangxin Zhou, Zichen Liu, Anya Sims, Haonan Wang, Tianyu Pang, Chongxuan Li, Liang Wang, Min Lin, and Chao Du.
+Reinforcing general reasoning without verifiers.
+In
+ICLR
+, 2026.
+Ziebart et al. (2008)
+Brian D Ziebart, Andrew L Maas, J Andrew Bagnell, Anind K Dey, et al.
+Maximum entropy inverse reinforcement learning.
+In
+AAAI
+, 2008.
+Zuo et al. (2025)
+Yuxin Zuo, Kaiyan Zhang, Shang Qu, Li Sheng, Xuekai Zhu, Biqing Qi, Youbang Sun, Ganqu Cui, Ning Ding, and Bowen Zhou.
+Ttrl: Test-time reinforcement learning.
+In
+NeurIPS
+, 2025.
+Contents
+section.1table.caption.4section.2subsection.2.1subsection.2.2subsection.2.3section.3subsection.3.1subsection.3.1subsection.3.2subsection.3.3section.4section.4subsection.4.1subsection.4.2subsection.4.3subsection.4.4subsection.4.4subsection.4.5subsection.4.6subsection.4.6subsection.4.6subsection.4.6section.5subsection.5.1subsection.5.2figure.caption.20figure.caption.20section.6subsection.6.1subsection.6.2subsection.6.3subsection.6.4section.7section.7section.7appendix.Asubsection.A.1subsection.A.2subsection.A.3appendix.Bsubsection.B.1subsection.B.2subsection.B.3appendix.Cappendix.Cappendix.Cappendix.Cappendix.Csubsection.C.1subsection.C.1equation.22appendix.Dsubsection.D.1subsection.D.2subsubsection.D.2.1subsubsection.D.2.2subsubsection.D.2.3subsection.D.3appendix.Esubsection.E.1subsection.E.2subsubsection.E.2.1subsection.E.3appendix.Fsubsection.F.1subsection.F.2subsection.F.3subsection.F.4
+Appendix A
+Implementation of SDPO
+The following pseudocode in
+Figure
+˜
+14
+outlines the implementation of SDPO:
+⬇
+def
+compute_sdpo_loss
+(
+batch
+,
+teacher_context
+,
+loss_mask
+):
+"""
+Computes
+probabilities
+of
+response
+y
+under
+the
+self
+-
+teacher
+and
+the
+per
+-
+logit
+SDPO
+loss
+.
+"""
+#
+Compute
+model
+probabilities
+for
+response
+y
+logprobs_student
+=
+compute_log_prob
+(
+batch
+)
+#
+(
+T
+,
+V
+)
+probs_student
+=
+logprobs_student
+.
+exp
+()
+#
+(
+T
+,
+V
+)
+\
+par
+#
+Compute
+self
+-
+teacher
+probabilities
+for
+response
+y
+teacher_batch
+=
+reprompt
+(
+batch
+,
+teacher_context
+)
+logprobs_teacher
+=
+compute_log_prob
+(
+teacher_batch
+).
+detach
+()
+#
+(
+T
+,
+V
+)
+\
+par
+#
+Compute
+SDPO
+loss
+:
+per
+-
+token
+divergence
+per_token_loss
+=
+divergence
+(
+logprobs_student
+,
+logprobs_teacher
+)
+#
+(
+T
+,)
+return
+agg_loss
+(
+per_token_loss
+,
+loss_mask
+,
+loss_agg_mode
+="
+token
+-
+mean
+")
+Figure 14:
+The pseudo-code of SDPO within a standard RL training pipeline. Omitted here is the filtering to top-
+K
+K
+logprobs for student and teacher (including a tail term) as described in
+Section
+˜
+A.2
+. Further, we omit here any importance sampling weights to correct for off-policy data.
+reprompt
+modifies the batch to incorporate teacher context (i.e., rich feedback).
+divergence
+implements any per-token divergence such as reverse-KL, forward-KL, or Jensen-Shannon.
+In the following, we provide further details on:
+•
+Teacher regularization (
+Section
+˜
+A.1
+)
+•
+Approximating logit-distillation with the top-
+K
+K
+logits for saving GPU memory (
+Section
+˜
+A.2
+)
+•
+Generalizing PPO-style policy gradient algorithms to logit-level advantages (
+Section
+˜
+A.3
+)
+To disambiguate the notation of the self-teacher, we use
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+:=
+π
+θ
+(
+⋅
+∣
+reprompt
+(
+x
+,
+f
+)
+)
+q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f))
+in the following.
+Here,
+reprompt
+denotes the reprompt template of the self-teacher.
+A.1
+Regularized teacher
+In contrast to standard distillation, the teacher in SDPO changes throughout training. This bootstrapping enables the teacher to improve, but it may also lead to training instability.
+To stabilize training, we seek to prevent the teacher
+q
+q
+from quickly diverging from the initial teacher
+q
+θ
+ref
+\smash{q_{\theta_{{\mathrm{ref}}}}}
+.
+We can achieve this by placing an explicit trust-region constraint on
+q
+q
+(Schulman et al.,
+2015
+; Peng et al.,
+2019
+)
+, that is:
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+ϵ
+>
+0
+.
+\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\quad\epsilon>0.
+(5)
+This trust-region can be implemented in two ways:
+1.
+Explicit trust-region:
+We can define the teacher as the policy closest to
+q
+θ
+q_{\theta}
+while satisfying the trust-region constraint.
+This teacher can be expressed as
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+,
+q(y_{t}\mid x,f,y_{<t})\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})+\alpha\log q_{\theta}(y_{t}\mid x,f,y_{<t})\big),
+(6)
+with
+α
+∈
+(
+0
+,
+1
+)
+\alpha\in(0,1)
+the inverse Lagrange multiplier for the trust-region constraint.
+We include a full derivation in
+Section
+˜
+B.2
+.
+We can plug this explicitly constrained teacher directly into the SDPO objective.
+2.
+Exponential moving average (EMA):
+Alternatively, we can stabilize the teacher’s parameters directly; parameterizing
+q
+θ
+′
+q_{\theta^{\prime}}
+by
+θ
+′
+\theta^{\prime}
+and updating as
+θ
+′
+←
+(
+1
+−
+α
+)
+​
+θ
+′
++
+α
+​
+θ
+\theta^{\prime}\leftarrow(1-\alpha)\theta^{\prime}+\alpha\theta
+with
+α
+∈
+(
+0
+,
+1
+)
+\alpha\in(0,1)
+.
+Under mild smoothness assumptions, this EMA teacher remains implicitly within a trust-region around the initial teacher (cf.
+Section
+˜
+B.3
+).
+Note that each implementation has a different practical advantage:
+The EMA teacher requires additional GPU memory for
+θ
+′
+\theta^{\prime}
+yet does not introduce any runtime overhead.
+In contrast, the trust-region teacher requires an additional log-prob computation with
+q
+θ
+ref
+\smash{q_{\theta_{{\mathrm{ref}}}}}
+yet does not require additional GPU memory if
+θ
+ref
+{\theta_{{\mathrm{ref}}}}
+is used for explicit KL regularization.
+A.2
+Approximate Logit Distillation
+To save GPU memory, we perform distillation only on the top-
+K
+K
+tokens predicted by the student:
+ℒ
+SDPO
+​
+(
+θ
+)
+\displaystyle\mathcal{L}_{\mathrm{SDPO}}(\theta)
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+q
+θ
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(y_{t}\mid x,y_{<t})\|\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t})))
+≈
+E
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+∑
+t
+=
+1
+T
+∑
+y
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
++
+(
+1
+−
+∑
+y
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+⋅
+log
+⁡
+1
+−
+∑
+y
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+1
+−
+∑
+y
+t
+∈
+top
+K
+​
+(
+π
+θ
+)
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+⏟
+tail
+\displaystyle\approx\begin{multlined}E_{y\sim\pi_{\theta}(\cdot\mid x)}\sum_{t=1}^{T}\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(y_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t}))}\\
++\underbrace{\Big(1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})\Big)\cdot\log\frac{1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})}{\mathrm{stopgrad}\Big(1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}q_{\theta}(y_{t}\mid x,f,y_{<t})\Big)}}_{\text{tail}}\end{multlined}E_{y\sim\pi_{\theta}(\cdot\mid x)}\sum_{t=1}^{T}\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(y_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t}))}\\
++\underbrace{\Big(1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})\Big)\cdot\log\frac{1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}\pi_{\theta}(y_{t}\mid x,y_{<t})}{\mathrm{stopgrad}\Big(1-\textstyle\sum_{y_{t}\in\mathrm{top}_{K}(\pi_{\theta})}q_{\theta}(y_{t}\mid x,f,y_{<t})\Big)}}_{\text{tail}}
+(9)
+Here, the top-
+K
+K
+is with respect to student.
+Without top-
+K
+K
+distillation, we would have to keep two copies of logits in memory: one for teacher and student each.
+Top-
+K
+K
+distillation avoids virtually any memory overhead without impacting performance significantly, since most tokens of the vocabulary are not informative at a given time.
+A.3
+Off-Policy Training: Generalization to Logit-Level Losses
+PPO-style clipping
+(Schulman et al.,
+2017
+)
+with
+truncated importance sampling
+(Yao et al.,
+2025
+)
+,
+clip-higher
+(Yu et al.,
+2025
+)
+,
+fixed length normalization
+(Liu et al.,
+2025b
+)
+:
+ℒ
+token
+​
+(
+θ
+)
+:=
+−
+1
+∑
+i
+=
+1
+G
+|
+y
+i
+|
+​
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+y
+i
+|
+min
+⁡
+(
+w
+i
+,
+t
+TIS
+,
+ρ
+)
+​
+min
+⁡
+(
+w
+i
+,
+t
+​
+A
+i
+,
+t
+,
+clip
+​
+(
+w
+i
+,
+t
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+​
+A
+i
+,
+t
+)
+,
+\mathcal{L}_{\mathrm{token}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(w^{\mathrm{TIS}}_{i,t},\rho\right)}\min\left(w_{i,t}A_{i,t},\text{clip}(w_{i,t},1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}\right),
+(10)
+with
+w
+i
+,
+t
+:=
+π
+θ
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+w_{i,t}:=\frac{\pi_{\theta}(y_{i,t}\mid x,y_{i,<t})}{\pi_{\theta_{{\mathrm{old}}}}(y_{i,t}\mid x,y_{i,<t})}
+,
+w
+i
+,
+t
+TIS
+:=
+π
+θ
+old
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+π
+θ
+old
+rollout
+​
+(
+y
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+w^{\mathrm{TIS}}_{i,t}:=\frac{\pi_{\theta_{{\mathrm{old}}}}(y_{i,t}\mid x,y_{i,<t})}{\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(y_{i,t}\mid x,y_{i,<t})}
+, and
+A
+i
+,
+t
+A_{i,t}
+denotes the per-token advantage.
+We extend this to a
+logit-level
+loss:
+ℒ
+logit
+​
+(
+θ
+)
+:=
+−
+1
+∑
+i
+=
+1
+G
+|
+y
+i
+|
+​
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+y
+i
+|
+∑
+y
+^
+i
+,
+t
+min
+⁡
+(
+π
+θ
+old
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+,
+ρ
+​
+π
+θ
+old
+rollout
+​
+(
+y
+^
+i
+,
+t
+∣
+x
+,
+y
+i
+,
+<
+t
+)
+)
+min
+⁡
+(
+w
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+​
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+,
+clip
+​
+(
+w
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+,
+1
+−
+ε
+low
+,
+1
++
+ε
+high
+)
+​
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+)
+,
+\begin{multlined}\mathcal{L}_{\mathrm{logit}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.16796875,0.3125,0.66796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.16796875,0.3125,0.66796875}\sum_{\hat{y}_{i,t}}}\ {\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(\pi_{\theta_{{\mathrm{old}}}}(\hat{y}_{i,t}\mid x,y_{i,<t}),\rho\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(\hat{y}_{i,t}\mid x,y_{i,<t})\right)}\\
+\min\left(w_{i,t}(\hat{y}_{i,t})A_{i,t}(\hat{y}_{i,t}),\text{clip}(w_{i,t}(\hat{y}_{i,t}),1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}(\hat{y}_{i,t})\right),\end{multlined}\mathcal{L}_{\mathrm{logit}}(\theta):=-{\color[rgb]{1,.5,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,.5,0}\frac{1}{\sum_{i=1}^{G}|y_{i}|}}\sum_{i=1}^{G}\sum_{t=1}^{|y_{i}|}{\color[rgb]{0.16796875,0.3125,0.66796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.16796875,0.3125,0.66796875}\sum_{\hat{y}_{i,t}}}\ {\color[rgb]{0.94921875,0.328125,0.35546875}\definecolor[named]{pgfstrokecolor}{rgb}{0.94921875,0.328125,0.35546875}\min\left(\pi_{\theta_{{\mathrm{old}}}}(\hat{y}_{i,t}\mid x,y_{i,<t}),\rho\pi_{\theta_{{\mathrm{old}}}}^{\mathrm{rollout}}(\hat{y}_{i,t}\mid x,y_{i,<t})\right)}\\
+\min\left(w_{i,t}(\hat{y}_{i,t})A_{i,t}(\hat{y}_{i,t}),\text{clip}(w_{i,t}(\hat{y}_{i,t}),1-\varepsilon_{\text{low}},1+{\color[rgb]{0.34765625,0.734375,0.16796875}\definecolor[named]{pgfstrokecolor}{rgb}{0.34765625,0.734375,0.16796875}\varepsilon_{\text{high}}})A_{i,t}(\hat{y}_{i,t})\right),
+(11)
+where
+y
+^
+i
+,
+t
+\hat{y}_{i,t}
+sums over all possible tokens at position
+t
+t
+for rollout
+i
+i
+(or the
+K
+K
+most likely under
+π
+θ
+old
+\pi_{\theta_{{\mathrm{old}}}}
+, cf.
+Section
+˜
+A.2
+).
+The TIS changes since we explicitly weight each logit by its probability under
+π
+θ
+old
+\pi_{\theta_{{\mathrm{old}}}}
+rather than relying on a Monte Carlo estimate of the expectation over next-token predictions.
+Here,
+A
+i
+,
+t
+​
+(
+y
+^
+i
+,
+t
+)
+A_{i,t}(\hat{y}_{i,t})
+is a per-logit advantage.
+In our experiments for SDPO, we apply the TIS term on a token-level rather than logit-level.
+Appendix B
+Theoretical Analysis
+This section is organized as follows:
+•
+Section
+˜
+B.1
+derives the SDPO gradient from
+Proposition
+˜
+2.1
+.
+•
+Section
+˜
+B.2
+derives the trust-region regularized teacher discussed in
+Section
+˜
+A.1
+.
+•
+Section
+˜
+B.3
+shows that the EMA teacher, as discussed in
+Section
+˜
+A.1
+, implements a trust-region constraint.
+To disambiguate the notation of the self-teacher, we use
+q
+θ
+(
+⋅
+∣
+x
+,
+f
+)
+:=
+π
+θ
+(
+⋅
+∣
+reprompt
+(
+x
+,
+f
+)
+)
+q_{\theta}(\cdot\mid x,f):=\pi_{\theta}(\cdot\mid\mathrm{reprompt}(x,f))
+in the following.
+Here,
+reprompt
+denotes the reprompt template of the self-teacher.
+B.1
+Gradient Estimator
+Proof of Proposition
+2.1
+.
+In the following, we derive the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+.
+∇
+θ
+ℒ
+SDPO
+​
+(
+θ
+)
+\displaystyle\boldsymbol{\nabla}_{\!\!\theta}\,\mathcal{L}_{\mathrm{SDPO}}(\theta)
+=
+∇
+θ
+∑
+t
+=
+1
+T
+KL
+(
+π
+θ
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+∥
+stopgrad
+(
+q
+θ
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\mathrm{KL}(\pi_{\theta}(y_{t}\mid x,y_{<t})\|\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t})))
+=
+∇
+θ
+​
+∑
+t
+=
+1
+T
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+log
+⁡
+(
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+)
+\displaystyle=\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})\log\left(\frac{\pi_{\theta}(y_{t}\mid x,y_{<t})}{\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t}))}\right)
+Let
+A
+t
+,
+k
+:=
+log
+⁡
+(
+stopgrad
+​
+(
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+A_{t,k}:=\log\left(\frac{\mathrm{stopgrad}(q_{\theta}(y_{t}\mid x,f,y_{<t}))}{\pi_{\theta}(y_{t}\mid x,y_{<t})}\right)
+. Then,
+=
+−
+∇
+θ
+​
+∑
+t
+=
+1
+T
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+A
+t
+,
+k
+\displaystyle=-\boldsymbol{\nabla}_{\!\!\theta}\,\sum_{t=1}^{T}\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})A_{t,k}
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+A
+t
+,
+k
++
+A
+t
+,
+k
+​
+∇
+θ
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+.
+\displaystyle=-\sum_{t=1}^{T}\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}+A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(y_{t}\mid x,y_{<t}).
+We have that
+∇
+θ
+A
+t
+,
+k
+=
+−
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}=-\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(y_{t}\mid x,y_{<t})
+is the negative score function. Using the score trick,
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+∇
+θ
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+\pi_{\theta}(y_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(y_{t}\mid x,y_{<t})=\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(y_{t}\mid x,y_{<t})
+. Hence, the first term simplifies to
+−
+∑
+t
+=
+1
+T
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+∇
+θ
+A
+t
+,
+k
+\displaystyle-\sum_{t=1}^{T}\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})\boldsymbol{\nabla}_{\!\!\theta}\,A_{t,k}
+=
+∑
+t
+=
+1
+T
+∑
+y
+t
+∇
+θ
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+∑
+t
+=
+1
+T
+∇
+θ
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+⏟
+=
+1
+=
+0
+.
+\displaystyle=\sum_{t=1}^{T}\sum_{y_{t}}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(y_{t}\mid x,y_{<t})=\sum_{t=1}^{T}\boldsymbol{\nabla}_{\!\!\theta}\,\underbrace{\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})}_{=1}=0.
+Thus, the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is
+∇
+θ
+ℒ
+SDPO
+\displaystyle\boldsymbol{\nabla}_{\!\!\theta}\,\mathcal{L}_{\mathrm{SDPO}}
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+t
+A
+t
+,
+k
+​
+∇
+θ
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+\displaystyle=-\sum_{t=1}^{T}\sum_{y_{t}}A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\pi_{\theta}(y_{t}\mid x,y_{<t})
+=
+−
+∑
+t
+=
+1
+T
+∑
+y
+t
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+​
+(
+A
+t
+,
+k
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+\displaystyle=-\sum_{t=1}^{T}\sum_{y_{t}}\pi_{\theta}(y_{t}\mid x,y_{<t})\Big(A_{t,k}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(y_{t}\mid x,y_{<t})\Big)
+=
+−
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+∑
+t
+=
+1
+|
+y
+|
+∑
+y
+t
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+A
+t
+,
+k
+]
+\displaystyle=-\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{|y|}\sum_{y_{t}}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(y_{t}\mid x,y_{<t})\cdot A_{t,k}\right]
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+​
+[
+∑
+t
+=
+1
+|
+y
+|
+∑
+y
+t
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+⋅
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+]
+.
+\displaystyle=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}\left[\sum_{t=1}^{|y|}\sum_{y_{t}}\boldsymbol{\nabla}_{\!\!\theta}\,\log\pi_{\theta}(y_{t}\mid x,y_{<t})\cdot\log\frac{\pi_{\theta}(y_{t}\mid x,y_{<t})}{q_{\theta}(y_{t}\mid x,f,y_{<t})}\right].
+∎
+Notably, the above implies that the gradient of
+ℒ
+SDPO
+\mathcal{L}_{\mathrm{SDPO}}
+is equivalent to the gradient of the loss if
+−
+A
+t
+,
+k
+=
+stopgrad
+​
+(
+log
+⁡
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+-A_{t,k}=\mathrm{stopgrad}\left(\log\frac{\pi_{\theta}(y_{t}\mid x,y_{<t})}{q_{\theta}(y_{t}\mid x,f,y_{<t})}\right)
+.
+B.2
+Trust-region Teacher
+To stabilize training, we seek to prevent the teacher
+q
+q
+from diverging from the initial teacher
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+.
+We can achieve this by placing an explicit trust-region constraint on the teacher
+q
+q
+(Schulman et al.,
+2015
+; Peng et al.,
+2019
+)
+, that is:
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+ϵ
+>
+0
+.
+\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\quad\epsilon>0.
+(12)
+In the following, we derive a teacher
+q
+q
+which satisfies the trust-region constraint while staying close to the target
+q
+θ
+q_{\theta}
+.
+The following optimization problem characterizes such a
+q
+q
+(Peng et al.,
+2019
+)
+:
+arg
+​
+max
+q
+∈
+Δ
+∑
+t
+∑
+y
+t
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+s.t.
+∑
+t
+KL
+(
+q
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+,
+\displaystyle\begin{split}\operatorname*{arg\,max}_{q\in\Delta}\ &\sum_{t}\sum_{y_{t}}q(y_{t}\mid x,f,y_{<t})\log\frac{q_{\theta}(y_{t}\mid x,f,y_{<t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})}\\
+\text{s.t.}\ &\sum_{t}\mathrm{KL}\left(q(y_{t}\mid x,f,y_{<t})\|q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})\right)\leq\epsilon,\end{split}
+(13)
+where
+Δ
+\Delta
+denotes the probability simplex.
+Intuitively, the solution is the
+q
+q
+satisfying the trust-region constraint, which is closest to
+q
+θ
+q_{\theta}
+(i.e., has minimal cross-entropy to
+q
+θ
+q_{\theta}
+) while being farthest from
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+(i.e., has maximal cross-entropy to
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+).
+Proposition B.1
+.
+The solution to
+Equation
+˜
+13
+can be expressed in closed form as
+q
+∗
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+.
+\displaystyle q^{*}(y_{t}\mid x,f,y_{<t})\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t}\mid x,f,y_{<t})+\alpha\log q_{\theta}(y_{t}\mid x,f,y_{<t})\big).
+(14)
+Proof.
+To simplify notation, we omit the conditioning in the following.
+The Lagrangian (with
+λ
+≥
+0
+\lambda\geq 0
+for the KL constraint and
+ν
+\nu
+for normalization) is
+ℒ
+​
+(
+q
+,
+λ
+,
+ν
+)
+=
+∑
+t
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+λ
+​
+(
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+​
+log
+⁡
+q
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+ϵ
+)
++
+ν
+​
+(
+∑
+y
+t
+q
+​
+(
+y
+t
+)
+−
+1
+)
+.
+\displaystyle\mathcal{L}(q,\lambda,\nu)=\sum_{t}\sum_{y_{t}}q({y_{t}})\log\frac{q_{\theta}({y_{t}})}{q_{\theta_{{\mathrm{ref}}}}({y_{t}})}-\lambda\Big(\sum_{y_{t}}q({y_{t}})\log\frac{q({y_{t}})}{q_{\theta_{{\mathrm{ref}}}}({y_{t}})}-\epsilon\Big)+\nu\Big(\sum_{y_{t}}q({y_{t}})-1\Big).
+Stationarity gives, for all
+y
+t
+y_{t}
+,
+0
+=
+∂
+ℒ
+∂
+q
+​
+(
+y
+t
+)
+=
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+−
+λ
+​
+(
+log
+⁡
+q
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
++
+1
+)
++
+ν
+.
+\displaystyle 0=\frac{\partial\mathcal{L}}{\partial q(y_{t})}=\log\frac{q_{\theta}(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}-\lambda\Big(\log\frac{q(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}+1\Big)+\nu.
+Let
+α
+:=
+1
+/
+λ
+\alpha:=1/\lambda
+. Then, the solution to
+Equation
+˜
+13
+can be characterized in closed form as
+q
+∗
+​
+(
+y
+t
+)
+\displaystyle q^{*}(y_{t})
+∝
+q
+θ
+ref
+​
+(
+y
+t
+)
+​
+exp
+⁡
+(
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+q
+θ
+ref
+​
+(
+y
+t
+)
+)
+\displaystyle\propto q_{\theta_{{\mathrm{ref}}}}(y_{t})\exp\!\Big(\alpha\log\tfrac{q_{\theta}(y_{t})}{q_{\theta_{{\mathrm{ref}}}}(y_{t})}\Big)
+∝
+exp
+⁡
+(
+(
+1
+−
+α
+)
+​
+log
+⁡
+q
+θ
+ref
+​
+(
+y
+t
+)
++
+α
+​
+log
+⁡
+q
+θ
+​
+(
+y
+t
+)
+)
+.
+\displaystyle\propto\exp\!\big((1-\alpha)\log q_{\theta_{{\mathrm{ref}}}}(y_{t})+\alpha\log q_{\theta}(y_{t})\big).
+∎
+Chen et al. (
+2025c
+)
+perform a similar derivation, but use reference
+π
+θ
+ref
+\pi_{{\theta_{{\mathrm{ref}}}}}
+, which we observe to underperform compared to the reference
+q
+θ
+ref
+q_{\theta_{{\mathrm{ref}}}}
+.
+B.3
+EMA Teacher as an Implicit Trust Region
+To stabilize training, an alternative to the explicit trust-region teacher in Appendix
+B.2
+is to parameterize the teacher as an exponential moving average (EMA) of the student’s parameters
+θ
+k
+\theta_{k}
+:
+θ
+k
+′
+=
+(
+1
+−
+α
+)
+​
+θ
+k
+−
+1
+′
++
+α
+​
+θ
+k
+,
+α
+∈
+(
+0
+,
+1
+)
+,
+\theta^{\prime}_{k}=(1-\alpha)\theta^{\prime}_{k-1}+\alpha\theta_{k},\qquad\alpha\in(0,1),
+(15)
+with initialization
+θ
+0
+=
+θ
+0
+′
+=
+θ
+ref
+\theta_{0}=\theta^{\prime}_{0}={\theta_{{\mathrm{ref}}}}
+. While
+Equation
+˜
+15
+constrains the teacher in parameter space (and thus does not, in general, impose an explicit KL trust
+region in distribution space), under a mild smoothness condition we can show that the EMA teacher
+q
+θ
+′
+q_{\theta^{\prime}}
+remains within an
+approximate
+trust region relative to the reference teacher
+q
+θ
+ref
+q_{\theta_{\mathrm{ref}}}
+.
+Let
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+q_{\theta}(y_{t}\mid x,f,y_{<t})
+be a softmax distribution with logits
+z
+θ
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+∈
+ℝ
+|
+𝒱
+|
+z_{\theta}(x,f,y_{<t})\in\mathbb{R}^{|\mathcal{V}|}
+:
+q
+θ
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+=
+softmax
+​
+(
+z
+θ
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+)
+y
+t
+.
+q_{\theta}(y_{t}\mid x,f,y_{<t})=\mathrm{softmax}\big(z_{\theta}(x,f,y_{<t})\big)_{y_{t}}.
+(16)
+Assume the logits are
+L
+L
+-Lipschitz in parameters:
+‖
+z
+θ
+1
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+−
+z
+θ
+2
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+‖
+2
+≤
+L
+​
+‖
+θ
+1
+−
+θ
+2
+‖
+2
+,
+∀
+(
+x
+,
+f
+,
+t
+,
+θ
+1
+,
+θ
+2
+)
+.
+\big\|z_{\theta_{1}}(x,f,y_{<t})-z_{\theta_{2}}(x,f,y_{<t})\big\|_{2}\leq L\,\big\|\theta_{1}-\theta_{2}\big\|_{2},\qquad\forall(x,f,t,\theta_{1},\theta_{2}).
+(17)
+Proposition B.2
+(EMA yields an approximate trust region)
+.
+Let
+θ
+0
+=
+θ
+0
+′
+=
+θ
+ref
+\theta_{0}=\theta^{\prime}_{0}={\theta_{{\mathrm{ref}}}}
+,
+k
+≥
+1
+k\geq 1
+, and update
+θ
+k
+′
+\theta^{\prime}_{k}
+by
+Equation
+˜
+15
+. We assume:
+•
+The logits are
+L
+L
+-Lipschitz in parameters (cf.
+Equation
+˜
+17
+).
+•
+The student does not diverge from the initial model, i.e.,
+‖
+θ
+i
+−
+θ
+0
+‖
+2
+≤
+R
+ref
+\|\theta_{i}-\theta_{0}\|_{2}\leq R_{\mathrm{ref}}
+for all
+i
+≤
+k
+i\leq k
+.
+Then, for any
+(
+x
+,
+c
+)
+(x,c)
+and any sequence length
+T
+T
+, the EMA teacher
+q
+θ
+k
+′
+\smash{q_{\theta^{\prime}_{k}}}
+implicitly satisfies the trust-region constraint
+∑
+t
+=
+1
+T
+KL
+(
+q
+θ
+k
+′
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+ϵ
+k
+,
+ϵ
+k
+=
+L
+2
+4
+T
+R
+ref
+2
+(
+1
+−
+(
+1
+−
+α
+)
+k
+)
+2
+.
+\sum_{t=1}^{T}\mathrm{KL}\left(q_{\theta^{\prime}_{k}}(\cdot\mid x,f,y_{<t})\|q_{\theta_{\mathrm{ref}}}(\cdot\mid x,f,y_{<t})\right)\leq\epsilon_{k},\qquad\epsilon_{k}=\frac{L^{2}}{4}\,T\,R_{\mathrm{ref}}^{2}\,(1-(1-\alpha)^{k})^{2}.
+(18)
+Proof.
+Fix
+(
+x
+,
+f
+,
+t
+)
+(x,f,t)
+and define
+p
+=
+softmax
+​
+(
+a
+)
+p=\mathrm{softmax}(a)
+and
+q
+=
+softmax
+​
+(
+b
+)
+q=\mathrm{softmax}(b)
+with
+a
+=
+z
+θ
+k
+′
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+a=z_{\theta^{\prime}_{k}}(x,f,y_{<t})
+and
+b
+=
+z
+θ
+ref
+​
+(
+x
+,
+f
+,
+y
+<
+t
+)
+b=z_{\theta_{\mathrm{ref}}}(x,f,y_{<t})
+.
+A standard smoothness bound for softmax distributions
+10
+10
+10
+Let
+p
+=
+softmax
+​
+(
+a
+)
+p=\mathrm{softmax}(a)
+and
+q
+=
+softmax
+​
+(
+b
+)
+q=\mathrm{softmax}(b)
+, and define the log-partition
+A
+​
+(
+z
+)
+=
+log
+​
+∑
+i
+exp
+⁡
+(
+z
+i
+)
+A(z)=\log\sum_{i}\exp(z_{i})
+. For the categorical exponential family, the KL divergence admits the
+(primal) Bregman form
+KL
+​
+(
+p
+∥
+q
+)
+=
+A
+​
+(
+b
+)
+−
+A
+​
+(
+a
+)
+−
+⟨
+∇
+A
+​
+(
+a
+)
+,
+b
+−
+a
+⟩
+\mathrm{KL}\left(p\|q\right)=A(b)-A(a)-\langle\boldsymbol{\nabla}A(a),\,b-a\rangle
+(Wainwright & Jordan,
+2008
+, Eq. (5.10))
+.
+Moreover,
+∇
+A
+​
+(
+z
+)
+=
+softmax
+​
+(
+z
+)
+\boldsymbol{\nabla}A(z)=\mathrm{softmax}(z)
+and
+∇
+2
+A
+​
+(
+z
+)
+=
+diag
+​
+(
+p
+)
+−
+p
+​
+p
+⊤
+\boldsymbol{\nabla}^{2}A(z)=\mathrm{diag}(p)-pp^{\top}
+(Boyd & Vandenberghe,
+2004
+)
+.
+For any
+v
+∈
+ℝ
+|
+𝒱
+|
+v\in\mathbb{R}^{|\mathcal{V}|}
+,
+v
+⊤
+​
+∇
+2
+A
+​
+(
+z
+)
+​
+v
+=
+∑
+i
+p
+i
+​
+v
+i
+2
+−
+(
+∑
+i
+p
+i
+​
+v
+i
+)
+2
+=
+Var
+i
+∼
+p
+​
+(
+v
+i
+)
+≤
+(
+max
+i
+⁡
+v
+i
+−
+min
+i
+⁡
+v
+i
+)
+2
+4
+≤
+‖
+v
+‖
+2
+2
+2
+,
+v^{\top}\boldsymbol{\nabla}^{2}A(z)v=\sum_{i}p_{i}v_{i}^{2}-\Big(\sum_{i}p_{i}v_{i}\Big)^{2}=\mathrm{Var}_{i\sim p}(v_{i})\leq\frac{(\max_{i}v_{i}-\min_{i}v_{i})^{2}}{4}\leq\frac{\|v\|_{2}^{2}}{2},
+so
+‖
+∇
+2
+A
+​
+(
+z
+)
+‖
+op
+≤
+1
+2
+\|\boldsymbol{\nabla}^{2}A(z)\|_{\mathrm{op}}\leq\tfrac{1}{2}
+and hence
+A
+A
+is
+(
+1
+/
+2
+)
+(1/2)
+-smooth in
+∥
+⋅
+∥
+2
+\|\cdot\|_{2}
+.
+Applying the standard smoothness inequality for
+β
+\beta
+-smooth functions
+(Bubeck,
+2015
+, Lemma 3.4)
+yields
+A
+​
+(
+b
+)
+≤
+A
+​
+(
+a
+)
++
+⟨
+∇
+A
+​
+(
+a
+)
+,
+b
+−
+a
+⟩
++
+β
+2
+​
+‖
+b
+−
+a
+‖
+2
+2
+A(b)\leq A(a)+\langle\boldsymbol{\nabla}A(a),b-a\rangle+\tfrac{\beta}{2}\|b-a\|_{2}^{2}
+; setting
+β
+=
+1
+2
+\beta=\tfrac{1}{2}
+gives
+KL
+​
+(
+p
+∥
+q
+)
+≤
+1
+4
+​
+‖
+a
+−
+b
+‖
+2
+2
+\mathrm{KL}\left(p\|q\right)\leq\tfrac{1}{4}\|a-b\|_{2}^{2}
+.
+implies
+KL
+​
+(
+p
+∥
+q
+)
+≤
+1
+4
+​
+‖
+a
+−
+b
+‖
+2
+2
+.
+\mathrm{KL}\left(p\|q\right)\leq\frac{1}{4}\,\|a-b\|_{2}^{2}.
+(19)
+By
+Equation
+˜
+17
+, this gives
+KL
+(
+q
+θ
+k
+′
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+L
+2
+4
+∥
+θ
+k
+′
+−
+θ
+ref
+∥
+2
+2
+.
+\mathrm{KL}\left(q_{\theta^{\prime}_{k}}(\cdot\mid x,f,y_{<t})\|q_{{\theta_{{\mathrm{ref}}}}}(\cdot\mid x,f,y_{<t})\right)\leq\frac{L^{2}}{4}\,\|\theta^{\prime}_{k}-{\theta_{{\mathrm{ref}}}}\|_{2}^{2}.
+(20)
+Unrolling the EMA recursion with
+θ
+0
+′
+=
+θ
+ref
+\theta^{\prime}_{0}={\theta_{{\mathrm{ref}}}}
+yields
+θ
+k
+′
+−
+θ
+ref
+=
+∑
+i
+=
+1
+k
+w
+i
+​
+(
+θ
+i
+−
+θ
+ref
+)
+,
+w
+i
+=
+α
+​
+(
+1
+−
+α
+)
+k
+−
+i
+,
+∑
+i
+=
+1
+k
+w
+i
+=
+1
+−
+(
+1
+−
+α
+)
+k
+.
+\theta^{\prime}_{k}-{\theta_{{\mathrm{ref}}}}=\sum_{i=1}^{k}w_{i}(\theta_{i}-{\theta_{{\mathrm{ref}}}}),\qquad w_{i}=\alpha(1-\alpha)^{k-i},\qquad\sum_{i=1}^{k}w_{i}=1-(1-\alpha)^{k}.
+By Cauchy–Schwarz,
+‖
+∑
+i
+=
+1
+k
+w
+i
+​
+(
+θ
+i
+−
+θ
+ref
+)
+‖
+2
+2
+≤
+(
+∑
+i
+=
+1
+k
+w
+i
+)
+​
+(
+∑
+i
+=
+1
+k
+w
+i
+​
+‖
+θ
+i
+−
+θ
+ref
+‖
+2
+2
+)
+=
+(
+1
+−
+(
+1
+−
+α
+)
+k
+)
+​
+∑
+i
+=
+1
+k
+w
+i
+​
+‖
+θ
+i
+−
+θ
+ref
+‖
+2
+2
+.
+\Big\|\sum_{i=1}^{k}w_{i}(\theta_{i}-{\theta_{{\mathrm{ref}}}})\Big\|_{2}^{2}\leq\Big(\sum_{i=1}^{k}w_{i}\Big)\Big(\sum_{i=1}^{k}w_{i}\|\theta_{i}-{\theta_{{\mathrm{ref}}}}\|_{2}^{2}\Big)=(1-(1-\alpha)^{k})\sum_{i=1}^{k}w_{i}\|\theta_{i}-{\theta_{{\mathrm{ref}}}}\|_{2}^{2}.
+(21)
+Combining
+Equations
+˜
+20
+and
+21
+and summing over
+t
+=
+1
+,
+…
+,
+T
+t=1,\dots,T
+gives
+∑
+t
+=
+1
+T
+KL
+(
+q
+θ
+k
+′
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+∥
+q
+θ
+ref
+(
+⋅
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+)
+≤
+L
+2
+4
+T
+(
+1
+−
+(
+1
+−
+α
+)
+k
+)
+∑
+i
+=
+1
+k
+α
+(
+1
+−
+α
+)
+k
+−
+i
+∥
+θ
+i
+−
+θ
+ref
+∥
+2
+2
+.
+\sum_{t=1}^{T}\mathrm{KL}\left(q_{\theta^{\prime}_{k}}(\cdot\mid x,f,y_{<t})\|q_{{\theta_{{\mathrm{ref}}}}}(\cdot\mid x,f,y_{<t})\right)\leq\frac{L^{2}}{4}\,T\,(1-(1-\alpha)^{k})\,\sum_{i=1}^{k}\alpha(1-\alpha)^{k-i}\,\big\|\theta_{i}-{\theta_{{\mathrm{ref}}}}\big\|_{2}^{2}.
+Finally, if
+‖
+θ
+i
+−
+θ
+ref
+‖
+2
+≤
+R
+ref
+\|\theta_{i}-{\theta_{{\mathrm{ref}}}}\|_{2}\leq R_{\mathrm{ref}}
+for all
+i
+≤
+k
+i\leq k
+, then
+∑
+i
+=
+1
+k
+w
+i
+​
+‖
+θ
+i
+−
+θ
+ref
+‖
+2
+2
+≤
+R
+ref
+2
+​
+∑
+i
+=
+1
+k
+w
+i
+=
+R
+ref
+2
+​
+(
+1
+−
+(
+1
+−
+α
+)
+k
+)
+\sum_{i=1}^{k}w_{i}\|\theta_{i}-{\theta_{{\mathrm{ref}}}}\|_{2}^{2}\leq R_{\mathrm{ref}}^{2}\sum_{i=1}^{k}w_{i}=R_{\mathrm{ref}}^{2}(1-(1-\alpha)^{k})
+,
+which yields
+Equation
+˜
+18
+.
+∎
+To summarize,
+Equation
+˜
+18
+recovers the same form as the explicit trust-region constraint
+∑
+t
+KL
+(
+⋅
+∥
+q
+θ
+ref
+)
+≤
+ϵ
+\smash{\sum_{t}\mathrm{KL}\left(\cdot\|q_{{\theta_{{\mathrm{ref}}}}}\right)\leq\epsilon}
+(cf.
+Section
+˜
+B.2
+), with an
+effective radius
+ϵ
+k
+\epsilon_{k}
+controlled by the EMA rate
+α
+\alpha
+and the cumulative deviation from the reference.
+For fixed
+k
+k
+and small
+α
+\alpha
+,
+1
+−
+(
+1
+−
+α
+)
+k
+≈
+α
+​
+k
+\smash{1-(1-\alpha)^{k}\approx\alpha k}
+, so the reference divergence scales as
+ϵ
+k
+=
+O
+​
+(
+α
+2
+​
+k
+2
+)
+\epsilon_{k}=O(\alpha^{2}k^{2})
+when
+θ
+i
+\theta_{i}
+stays within a bounded neighborhood of
+θ
+ref
+{\theta_{{\mathrm{ref}}}}
+.
+Appendix C
+Additional Related Work
+Value networks and Monte Carlo advantage estimation.
+Several prior approaches aim to improve credit assignment but face the same information bottleneck as GRPO. Classical RL frequently trains value networks which provide token-level advantages, but themselves are learned from scalar rewards
+(Schulman et al.,
+2016
+;
+2017
+)
+. Furthermore, value networks incur significant computational and memory overhead and are therefore typically not used to train LLMs.
+Other recent work estimates token-level advantages by performing additional generations starting from various positions in the original attempt
+(Kazemnejad et al.,
+2025
+; Zheng et al.,
+2025b
+)
+.
+While this can learn with fewer gradient steps than GRPO it still uses only scalar rewards as signal and requires costly additional generations.
+Dense credit assignment with a reward model.
+Several recent works have explored assigning dense (per-token) rewards given access to an external reward model, leveraging internal structure of the reward model
+(Chan et al.,
+2024
+; Cao et al.,
+2025
+)
+.
+Partial observability.
+From the perspective of classical RL, many verifiable domains for LLMs are naturally
+partially observable
+:
+executing a proposed solution induces a latent environment state (e.g., failing tests or states of an agentic system) that is revealed only through rich feedback.
+This aligns with the formalism of partially observable Markov decision processes (POMDPs), where agents must act under incomplete observations of state
+(Kaelbling et al.,
+1998
+; Sutton & Barto,
+1998
+)
+.
+By contrast, RLVR and RLHF pipelines typically discard this observation channel and learn only from terminal scalar rewards or pairwise preferences.
+Relation to test-time training.
+Our setting from
+Section
+˜
+5
+can be seen as a special case of test-time training where the model itself is updated at test-time using self-distillation.
+Updating the model at test-time is known as test-time training
+(Sun et al.,
+2020
+;
+2025
+; Hardt & Sun,
+2024
+; Hübotter et al.,
+2025a
+;
+b
+; Akyürek et al.,
+2025
+; Behrouz et al.,
+2025
+; Tandon et al.,
+2025
+; Hübotter et al.,
+2026
+)
+.
+Unlike prior work, self-distillation uses the in-context learning ability of the current model to attribute credit after receiving feedback.
+This can be seen as simulating long-context reasoning with periodic compression of context into the model weights.
+C.1
+SDPO as Maximum Entropy RL
+The SDPO objective resembles the objective in maximum entropy RL
+(e.g., Levine,
+2018
+; Haarnoja et al.,
+2018
+)
+with a particular choice of reward function.
+Maximum Entropy RL
+Consider optimizing
+arg
+​
+max
+θ
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+[
+∑
+t
+r
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+]
++
+λ
+H
+[
+π
+θ
+(
+⋅
+∣
+x
+)
+]
+,
+λ
+>
+0
+\operatorname*{arg\,max}_{\theta}\ \mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[\sum_{t}r(y_{t}\mid x,y_{<t})\right]+\lambda\mathrm{H}\left[\pi_{\theta}(\cdot\mid x)\right],\quad\lambda>0
+(22)
+where
+π
+θ
+​
+(
+y
+∣
+x
+)
+=
+∏
+t
+=
+1
+T
+π
+θ
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+\smash{\pi_{\theta}(y\mid x)=\prod_{t=1}^{T}\pi_{\theta}(y_{t}\mid x,y_{<t})}
+and
+H
+[
+π
+θ
+(
+⋅
+∣
+x
+)
+]
+=
+𝔼
+y
+∼
+π
+θ
+(
+⋅
+∣
+x
+)
+[
+−
+log
+π
+θ
+(
+y
+∣
+x
+)
+]
+\smash{\mathrm{H}\left[\pi_{\theta}(\cdot\mid x)\right]=\mathbb{E}_{y\sim\pi_{\theta}(\cdot\mid x)}{}\left[-\log\pi_{\theta}(y\mid x)\right]}
+is the entropy of the policy.
+Here,
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+r(y_{t}\mid x,y_{<t})
+is an arbitrary reward function, possibly “dense” (i.e., per-token).
+Equation
+˜
+22
+is known as maximum entropy RL.
+It is known that this objective is equivalent to solving a variational inference problem which discuss next.
+To this end, we define a Bernoulli random variable
+𝒞
+\mathcal{C}
+which is
+1
+1
+if the attempt
+y
+y
+is correct and
+0
+otherwise.
+We then define its distribution as
+p
+​
+(
+𝒞
+=
+1
+∣
+x
+,
+y
+)
+∝
+exp
+⁡
+(
+1
+λ
+​
+∑
+t
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+\smash{p(\mathcal{C}=1\mid x,y)\propto\exp(\tfrac{1}{\lambda}\sum_{t}r(y_{t}\mid x,y_{<t}))}
+.
+Further assuming w.l.o.g. that the “prior” over responses is uniform, we can express the posterior conditioned on the event of correctness as
+π
+⋆
+​
+(
+y
+∣
+x
+)
+:=
+p
+​
+(
+y
+∣
+x
+,
+𝒞
+=
+1
+)
+∝
+p
+​
+(
+𝒞
+=
+1
+∣
+x
+,
+y
+)
+∝
+exp
+⁡
+(
+1
+λ
+​
+∑
+t
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+.
+\pi^{\star}(y\mid x):=p(y\mid x,\mathcal{C}=1)\propto p(\mathcal{C}=1\mid x,y)\propto\exp\!\left(\frac{1}{\lambda}\sum_{t}r(y_{t}\mid x,y_{<t})\right).
+(23)
+Then,
+Equation
+˜
+22
+is equivalent to minimizing the KL divergence with respect to
+π
+⋆
+\pi^{\star}
+:
+arg
+​
+min
+θ
+∑
+t
+KL
+(
+π
+θ
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+∥
+π
+⋆
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+)
+.
+\operatorname*{arg\,min}_{\theta}\ \sum_{t}\mathrm{KL}\left(\pi_{\theta}(y_{t}\mid x,y_{<t})\|\pi^{\star}(y_{t}\mid x,y_{<t})\right).
+(24)
+SDPO optimizes an implicit reward defined by the teacher
+Note that
+Equation
+˜
+24
+is equivalent to the SDPO objective (
+Equation
+˜
+1
+) with implicit reward
+r
+​
+(
+y
+t
+∣
+x
+,
+y
+<
+t
+)
+=
+log
+⁡
+q
+​
+(
+y
+t
+∣
+x
+,
+f
+,
+y
+<
+t
+)
+r(y_{t}\mid x,y_{<t})=\log q(y_{t}\mid x,f,y_{<t})
+and
+λ
+=
+1
+\lambda=1
+.
+In this sense, SDPO can be seen as a maximum entropy RL algorithm with dense rewards constructed implicitly through the retrospective model.
+This also points to a connection of SDPO to inverse RL
+(Ng et al.,
+2000
+; Ziebart et al.,
+2008
+; Rafailov et al.,
+2023
+)
+, where the goal is to recover an unknown reward function.
+In SDPO, the student learns an implicit reward function defined by the retrospective model.
+Appendix D
+Additional Results & Ablations
+This section is organized as follows:
+•
+Section
+˜
+D.1
+contains results and ablations for
+Section
+˜
+3
+.
+•
+Section
+˜
+D.2
+contains results and ablations for
+Section
+˜
+4
+.
+•
+Section
+˜
+D.3
+contains results and ablations for
+Section
+˜
+5
+.
+D.1
+Learning without rich environment feedback
+•
+Table
+˜
+7
+reports results when optimal hyperparameters are selected for each model/task combination.
+•
+Table
+˜
+8
+compares average response lengths of SDPO and GRPO.
+Chemistry
+Physics
+Biology
+Materials
+Tool use
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+1h
+5h
+Qwen3-8B
+35.6
+59.2
+27.9
+58.9
+57.5
++ GRPO
+54.2
+69.6
+62.9
+74.5
+34.3
+51.8
+74.3
+77.1
+61.7
+68.1
++ GRPO (on-policy)
+54.2
+69.6
+62.9
+74.8
+30.3
+49.4
+73.3
+75.8
+61.7
+68.1
++
+SDPO
+(on-policy)
+59.9
+70.1
+70.6
+80.6
+53.1
+53.1
+72.1
+78.3
+56.4
+68.5
+Olmo3-7B-Instruct
+18.8
+37.7
+18.1
+36.7
+39.3
++ GRPO
+42.7
+54.3
+55.3
+63.3
+54.2
+63.8
+73.8
+78.1
+56.4
+65.0
++ GRPO (on-policy)
+48.8
+54.3
+62.7
+62.7
+54.2
+63.8
+67.9
+74.4
+56.0
+61.3
++
+SDPO
+(on-policy)
+59.2
+76.8
+60.3
+71.4
+56.1
+58.3
+75.3
+79.2
+57.3
+62.5
+Table 7:
+Comparison of SDPO and GRPO on reasoning-related benchmarks.
+We report the highest achieved avg@16 within 1 hour and 5 hours of wall-clock training time, respectively. Both SDPO and on-policy GRPO perform one gradient step per generation batch, while GRPO performs 4 off-policy mini batch steps. We select optimal hyperparameters for SDPO and baselines based on 5h accuracy. We perform this selection independently for each model and dataset. Each run is performed on a node with 4 NVIDIA GH200 GPUs. Together with initialization and validation, each run takes approximately 6 hours.
+As opposed to
+Table
+˜
+3
+which selects globally optimal hyperparameters per method, this table selects optimal hyperparameters individually for each model/task combination based on 5h accuracy.
+The hyperparameter grid is described in
+Section
+˜
+E.2.1
+.
+Model
+GRPO
+SDPO
+Reduction of SDPO
+Qwen3-8B
+820.8
+255.8
+3.2
+×
+3.2\times
+Olmo3-7B-Instruct
+1095.4
+343.9
+3.2
+×
+3.2\times
+Table 8:
+Average response lengths of SDPO and GRPO (averaged across tasks from
+Section
+˜
+3
+). Both algorithms are evaluated in the on-policy setting.
+D.2
+Learning with rich environment feedback
+D.2.1
+Additional Results
+Figure 15:
+Average accuracy during training until step 80, stratified by difficulty. Error bars show standard deviation across 3 seeds.
+Figure
+˜
+15
+shows the average accuracy of SDPO and GRPO stratified by question difficulty. LCB differentiates between easy, medium, and hard questions.
+As displayed, SDPO significantly improves over GRPO in solving medium and hard questions, highlighting the importance of rich feedback for challenging tasks. Note that this categorization of questions is different from the one in
+Section
+˜
+5
+.
+In
+Figure
+˜
+16
+, we compare different train batch sizes and number of rollouts for training GRPO and SDPO on LCBv6.
+Figure 16:
+Accuracy (pass@1) for varying train batch sizes (4, 8, 16, 32) and number of rollouts (4, 8) for training SDPO and GRPO with Qwen3-8B
+(Yang et al.,
+2025
+)
+on LCBv6,
+±
+\pm
+stderr across 3 seeds. Different shades of the same color correspond to different runs.
+Complementing the results shown in
+Figure
+˜
+8
+, we show additional results using Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+in
+Figure
+˜
+17
+.
+Figure 17:
+Average validation accuracy by model size,
+±
+\pm
+std across 3 seeds. With Qwen2.5-Instruct
+(Qwen et al.,
+2024
+)
+and Qwen3
+(Yang et al.,
+2025
+)
+on LCBv6. Until step 65 for Qwen2.5 and until step 80 for Qwen3.
+D.2.2
+Training Stability
+Figure
+˜
+18
+shows diverse metrics logged during training, including the loss, entropy, average gradient norm, and average response length.
+Figure 18:
+Loss, entropy, avg. gradient norm and avg. response length during training of SDPO on LCBv6 (
+Section
+˜
+4
+.
+D.2.3
+Baselines
+Table
+˜
+9
+compares the performance on LCBv6 of various baselines, including two variants of GRPO, GSPO, and CISPO to SDPO.
+Accuracy
+Avg accuracy
+GRPO
+41.2
+±
+0.8
+41.2\pm 0.8
+38.2
+±
+0.0
+38.2\pm 0.0
++ only high-entropy tokens
+(Wang et al.,
+2025
+)
+37.8
+±
+2.2
+37.8\pm 2.2
+35.9
+±
+0.1
+35.9\pm 0.1
+GSPO
+(Zheng et al.,
+2025a
+)
+40.1
+±
+2.3
+40.1\pm 2.3
+37.7
+±
+0.1
+37.7\pm 0.1
+CISPO
+(Chen et al.,
+2025a
+)
+41.2
+±
+1.8
+41.2\pm 1.8
+37.8
+±
+0.1
+37.8\pm 0.1
+SDPO
+48.8
+±
+0.6
+\mathbf{48.8}\pm 0.6
+43.8
+±
+0.0
+\mathbf{43.8}\pm 0.0
+Table 9:
+Performance on LCBv6 at/until training step 80 with std over 3 seeds. We compare to GSPO
+(Zheng et al.,
+2025a
+)
+and CISPO
+(Chen et al.,
+2025a
+)
+. With Qwen3-8B.
+D.3
+Test-time self-distillation
+Complementing the results shown in
+Section
+˜
+5
+, we show the discovery@
+k
+k
+curves for all hard question in
+Figure
+˜
+20
+, and report the mean number of generations until the first discovery in
+Table
+˜
+10
+. Further,
+Table
+˜
+11
+shows the per-question accuracy of the self-teacher at the initial training step of SDPO. In
+Figure
+˜
+19
+, we ablate the choice of batch size for SDPO and the in-context reprompting strategy for multi-turn sampling.
+In the selection of hard questions, we have discarded one malformed question (Q9) where the coding environment did not correctly validate the solution due to rounding inaccuracies, which led to failures even with correct logic.
+Question
+SDPO
+Best-of-
+k
+k
+Multi-turn
+Speedup
+Best-of-
+k
+k
+→
+\rightarrow
+SDPO
+1
+104
+98
+59
+0.9
+×
+\times
+3*
+1987
+≥
+2750
+\geq 2750
+≥
+2750
+\geq 2750
+1.4
+×
+\times
+10*
+938
+≥
+2750
+\geq 2750
+1706
+2.9
+×
+\times
+43
+111
+109
+111
+1.0
+×
+\times
+46*
+1852
+1466
+1315
+0.8
+×
+\times
+59
+172
+123
+76
+0.7
+×
+\times
+69
+280
+134
+134
+0.5
+×
+\times
+74*
+1948
+1466
+2405
+0.8
+×
+\times
+86
+85
+421
+335
+5.0
+×
+\times
+91*
+1360
+≥
+2750
+\geq 2750
+2384
+2.0
+×
+\times
+92*
+1575
+≥
+2750
+\geq 2750
+2203
+1.8
+×
+\times
+95*
+1948
+1466
+1794
+0.8
+×
+\times
+100
+277
+294
+1596
+1.1
+×
+\times
+103*
+2246
+≥
+2750
+\geq 2750
+2210
+1.2
+×
+\times
+111
+85
+95
+39
+1.1
+×
+\times
+120
+24
+327
+70
+13.6
+×
+\times
+125*
+1795
+1466
+2320
+0.8
+×
+\times
+127
+28
+368
+61
+13.1
+×
+\times
+129
+168
+173
+104
+1.0
+×
+\times
+Hard tasks
+894
+1145
+1141
+1.3
+×
+\times
+Very hard tasks
+1739
+2180
+2121
+1.2
+×
+\times
+Table 10:
+Mean number of generations until first success per question for SDPO, best-of-
+k
+k
+sampling, and the multi-turn sampling. For the mean calculation, values are truncated at the maximum budget of 2750 generations. Very hard tasks (
+pass
+​
+@
+​
+64
+<
+0.03
+\text{pass}@64<0.03
+) are marked with an asterisk (*). Averaged over all questions, SDPO achieves successes faster than the baselines, reaching a speedup of up to
+13.6
+×
+13.6\times
+on individual questions compared to best-of-
+k
+k
+sampling.
+Question
+Initial Teacher
+Accuracy (%)
+1
+0.00
+3
+0.00
+10
+0.00
+43
+6.25
+46
+0.00
+59
+0.00
+69
+3.12
+74
+0.00
+86
+0.00
+91
+0.00
+92
+0.00
+95
+0.00
+100
+0.00
+103
+0.00
+111
+0.00
+120
+0.00
+125
+0.00
+127
+1.23
+129
+0.06
+Table 11:
+Average accuracy of the retrospective teacher at the first step for each question.
+These scores represent the percentage of successful solutions generated when the base model is reprompted with feedback in a single-turn interaction. For the majority of these hard and very hard tasks, the teacher accuracy is near or exactly 0%. Despite this, the self-distilled token-level advantages are sufficiently rich for SDPO to iteratively refine its policy and solve these questions over successive updates.
+Figure 19:
+Ablations self-distillation at test-time on hard tasks.
+Left:
+Impact of SDPO batch size on
+pass
+​
+@
+​
+k
+\text{pass}@k
+curves. While smaller batch sizes (8 and 16) can lead to slightly earlier discoveries at very low generation budgets (
+k
+<
+2
+6
+k<2^{6}
+), larger batch sizes (16, 32) result in more stable updates that significantly improve the discovery rate as the budget scales.
+Right:
+Comparison of multi-turn reprompting templates on a subset of hard questions. The “Only feedback” template concatenates the feedback from previous attempts using a first-in, first-out sliding window. The “Attempts + Feedback” template concatenates the full turn, also using a sliding window. Including only the feedback substantially outperforms concatenating full conversations.
+Figure 20:
+Individual task results self-distillation at test-time.
+Discovery
+​
+@
+​
+k
+\text{Discovery}@k
+for each of the 19 questions evaluated in
+Section
+˜
+5
+. In most cases, SDPO finds a successful solution significantly earlier than both the base model and the multi-turn baseline. Notably, for one question (Q3) where the base model and the multi-turn baseline maintain a
+discovery
+​
+@
+​
+k
+\text{discovery}@k
+of zero for the entire budget up to 2750 , SDPO discovers a solution after 321 attempts. Curves represent the mean and 90% confidence intervals across 5 random seeds per question.
+Appendix E
+Experiment Details
+E.1
+Technical setup
+All experiments were conducted on a single node equipped with four NVIDIA
+GH200 GPUs, for a total of 378GB VRAM. Our environment is built on top of the NVIDIA PyTorch container
+nvcr.io/nvidia/pytorch:25.02-py3
+, with CUDA 12.8 and PyTorch v2.7.0.
+Our implementation is based on the
+verl
+library
+(Sheng et al.,
+2025
+)
+. We use PyTorch Fully Sharded Data Parallel (FSDP2) for distributed training. For rollout generation, we employ
+vLLM
+(Kwon et al.,
+2023
+)
+, which enables efficient batched inference on the multi-GPU node.
+E.2
+Hyperparameters
+We summarize hyperparameters used for SDPO in
+Table
+˜
+12
+and those used for GRPO in
+Table
+˜
+13
+.
+Parameters
+Without Feedback
+With Feedback
+TTT
+Section
+3
+Section
+4
+Section
+5
+General
+Model
+Qwen/Qwen3-8B
+Qwen/Qwen3-8B
+Qwen/Qwen3-8B
+allenai/Olmo3-7B-Instruct
+Thinking
+False
+False
+False
+Data
+Max. prompt length
+2048
+2048
+2048
+Max. response length
+8192
+8192
+8192
+Batching
+Question batch size
+32
+32
+1
+Mini batch size
+32
+1
+1
+Number of rollouts
+8
+8
+16
+Rollout
+Inference engine
+vllm
+vllm
+vllm
+Temperature
+1.0
+1.0
+1.0
+Validation
+Number of rollouts
+16
+4
+-
+Temperature
+0.6
+0.6
+-
+Top-
+p
+p
+0.95
+0.95
+-
+SDPO loss
+Top-
+K
+K
+distillation
+100
+20
+20
+Distillation divergence
+Jensen–Shannon
+Reverse-KL
+Reverse-KL
+Clip advantages
+–
+–
+5.0
+Teacher-EMA update rate
+0.05
+0.01
+0.01
+Rollout importance sampling clip
+2
+2
+2
+Training
+Optimizer
+AdamW
+AdamW
+AdamW
+Learning rate
+1
+×
+10
+−
+5
+1\times 10^{-5}
+(constant)
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(constant)
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(constant)
+Warmup steps
+10
+0
+0
+Weight decay
+0.01
+0.01
+0.01
+Gradient Clip Norm
+1.0
+1.0
+1.0
+Table 12:
+Hyperparameters used for
+SDPO
+for each experimental setup.
+Parameters
+Experiment 1
+Section
+3
+General
+Model
+Qwen/Qwen3-8B
+allenai/Olmo3-7B-Instruct
+Thinking
+False
+Data
+Max. prompt length
+2048
+Max. response length
+8192
+Batching
+Question batch size
+32
+Mini batch size
+8 (default) / 32 (on-policy)
+Number of rollouts
+8
+Rollout
+Inference engine
+vllm
+Temperature
+1.0
+Validation
+Temperature
+0.6
+Top-
+p
+p
+0.95
+Number of rollouts
+16
+Loss
+ϵ
+\epsilon
+-high
+0.28
+Rollout importance sampling clip
+2
+KL coefficient (
+λ
+\lambda
+)
+0.0
+Training
+Optimizer
+AdamW
+Learning rate
+1
+×
+10
+−
+6
+1\times 10^{-6}
+(default) /
+1
+×
+10
+−
+5
+1\times 10^{-5}
+(on-policy)
+Warmup steps
+10
+Weight decay
+0.01
+Gradient Clip Norm
+1.0
+Table 13:
+Hyperparameters used for
+GRPO
+.
+E.2.1
+Details on Hyperparameter Selection (
+Section
+˜
+3
+)
+For GRPO in the experiments in
+Section
+˜
+3
+, we perform a grid search over learning rates
+{
+10
+−
+5
+,
+10
+−
+6
+}
+\{10^{-5},10^{-6}\}
+and minibatch sizes
+{
+8
+,
+32
+}
+\{8,32\}
+. For on-policy GRPO, we search over the same learning rates while fixing the minibatch size to 32. For SDPO, we grid-search over KL variants (forward KL, Jensen–Shannon), learning rates
+{
+10
+−
+5
+,
+10
+−
+6
+}
+\{10^{-5},10^{-6}\}
+, and minibatch sizes
+{
+8
+,
+32
+}
+\{8,32\}
+.
+For each method (GRPO, on-policy GRPO, and SDPO), we select a
+single
+hyperparameter configuration that achieves the highest validation accuracy within the first 5 hours of training, evaluated across all datasets and models used in
+Section
+˜
+3
+.
+We further report results obtained by selecting the optimal hyperparameter configuration separately for each model and dataset in
+Table
+˜
+3
+.
+E.3
+User Templates
+For multiple-choice questions and tool use, the model must be prompted in a task-specific manner. We therefore provide the prompt templates used for these settings below.
+⬇
+Given
+a
+question
+and
+four
+options
+,
+please
+select
+the
+right
+answer
+.
+Respond
+in
+the
+following
+format
+:
+<
+reasoning
+>
+...
+</
+reasoning
+>
+<
+answer
+>
+...
+</
+answer
+>
+For
+the
+answer
+,
+only
+output
+the
+letter
+corresponding
+to
+the
+correct
+option
+(
+A
+,
+B
+,
+C
+,
+or
+D
+),
+and
+nothing
+else
+.
+Do
+not
+restate
+the
+answer
+text
+.
+For
+example
+,
+if
+the
+answer
+is
+"
+A
+",
+just
+output
+:
+<
+answer
+>
+A
+</
+answer
+>
+Listing 1:
+System prompt: Multiple Choice Questions
+⬇
+{
+question
+}
+Please
+reason
+step
+by
+step
+.
+Listing 2:
+User prompt: Multiple Choice Questions
+⬇
+You
+are
+a
+helpful
+function
+-
+calling
+AI
+assistant
+.
+You
+are
+provided
+with
+function
+signatures
+within
+<
+functions
+></
+functions
+>
+XML
+tags
+.
+You
+may
+call
+one
+or
+more
+functions
+to
+assist
+with
+the
+user
+query
+.
+Output
+any
+function
+calls
+within
+<
+function_calls
+></
+function_calls
+>
+XML
+tags
+.
+Do
+not
+make
+assumptions
+about
+what
+values
+to
+plug
+into
+functions
+.
+Listing 3:
+System prompt: Tool use
+⬇
+Your
+task
+is
+to
+answer
+the
+user
+’
+s
+question
+using
+available
+tools
+.
+You
+have
+access
+to
+the
+following
+tools
+:
+Name
+:
+Axolotl
+Description
+:
+Collection
+of
+axolotl
+pictures
+and
+facts
+Documentation
+:
+getRandomAxolotlImage
+:
+Retrieve
+a
+random
+axolotl
+image
+with
+information
+on
+the
+image
+source
+.
+Parameters
+:
+{}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Object
+{
+url
+,
+source
+,
+description
+}
+searchAxolotlImages
+:
+Search
+for
+axolotl
+images
+based
+on
+specific
+criteria
+such
+as
+color
+,
+gender
+,
+and
+size
+.
+Parameters
+:
+{"
+color
+":
+"
+string
+.
+One
+of
+:
+[
+wild
+,
+leucistic
+,
+albino
+].
+The
+color
+of
+the
+axolotl
+(
+e
+.
+g
+.,
+’
+wild
+’,
+’
+leucistic
+’,
+’
+albino
+’,
+etc
+.).",
+"
+gender
+":
+"
+string
+.
+One
+of
+:
+[
+male
+,
+female
+].
+The
+gender
+of
+the
+axolotl
+(’
+male
+’,
+’
+female
+’).",
+"
+size
+":
+"
+string
+.
+One
+of
+:
+[
+small
+,
+medium
+,
+large
+].
+The
+size
+of
+the
+axolotl
+(’
+small
+’,
+’
+medium
+’,
+’
+large
+’).",
+"
+page
+":
+"
+integer
+.
+The
+page
+number
+for
+pagination
+purposes
+."}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Object
+{
+results
+:
+Array
+[
+Object
+{
+url
+,
+source
+,
+description
+}],
+pagination
+:
+Object
+{
+current_page
+,
+total_pages
+,
+total_results
+}}
+getAxolotlFacts
+:
+Retrieve
+interesting
+facts
+about
+axolotls
+such
+as
+their
+habits
+,
+habitats
+,
+and
+physical
+characteristics
+.
+Parameters
+:
+{"
+category
+":
+"
+string
+.
+One
+of
+:
+[
+habits
+,
+habitat
+,
+physical
+characteristics
+].
+The
+category
+of
+facts
+to
+retrieve
+(
+e
+.
+g
+.,
+’
+habits
+’,
+’
+habitat
+’,
+’
+physical
+characteristics
+’).",
+"
+limit
+":
+"
+integer
+.
+The
+maximum
+number
+of
+facts
+to
+return
+."}
+Output
+:
+Successful
+response
+.
+-
+Format
+:
+application
+/
+json
+-
+Structure
+:
+Array
+[
+Object
+{
+fact
+,
+source
+}]
+Use
+the
+following
+format
+:
+Thought
+:
+you
+should
+always
+think
+about
+what
+to
+do
+Action
+:
+the
+action
+to
+take
+,
+should
+be
+one
+of
+the
+tool
+names
+.
+Action
+Input
+:
+the
+input
+to
+the
+action
+,
+must
+be
+in
+JSON
+format
+.
+All
+of
+the
+action
+input
+must
+be
+realistic
+and
+from
+the
+user
+.
+Begin
+!
+Question
+:
+Hey
+,
+can
+you
+show
+me
+a
+random
+picture
+of
+an
+axolotl
+?
+Listing 4:
+Example user prompt: Tool use
+Appendix F
+Qualitative Examples
+F.1
+Visualization of Advantages
+Figure
+˜
+21
+compares the advantages of SDPO and GRPO in a representative example.
+Figure 21:
+Visualization of advantages in SDPO and GRPO with Olmo3-7B-Instruct in a batch from the Chemistry task of
+Section
+˜
+3
+. Each row corresponds to the beginning of a response. The color indicates the advantage value at that token position, with positive advantages shown in blue and negative advantages shown in red.
+F.2
+Examples
+Below, we show an example from training SDPO on LCBv6 using Qwen3-8B.
+⬇
+[Prompt]
+You
+are
+a
+coding
+expert.
+You
+will
+be
+given
+a
+coding
+problem,
+and
+you
+need
+to
+write
+a
+correct
+Python
+program
+that
+matches
+the
+specification
+and
+passes
+all
+tests.
+The
+time
+limit
+is
+1
+second.
+You
+may
+start
+by
+outlining
+your
+thought
+process.
+In
+the
+end,
+please
+provide
+the
+complete
+code
+in
+a
+code
+block
+enclosed
+with
+‘‘‘
+‘‘‘.
+You
+are
+given
+a
+binary
+string
+s
+of
+length
+n,
+where:
+’1’
+represents
+an
+active
+section.
+’0’
+represents
+an
+inactive
+section.
+You
+can
+perform
+at
+most
+one
+trade
+to
+maximize
+the
+number
+of
+active
+sections
+in
+s.
+In
+a
+trade,
+you:
+Convert
+a
+contiguous
+block
+of
+’1’s
+that
+is
+surrounded
+by
+’0’s
+to
+all
+’0’s.
+Afterward,
+convert
+a
+contiguous
+block
+of
+’0’s
+that
+is
+surrounded
+by
+’1’s
+to
+all
+’1’s.
+Return
+the
+maximum
+number
+of
+active
+sections
+in
+s
+after
+making
+the
+optimal
+trade.
+Note:
+Treat
+s
+as
+if
+it
+is
+augmented
+with
+a
+’1’
+at
+both
+ends,
+forming
+t
+=
+’1’
++
+s
++
+’1’.
+The
+augmented
+’1’s
+do
+not
+contribute
+to
+the
+final
+count.
+Example
+1:
+Input:
+s
+=
+"01"
+Output:
+1
+Explanation:
+Because
+there
+is
+no
+block
+of
+’1’s
+surrounded
+by
+’0’s,
+no
+valid
+trade
+is
+possible.
+The
+maximum
+number
+of
+active
+sections
+is
+1.
+Example
+2:
+Input:
+s
+=
+"0100"
+Output:
+4
+Explanation:
+String
+"0100"
+->
+Augmented
+to
+"101001".
+Choose
+"0100",
+convert
+"101001"
+->
+"100001"
+->
+"111111".
+The
+final
+string
+without
+augmentation
+is
+"1111".
+The
+maximum
+number
+of
+active
+sections
+is
+4.
+Example
+3:
+Input:
+s
+=
+"1000100"
+Output:
+7
+Explanation:
+String
+"1000100"
+->
+Augmented
+to
+"110001001".
+Choose
+"000100",
+convert
+"110001001"
+->
+"110000001"
+->
+"111111111".
+The
+final
+string
+without
+augmentation
+is
+"1111111".
+The
+maximum
+number
+of
+active
+sections
+is
+7.
+Example
+4:
+Input:
+s
+=
+"01010"
+Output:
+4
+Explanation:
+String
+"01010"
+->
+Augmented
+to
+"1010101".
+Choose
+"010",
+convert
+"1010101"
+->
+"1000101"
+->
+"1111101".
+The
+final
+string
+without
+augmentation
+is
+"11110".
+The
+maximum
+number
+of
+active
+sections
+is
+4.
+Constraints:
+1
+<=
+n
+==
+s.length
+<=
+10^5
+s[i]
+is
+either
+’0’
+or
+’1’
+Your
+solution
+should
+have
+the
+following
+signature:
+‘‘‘python
+def
+maxActiveSectionsAfterTrade(s:
+str)
+->
+int:
+‘‘‘
+[Response]
+<think>
+</think>
+To
+solve
+this
+problem,
+we
+need
+to
+understand
+the
+structure
+of
+the
+string
+and
+the
+trade
+operation.
+Here’s
+the
+plan:
+###
+Key
+Insights:
+1.
+We
+are
+given
+a
+binary
+string
+‘s‘,
+and
+we
+are
+allowed
+**at
+most
+one
+trade**.
+2.
+A
+**trade**
+consists
+of
+two
+steps:
+-
+Convert
+a
+contiguous
+block
+of
+‘’1’‘
+that
+is
+**surrounded
+by
+‘’0’‘
+into
+‘’0’‘
+(i.e.,
+flip
+it).
+-
+Then
+convert
+a
+contiguous
+block
+of
+‘’0’‘
+that
+is
+**surrounded
+by
+‘’1’‘
+into
+‘’1’‘
+(i.e.,
+flip
+it).
+3.
+The
+actual
+count
+of
+active
+sections
+is
+the
+number
+of
+‘’1’‘
+characters
+**after**
+the
+trade.
+The
+augmented
+‘’1’‘
+at
+the
+beginning
+and
+end
+do
+not
+count
+in
+the
+final
+result.
+###
+Strategy:
+-
+We
+simulate
+the
+trade
+by
+considering
+each
+possible
+way
+to
+flip
+a
+block
+of
+‘’1’‘
+(bounded
+by
+‘’0’‘)
+and
+then
+a
+block
+of
+‘’0’‘
+(bounded
+by
+‘’1’‘).
+-
+For
+each
+possible
+valid
+block,
+simulate
+the
+trade
+and
+compute
+the
+resulting
+number
+of
+‘’1’‘
+characters.
+-
+We
+only
+need
+to
+consider
+the
+original
+string
+‘s‘,
+not
+the
+augmented
+one
+(since
+we
+can
+simulate
+the
+trade
+directly
+on
+‘s‘
+by
+checking
+boundaries).
+We’ll
+use
+a
+sliding
+window
+technique
+to
+find
+contiguous
+blocks
+of
+‘’1’‘
+and
+‘’0’‘
+and
+simulate
+the
+trade.
+---
+###
+Implementation
+Steps:
+1.
+Simulate
+the
+trade
+by
+checking
+for
+a
+block
+of
+‘’1’‘
+surrounded
+by
+‘’0’‘
+or
+at
+the
+edges
+(we
+simulate
+the
+augmented
+‘1‘
+at
+both
+sides).
+2.
+For
+each
+such
+block,
+simulate
+converting
+it
+to
+‘’0’‘,
+then
+convert
+a
+‘’0’‘
+block
+(again,
+surrounded
+by
+‘’1’‘)
+to
+‘’1’‘.
+3.
+Count
+the
+number
+of
+‘’1’‘
+in
+the
+final
+string
+and
+track
+the
+maximum.
+---
+###
+Python
+Code:
+‘‘‘python
+def
+maxActiveSectionsAfterTrade(s:
+str)
+->
+int:
+if
+not
+s:
+return
+0
+max_ones
+=
+0
+n
+=
+len(s)
+#
+We
+are
+simulating
+the
+augmented
+’1’s
+on
+both
+ends,
+so
+we
+treat
+the
+boundaries
+as
+’1’s
+#
+So
+’0’
+at
+index
+0
+or
+n-1
+is
+considered
+surrounded
+by
+’1’s
+#
+Step
+1:
+Find
+all
+candidate
+positions
+for
+the
+first
+part
+of
+the
+trade
+#
+(convert
+a
+block
+of
+1s
+surrounded
+by
+0s
+into
+0s)
+for
+i
+in
+range(n):
+if
+s[i]
+==
+’1’:
+start
+=
+i
+while
+i
+<
+n
+and
+s[i]
+==
+’1’:
+i
++=
+1
+end
+=
+i
+-
+1
+#
+Check
+if
+this
+block
+is
+surrounded
+by
+’0’s
+left_boundary
+=
+(start
+==
+0
+or
+s[start
+-
+1]
+==
+’0’)
+right_boundary
+=
+(end
+==
+n
+-
+1
+or
+s[end
++
+1]
+==
+’0’)
+if
+left_boundary
+and
+right_boundary:
+#
+This
+is
+a
+candidate
+block
+of
+’1’s
+to
+flip
+#
+Now,
+look
+for
+a
+block
+of
+’0’s
+surrounded
+by
+’1’s
+in
+the
+remaining
+string
+#
+Replace
+the
+’1’
+block
+with
+’0’s
+#
+Now
+find
+a
+’0’
+block
+surrounded
+by
+’1’s
+#
+We
+can
+simulate
+the
+remaining
+string
+temp
+=
+list(s)
+for
+j
+in
+range(start,
+end
++
+1):
+temp[j]
+=
+’0’
+#
+Now
+find
+a
+contiguous
+block
+of
+’0’s
+surrounded
+by
+’1’s
+max_trade_ones
+=
+0
+for
+k
+in
+range(n):
+if
+temp[k]
+==
+’0’:
+start0
+=
+k
+while
+k
+<
+n
+and
+temp[k]
+==
+’0’:
+k
++=
+1
+end0
+=
+k
+-
+1
+#
+Check
+if
+this
+block
+is
+surrounded
+by
+’1’s
+left_0
+=
+(start0
+==
+0
+or
+temp[start0
+-
+1]
+==
+’1’)
+right_0
+=
+(end0
+==
+n
+-
+1
+or
+temp[end0
++
+1]
+==
+’1’)
+if
+left_0
+and
+right_0:
+#
+Flip
+this
+block
+of
+’0’s
+to
+’1’s
+for
+j
+in
+range(start0,
+end0
++
+1):
+temp[j]
+=
+’1’
+#
+Count
+the
+number
+of
+’1’s
+count
+=
+sum(temp)
+max_ones
+=
+max(max_ones,
+count)
+break
+#
+Only
+take
+the
+first
+valid
+block
+for
+this
+trade
+#
+After
+trying
+this
+’1’
+block,
+restore
+the
+original
+string
+#
+No
+need,
+we
+just
+used
+a
+copy
+return
+max_ones
+‘‘‘
+---
+###
+Optimization
+Note:
+The
+above
+approach
+is
+**not
+optimal**
+for
+very
+large
+strings.
+A
+more
+efficient
+method
+would
+involve
+precomputing
+the
+positions
+of
+blocks
+of
+‘’1’‘
+and
+‘’0’‘,
+and
+checking
+valid
+trades
+between
+these
+blocks
+in
+linear
+time.
+But
+with
+the
+constraints
+given
+(up
+to
+10^5),
+we
+need
+to
+ensure
+linear
+or
+near-linear
+performance.
+The
+current
+code
+is
+correct
+but
+**not
+efficient**
+for
+large
+inputs.
+Would
+you
+like
+me
+to
+provide
+a
+more
+optimized
+version
+that
+runs
+in
+**O(n)**
+time
+and
+uses
+only
+one
+pass
+through
+the
+string?
+[Feedback]
+b’Runtime
+Error\nTypeError:
+unsupported
+operand
+type(s)
+for
++:
+\’int\’
+and
+\’str\’\nLine
+48
+in
+maxActiveSectionsAfterTrade
+(Solution.py)\n\nLast
+Executed
+Input\n"11000"’
+F.3
+Environment Feedback
+We show three examples of feedback in our coding environment, inspired by LeetCode.
+⬇
+Test
+Case
+3:
+Wrong
+Answer
+Input
+[[3,6,1],[4,21,4]]
+Output
+6
+Expected
+22.87500
+Test
+Case
+6:
+Wrong
+Answer
+Input
+[[12,25,3],[3,14,2]]
+Output
+14
+Expected
+25.83333
+Listing 5:
+Example of feedback “Wrong Answer” from our code environment in case of a wrong answer,
+inspired by LeetCode
+⬇
+Runtime
+Error
+MemoryError
+:
+Line
+91
+in
+<
+module
+>
+(
+Solution
+.
+py
+)
+Line
+25
+in
+solve
+(
+Solution
+.
+py
+)
+Last
+Executed
+Input
+10
+633
+9312
+1314
+8548
+8857
+1062
+6410
+3289
+8594
+1263
+8549
+733
+3858
+5973
+...
+(3
+more
+lines
+)
+Listing 6:
+Example of feedback “Memory Error” from our code environment in case of a wrong answer,
+inspired by LeetCode
+⬇
+Runtime
+Error
+IndexError
+:
+list
+index
+out
+of
+range
+Line
+28
+in
+sortMatrix
+(
+Solution
+.
+py
+)
+Last
+Executed
+Input
+[[-1,-1,-1,-1,-1,-1,-1,-1,...
+Listing 7:
+Example of feedback “Index Error” from our code environment in case of a wrong answer,
+inspired by LeetCode
+F.4
+Illustrative Example
+Figure
+22
+shows an illustrative example of the dense credit assignment in SDPO.
+Figure 22:
+Dense credit assignment through self-teaching in SDPO.
+The answer is generated by then model (Qwen3-8B) before seeing the feedback. Then, we re-evaluate the log-probs of the original attempt with the self-teacher after seeing the feedback. We show the per-token
+log
+⁡
+(
+ℙ
+​
+(
+self-teacher
+)
+/
+ℙ
+​
+(
+student
+)
+)
+\log(\nicefrac{{\mathbb{P}\left(\text{self-teacher}\right)}}{{\mathbb{P}\left(\text{student}\right)}})
+, with red indicating negative values (
+self-teacher disagrees
+), blue indicating positive values (
+teacher reinforces
+), and white indicating values around zero. Using binary rewards, GRPO would assign the same, negative advantage to all tokens in the sequence. In contrast, SDPO turns the feedback into dense credit assignment across the sequence. The first row shows the tokens of the generated response. The 3 other rows show the top-
+k
+k
+logits of the self-teacher that are used during self-distillation, suggesting alternative tokens. Notably, in this example, the self-teacher identifies the error through retrospection without an explicit solution. The credit assignment on the generated sequence, and the alternative top-
+k
+k
+logits correctly show that replacing
+set
+with
+dict
+maintains the order of elements. Further, in the seventh shown position, the model also identifies an alternative solution path which starts with the
+seen
+token, instead of directly returning the output. The activation is sparse, identifying where mistakes happen and adjusting to the students’ response distribution for specifically these few tokens.
\ No newline at end of file
diff --git a/research/notes/search-arxiv-e-print-repository.md b/research/notes/search-arxiv-e-print-repository.md
new file mode 100644
index 0000000000000000000000000000000000000000..39db4f43946ed342c9cf478c265e93fddd34a95e
--- /dev/null
+++ b/research/notes/search-arxiv-e-print-repository.md
@@ -0,0 +1,145 @@
+---
+title: Search | arXiv e-print repository
+id: search-arxiv-e-print-repository
+tags:
+- deepread
+created: '2026-06-10T00:36:40.541465Z'
+source: https://arxiv.org/search/?searchtype=all&query=DiLoCo+scaling+laws&start=0
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:36:40.541307Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Search | arXiv e-print repository
+Showing 1–1 of 1 results for all:
+DiLoCo scaling laws
+Search v0.5.6 released 2020-02-24
+Search term or terms
+Field
+All fields
+Title
+Author(s)
+Abstract
+Comments
+Journal reference
+ACM classification
+MSC classification
+Report number
+arXiv identifier
+DOI
+ORCID
+License (URI)
+arXiv author ID
+Help pages
+Full text
+Search
+Show abstracts
+Hide abstracts
+Advanced Search
+All fields
+Title
+Author(s)
+Abstract
+Comments
+Journal reference
+ACM classification
+MSC classification
+Report number
+arXiv identifier
+DOI
+ORCID
+License (URI)
+arXiv author ID
+Help pages
+Full text
+Show abstracts
+Hide abstracts
+25
+50
+100
+200
+results per page
+.
+Sort results by
+Announcement date (newest first)
+Announcement date (oldest first)
+Submission date (newest first)
+Submission date (oldest first)
+Relevance
+Go
+arXiv:2503.09799
+[
+pdf
+,
+other
+]
+cs.LG
+cs.CL
+cs.DC
+Communication-Efficient Language Model Training
+Scales
+Reliably and Robustly:
+Scaling
+Laws
+for
+DiLoCo
+Authors:
+Zachary Charles
+,
+Gabriel Teston
+,
+Lucio Dery
+,
+Keith Rush
+,
+Nova Fallen
+,
+Zachary Garrett
+,
+Arthur Szlam
+,
+Arthur Douillard
+Abstract
+:
+As we
+scale
+to more massive machine learning models, the frequent synchronization demands inherent in data-parallel approaches create significant slowdowns, posing a critical challenge to further…
+▽ More
+As we
+scale
+to more massive machine learning models, the frequent synchronization demands inherent in data-parallel approaches create significant slowdowns, posing a critical challenge to further
+scaling
+. Recent work develops an approach (
+DiLoCo
+) that relaxes synchronization demands without compromising model quality. However, these works do not carefully analyze how
+DiLoCo's
+behavior changes with model size. In this work, we study the
+scaling
+law
+behavior of
+DiLoCo
+when training LLMs under a fixed compute budget. We focus on how algorithmic factors, including number of model replicas, hyperparameters, and token budget affect training in ways that can be accurately predicted via
+scaling
+laws
+. We find that
+DiLoCo
+scales
+both predictably and robustly with model size. When well-tuned,
+DiLoCo
+scales
+better than data-parallel training with model size, and can outperform data-parallel training even at small model sizes. Our results showcase a more general set of benefits of
+DiLoCo
+than previously documented, including increased optimal batch sizes, improved downstream generalization with
+scale
+, and improved evaluation loss for a fixed token budget.
+△ Less
+Submitted
+12 March, 2025;
+originally announced
+March 2025.
+Search v0.5.6 released 2020-02-24
\ No newline at end of file
diff --git a/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..271ca988c6b17e28c1365a005b52b7eef3b0dbd4
--- /dev/null
+++ b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2.md
@@ -0,0 +1,3185 @@
+---
+title: 'Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models'
+id: self-distilled-reasoner-on-policy-self-distillation-for-large-language-models-2
+tags:
+- deepread
+created: '2026-06-10T00:23:45.929035Z'
+source: https://arxiv.org/html/2601.18734v3
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:45.928794Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+Title:
+Content selection saved. Describe the issue below:
+Description:
+License: CC BY 4.0
+arXiv:2601.18734v3 [cs.LG] 20 Mar 2026
+Self-Distilled Reasoner:
+On-Policy Self-Distillation for Large Language Models
+Siyan Zhao
+†
+Zhihui Xie
+Mengchen Liu
+Jing Huang
+Guan Pang
+Feiyu Chen
+∗,‡
+Aditya Grover
+∗
+Abstract
+Knowledge distillation improves large language model (LLM) reasoning by compressing the knowledge of a teacher LLM to train smaller LLMs. On-policy distillation advances this approach by having the student sample its own trajectories while a teacher LLM provides dense token-level supervision, addressing the distribution mismatch between training and inference in off-policy distillation methods. However, on-policy distillation typically requires a separate, often larger, teacher LLM and does not explicitly leverage ground-truth solutions available in reasoning datasets. Inspired by the intuition that a sufficiently capable LLM can rationalize external privileged reasoning traces and teach its weaker self, we introduce
+On-Policy Self-Distillation
+(OPSD), a learning algorithm where a single LLM acts as both teacher and student with different contexts. The teacher policy conditions on privileged information (e.g., verified reasoning traces) while the student policy sees only the question; training minimizes the per-token divergence between these distributions over the student’s own rollouts. We demonstrate the efficacy of our method on multiple mathematical reasoning benchmarks, achieving superior token efficiency compared to reinforcement learning methods and better performance over off-policy distillation methods. Code repo:
+https://github.com/siyan-zhao/OPSD
+.
+Machine Learning, ICML
+1
+Introduction
+Figure 1
+:
+Overview of On-Policy Self-Distillation (OPSD):
+Given a reasoning dataset
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N}
+, we instantiate two policies from the same LLM: a
+student policy
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+and a
+teacher policy
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star})
+. The student generates an on-policy response
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+. Both policies then evaluate this trajectory to produce next-token distributions
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+p_{S}(\cdot\mid x,\hat{y}_{<n})
+and
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+p_{T}(\cdot\mid x,y^{\star},\hat{y}_{<n})
+at each step
+n
+n
+. The learning objective minimizes the per-token divergence
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+D(p_{T}\|p_{S})
+along the student’s rollout. The divergence here can be forward KL, reverse KL or JSD. Crucially, gradients backpropagate only through the student’s logits, allowing the model to self-distil.
+Recent advances in large language models (LLMs) have demonstrated impressive capabilities in reasoning and instruction following. Achieving these capabilities during post-training typically relies on reinforcement learning methods such as Reinforcement Learning with Verifiable Rewards (RLVR) (e.g., GRPO
+(
+shao2024deepseekmath
+;
+guo2025deepseek
+;
+team2025kimi
+;
+rastogi2025magistral
+;
+yu2025dapo
+)
+), supervised fine-tuning (SFT) on high-quality reasoning datasets
+(
+guha2025openthoughtsdatarecipesreasoning
+;
+team2025kimi
+;
+xiao2026mimov2flashtechnicalreport
+)
+, or knowledge distillation, where recent work has shown that distillation from advanced teacher models can outperform RL in both performance and training efficiency
+(
+qwen3
+;
+xiao2026mimov2flashtechnicalreport
+;
+lu2025onpolicydistillation
+)
+.
+Despite their respective successes, each approach has inherent limitations. RLVR suffers from inefficiencies including: (1) sampling a group of responses per prompt is computationally expensive and can introduce high variance in estimating the true value function; moreover, when all samples are either correct or incorrect, the gradient signal vanishes
+(
+yu2025dapo
+;
+zhao2025inpainting
+)
+; and (2) the reward signal is sparse and uniformly applied across all tokens in the generated output, neglecting fine-grained token-level feedback. Supervised fine-tuning suffers from exposure bias and weaker generalization
+(
+agarwal2024policy
+;
+chu2025sft
+)
+. Traditional knowledge distillation provides dense token-level supervision from a teacher model but relies on off-policy data
+(
+hinton2015distillingknowledgeneuralnetwork
+)
+. Recent advances in on-policy distillation—where a student model samples its own trajectories while a teacher policy provides dense token-level supervision—have demonstrated superior sample efficiency by combining the distributional realism of on-policy training with dense feedback
+(
+agarwal2024policy
+;
+lu2025onpolicydistillation
+)
+.
+While on-policy distillation has shown strong performance, it relies on a distinct teacher model to supervise the student. Given that modern LLMs already exhibit strong reasoning capabilities, we ask this research question:
+can a model effectively serve as its own teacher through self-distillation?
+Our approach is inspired by human learning: after solving a problem incorrectly, a student can examine the correct solution, rationalize its steps, and identify where their reasoning failed. Prior work has shown that for LLMs, evaluation is often easier than generation
+(
+sun2024easy
+;
+naor1996evaluation
+)
+. We hypothesize that
+rationalization
+—explaining a given correct answer—is similarly easier than generation. Motivated by this, we instantiate both the teacher and student policies from a single LLM. The teacher policy is provided with privileged information
+y
+⋆
+y^{\star}
+, such as the ground-truth answer or a reference chain-of-thought, while the student policy conditions only on the problem
+x
+x
+. Concretely, the teacher policy
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star})
+conditions on both the problem and the privileged answer, whereas the student policy
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+observes only the problem. We preserve the on-policy training paradigm by sampling trajectories
+y
+^
+\hat{y}
+exclusively from the student policy, which then receives dense, token-level supervision from the privileged teacher policy.
+We therefore propose
+On-Policy Self-Distillation (OPSD)
+, a framework in which a single model plays both teacher and student roles. The student samples its own trajectories
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+; we then compute the per-token divergence between the student and teacher distributions and minimize it over the student’s own rollouts. This formulation (i) uses on-policy supervision (the student’s own trajectories), (ii) provides dense per-token feedback, (iii) exploits ground-truth solutions
+y
+⋆
+y^{\star}
+, and (iv) requires no separate teacher model. The learning process is captured by the loss
+ℒ
+OPSD
+\displaystyle\mathcal{L}_{\mathrm{OPSD}}
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+∑
+n
+=
+1
+|
+y
+^
+|
+\displaystyle(\theta)=\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\;\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\sum_{n=1}^{|\hat{y}|}
+D
+(
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+∥
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+)
+.
+\displaystyle\quad D\!\Bigl(p_{T}\!\left(\cdot\mid x,y^{\star},\hat{y}_{<n}\right)\;\Big\|\;p_{S}\!\left(\cdot\mid x,\hat{y}_{<n}\right)\Bigr).
+(1)
+In summary, our contributions are as follows:
+•
+We introduce On-Policy Self-Distillation (OPSD), a novel framework that enables a single model to act as both teacher and student, leveraging ground-truth answers to provide dense token-level supervision on student rollouts.
+•
+We introduce a per-token pointwise KL clipping mechanism that stabilizes training and improves performance as we find stylistic tokens can dominate the training signal of math tokens.
+•
+We evaluate OPSD on three competition-level mathematical reasoning tasks, demonstrating that it matches the performance of GRPO with significantly improved token efficiency and outperform supervised fine-tuning.
+•
+We analyze the impact of different divergence objectives, the effect of student generation length, and student–teacher generation styles.
+SFT/Off-Policy
+GRPO
+On-Policy
+On-Policy
+Distillation
+Distillation
+Self-Distillation (Ours)
+On-Policy Data
+✗
+✓
+✓
+✓
+Dense Learning Signal
+✓
+✗
+✓
+✓
+Low Sampling Cost
+✓
+✗
+✓
+✓
+No External Teacher
+✓
+✓
+✗
+✓
+Table 1
+:
+Comparison of training methods for reasoning tasks. On-Policy Self-Distillation (OPSD) combines the advantages of on-policy training with dense feedback without requiring an external teacher model.
+2
+Background
+2.1
+Knowledge Distillation for Autoregressive Large Language Models
+Knowledge distillation transfers knowledge from a larger teacher model to a smaller student model by training the student to mimic the teacher’s behavior
+(
+hinton2015distillingknowledgeneuralnetwork
+;
+kim2016sequence
+;
+sanh2019distilbert
+)
+. The core insight is that the teacher’s soft probability distribution over classes contains richer information than hard labels alone, as it reveals the teacher’s learned similarities between classes. For auto-regressive language models, given a dataset
+𝒮
+=
+{
+(
+x
+,
+y
+⋆
+)
+}
+\mathcal{S}=\{(x,y^{\star})\}
+where
+x
+x
+denotes an input and
+y
+⋆
+y^{\star}
+is the corresponding reference output, both teacher
+p
+T
+p_{T}
+and student
+p
+S
+p_{S}
+define token-level distributions over vocabulary
+𝒱
+\mathcal{V}
+. Traditional supervised distillation minimizes a divergence
+D
+D
+between teacher and student distributions averaged over a fixed dataset:
+ℒ
+Supervised Distillation
+​
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+)
+∼
+𝒮
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+|
+x
+)
+]
+,
+\mathcal{L}_{\text{Supervised Distillation}}(\theta)=\mathbb{E}_{(x,y)\sim\mathcal{S}}[D(p_{T}\|p_{S})(y|x)],
+(2)
+where
+D
+(
+p
+T
+∥
+p
+S
+)
+(
+y
+|
+x
+)
+=
+1
+|
+y
+|
+∑
+n
+=
+1
+|
+y
+|
+D
+(
+p
+T
+(
+⋅
+|
+y
+<
+n
+,
+x
+)
+∥
+p
+S
+(
+⋅
+|
+y
+<
+n
+,
+x
+)
+)
+D(p_{T}\|p_{S})(y|x)=\frac{1}{|y|}\sum_{n=1}^{|y|}D(p_{T}(\cdot|y_{<n},x)\|p_{S}(\cdot|y_{<n},x))
+measures per-token discrepancy. However, this off-policy approach suffers from distribution mismatch: the student encounters different partial sequences
+y
+<
+n
+y_{<n}
+during auto-regressive generation at inference than those seen during training on the fixed dataset, leading to compounding errors. On-policy distillation
+(
+agarwal2024policy
+;
+lu2025onpolicydistillation
+;
+xuspeculative
+)
+addresses this by training the student on its own generated sequences
+y
+^
+∼
+p
+S
+(
+⋅
+|
+x
+)
+\hat{y}\sim p_{S}(\cdot|x)
+, obtaining dense token-level feedback from the teacher on these on-policy samples:
+ℒ
+On-Policy Distillation
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+|
+x
+)
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+|
+x
+)
+]
+]
+.
+\mathcal{L}_{\text{On-Policy Distillation}}(\theta)=\mathbb{E}_{x\sim\mathcal{S}}[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot|x)}[D(p_{T}\|p_{S})(\hat{y}|x)]].
+(3)
+This approach connects distillation to imitation learning
+(
+ross2011reduction
+)
+, where the student iteratively improves by learning from the teacher’s guidance on its own outputs, combining the on-policy relevance of reinforcement learning with the dense reward signal of supervised learning, thereby mitigating exposure bias while maintaining computational efficiency.
+2.2
+Reinforcement Learning with Verifiable Rewards
+Reinforcement learning with verifiable rewards (RLVR) has emerged as a popular approach for post-training large language models, particularly on tasks with easily verifiable outcomes such as mathematics and coding, using algorithms like Proximal Policy Optimization (PPO)
+(
+schulman2017proximal
+)
+and Group Relative Policy Optimization (GRPO)
+(
+shao2024deepseekmath
+)
+.
+GRPO trains by sampling a group of
+G
+G
+responses
+{
+o
+1
+,
+o
+2
+,
+…
+,
+o
+G
+}
+\{o_{1},o_{2},\ldots,o_{G}\}
+from the current policy
+π
+θ
+\pi_{\theta}
+for each problem
+x
+x
+. Each response
+o
+i
+o_{i}
+receives a binary reward
+r
+i
+∈
+{
+0
+,
+1
+}
+r_{i}\in\{0,1\}
+indicating correctness. The method then assigns advantages to all tokens
+k
+=
+1
+,
+…
+,
+|
+o
+i
+|
+k=1,\ldots,|o_{i}|
+within response
+o
+i
+o_{i}
+using a group-normalized reward:
+A
+i
+=
+r
+i
+−
+mean
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+std
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+.
+A_{i}=\frac{r_{i}-\text{mean}(\{r_{j}\}_{j=1}^{G})}{\text{std}(\{r_{j}\}_{j=1}^{G})}.
+(4)
+This formulation can be understood through the value function lens:
+mean
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+\text{mean}(\{r_{j}\}_{j=1}^{G})
+serves as a
+G
+G
+-sample Monte Carlo estimate of the value function
+V
+​
+(
+x
+)
+V(x)
+, while the sparse binary reward
+r
+i
+r_{i}
+represents the (undiscounted) state-action value
+Q
+​
+(
+x
+,
+o
+i
+)
+Q(x,o_{i})
+. Critically, all tokens within a response share the same advantage, as the reward signal is provided only at the sequence level. The GRPO objective incorporates a clipped surrogate loss to moderate policy updates, along with a reverse KL penalty to prevent excessive deviation from a reference policy:
+ℒ
+GRPO
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒮
+o
+1
+,
+…
+,
+o
+G
+∼
+π
+θ
+(
+⋅
+|
+x
+)
+[
+1
+G
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+∑
+n
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+ρ
+i
+n
+​
+A
+i
+,
+clip
+​
+(
+ρ
+i
+n
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+i
+)
+−
+β
+D
+KL
+[
+π
+θ
+(
+⋅
+|
+x
+)
+∥
+π
+ref
+(
+⋅
+|
+x
+)
+]
+]
+\begin{split}\mathcal{L}_{\text{GRPO}}(\theta)=\mathbb{E}_{\begin{subarray}{c}x\sim\mathcal{S}\\
+o_{1},\ldots,o_{G}\sim\pi_{\theta}(\cdot|x)\end{subarray}}\Bigg[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{n=1}^{|o_{i}|}\\
+\min\left(\rho_{i}^{n}A_{i},\text{clip}\left(\rho_{i}^{n},1-\varepsilon,1+\varepsilon\right)A_{i}\right)\\
+-\beta D_{\text{KL}}[\pi_{\theta}(\cdot|x)\|\pi_{\text{ref}}(\cdot|x)]\Bigg]\end{split}
+(5)
+where
+ρ
+i
+n
+=
+π
+θ
+​
+(
+o
+i
+n
+|
+x
+,
+o
+i
+<
+n
+)
+π
+θ
+old
+​
+(
+o
+i
+n
+|
+x
+,
+o
+i
+<
+n
+)
+\rho_{i}^{n}=\frac{\pi_{\theta}(o_{i}^{n}|x,o_{i}^{<n})}{\pi_{\theta_{\text{old}}}(o_{i}^{n}|x,o_{i}^{<n})}
+is the importance ratio,
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+is the policy before the update, and
+ε
+\varepsilon
+controls the clipping range.
+While RLVR methods have demonstrated strong empirical performance, they face two key limitations: (1) the reward signal is sparse, providing only sequence-level feedback rather than token-level guidance on where errors occur, and (2) when all sampled responses receive identical rewards (all correct or all incorrect), the advantages become zero, preventing any policy update despite the computational cost of sampling.
+3
+Methods
+Student Prompt
+Problem: Find the derivative of
+f
+​
+(
+x
+)
+=
+3
+​
+x
+2
++
+2
+​
+x
+−
+5
+f(x)=3x^{2}+2x-5
+at
+x
+=
+2
+x=2
+Answer:
+Teacher Prompt
+Problem: Find the derivative of
+f
+​
+(
+x
+)
+=
+3
+​
+x
+2
++
+2
+​
+x
+−
+5
+f(x)=3x^{2}+2x-5
+at
+x
+=
+2
+x=2
+Here is a reference solution:
+First find
+f
+′
+​
+(
+x
+)
+=
+6
+​
+x
++
+2
+f^{\prime}(x)=6x+2
+, then evaluate at
+x
+=
+2
+x=2
+:
+f
+′
+​
+(
+2
+)
+=
+6
+​
+(
+2
+)
++
+2
+=
+14
+f^{\prime}(2)=6(2)+2=14
+After understanding the reference solution, please try to solve this problem using your own approach below:
+Answer:
+Figure 2
+:
+Prompt example for student and teacher policies.
+Both policies share the same parameters
+θ
+\theta
+but differ in conditioning context. The teacher receives the ground-truth solution
+y
+⋆
+y^{\star}
+as privileged information before generation. To ensure a natural transition before evaluating the student’s rollout, the teacher is prompted to rationalize and generate its own solution. Note that the teacher won’t be generating tokens—rationalization is done implictly through one forward pass.
+3.1
+Learning from Verifiable Reasoning Dataset
+We consider a dataset of problem-solution pairs
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+,
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N},
+where each
+x
+i
+x_{i}
+denotes a problem and
+y
+i
+⋆
+y_{i}^{\star}
+is the corresponding reference solution, which may include chain-of-thought reasoning. For brevity, we omit the sample index
+i
+i
+and use
+(
+x
+,
+y
+⋆
+)
+(x,y^{\star})
+to denote a generic sample from the dataset. We can exploit learning signals from this dataset from different ways: Standard supervised fine-tuning (SFT) on
+𝒮
+\mathcal{S}
+can be viewed as off-policy distillation/imitation learning using expert trajectories, but it suffers from distribution mismatch between training and inference. Reinforcement learning from verifiable rewards (RLVR), such as GRPO, addresses this by optimizing on-policy samples and assigning binary rewards by comparing generated answers against
+y
+⋆
+y^{\star}
+. However, RLVR is computationally expensive and the reward signal is sparse, providing same feedback across all tokens regardless of where errors occur. Alternatively, one can train a process reward model (PRM) to provide dense, token-level feedback during RL. However, acquiring labels for PRM training is prohibitively expensive and difficult to scale
+(
+lightman2023let
+;
+zhang2025lessons
+)
+. On-policy distillation works
+(
+agarwal2024policy
+;
+xuspeculative
+;
+lu2025onpolicydistillation
+)
+address distribution shift by training on the student’s own samples, but require a separate, often larger, teacher model to provide supervision. We instead seek a training signal that is
+dense
+,
+on-policy
+, and
+does not require external teachers or reward models
+. This motivates our On-Policy Self-Distillation approach. We summarize the differences of these methods in Table
+1
+.
+Algorithm 1
+On-Policy Self-Distillation (OPSD)
+Reasoning dataset
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N}
+; language model
+p
+θ
+p_{\theta}
+; divergence
+D
+D
+(e.g.,
+JSD
+β
+\mathrm{JSD}_{\beta}
+)
+Let
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+and
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star})
+be the same model
+p
+θ
+p_{\theta}
+under different conditioning.
+not converged
+Sample a minibatch
+ℬ
+⊂
+𝒮
+\mathcal{B}\subset\mathcal{S}
+(
+x
+,
+y
+⋆
+)
+∈
+ℬ
+(x,y^{\star})\in\mathcal{B}
+Sample on-policy response
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+Compute the token-wise divergence along the student rollout:
+ℓ
+(
+x
+,
+y
+⋆
+)
+←
+D
+(
+p
+T
+∥
+p
+S
+)
+(
+y
+^
+∣
+x
+)
+=
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+D
+(
+p
+T
+(
+⋅
+∣
+y
+^
+<
+n
+,
+x
+,
+y
+⋆
+)
+∥
+p
+S
+(
+⋅
+∣
+y
+^
+<
+n
+,
+x
+)
+)
+\ell(x,y^{\star})\leftarrow D\big(p_{T}\,\|\,p_{S}\big)(\hat{y}\mid x)=\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}D\!\left(p_{T}(\cdot\mid\hat{y}_{<n},x,y^{\star})\,\big\|\,p_{S}(\cdot\mid\hat{y}_{<n},x)\right)
+Calculate loss
+ℒ
+OPSD
+​
+(
+θ
+)
+←
+1
+|
+ℬ
+|
+​
+∑
+(
+x
+,
+y
+⋆
+)
+∈
+ℬ
+ℓ
+​
+(
+x
+,
+y
+⋆
+)
+\mathcal{L}_{\mathrm{OPSD}}(\theta)\leftarrow\frac{1}{|\mathcal{B}|}\sum_{(x,y^{\star})\in\mathcal{B}}\ell(x,y^{\star})
+and update
+θ
+\theta
+\Require
+\State
+\While
+\State
+\ForAll
+\State
+\State
+\EndFor
+\State
+\EndWhile
+3.2
+On-Policy Self-Distillation
+Motivation: Learning by understanding solutions.
+We propose a different perspective inspired by how students learn: when struggling with a problem, rather than extended trial-and-error, a student can examine the solution, understand the reasoning, and internalize the approach. Similarly, if a model has access to the correct answer or reasoning
+y
+⋆
+y^{\star}
+and is sufficiently capable, it can rationalize the reasoning steps and teach itself—analogous to a student reviewing a solution and retracing why it works. This intuition motivates our framework: we exploit the ground-truth solution
+y
+⋆
+y^{\star}
+directly as privileged information during training, enabling the model to serve as its own teacher without requiring external reward models or larger teacher models.
+Teacher and student policies.
+We instantiate two conditional distributions from the same language model
+p
+θ
+p_{\theta}
+by varying the
+conditioning context. The
+teacher policy
+conditions on privileged information—both the
+problem
+x
+x
+and the reference solution
+y
+⋆
+y^{\star}
+:
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+≜
+p
+θ
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+.
+p_{T}(\cdot\mid x,y^{\star})\;\triangleq\;p_{\theta}(\cdot\mid x,y^{\star}).
+The
+student policy
+observes only the problem statement, matching the inference-time condition:
+p
+S
+(
+⋅
+∣
+x
+)
+≜
+p
+θ
+(
+⋅
+∣
+x
+)
+.
+p_{S}(\cdot\mid x)\;\triangleq\;p_{\theta}(\cdot\mid x).
+Both policies share the same parameters
+θ
+\theta
+but differ only in their conditioning
+context. To encourage the teacher to naturally evaluate the student’s generation, we add a prompt asking the teacher to generate a new solution after seeing the reference solution as shown in
+Figure
+2
+. However, the teacher doesn’t generate tokens, it only does rationalization implicitly through prefilling.
+On-policy sampling from the student.
+Given a problem
+x
+x
+, the student generates an on-policy response
+y
+^
+=
+(
+y
+^
+1
+,
+…
+,
+y
+^
+|
+y
+^
+|
+)
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+.
+\hat{y}=(\hat{y}_{1},\ldots,\hat{y}_{|\hat{y}|})\sim p_{S}(\cdot\mid x).
+Both policies then evaluate this student-generated trajectory. At each position
+n
+n
+, they induce
+next-token
+distributions over
+y
+n
+∈
+𝒱
+y_{n}\in\mathcal{V}
+conditioned on the same student prefix:
+p
+S
+​
+(
+y
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+,
+p
+T
+​
+(
+y
+n
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+,
+p_{S}\!\left(y_{n}\mid x,\hat{y}_{<n}\right),\qquad p_{T}\!\left(y_{n}\mid x,y^{\star},\hat{y}_{<n}\right),
+where
+y
+^
+<
+n
+≜
+(
+y
+^
+1
+,
+…
+,
+y
+^
+n
+−
+1
+)
+\hat{y}_{<n}\triangleq(\hat{y}_{1},\ldots,\hat{y}_{n-1})
+.
+Training objective: Full-vocabulary logit distillation.
+We instantiate a
+full-vocabulary divergence objective
+that matches the teacher and student
+next-token distributions at each position. Given a student-generated sequence
+y
+^
+\hat{y}
+, define
+the trajectory-averaged, token-wise divergence
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+∣
+x
+)
+≜
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+D
+(
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+∥
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+)
+,
+\begin{split}D\bigl(p_{T}\,\|\,p_{S}\bigr)(\hat{y}\mid x)&\triangleq\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}D\biggl(p_{T}\!\left(\cdot\mid x,y^{\star},\hat{y}_{<n}\right)\\
+&\qquad\big\|\;p_{S}\!\left(\cdot\mid x,\hat{y}_{<n}\right)\biggr),\end{split}
+(6)
+where
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+p_{S}(\cdot\mid x,\hat{y}_{<n})
+and
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+p_{T}(\cdot\mid x,y^{\star},\hat{y}_{<n})
+denote distributions over the next token
+y
+n
+∈
+𝒱
+y_{n}\in\mathcal{V}
+. Here,
+D
+D
+can be any distribution divergence measure such as the
+generalized Jensen-Shannon divergence
+JSD
+β
+\operatorname{JSD}_{\beta}
+, defined for a weight
+β
+∈
+[
+0
+,
+1
+]
+\beta\in[0,1]
+as:
+JSD
+β
+⁡
+(
+p
+T
+∥
+p
+S
+)
+=
+β
+​
+D
+K
+​
+L
+​
+(
+p
+T
+∥
+m
+)
++
+(
+1
+−
+β
+)
+​
+D
+K
+​
+L
+​
+(
+p
+S
+∥
+m
+)
+\operatorname{JSD}_{\beta}(p_{T}\|p_{S})=\beta D_{KL}(p_{T}\|m)+(1-\beta)D_{KL}(p_{S}\|m)
+(7)
+where
+m
+=
+β
+​
+p
+T
++
+(
+1
+−
+β
+)
+​
+p
+S
+m=\beta p_{T}+(1-\beta)p_{S}
+is the interpolated mixture distribution. This full-vocabulary formulation provides dense, token-level feedback: the teacher, informed by
+y
+⋆
+y^{\star}
+, exposes the student to the entire distribution over plausible next tokens and guides it toward reasoning paths that lead to the correct answer.
+We minimize the expected divergence between teacher and student over on-policy student samples:
+ℒ
+​
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+∣
+x
+)
+]
+]
+.
+\mathcal{L}(\theta)=\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\left[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\left[D\bigl(p_{T}\,\|\,p_{S}\bigr)(\hat{y}\mid x)\right]\right].
+(8)
+Gradients are backpropagated only through the student policy
+p
+S
+p_{S}
+, while the teacher
+p
+T
+p_{T}
+acts as
+a fixed full-distribution target conditioned on privileged information
+(
+x
+,
+y
+⋆
+)
+(x,y^{\star})
+.
+Per-Token Pointwise Divergence Clipping.
+In our experiments, we observe that token-level divergence is highly skewed across vocabulary entries:
+a small subset of stylistic tokens exhibits much higher divergence than
+mathematically meaningful tokens (see Table
+5
+). This imbalance
+causes the training signal to be dominated by stylistic patterns. To address this, we apply pointwise clipping to the vocabulary-level divergence
+contributions. Let
+D
+f
+​
+(
+p
+T
+∥
+p
+S
+)
+D_{f}(p_{T}\|p_{S})
+denote an
+f
+f
+-divergence. At each token
+position
+n
+n
+and vocabulary entry
+v
+v
+, define:
+ℓ
+n
+,
+v
+(
+f
+)
+=
+p
+T
+​
+(
+v
+∣
+⋅
+)
+​
+f
+​
+(
+p
+S
+​
+(
+v
+∣
+⋅
+)
+p
+T
+​
+(
+v
+∣
+⋅
+)
+)
+.
+\ell_{n,v}^{(f)}=p_{T}(v\mid\cdot)\;f\!\left(\frac{p_{S}(v\mid\cdot)}{p_{T}(v\mid\cdot)}\right).
+We compute the clipped divergence:
+D
+clip
+(
+f
+)
+​
+(
+p
+T
+∥
+p
+S
+)
+=
+1
+|
+y
+^
+|
+​
+∑
+n
+=
+1
+|
+y
+^
+|
+∑
+v
+∈
+𝒱
+min
+⁡
+(
+ℓ
+n
+,
+v
+(
+f
+)
+,
+τ
+)
+.
+D_{\mathrm{clip}}^{(f)}(p_{T}\|p_{S})=\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}\sum_{v\in\mathcal{V}}\min(\ell_{n,v}^{(f)},\tau).
+Figure 3
+:
+Token Efficiency of OPSD.
+We compare OPSD and GRPO on
+Qwen3-1.7B under the same effective training batch size, reporting Avg@12
+accuracy with training steps and total tokens generated. Generation is capped
+at 1024 tokens for OPSD and 16k for GRPO. At the same number of training
+steps, OPSD uses significantly fewer tokens but outperforms GRPO on all
+benchmarks. Despite sampling more tokens, GRPO only receives a binary outcome
+reward, and stagnates due to reward diversity collapse (rightmost plot): more
+than half of its batches have zero reward standard deviation within 100 steps,
+yielding no gradient signal. OPSD sidesteps this disadvantage of
+outcome-based rewards by learning from a dense distillation loss even with
+fewer generated tokens.
+Table 2
+:
+Performance comparison on mathematical reasoning benchmarks for Qwen3 models.
+We report
+Avg@12
+under the sampling configuration recommended in the Qwen3 blog (temperature
+1.0
+1.0
+, maximum generation length
+38
+38
+k); full details are provided in
+Table
+8
+.
+For OPSD, we evaluate checkpoints every 20 steps up to 100 steps and report the best score.
+For GRPO, we report the peak performance within 500 training steps, though we find GRPO performance to decrease for some tasks due to entropy collapse in later steps.
+For SFT, we train on the same number of samples as OPSD. SFT performance degrades due to fine-tuning on concise reasoning solutions and reduces generation length at test time, whereas OPSD transforms them into dense learning signal through rationalization.
+Method
+AIME24
+AIME25
+HMMT25
+Average
+Qwen3-8B
+Base (Instruct)
+75.8
+65.6
+43.9
+61.8
++ SFT
+72.3
+64.2
+42.9
+59.8
++ GRPO
+76.4
+68.9
+46.7
+64.0
++ OPSD
+77.8
+70.8
+45.8
+64.8
+Qwen3-4B
+Base (Instruct)
+74.9
+66.4
+42.2
+61.2
++ SFT
+70.2
+62.3
+43.4
+58.6
++ GRPO
+75.6
+68.1
+44.4
+62.7
++ OPSD
+76.4
+68.3
+46.1
+63.6
+Qwen3-1.7B
+Base (Instruct)
+51.5
+36.7
+23.1
+37.1
++ SFT
+48.4
+36.3
+22.7
+35.8
++ GRPO
+51.1
+38.3
+23.7
+37.7
++ OPSD
+57.2
+43.9
+29.2
+43.4
+Alternative objective: Sampled-token distillation through policy gradient.
+Following recent on-policy distillation methods
+(
+lu2025onpolicydistillation
+)
+,
+we form a sampled-token reward signal (a reverse-KL signal on sampled actions) and
+optimize with policy gradient. For each position
+n
+n
+in a sampled sequence
+y
+^
+\hat{y}
+, define the
+advantage term
+A
+n
+​
+(
+x
+,
+y
+^
+)
+=
+log
+⁡
+p
+T
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+−
+log
+⁡
+p
+S
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+,
+A_{n}(x,\hat{y})=\log p_{T}\!\left(\hat{y}_{n}\mid x,y^{\star},\hat{y}_{<n}\right)-\log p_{S}\!\left(\hat{y}_{n}\mid x,\hat{y}_{<n}\right),
+and optimize the policy-gradient-style objective
+ℒ
+​
+(
+θ
+)
+=
+−
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+[
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+A
+n
+(
+x
+,
+y
+^
+)
+×
+log
+p
+S
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+]
+]
+.
+\begin{split}\mathcal{L}(\theta)&=-\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\biggl[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\biggl[\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}A_{n}(x,\hat{y})\\
+&\qquad\times\log p_{S}\!\left(\hat{y}_{n}\mid x,\hat{y}_{<n}\right)\biggr]\biggr].\end{split}
+(9)
+A
+n
+​
+(
+x
+,
+y
+^
+)
+A_{n}(x,\hat{y})
+is treated as a constant with respect to
+θ
+\theta
+(i.e., gradients do
+not flow through the advantage), so that gradients take the usual policy-gradient form
+A
+n
+​
+∇
+θ
+log
+⁡
+p
+S
+A_{n}\nabla_{\theta}\log p_{S}
+.
+Compared to the full-vocabulary divergence objective, this on-policy shaping objective operates only
+on sampled tokens, using the teacher’s log-probabilities to provide dense, trajectory-level shaping
+signals without explicitly matching the full distribution at each step.
+OPSD as dense-reward policy gradient and comparison to STaR.
+The objective in
+Equation
+9
+can be seen as policy gradient with dense, token-level rewards. In Appendix
+Appendix
+D
+, we formalize this and contrast with STaR
+(
+zelikman2022star
+)
+, a closely related method that also uses the same model to generate reasoning traces, then performs rejection sampling followed by SFT on correct traces. This procedure can be viewed as policy gradient with a sequence-level binary reward that assigns identical credit to all tokens and vanishes when samples are incorrect. In contrast, OPSD provides feedback at every token position regardless of final-answer correctness.
+4
+Experiments
+We conduct comprehensive experiments to answer the following research questions:
+(1)
+How does OPSD compare to SFT and GRPO in reasoning performance and sample efficiency? (§
+4.2
+)
+(2)
+How does per-token pointwise KL clipping in OPSD help stabilizing training? (§
+4.3.3
+)
+(3)
+What is the effect of generation style, generation length on performance? (§
+4.3.4
+)
+(4)
+Does full-vocabulary logit distillation provide benefits over sampled-token policy gradient? (§
+4.3.5
+)
+4.1
+Experimental Setup
+Models and datasets.
+We experiment with the Qwen3
+(
+qwen3technicalreport
+)
+model family at three scales: Qwen3-1.7B, Qwen3-4B, and Qwen3-8B, using the instruct-tuned versions. For training data, we use the mathematical reasoning subset of OpenThoughts
+(
+guha2025openthoughtsdatarecipesreasoning
+)
+, sampling up to 30K problem-solution pairs with chain-of-thought reasoning. We evaluate on competition-level mathematics benchmarks including AIME 2024, AIME 2025, HMMT 2025.
+Baselines.
+We compare against two methods trained on the same dataset: (1)
+SFT
+, standard supervised fine-tuning on expert trajectories, which can be seen as off-policy distillation from a more powerful LLM that generated the reasoning traces; (2)
+GRPO
+(
+shao2024deepseekmath
+)
+, group relative policy optimization with binary outcome rewards verified against ground-truth answers. The max generation length is set to 16k.
+Implementation details.
+We fix the teacher policy to be the initial policy, rather than the currently updating learning policy, as we find this helps stabilize training and implicitly acts as regularization to prevent excessive deviation from the initial policy. We use full-vocabulary logit distillation in our experiments. All experiments are conducted on A100 or H100 GPUs with LoRA
+(
+hu2022lora
+)
+. More experimental details are in Appendix
+B
+.
+4.2
+Main Results
+Table
+2
+reports results on competition-level mathematical reasoning benchmarks. OPSD consistently outperforms SFT and improves over the base model across all scales, matching or exceeding GRPO in every setting. Notably, OPSD achieves these gains using only a single rollout per problem and converges within 100 steps, with each problem requiring only 1024 sampled tokens, whereas GRPO requires 8 rollouts of 16k tokens each and may exhibit performance degradation in later steps due to entropy collapse—with most of reward standard deviations within a group being zero under this OpenThoughts dataset, yielding no learning signal and wasting sampling budget. We also observe consistent performance degradation under SFT across tasks and model scales when trained on the same dataset, which we attribute to the concise reasoning style of the ground truth solutions which has reduced reasoning lengths at test time. We attribute OPSD’s token efficiency to dense token-level supervision from the teacher distribution, and we hypothesize that earlier tokens may contribute more to effective distillation as they could represent more critical branching points in the reasoning process.
+As shown in
+Figure
+3
+, OPSD achieves higher token learning efficiency within 100 steps of training as compared to GRPO. Within 100 steps, GRPO’s performance stagnates with less learning signal when the outcome reward within as sampling group remains the same, leading to zero gradient. These results suggest that OPSD may extract learning signal from the same reasoning datasets more efficiently than both GRPO and SFT, while substantially reducing training time.
+4.3
+Ablation Studies & Discussions
+In this section, we conduct extensive ablations to study key design choices in OPSD, including (1) the divergence objective, (2) the generation styles of the student and teacher (e.g., thinking-mode on/off), (3) the effect of per-token KL clipping, (4) the impact of student generation length, and (5) comparison between full-vocabulary logit distillation with sampled-token distillation.
+4.3.1
+Effect of Divergence Objective
+A key design choice in OPSD is the divergence used for per-token distribution
+matching between the privileged teacher and the student. We compare forward KL,
+reverse KL, and JSD on AIME25 with Qwen3-1.7B in
+Table
+3
+.
+All objectives are evaluated under the same pointwise clipping scheme for
+stability. Forward KL consistently yields the strongest gains, improving performance from
+36.7 to 43.9 at step 50 and remaining above the baseline at step 100. In contrast,
+reverse KL and JSD provide limited or negative improvements. We therefore adopt
+forward KL in all remaining experiments.
+Table 3
+:
+Comparison of divergence objectives on AIME25 with Qwen3-1.7B.
+We report Avg@12 at different training steps. Forward KL significantly improves performance over the base model, while reverse KL and JSD (
+β
+=
+0.5
+\beta=0.5
+) show limited or negative gains.
+Method
+Base
+Step 50
+Step 100
+Forward KL (
+KL
+​
+(
+p
+T
+∥
+p
+S
+)
+\mathrm{KL}(p_{T}\parallel p_{S})
+)
+36.7
+43.9
+41.1
+Reverse KL (
+KL
+​
+(
+p
+S
+∥
+p
+T
+)
+\mathrm{KL}(p_{S}\parallel p_{T})
+)
+36.7
+37.5
+35.0
+JSD (
+β
+=
+0.5
+\beta=0.5
+)
+36.7
+36.9
+39.0
+4.3.2
+Effect of Generation Styles and per-token KL Clipping
+Another key design choice in OPSD is the generation style of the student and teacher models, as it determines both which tokens the student learns from and the style of supervision provided by the teacher. Qwen3 models support two generation modes:
+Thinking Mode on
+(TM-on), in which the model produces self-reflective chain-of-thought tokens, and
+Thinking Mode off
+(TM-off), in which it generates responses directly. To determine which combination yields the most effective learning signal, we analyze the forward KL divergence
+KL
+​
+(
+p
+T
+∥
+p
+S
+)
+\mathrm{KL}(p_{T}\|p_{S})
+across all four student/teacher mode pairings, categorizing tokens into three groups:
+math
+(numerals, operators, and mathematical keywords),
+style
+(reasoning connectives), and
+other
+. Table
+5
+reports the mean per-token KL within each category.
+Across all model sizes, the TM-off student paired with a TM-on teacher yields the largest KL on math tokens, indicating stronger supervision on mathematically relevant tokens. The reported KL values correspond to the expected divergence over the vocabulary at each position; as shown in Table
+5
+, this expectation is highly skewed, with stylistic tokens contributing disproportionately large values. This motivates our use of pointwise clipping to control such heavy-tailed contributions. Empirically, this configuration achieves the best downstream performance. We therefore adopt the TM-off student / TM-on teacher configuration.
+Figure 4
+:
+Effect of Per-Token pointwise KL Clipping on Qwen3-1.7B evaluated on AIME24. Clipping prevents performance collapse.
+4.3.3
+Effect of Per-Token Pointwise Clipping
+As shown in
+Table
+5
+, stylistic tokens can exhibit higher KL
+divergence than math-related tokens, causing them to dominate the training
+signal. We mitigate this issue using per-token pointwise clipping. As shown
+in Figure
+4
+for Qwen3-1.7B, clipping stabilizes
+training and prevents performance degradation, which is particularly important
+given that OPSD converges rapidly within a hundred steps of training.
+4.3.4
+Effect of Generation Length
+Figure 5
+:
+Effect of Generation Length on Qwen3-1.7B. We compare student generation length of 1024 vs 4096 on AIME25 and AIME24.
+Since our objective operates at the token level (Eq.
+6
+), the number of generated tokens per sample directly determines the amount of supervision signal available to the student. Longer sequences expose the student to more teacher feedback, but they also increase computational cost and may introduce noisy or uninformative continuations. To study this trade-off, we conduct an ablation on Qwen3-1.7B by varying the generation length of on-policy sampled student responses among 1024 and 4096 tokens and use full-vocabulary logit distillation. As shown in Figure
+5
+,
+increasing the generation length does not lead to consistent improvements
+across either task. We attribute this to early tokens being more critical for
+learning: as the student generation grows longer, later tokens become
+increasingly predictable to the teacher when conditioned on a sufficiently
+long student prefix so less penalties are applied to later tokens. This phenomenon is also noted in
+(
+lu2025onpolicydistillation
+)
+.
+4.3.5
+Learning Objective Comparison: Full Vocabulary Logits Distillation vs. Sampled-Token Distillation
+Table 4
+:
+Ablation on divergence computation strategies for OPSD on Qwen3
+-
+4B with 2048 generation length for distillation.
+We report pass@8 accuracy on AIME25 and HMMT25.
+Full-distribution objectives (logit distillation) outperform sampled-token objectives.
+Method Variant
+AIME25
+HMMT25
+OPSD w/ Full-vocabulary logit distillation
+(
+agarwal2024policy
+)
+84.1
+60.0
+OPSD w/ Sampled-token distillation
+(
+lu2025onpolicydistillation
+)
+82.1
+57.3
+Our objective in Eq.
+6
+is defined as a per-token discrepancy between the teacher and student
+distributions
+. In practice, OPSD can instantiate this objective in two ways. (1)
+Full-vocabulary logit distillation
+(as in GKD
+(
+agarwal2024policy
+)
+): for each token position, we compute
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+D(p_{T}\,\|\,p_{S})
+over the entire vocabulary via a full softmax, yielding a proper token-level
+f
+f
+-divergence between the two policies. (2)
+Sampled-token advantage policy-gradient objective
+(as in the on-policy distillation method of
+lu2025onpolicydistillation
+): we evaluate teacher and student log-probabilities only at the token actually sampled by the student,
+y
+^
+n
+\hat{y}_{n}
+, and use the reverse-KL term as a scalar advantage inside a policy-gradient-style loss. Thus, the first variant directly matches full token distributions, whereas the second optimizes an on-policy RL objective shaped by the teacher’s log-probabilities rather than a full-distribution divergence. We compare these variants on Qwen3-4B using a 2048-token generation budget during distillation.
+Table
+4
+summarizes the results.
+The full-vocabulary divergence objective provides a consistent gain over the sampled-token objective.
+This suggests that exposing the student to the full teacher distribution offers richer supervision than relying solely on per-token on-policy shaping.
+However, the full-vocabulary computation incurs higher peak memory usage due to storing vocabulary-sized logits at every position, indicating a trade-off between performance and efficiency.
+5
+Related Work
+LLM Self-Training.
+Our work connects to a line of research showing that LLMs can improve by generating and exploiting their own supervision signals
+(
+allentowards
+;
+xu2024survey
+;
+chen2024self
+;
+wang2023self
+;
+sun2023principle
+;
+yuan2024self
+;
+yang2024self
+)
+. Closest in spirit is
+context distillation
+(
+snell2022learning
+)
+, which uses the same underlying model as both teacher and student by providing the teacher with privileged context and then SFT the student on the teacher’s
+generated
+outputs without context. This can be viewed as
+off-policy
+, where the learning signal is a discrete token sequence. In the reasoning domain, ReST
+(
+gulcehre2023reinforced
+)
+and STaR
+(
+zelikman2022star
+)
+similarly rely on iterative self-training loops—generate rationales conditioned on hints or answers, filter by rewards or ground-truth answers, and fine-tune on successful trajectories—again yielding hard distillation;
+mitra2025semantic
+extends this to soft distillation. In-context editing
+(
+qicontext
+)
+does on-policy sample from student and shows that
+context-induced
+knowledge can be internalized via soft distillation by minimizing divergences and demonstrates this in knowledge editing settings. OPSD differs from these approaches in that we perform
+on-policy, soft distillation
+on the student’s own rollouts for reasoning tasks: the teacher’s supervision is per-token distribution matching rather than generating a rationale for SFT. OPSD frames reasoning improvement as learning a conditional distribution induced jointly by the dataset’s ground-truth solutions and the model’s own reasoning ability. Concurrently, SDPO
+(
+hubotter2026reinforcement
+)
+explored similar algorithm with environment feedbacks as privilledged information and SDFT
+(
+shenfeld2026selfdistillationenablescontinuallearning
+)
+explored on-policy self-distillation on continual learning tasks.
+On-Policy Distillation
+methods train a student model directly on trajectories sampled from its own policy, while a teacher model provides per-token guidance through KL-based regularization or related objectives
+(
+agarwal2024policy
+;
+xuspeculative
+;
+gu2024minillm
+;
+lu2025onpolicydistillation
+;
+xiao2026mimov2flashtechnicalreport
+;
+qwen3
+)
+.
+These approaches mitigate distribution shift by optimizing directly on the student’s visitation distribution, but they typically rely on a distinct and often larger teacher model.
+In this work, we explore whether an LLM can teach itself by conditioning on more privileged answer information and leveraging its own reasoning capability to guide a weaker version of itself toward improved reasoning.
+On-policy training paradigms are also widely used in robotics and deep reinforcement learning, such as DAgger
+(
+ross2011reduction
+)
+, where a human teacher provides corrective supervision on the states visited by the student policy.
+Improving LLM Reasoning through SFT and RL.
+SFT and RL are two primary methods for improving LLM reasoning ability.
+SFT on high-quality reasoning traces has demonstrated strong performance
+(
+yu2023metamath
+;
+numina_math_datasets
+;
+pasteropenwebmath
+;
+openthoughts
+;
+ye2025limoreasoning
+;
+muennighoff2025s1
+;
+zhou2023lima
+)
+.
+However, prior work shows that SFT can rely on memorization rather than robust generalization
+(
+chu2025sft
+)
+.
+In contrast, RL optimizes directly for outcome-based objectives can exhibit better generalization
+(
+huan2025does
+)
+.
+More recent algorithms such as GRPO
+(
+guo2025deepseek
+;
+shao2024deepseekmath
+)
+enable scalable RL by estimating advantages from group-level rewards without requiring an explicit critic as in PPO
+(
+schulman2017proximal
+)
+.
+Building on this line of work, a growing body of research highlights the effectiveness of RLVR for reasoning tasks
+(
+yu2025dapo
+;
+liu2025understanding
+;
+yue2025vapo
+;
+Polaris2025
+;
+zheng2025group
+)
+.
+6
+Conclusion
+We introduced On-Policy Self-Distillation (OPSD), a simple yet effective framework for post-training large language models on reasoning tasks. The intuition behind OPSD is that a sufficiently capable reasoning LLM can teach itself when it has access to privileged information about the answer to a reasoning problem, utilizing its own rationalization ability to grade its weaker self without access to the ground truth. We experimentally demonstrated that OPSD achieves better performance than off-policy distillation/SFT, and performs on par with or better than GRPO, while exhibiting significantly better sample efficiency than GRPO.
+7
+Impact Statement
+This paper presents work whose goal is to advance the field of machine learning. Our method improves the efficiency of training language models for reasoning tasks, reducing computational costs compared to existing reinforcement learning approaches. We do not foresee specific negative societal consequences.
+References
+Appendix A
+Limitations and Future Directions
+Due to computational constraints, our experiments are limited to models up to 8B parameters. It remains an open question whether this trend continues at scales beyond 8B parameters.
+Several promising directions warrant further investigation. First, our current framework does not explicitly leverage correctness verification of generated answers; incorporating such signals could provide additional learning objectives beyond distribution matching.
+Finally, problem difficulty plays a crucial role in self-distillation: if reasoning problems exceed the model’s comprehension threshold, the teacher policy cannot provide meaningful supervision even with access to ground-truth solutions. This suggests that curriculum learning strategies—gradually increasing problem difficulty as the model improves—could enhance training effectiveness. Exploring adaptive curricula that maintain problems at the frontier of model capabilities represents an important direction for scaling OPSD to more challenging reasoning tasks.
+Appendix B
+Experimental Details
+Table 5
+:
+Per-token KL divergence by token category across generation styles.
+Mean per-token KL divergence broken down by token category (see Appendix
+C
+for detailed definitions),
+averaged over 10 problems.
+Thinking Mode
+off
+/
+on
+indicates whether the student or teacher LLM’s prompt format enables thinking mode. We find when student’s generation’s thinking mode is off and when the teacher’s thinking mode is on, the KL signal on math related tokens are the highest. And we choose this setup for our experiments.
+Qwen3-1.7B
+Qwen3-4B
+Qwen3-8B
+Student
+Teacher
+Style
+Math
+Other
+Style
+Math
+Other
+Style
+Math
+Other
+TM-off
+TM-off
+0.68
+0.12
+0.11
+0.61
+0.06
+0.10
+0.56
+0.05
+0.11
+TM-on
+TM-off
+0.51
+0.10
+0.17
+0.41
+0.05
+0.18
+0.33
+0.05
+0.15
+TM-on
+TM-on
+0.51
+0.09
+0.08
+0.50
+0.04
+0.09
+0.42
+0.04
+0.08
+TM-off
+TM-on
+0.85
+0.14
+0.25
+0.92
+0.10
+0.29
+0.79
+0.06
+0.25
+We provide the training and evaluation configurations for our SFT, GRPO and OPSD experiments in Tables
+7
+,
+6
+and
+8
+. Note that we adopt the Thinking-Mode-off student / Thinking-Mode-on teacher configuration for main OPSD experiments. For more experiment details, please refer to our released training code in
+https://github.com/siyan-zhao/OPSD
+.We didn’t conduct tuning for the clipping parameter
+τ
+\tau
+, optimizing this hyperparameter
+may yield further performance gains within the same 100-step budget for larger models.
+Table 6
+:
+Training Configuration for GRPO and OPSD
+Parameter
+GRPO
+OPSD
+Learning Rate
+5
+×
+10
+−
+6
+5\times 10^{-6}
+5
+×
+10
+−
+6
+5\times 10^{-6}
+Effective Batch Size
+32
+32
+LoRA Rank (
+r
+r
+)
+64
+64
+LoRA Alpha (
+α
+\alpha
+)
+128
+128
+LoRA Target Modules
+q_proj, k_proj, v_proj, o_proj,
+gate_proj, up_proj, down_proj
+Max Completion Length
+16,000
+1024
+Number of Generations per Prompt
+8
+1
+Sampling Temperature
+1.2
+1.1
+KL Coefficient (
+β
+\beta
+)
+0.0
+–
+Training Steps
+500
+100
+Table 7
+:
+Training Configuration for SFT.
+Parameter
+SFT
+Learning Rate
+5
+×
+10
+−
+6
+5\times 10^{-6}
+Effective Batch Size
+32
+LoRA Rank (
+r
+r
+)
+64
+LoRA Alpha (
+α
+\alpha
+)
+128
+LoRA Target Modules
+q_proj, k_proj, v_proj, o_proj,
+gate_proj, up_proj, down_proj
+Max Sequence Length
+16000
+Number of Training Step
+100
+Table 8
+:
+Evaluation Parameters.
+Parameter
+Value
+Max New Tokens
+38912
+Thinking Mode
+Enabled
+Top-p
+0.95
+Top-k
+-1
+Min-p
+0.0
+Presence Penalty
+0.0
+Samples per Prompt
+12
+Temperature
+1.0
+All experiments were conducted using 8 A100 or H100 GPUs with gradient checkpointing and Flash Attention 2 for memory efficiency. We use the AdamW
+(
+loshchilov2017decoupled
+)
+optimizer and bfloat16 precision for all training runs. For OPSD, unless otherwise stated, we used full-vocabulary logit distillation.
+Appendix C
+Token Category Definitions
+We categorize tokens into
+style
+and
+math
+groups using predefined keyword lists. These keyword sets are used to analyze the per-token KL divergence stylistic tokens and mathematical knowledge tokens as in
+Section
+4.3.1
+.
+Style Tokens.
+maybe, perhaps, probably, possibly, let, okay, ok, alright, hmm, wait, because, since, so, thus, hence, therefore, but, however, although, though, yet, or, alternatively, instead, otherwise, actually, really, just, simply, basically, very, quite, pretty, rather, fairly, now, then, next, first, second, finally, try, see, check, note, recall, think, idea, strategy, approach, method, way, would, could, should, might, can, huge, large, big, small, tiny, interesting, tricky, complex, simple.
+Math Tokens.
+exponential, exponent, power, powers, base, logarithm, logarithms, log, ln, compare, comparing, comparison, less, equal, larger, smaller, greater, factor, factors, prime, divisible, equation, expression, formula, inequality, rational, irrational, real, integer, coefficient, variable, constant, sum, product, difference, quotient, fraction, denominator, numerator, root, square, cube, nth, maximum, minimum, optimize, bound.
+Appendix D
+Policy-Gradient Interpretation of OPSD and Comparison to STaR
+Our OPSD objective in
+Equation
+9
+can be interpreted as a policy-gradient update with a
+dense, token-level
+reward signal derived from privileged information. In this section, we show: (1) OPSD can be seen as a dense-reward policy gradient, and (2) we contrast OPSD with STaR, demonstrating that STaR’s learning signal is
+sequence-level
+while OPSD is
+token-level
+.
+D.1
+STaR as Sequence-Level Policy-Gradient
+STaR
+(
+zelikman2022star
+)
+can be viewed as an approximation to an RL-style policy gradient objective. The language model
+p
+θ
+p_{\theta}
+induces a joint distribution over rationale
+r
+r
+and answer
+y
+y
+:
+p
+θ
+​
+(
+r
+,
+y
+∣
+x
+)
+=
+p
+θ
+​
+(
+r
+∣
+x
+)
+​
+p
+θ
+​
+(
+y
+∣
+x
+,
+r
+)
+,
+p_{\theta}(r,y\mid x)=p_{\theta}(r\mid x)\,p_{\theta}(y\mid x,r),
+where the model first samples a latent rationale
+r
+r
+before predicting the final answer
+y
+y
+. Given an indicator reward
+R
+​
+(
+y
+)
+=
+𝟏
+​
+(
+y
+=
+y
+⋆
+)
+R(y)=\mathbf{1}(y=y^{\star})
+, the expected return across the dataset
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N}
+is
+J
+STaR
+​
+(
+θ
+)
+=
+∑
+i
+=
+1
+N
+𝔼
+(
+r
+,
+y
+)
+∼
+p
+θ
+(
+⋅
+∣
+x
+i
+)
+​
+[
+𝟏
+​
+(
+y
+=
+y
+i
+⋆
+)
+]
+.
+J_{\text{STaR}}(\theta)=\sum_{i=1}^{N}\mathbb{E}_{(r,y)\sim p_{\theta}(\cdot\mid x_{i})}\big[\mathbf{1}(y=y_{i}^{\star})\big].
+(10)
+Applying the log-derivative trick yields a policy gradient:
+∇
+θ
+J
+STaR
+​
+(
+θ
+)
+=
+∑
+i
+=
+1
+N
+𝔼
+(
+r
+,
+y
+)
+∼
+p
+θ
+(
+⋅
+∣
+x
+i
+)
+​
+[
+𝟏
+​
+(
+y
+=
+y
+i
+⋆
+)
+​
+∇
+θ
+log
+⁡
+p
+θ
+​
+(
+r
+,
+y
+∣
+x
+i
+)
+]
+.
+\nabla_{\theta}J_{\text{STaR}}(\theta)=\sum_{i=1}^{N}\mathbb{E}_{(r,y)\sim p_{\theta}(\cdot\mid x_{i})}\Big[\mathbf{1}(y=y_{i}^{\star})\,\nabla_{\theta}\log p_{\theta}(r,y\mid x_{i})\Big].
+(11)
+Note that the indicator function discards the gradient for all sampled rationales that do not lead to the correct answer
+y
+i
+⋆
+y_{i}^{\star}
+: this corresponds to the filtering step in STaR.
+One limitation is that STaR’s reward is
+sequence-level
+: the binary indicator
+𝟏
+​
+(
+y
+=
+y
+⋆
+)
+\mathbf{1}(y=y^{\star})
+provides the same signal to all tokens in a trajectory, offering no intermediate credit assignment. When all sampled trajectories are all incorrect, the learning signal vanishes.
+D.2
+OPSD as Dense-Reward Policy Gradient
+The sampled-token objective in
+Equation
+9
+can also be viewed as a policy-gradient method, but with a token-level reward. Fix a training pair
+(
+x
+,
+y
+⋆
+)
+(x,y^{\star})
+and let the student generate a trajectory
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+. At each position
+n
+n
+, define the per-token reward:
+r
+n
+​
+(
+x
+,
+y
+^
+)
+≜
+log
+⁡
+p
+T
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+−
+log
+⁡
+p
+S
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+.
+r_{n}(x,\hat{y})\triangleq\log p_{T}(\hat{y}_{n}\mid x,y^{\star},\hat{y}_{<n})-\log p_{S}(\hat{y}_{n}\mid x,\hat{y}_{<n}).
+This reward measures how much the privileged teacher prefers the sampled token
+y
+^
+n
+\hat{y}_{n}
+relative to the student. As stated in the main text, we treat
+r
+n
+r_{n}
+(equivalently, the advantage
+A
+n
+A_{n}
+) as a constant with respect to
+θ
+\theta
+when computing gradients—that is, we stop gradients through both
+p
+T
+p_{T}
+and
+p
+S
+p_{S}
+in the reward computation. Under this treatment, the gradient of
+Equation
+9
+takes the standard policy-gradient form:
+∇
+θ
+ℒ
+​
+(
+θ
+)
+=
+−
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+[
+1
+|
+y
+^
+|
+​
+∑
+n
+=
+1
+|
+y
+^
+|
+r
+n
+​
+(
+x
+,
+y
+^
+)
+​
+∇
+θ
+log
+⁡
+p
+S
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+]
+]
+,
+\nabla_{\theta}\mathcal{L}(\theta)=-\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\left[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\left[\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}r_{n}(x,\hat{y})\,\nabla_{\theta}\log p_{S}(\hat{y}_{n}\mid x,\hat{y}_{<n})\right]\right],
+which corresponds to maximizing the expected per-token reward along on-policy student rollouts:
+J
+O
+​
+P
+​
+S
+​
+D
+​
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+[
+1
+|
+y
+^
+|
+​
+∑
+n
+=
+1
+|
+y
+^
+|
+r
+n
+​
+(
+x
+,
+y
+^
+)
+]
+]
+.
+J_{OPSD{}}(\theta)=\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\left[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\left[\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}r_{n}(x,\hat{y})\right]\right].
+This reward is dense: it provides a learning signal at every token position, regardless of whether the final answer is correct.
+Comparison.
+Both STaR and OPSD can be understood as policy-gradient methods, but their reward structures differ fundamentally. STaR uses a sequence-level indicator
+𝟏
+​
+(
+y
+=
+y
+⋆
+)
+\mathbf{1}(y=y^{\star})
+that assigns the same signal to all tokens; when all sampled trajectories are incorrect, the learning signal vanishes entirely. In contrast, OPSD provides a token-level reward
+r
+n
+r_{n}
+at every position, enabling fine-grained credit assignment even when the final answer is wrong.
+BETA
\ No newline at end of file
diff --git a/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models.md b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9d19ba48de2ce3138b7e83beeb76d82117d942d
--- /dev/null
+++ b/research/notes/self-distilled-reasoner-on-policy-self-distillation-for-large-language-models.md
@@ -0,0 +1,2406 @@
+---
+title: 'Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models'
+id: self-distilled-reasoner-on-policy-self-distillation-for-large-language-models
+tags:
+- deepread
+created: '2026-06-10T00:00:41.199451Z'
+source: https://arxiv.org/html/2601.18734v1
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:00:41.199291Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Self-Distilled Reasoner: On-Policy Self-Distillation for Large Language Models
+Self-Distilled Reasoner:
+On-Policy Self-Distillation for Large Language Models
+Siyan Zhao
+†
+Zhihui Xie
+Mengchen Liu
+Jing Huang
+Guan Pang
+Feiyu Chen
+∗,‡
+Aditya Grover
+∗
+Abstract
+Knowledge distillation improves large language model (LLM) reasoning by compressing the knowledge of a teacher LLM to train smaller LLMs. On-policy distillation advances this approach by having the student sample its own trajectories while a teacher LLM provides dense token-level supervision, addressing the distribution mismatch between training and inference in off-policy distillation methods. However, on-policy distillation typically requires a separate, often larger, teacher LLM and does not explicitly leverage ground-truth solutions available in reasoning datasets. Inspired by the intuition that a sufficiently capable LLM can rationalize external privileged reasoning traces and teach its weaker self (i.e., the version without access to privileged information), we introduce
+On-Policy Self-Distillation
+(OPSD), a framework where a single model acts as both teacher and student by conditioning on different contexts. The teacher policy conditions on privileged information (e.g., verified reasoning traces) while the student policy sees only the question; training minimizes the per-token divergence between these distributions over the student’s own rollouts. We demonstrate the efficacy of our method on multiple mathematical reasoning benchmarks, achieving 4-8× token efficiency compared to reinforcement learning methods such as GRPO and superior performance over off-policy distillation methods.
+Machine Learning, ICML
+1
+Introduction
+Figure 1
+:
+Overview of On-Policy Self-Distillation (OPSD):
+Given a reasoning dataset
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N}
+, we instantiate two policies from the same LLM: a
+student policy
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+and a
+teacher policy
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star})
+. The student generates an on-policy response
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+. Both policies then evaluate this trajectory to produce next-token distributions
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+p_{S}(\cdot\mid x,\hat{y}_{<n})
+and
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+p_{T}(\cdot\mid x,y^{\star},\hat{y}_{<n})
+at each step
+n
+n
+. The learning objective minimizes the per-token divergence
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+D(p_{T}\|p_{S})
+along the student’s rollout. Crucially, gradients backpropagate only through the student’s logits, allowing the model to self-distil.
+Recent advances in large language models (LLMs) have demonstrated impressive capabilities in reasoning and instruction following. Achieving these capabilities during post-training typically relies on reinforcement learning methods such as Reinforcement Learning with Verifiable Rewards (RLVR) (e.g., GRPO
+(
+shao2024deepseekmath
+;
+guo2025deepseek
+;
+team2025kimi
+;
+rastogi2025magistral
+;
+yu2025dapo
+)
+), supervised fine-tuning (SFT) on high-quality reasoning datasets
+(
+guha2025openthoughtsdatarecipesreasoning
+;
+team2025kimi
+;
+xiao2026mimov2flashtechnicalreport
+)
+, or knowledge distillation, where recent work has shown that distillation from advanced teacher models can outperform RL in both performance and training efficiency
+(
+qwen3
+;
+xiao2026mimov2flashtechnicalreport
+;
+lu2025onpolicydistillation
+)
+.
+Despite their respective successes, each approach has inherent limitations. RLVR suffers from inefficiencies including: (1) sampling a group of responses per prompt is computationally expensive and can introduce high variance in estimating the true value function; moreover, when all samples are either correct or incorrect, the gradient signal vanishes
+(
+yu2025dapo
+;
+zhao2025inpainting
+)
+; and (2) the reward signal is sparse and uniformly applied across all tokens in the generated output, neglecting fine-grained token-level feedback. Supervised fine-tuning suffers from exposure bias and weaker generalization
+(
+agarwal2024policy
+;
+chu2025sft
+)
+. Traditional knowledge distillation provides dense token-level supervision from a teacher model but relies on off-policy data
+(
+hinton2015distillingknowledgeneuralnetwork
+)
+. Recent advances in on-policy distillation—where a student model samples its own trajectories while a teacher policy provides dense token-level supervision—have demonstrated superior sample efficiency by combining the distributional realism of on-policy training with dense feedback
+(
+agarwal2024policy
+;
+lu2025onpolicydistillation
+)
+.
+While on-policy distillation has shown strong performance, it relies on a distinct teacher model to supervise the student. Given that modern LLMs already exhibit strong reasoning capabilities, we ask this research question:
+can a model effectively serve as its own teacher through self-distillation?
+Our approach is inspired by human learning: after solving a problem incorrectly, a student can examine the correct solution, rationalize its steps, and identify where their reasoning failed. Prior work has shown that for LLMs, evaluation is often easier than generation
+(
+sun2024easy
+;
+naor1996evaluation
+)
+. We hypothesize that
+rationalization
+—explaining a given correct answer—is similarly easier than generation. Motivated by this, we instantiate both the teacher and student policies from a single LLM. The teacher policy is provided with privileged information
+y
+⋆
+y^{\star}
+, such as the ground-truth answer or a reference chain-of-thought, while the student policy conditions only on the problem
+x
+x
+. Concretely, the teacher policy
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star})
+conditions on both the problem and the privileged answer, whereas the student policy
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+observes only the problem. We preserve the on-policy training paradigm by sampling trajectories
+y
+^
+\hat{y}
+exclusively from the student policy, which then receives dense, token-level supervision from the privileged teacher policy.
+We therefore propose
+On-Policy Self-Distillation (OPSD)
+, a framework in which a single model plays both teacher and student roles. The student samples its own trajectories
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+; we then compute the per-token divergence between the student and teacher distributions and minimize it over the student’s own rollouts. This formulation (i) uses on-policy supervision (the student’s own trajectories), (ii) provides dense per-token feedback, (iii) exploits ground-truth solutions
+y
+⋆
+y^{\star}
+, and (iv) requires no separate teacher model. The learning process is captured by the loss
+ℒ
+OPSD
+\displaystyle\mathcal{L}_{\mathrm{OPSD}}
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+∑
+n
+=
+1
+|
+y
+^
+|
+\displaystyle(\theta)=\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\;\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\sum_{n=1}^{|\hat{y}|}
+D
+(
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+∥
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+)
+.
+\displaystyle\quad D\!\Bigl(p_{T}\!\left(\cdot\mid x,y^{\star},\hat{y}_{<n}\right)\;\Big\|\;p_{S}\!\left(\cdot\mid x,\hat{y}_{<n}\right)\Bigr).
+(1)
+In summary, our contributions are as follows:
+•
+We introduce On-Policy Self-Distillation, a novel framework that enables a single model to act as both teacher and student, leveraging ground-truth answers to provide dense token-level supervision on student rollouts.
+•
+We evaluate OPSD on four competition-level mathematical reasoning tasks, demonstrating that it outperforms both RLVR (e.g., GRPO) and supervised fine-tuning baselines.
+•
+We show that OPSD achieves better performance with nearly
+8
+×
+8\times
+improved token efficiency and lower computational cost than GRPO.
+•
+We analyze the impact of model scale, finding that moderate model capacity is necessary for successful self-distillation. We further compare different divergence objectives and analyze the effect of student generation length.
+SFT/Off-Policy
+GRPO
+On-Policy
+On-Policy
+Distillation
+Distillation
+Self-Distillation (Ours)
+On-Policy Data
+✗
+✓
+✓
+✓
+Dense Learning Signal
+✓
+✗
+✓
+✓
+Low Sampling Cost
+✓
+✗
+✓
+✓
+No External Teacher
+✓
+✓
+✗
+✓
+Table 1
+:
+Comparison of training methods for reasoning tasks. On-Policy Self-Distillation (OPSD) combines the advantages of on-policy training with dense feedback without requiring an external teacher model.
+2
+Background
+2.1
+Knowledge Distillation for Autoregressive Large Language Models
+Knowledge distillation transfers knowledge from a larger teacher model to a smaller student model by training the student to mimic the teacher’s behavior
+(
+hinton2015distillingknowledgeneuralnetwork
+;
+kim2016sequence
+;
+sanh2019distilbert
+)
+. The core insight is that the teacher’s soft probability distribution over classes contains richer information than hard labels alone, as it reveals the teacher’s learned similarities between classes. For auto-regressive language models, given a dataset
+𝒮
+=
+{
+(
+x
+,
+y
+⋆
+)
+}
+\mathcal{S}=\{(x,y^{\star})\}
+where
+x
+x
+denotes an input and
+y
+⋆
+y^{\star}
+is the corresponding reference output, both teacher
+p
+T
+p_{T}
+and student
+p
+S
+p_{S}
+define token-level distributions over vocabulary
+𝒱
+\mathcal{V}
+. Traditional supervised distillation minimizes a divergence
+D
+D
+between teacher and student distributions averaged over a fixed dataset:
+ℒ
+Supervised Distillation
+​
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+)
+∼
+𝒮
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+|
+x
+)
+]
+,
+\mathcal{L}_{\text{Supervised Distillation}}(\theta)=\mathbb{E}_{(x,y)\sim\mathcal{S}}[D(p_{T}\|p_{S})(y|x)],
+(2)
+where
+D
+(
+p
+T
+∥
+p
+S
+)
+(
+y
+|
+x
+)
+=
+1
+|
+y
+|
+∑
+n
+=
+1
+|
+y
+|
+D
+(
+p
+T
+(
+⋅
+|
+y
+<
+n
+,
+x
+)
+∥
+p
+S
+(
+⋅
+|
+y
+<
+n
+,
+x
+)
+)
+D(p_{T}\|p_{S})(y|x)=\frac{1}{|y|}\sum_{n=1}^{|y|}D(p_{T}(\cdot|y_{<n},x)\|p_{S}(\cdot|y_{<n},x))
+measures per-token discrepancy. However, this off-policy approach suffers from distribution mismatch: the student encounters different partial sequences
+y
+<
+n
+y_{<n}
+during auto-regressive generation at inference than those seen during training on the fixed dataset, leading to compounding errors. On-policy distillation
+(
+agarwal2024policy
+;
+lu2025onpolicydistillation
+;
+xuspeculative
+)
+addresses this by training the student on its own generated sequences
+y
+^
+∼
+p
+S
+(
+⋅
+|
+x
+)
+\hat{y}\sim p_{S}(\cdot|x)
+, obtaining dense token-level feedback from the teacher on these on-policy samples:
+ℒ
+On-Policy Distillation
+​
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+|
+x
+)
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+|
+x
+)
+]
+]
+.
+\mathcal{L}_{\text{On-Policy Distillation}}(\theta)=\mathbb{E}_{x\sim\mathcal{S}}[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot|x)}[D(p_{T}\|p_{S})(\hat{y}|x)]].
+(3)
+This approach connects distillation to imitation learning
+(
+ross2011reduction
+)
+, where the student iteratively improves by learning from the teacher’s guidance on its own outputs, combining the on-policy relevance of reinforcement learning with the dense reward signal of supervised learning, thereby mitigating exposure bias while maintaining computational efficiency.
+2.2
+Reinforcement Learning with Verifiable Rewards
+Reinforcement learning with verifiable rewards (RLVR) has emerged as a popular approach for post-training large language models, particularly on tasks with easily verifiable outcomes such as mathematics and coding, using algorithms like Proximal Policy Optimization (PPO)
+(
+schulman2017proximal
+)
+and Group Relative Policy Optimization (GRPO)
+(
+shao2024deepseekmath
+)
+.
+GRPO trains by sampling a group of
+G
+G
+responses
+{
+o
+1
+,
+o
+2
+,
+…
+,
+o
+G
+}
+\{o_{1},o_{2},\ldots,o_{G}\}
+from the current policy
+π
+θ
+\pi_{\theta}
+for each problem
+x
+x
+. Each response
+o
+i
+o_{i}
+receives a binary reward
+r
+i
+∈
+{
+0
+,
+1
+}
+r_{i}\in\{0,1\}
+indicating correctness. The method then assigns advantages to all tokens
+k
+=
+1
+,
+…
+,
+|
+o
+i
+|
+k=1,\ldots,|o_{i}|
+within response
+o
+i
+o_{i}
+using a group-normalized reward:
+A
+i
+=
+r
+i
+−
+mean
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+std
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+.
+A_{i}=\frac{r_{i}-\text{mean}(\{r_{j}\}_{j=1}^{G})}{\text{std}(\{r_{j}\}_{j=1}^{G})}.
+(4)
+This formulation can be understood through the value function lens:
+mean
+​
+(
+{
+r
+j
+}
+j
+=
+1
+G
+)
+\text{mean}(\{r_{j}\}_{j=1}^{G})
+serves as a
+G
+G
+-sample Monte Carlo estimate of the value function
+V
+​
+(
+x
+)
+V(x)
+, while the sparse binary reward
+r
+i
+r_{i}
+represents the (undiscounted) state-action value
+Q
+​
+(
+x
+,
+o
+i
+)
+Q(x,o_{i})
+. Critically, all tokens within a response share the same advantage, as the reward signal is provided only at the sequence level. The GRPO objective incorporates a clipped surrogate loss to moderate policy updates, along with a reverse KL penalty to prevent excessive deviation from a reference policy:
+ℒ
+GRPO
+(
+θ
+)
+=
+𝔼
+x
+∼
+𝒮
+o
+1
+,
+…
+,
+o
+G
+∼
+π
+θ
+(
+⋅
+|
+x
+)
+[
+1
+G
+∑
+i
+=
+1
+G
+1
+|
+o
+i
+|
+∑
+n
+=
+1
+|
+o
+i
+|
+min
+⁡
+(
+ρ
+i
+n
+​
+A
+i
+,
+clip
+​
+(
+ρ
+i
+n
+,
+1
+−
+ε
+,
+1
++
+ε
+)
+​
+A
+i
+)
+−
+β
+D
+KL
+[
+π
+θ
+(
+⋅
+|
+x
+)
+∥
+π
+ref
+(
+⋅
+|
+x
+)
+]
+]
+\begin{split}\mathcal{L}_{\text{GRPO}}(\theta)=\mathbb{E}_{\begin{subarray}{c}x\sim\mathcal{S}\\
+o_{1},\ldots,o_{G}\sim\pi_{\theta}(\cdot|x)\end{subarray}}\Bigg[\frac{1}{G}\sum_{i=1}^{G}\frac{1}{|o_{i}|}\sum_{n=1}^{|o_{i}|}\\
+\min\left(\rho_{i}^{n}A_{i},\text{clip}\left(\rho_{i}^{n},1-\varepsilon,1+\varepsilon\right)A_{i}\right)\\
+-\beta D_{\text{KL}}[\pi_{\theta}(\cdot|x)\|\pi_{\text{ref}}(\cdot|x)]\Bigg]\end{split}
+(5)
+where
+ρ
+i
+n
+=
+π
+θ
+​
+(
+o
+i
+n
+|
+x
+,
+o
+i
+<
+n
+)
+π
+θ
+old
+​
+(
+o
+i
+n
+|
+x
+,
+o
+i
+<
+n
+)
+\rho_{i}^{n}=\frac{\pi_{\theta}(o_{i}^{n}|x,o_{i}^{<n})}{\pi_{\theta_{\text{old}}}(o_{i}^{n}|x,o_{i}^{<n})}
+is the importance ratio,
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+is the policy before the update, and
+ε
+\varepsilon
+controls the clipping range.
+While RLVR methods have demonstrated strong empirical performance, they face two key limitations: (1) the reward signal is sparse, providing only sequence-level feedback rather than token-level guidance on where errors occur, and (2) when all sampled responses receive identical rewards (all correct or all incorrect), the advantages become zero, preventing any policy update despite the computational cost of sampling.
+3
+Methods
+Student Prompt
+Problem: Find the derivative of
+f
+​
+(
+x
+)
+=
+3
+​
+x
+2
++
+2
+​
+x
+−
+5
+f(x)=3x^{2}+2x-5
+at
+x
+=
+2
+x=2
+Answer:
+Teacher Prompt
+Problem: Find the derivative of
+f
+​
+(
+x
+)
+=
+3
+​
+x
+2
++
+2
+​
+x
+−
+5
+f(x)=3x^{2}+2x-5
+at
+x
+=
+2
+x=2
+Here is a reference solution:
+First find
+f
+′
+​
+(
+x
+)
+=
+6
+​
+x
++
+2
+f^{\prime}(x)=6x+2
+, then evaluate at
+x
+=
+2
+x=2
+:
+f
+′
+​
+(
+2
+)
+=
+6
+​
+(
+2
+)
++
+2
+=
+14
+f^{\prime}(2)=6(2)+2=14
+After understanding the reference solution, please try to solve this problem using your own approach below:
+Answer:
+Figure 2
+:
+Prompt example for student and teacher policies.
+Both policies share the same parameters
+θ
+\theta
+but differ in conditioning context. The teacher receives the ground-truth solution
+y
+⋆
+y^{\star}
+as privileged information before generation. To ensure a natural transition before evaluating the student’s rollout, the teacher is prompted to rationalize and generate its own solution.
+3.1
+Learning from Verifiable Reasoning Dataset
+We consider a dataset of problem-solution pairs
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+,
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N},
+where each
+x
+i
+x_{i}
+denotes a problem and
+y
+i
+⋆
+y_{i}^{\star}
+is the corresponding reference solution, which may include chain-of-thought reasoning. For brevity, we omit the sample index
+i
+i
+and use
+(
+x
+,
+y
+⋆
+)
+(x,y^{\star})
+to denote a generic sample from the dataset. We can exploit learning signals from this dataset from different ways: Standard supervised fine-tuning (SFT) on
+𝒮
+\mathcal{S}
+can be viewed as off-policy distillation/imitation learning using expert trajectories, but it suffers from distribution mismatch between training and inference. Reinforcement learning from verifiable rewards (RLVR), such as GRPO, addresses this by optimizing on-policy samples and assigning binary rewards by comparing generated answers against
+y
+⋆
+y^{\star}
+. However, RLVR is computationally expensive and the reward signal is sparse, providing same feedback across all tokens regardless of where errors occur. Alternatively, one can train a process reward model (PRM) to provide dense, token-level feedback during RL. However, acquiring labels for PRM training is prohibitively expensive and difficult to scale
+(
+lightman2023let
+;
+zhang2025lessons
+)
+. On-policy distillation works
+(
+agarwal2024policy
+;
+xuspeculative
+;
+lu2025onpolicydistillation
+)
+address distribution shift by training on the student’s own samples, but require a separate, often larger, teacher model to provide supervision. We instead seek a training signal that is
+dense
+,
+on-policy
+, and
+does not require external teachers or reward models
+. This motivates our On-Policy Self-Distillation approach. We summarize the differences of these methods in Table
+1
+.
+Algorithm 1
+On-Policy Self-Distillation (OPSD)
+Reasoning dataset
+𝒮
+=
+{
+(
+x
+i
+,
+y
+i
+⋆
+)
+}
+i
+=
+1
+N
+\mathcal{S}=\{(x_{i},y_{i}^{\star})\}_{i=1}^{N}
+; language model
+p
+θ
+p_{\theta}
+; divergence
+D
+D
+(e.g.,
+JSD
+β
+\mathrm{JSD}_{\beta}
+)
+Define student policy
+p
+S
+(
+⋅
+∣
+x
+)
+:=
+p
+θ
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x):=p_{\theta}(\cdot\mid x)
+Define teacher policy
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+:=
+p
+θ
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+p_{T}(\cdot\mid x,y^{\star}):=p_{\theta}(\cdot\mid x,y^{\star})
+same parameters; different conditioning
+not converged
+Sample a minibatch
+ℬ
+⊂
+𝒮
+\mathcal{B}\subset\mathcal{S}
+(
+x
+,
+y
+⋆
+)
+∈
+ℬ
+(x,y^{\star})\in\mathcal{B}
+Sample on-policy response
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+\hat{y}\sim p_{S}(\cdot\mid x)
+Compute the token-wise divergence along the student rollout:
+ℓ
+(
+x
+,
+y
+⋆
+)
+←
+D
+(
+p
+T
+∥
+p
+S
+)
+(
+y
+^
+∣
+x
+)
+=
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+D
+(
+p
+T
+(
+⋅
+∣
+y
+^
+<
+n
+,
+x
+,
+y
+⋆
+)
+∥
+p
+S
+(
+⋅
+∣
+y
+^
+<
+n
+,
+x
+)
+)
+\ell(x,y^{\star})\leftarrow D\big(p_{T}\,\|\,p_{S}\big)(\hat{y}\mid x)=\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}D\!\left(p_{T}(\cdot\mid\hat{y}_{<n},x,y^{\star})\,\big\|\,p_{S}(\cdot\mid\hat{y}_{<n},x)\right)
+Batch loss
+ℒ
+OPSD
+​
+(
+θ
+)
+←
+1
+|
+ℬ
+|
+​
+∑
+(
+x
+,
+y
+⋆
+)
+∈
+ℬ
+ℓ
+​
+(
+x
+,
+y
+⋆
+)
+\mathcal{L}_{\mathrm{OPSD}}(\theta)\leftarrow\frac{1}{|\mathcal{B}|}\sum_{(x,y^{\star})\in\mathcal{B}}\ell(x,y^{\star})
+Update
+θ
+←
+θ
+−
+η
+​
+∇
+θ
+ℒ
+OPSD
+​
+(
+θ
+)
+\theta\leftarrow\theta-\eta\,\nabla_{\theta}\mathcal{L}_{\mathrm{OPSD}}(\theta)
+Return
+trained parameters
+θ
+\theta
+for inference-time policy
+p
+S
+(
+⋅
+∣
+x
+)
+p_{S}(\cdot\mid x)
+\Require
+\State
+\State
+\Comment
+\While
+\State
+\ForAll
+\State
+\State
+\EndFor
+\State
+\State
+\EndWhile
+\State
+3.2
+On-Policy Self-Distillation
+Motivation: Learning by understanding solutions.
+We propose a different perspective inspired by how students learn: when struggling with a problem, rather than extended trial-and-error, a student can examine the solution, understand the reasoning, and internalize the approach. Similarly, if a model has access to the correct answer or reasoning
+y
+⋆
+y^{\star}
+and is sufficiently capable, it can rationalize the reasoning steps and teach itself—analogous to a student reviewing a solution and retracing why it works. This intuition motivates our framework: we exploit the ground-truth solution
+y
+⋆
+y^{\star}
+directly as privileged information during training, enabling the model to serve as its own teacher without requiring external reward models or larger teacher models.
+Teacher and student policies.
+We instantiate two conditional distributions from the same language model
+p
+θ
+p_{\theta}
+by varying the
+conditioning context. The
+teacher policy
+conditions on privileged information—both the
+problem
+x
+x
+and the reference solution
+y
+⋆
+y^{\star}
+:
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+≜
+p
+θ
+(
+⋅
+∣
+x
+,
+y
+⋆
+)
+.
+p_{T}(\cdot\mid x,y^{\star})\;\triangleq\;p_{\theta}(\cdot\mid x,y^{\star}).
+The
+student policy
+observes only the problem statement, matching the inference-time condition:
+p
+S
+(
+⋅
+∣
+x
+)
+≜
+p
+θ
+(
+⋅
+∣
+x
+)
+.
+p_{S}(\cdot\mid x)\;\triangleq\;p_{\theta}(\cdot\mid x).
+Critically, both policies share the same parameters
+θ
+\theta
+but differ only in their conditioning
+context. The teacher has access to information unavailable at test time, allowing it to provide
+informed guidance. To encourage the teacher to naturally evaluate the student’s generation, we add a prompt asking the teacher to generate a new solution after rationalization, as shown in
+Figure
+2
+.
+On-policy sampling from the student.
+Given a problem
+x
+x
+, the student generates an on-policy response
+y
+^
+=
+(
+y
+^
+1
+,
+…
+,
+y
+^
+|
+y
+^
+|
+)
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+.
+\hat{y}=(\hat{y}_{1},\ldots,\hat{y}_{|\hat{y}|})\sim p_{S}(\cdot\mid x).
+Both policies then evaluate this student-generated trajectory. At each position
+n
+n
+, they induce
+next-token
+distributions over
+y
+n
+∈
+𝒱
+y_{n}\in\mathcal{V}
+conditioned on the same student prefix:
+p
+S
+​
+(
+y
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+,
+p
+T
+​
+(
+y
+n
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+,
+p_{S}\!\left(y_{n}\mid x,\hat{y}_{<n}\right),\qquad p_{T}\!\left(y_{n}\mid x,y^{\star},\hat{y}_{<n}\right),
+where
+y
+^
+<
+n
+≜
+(
+y
+^
+1
+,
+…
+,
+y
+^
+n
+−
+1
+)
+\hat{y}_{<n}\triangleq(\hat{y}_{1},\ldots,\hat{y}_{n-1})
+.
+Training objective: Full-vocabulary divergence.
+We instantiate a
+full-vocabulary divergence objective
+that matches the teacher and student
+next-token distributions at each position. Given a student-generated sequence
+y
+^
+\hat{y}
+, define
+the trajectory-averaged, token-wise divergence
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+∣
+x
+)
+≜
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+D
+(
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+∥
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+)
+,
+\begin{split}D\bigl(p_{T}\,\|\,p_{S}\bigr)(\hat{y}\mid x)&\triangleq\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}D\biggl(p_{T}\!\left(\cdot\mid x,y^{\star},\hat{y}_{<n}\right)\\
+&\qquad\big\|\;p_{S}\!\left(\cdot\mid x,\hat{y}_{<n}\right)\biggr),\end{split}
+(6)
+where
+p
+S
+(
+⋅
+∣
+x
+,
+y
+^
+<
+n
+)
+p_{S}(\cdot\mid x,\hat{y}_{<n})
+and
+p
+T
+(
+⋅
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+p_{T}(\cdot\mid x,y^{\star},\hat{y}_{<n})
+denote distributions over the next token
+y
+n
+∈
+𝒱
+y_{n}\in\mathcal{V}
+. Here,
+D
+D
+can be any distribution divergence measure such as the
+generalized Jensen-Shannon divergence
+JSD
+β
+\operatorname{JSD}_{\beta}
+, defined for a weight
+β
+∈
+[
+0
+,
+1
+]
+\beta\in[0,1]
+as:
+JSD
+β
+⁡
+(
+p
+T
+∥
+p
+S
+)
+=
+β
+​
+D
+K
+​
+L
+​
+(
+p
+T
+∥
+m
+)
++
+(
+1
+−
+β
+)
+​
+D
+K
+​
+L
+​
+(
+p
+S
+∥
+m
+)
+\operatorname{JSD}_{\beta}(p_{T}\|p_{S})=\beta D_{KL}(p_{T}\|m)+(1-\beta)D_{KL}(p_{S}\|m)
+(7)
+where
+m
+=
+β
+​
+p
+T
++
+(
+1
+−
+β
+)
+​
+p
+S
+m=\beta p_{T}+(1-\beta)p_{S}
+is the interpolated mixture distribution. This full-vocabulary formulation provides dense, token-level feedback: the teacher, informed by
+y
+⋆
+y^{\star}
+, exposes the student to the entire distribution over plausible next tokens and guides it toward reasoning paths that lead to the correct answer.
+We minimize the expected divergence between teacher and student over on-policy student samples:
+ℒ
+​
+(
+θ
+)
+=
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+​
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+​
+[
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+​
+(
+y
+^
+∣
+x
+)
+]
+]
+.
+\mathcal{L}(\theta)=\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\left[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\left[D\bigl(p_{T}\,\|\,p_{S}\bigr)(\hat{y}\mid x)\right]\right].
+(8)
+Gradients are backpropagated only through the student policy
+p
+S
+p_{S}
+, while the teacher
+p
+T
+p_{T}
+acts as
+a fixed full-distribution target conditioned on privileged information
+(
+x
+,
+y
+⋆
+)
+(x,y^{\star})
+.
+Alternative objective: Sampled-token distillation through policy gradient.
+Alternatively, following recent on-policy distillation methods
+(
+lu2025onpolicydistillation
+)
+,
+we form a sampled-token shaping signal (equivalently, a reverse-KL signal on sampled actions) and
+optimize with policy gradient. For each position
+n
+n
+in a sampled sequence
+y
+^
+\hat{y}
+, define the
+advantage term
+A
+n
+​
+(
+x
+,
+y
+^
+)
+=
+log
+⁡
+p
+T
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+⋆
+,
+y
+^
+<
+n
+)
+−
+log
+⁡
+p
+S
+​
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+,
+A_{n}(x,\hat{y})=\log p_{T}\!\left(\hat{y}_{n}\mid x,y^{\star},\hat{y}_{<n}\right)-\log p_{S}\!\left(\hat{y}_{n}\mid x,\hat{y}_{<n}\right),
+and optimize the policy-gradient-style objective
+ℒ
+​
+(
+θ
+)
+=
+−
+𝔼
+(
+x
+,
+y
+⋆
+)
+∼
+𝒮
+[
+𝔼
+y
+^
+∼
+p
+S
+(
+⋅
+∣
+x
+)
+[
+1
+|
+y
+^
+|
+∑
+n
+=
+1
+|
+y
+^
+|
+A
+n
+(
+x
+,
+y
+^
+)
+×
+log
+p
+S
+(
+y
+^
+n
+∣
+x
+,
+y
+^
+<
+n
+)
+]
+]
+.
+\begin{split}\mathcal{L}(\theta)&=-\mathbb{E}_{(x,y^{\star})\sim\mathcal{S}}\biggl[\mathbb{E}_{\hat{y}\sim p_{S}(\cdot\mid x)}\biggl[\frac{1}{|\hat{y}|}\sum_{n=1}^{|\hat{y}|}A_{n}(x,\hat{y})\\
+&\qquad\times\log p_{S}\!\left(\hat{y}_{n}\mid x,\hat{y}_{<n}\right)\biggr]\biggr].\end{split}
+(9)
+In practice,
+A
+n
+​
+(
+x
+,
+y
+^
+)
+A_{n}(x,\hat{y})
+is treated as a constant with respect to
+θ
+\theta
+(i.e., gradients do
+not flow through the advantage), so that gradients take the usual policy-gradient form
+A
+n
+​
+∇
+θ
+log
+⁡
+p
+S
+A_{n}\nabla_{\theta}\log p_{S}
+.
+Compared to the full-vocabulary divergence objective, this on-policy shaping objective operates only
+on sampled tokens, using the teacher’s log-probabilities to provide dense, trajectory-level shaping
+signals without explicitly matching the full distribution at each step.
+Table 2
+:
+Performance comparison across mathematical reasoning benchmarks for Qwen3 models from 1.7B to 8B. We report average@16 using suggested sampling parameters from the Qwen3 blog with temperature of 1.2 and generation length of 38k, with detailed parameter in
+Table
+5
+.
+Method
+AIME24
+AIME25
+HMMT25
+AMO-Bench
+Average
+Qwen3-8B
+Base (Instruct)
+75.2
+68.3
+43.1
+13.4
+50.0
++ SFT
+76.3
+66.2
+44.7
+12.9
+50.0
++ GRPO
+76.7
+68.7
+45.0
+14.8
+51.3
++ OPSD
+77.5
+69.8
+47.1
+14.3
+52.2
+Qwen3-4B
+Base (Instruct)
+74.6
+65.8
+40.3
+12.4
+48.3
++ SFT
+75.2
+66.3
+44.4
+12.5
+49.6
++ GRPO
+75.6
+67.1
+42.7
+12.8
+49.6
++ OPSD
+76.0
+66.9
+45.8
+13.5
+50.6
+Qwen3-1.7B
+Base (Instruct)
+50.2
+35.2
+25.4
+4.3
+28.8
++ SFT
+48.3
+36.3
+23.3
+3.9
+28.0
++ GRPO
+52.1
+38.3
+26.7
+4.5
+30.5
++ OPSD
+51.4
+39.5
+25.8
+5.0
+30.4
+4
+Experiments
+In this section, we conduct comprehensive experiments to answer the following research questions:
+(1)
+How does OPSD compare to SFT and GRPO in terms of mathematical reasoning performance and what’s the improved sample efficiency? (§
+4.2
+)
+(2)
+How does OPSD scale across different model sizes, does self-distillation require more powerful model ability? (§
+4.3.1
+)
+(3)
+What is the effect of generation length on training performance and sample efficiency? (§
+4.3.2
+)
+(4)
+Does computing divergence over the full vocabulary logits provide benefits compared to computing it only over sampled tokens and optimizing through policy gradient? (§
+4.3.3
+)
+4.1
+Experimental Setup
+Models and datasets.
+We experiment with the Qwen3
+(
+qwen3technicalreport
+)
+model family at three scales: Qwen3-1.7B, Qwen3-4B, and Qwen3-8B, using the instruct-tuned versions. For training data, we use the mathematical reasoning subset of OpenThoughts
+(
+guha2025openthoughtsdatarecipesreasoning
+)
+, sampling up to 30K problem-solution pairs with chain-of-thought reasoning. We evaluate on competition-level mathematics benchmarks including AIME 2024, AIME 2025, HMMT 2025 and Amo-Bench
+(
+an2025amo
+)
+.
+Baselines.
+We compare against two methods trained on the same dataset: (1)
+SFT
+, standard supervised fine-tuning on expert trajectories, which can be seen as off-policy distillation from a more powerful LLM that generated the reasoning traces; (2)
+GRPO
+(
+shao2024deepseekmath
+)
+, group relative policy optimization with binary outcome rewards verified against ground-truth answers.
+Implementation details.
+For GRPO, we sample 8 responses per problem. For OPSD, we sample 1 response per problem. We use Adam optimizer with a learning rate of 1e-5, warmup ratio of 0.1, and cosine learning rate decay. For the divergence measure in Eq.
+6
+, we use
+JSD
+β
+=
+0.5
+\operatorname{JSD}_{\beta=0.5}
+. Importantly, we fix the teacher policy to be the initial policy, rather than the currently updating learning policy, as we find this helps stabilize training and implicitly acts as regularization to prevent excessive deviation from the initial policy. All experiments are conducted on 8×A100 GPUs with LoRA
+(
+hu2022lora
+)
+. More experimental details are in Appendix
+8.1
+.
+4.2
+Main Results
+Figure 3
+:
+Token Efficiency of OPSD.
+We compare OPSD and GRPO on Qwen3-4B under the same effective training batch size, reporting average@16 performance as a function of gradient update steps and total generated tokens. Both methods are trained with the same effective batch size in terms of sampled generations per update, but differ in generation length: each generation is capped at 2048 tokens for OPSD and 16384 tokens for GRPO. OPSD achieves comparable or better performance with substantially fewer generated tokens, resulting in lower sampling cost and reduced training time. In this experiment, OPSD can be 4-8
+×
+\times
+more token-efficient than GRPO.
+Table
+2
+reports results on competition-level mathematical reasoning benchmarks.
+OPSD consistently outperforms SFT and improves over the base model across scales; it matches or exceeds GRPO at 4B/8B, and is comparable at 1.7B. Notably, OPSD accomplishes these gains using only a single rollout per problem, whereas GRPO requires 8 rollouts, demonstrating improved sample efficiency.
+Superior Token Efficiency from Dense Teacher Feedback.
+In addition to improved accuracy, OPSD is significantly more token-efficient than GRPO.
+Figure
+3
+compares the two methods under the same effective training batch size on Qwen3-4B.
+While GRPO relies on 8 rollouts with long generation budgets of 16k, OPSD achieves higher performance using substantially fewer generated tokens of 2k and needs only 1 rollout per prompt.
+This efficiency stems from dense token-level supervision from the teacher distribution, reducing sampling cost and training time without sacrificing performance. We hypothesize that the early tokens are more important for distillation than the later tokens, as the earlier tokens can represent more important branching points.
+Figure 4
+:
+Pass@K performance averaged across four mathematical reasoning benchmarks for Qwen3-4B. We study the effect of the generation length of on-policy sampled student responses in OPSD, comparing 1024, 2048, and 4096 tokens. Longer generations provide more teacher signals. Increasing the generation length from 1k to 2k and 4k consistently improves pass@K, with both 2k and 4k substantially outperforming the 1k setting.
+4.3
+Discussions
+4.3.1
+Effect of Model Scale
+Our method relies on the teacher policy’s ability to rationalize reference solutions when conditioned on privileged information. Under a fixed dataset, this capability depends on sufficient model capacity and is expected to scale with model size. We therefore hypothesize that OPSD becomes increasingly effective as models grow more capable of leveraging privileged context. To evaluate this, we apply OPSD to the Qwen3 family at three scales: 1.7B, 4B, and 8B parameters. As shown in
+Table
+2
+, OPSD provides limited gains over GRPO at the 1.7B scale although OPSD still improves over base and SFT at 1.7B., while yielding progressively larger improvements at the 4B and 8B scales, consistent with our hypothesis.
+4.3.2
+Effect of Generation Length
+Since our objective operates at the token level (Eq.
+6
+), the number of generated tokens per sample directly determines the amount of supervision signal available to the student. Longer sequences expose the student to more teacher feedback, but they also increase computational cost and may introduce noisy or uninformative continuations.
+To study this trade-off, we conduct an ablation on Qwen3-4B by varying the generation length of on-policy sampled student responses among 1024, 2048, and 4096 tokens and use full-vocabulary logit distillation. As shown in Figure
+4
+, increasing the generation length leads to clear improvements in pass@K performance. In particular, both the 2048-token and 4096-token settings significantly outperform the 1024-token baseline, indicating that longer generations provide more effective reasoning supervision.
+Table 3
+:
+Ablation on divergence computation strategies for OPSD on Qwen3
+-
+4B with 2048 generation length for distillation.
+We report pass@8 accuracy on AIME25 and HMMT25.
+Full-distribution objectives (logit distillation) outperform sampled-token objectives.
+Method Variant
+AIME25
+HMMT25
+OPSD w/ Full-vocabulary logit distillation
+(
+agarwal2024policy
+)
+84.1
+60.0
+OPSD w/ Sampled-token distillation
+(
+lu2025onpolicydistillation
+)
+82.1
+57.3
+4.3.3
+Learning Objective Comparison: Full Vocabulary Logits Distillation vs. Sampled-Token Distillation
+Our objective in Eq.
+6
+is defined as a per-token discrepancy between the teacher and student
+distributions
+. In practice, OPSD can instantiate this objective in two ways. (1)
+Full-vocabulary logit distillation
+(as in GKD
+(
+agarwal2024policy
+)
+): for each token position, we compute
+D
+​
+(
+p
+T
+∥
+p
+S
+)
+D(p_{T}\,\|\,p_{S})
+over the entire vocabulary via a full softmax, yielding a proper token-level
+f
+f
+-divergence between the two policies. (2)
+Sampled-token advantage policy-gradient objective
+(as in the on-policy distillation method of
+lu2025onpolicydistillation
+): we evaluate teacher and student log-probabilities only at the token actually sampled by the student,
+y
+^
+n
+\hat{y}_{n}
+, and use the reverse-KL term as a scalar advantage inside a policy-gradient-style loss. Thus, the first variant directly matches full token distributions, whereas the second optimizes an on-policy RL objective shaped by the teacher’s log-probabilities rather than a full-distribution divergence. We compare these variants on Qwen3-4B using a 2048-token generation budget during distillation.
+Table
+3
+summarizes the results.
+The full-vocabulary divergence objective provides a consistent gain over the sampled-token objective, improving AIME25 from 82.1% to 84.1% and HMMT25 from 57.3% to 60.0%.
+This suggests that exposing the student to the full teacher distribution offers richer supervision than relying solely on per-token on-policy shaping.
+However, the full-vocabulary computation incurs higher peak memory usage due to storing vocabulary-sized logits at every position, indicating a trade-off between performance and efficiency.
+5
+Related Work
+On-Policy Distillation
+methods train a student model directly on trajectories sampled from its own policy, while a teacher model provides per-token guidance through KL-based regularization or related objectives
+(
+agarwal2024policy
+;
+xuspeculative
+;
+gu2024minillm
+;
+lu2025onpolicydistillation
+;
+xiao2026mimov2flashtechnicalreport
+;
+qwen3
+)
+.
+These approaches mitigate distribution shift by optimizing directly on the student’s visitation distribution, but they typically rely on a distinct and often larger teacher model.
+In this work, we explore whether an LLM can teach itself by conditioning on more privileged answer information and leveraging its own reasoning capability to guide a weaker version of itself toward improved reasoning.
+On-policy training paradigms are also widely used in robotics and deep reinforcement learning, such as DAgger
+(
+ross2011reduction
+)
+, where a human teacher provides corrective supervision on the states visited by the student policy.
+Improving LLM Reasoning through SFT and RL.
+SFT and RL are two primary methods for improving LLM reasoning ability.
+SFT on high-quality reasoning traces has demonstrated strong performance
+(
+yu2023metamath
+;
+numina_math_datasets
+;
+pasteropenwebmath
+;
+openthoughts
+)
+, and that smaller, carefully curated datasets can outperform larger but noisier collections
+(
+ye2025limoreasoning
+;
+muennighoff2025s1
+;
+zhou2023lima
+)
+.
+However, prior work shows that SFT-based reasoning often relies on memorization rather than robust generalization
+(
+chu2025sft
+)
+.
+In contrast, RL-based approaches optimize directly for outcome-based objectives can exhibit stronger transfer to novel problems
+(
+huan2025does
+)
+.
+More recent algorithms such as GRPO
+(
+guo2025deepseek
+;
+shao2024deepseekmath
+)
+enable scalable RL by estimating advantages from group-level rewards without requiring an explicit critic as in PPO
+(
+schulman2017proximal
+)
+.
+Building on this line of work, a growing body of research highlights the effectiveness of reinforcement learning with verifiable rewards (RLVR) for reasoning tasks
+(
+yu2025dapo
+;
+liu2025understanding
+;
+yue2025vapo
+;
+Polaris2025
+;
+zheng2025group
+)
+.
+LLM Self-Training.
+Our work is related to a growing body of research demonstrating that LLMs can improve by generating and exploiting their own supervision signals
+(
+allentowards
+;
+xu2024survey
+;
+chen2024self
+)
+. Self-Instruct
+(
+wang2023self
+)
+and Self-Align
+(
+sun2023principle
+)
+demonstrate that large language models can bootstrap instruction-following and alignment with minimal human supervision by leveraging small sets of human-written seeds—either instructions or principles—to generate synthetic training data. Context distillation
+(
+snell2022learning
+)
+shows that models can internalize the benefits of privileged context tokens (e.g., instructions or scratchpads) by training a student to reproduce the same outputs without access to such context at inference time through SFT. Recent work on in-context editing
+(
+qicontext
+)
+demonstrates that models can learn new knowledge by optimizing toward self-induced contextual distributions rather than one-hot targets for knowledge editing. In the reasoning domain, ReST
+(
+gulcehre2023reinforced
+)
+and STaR
+(
+zelikman2022star
+)
+improve performance through iterative loops of rationale generation, filtering based on rewards or ground-truth answers, and fine-tuning on successful samples. LLM can also be used as a judge to generate RL rewards
+(
+yuan2024self
+)
+for itself. While aligned with this self-training paradigm, OPSD introduces a distinct approach: we perform on-policy, token-level self-distillation where the model learns from its own outputs conditioned on privileged access to ground-truth solutions. This transforms reasoning improvement into learning a conditional distribution induced by both the dataset’s ground-truth answers and the model’s own understanding of how to reach them.
+6
+Conclusion
+We introduced On-Policy Self-Distillation (OPSD), a simple yet effective framework for post-training large language models on reasoning tasks. The intuition behind OPSD is that a sufficiently capable reasoning LLM can teach itself when it has access to privileged information about the answer to a reasoning problem, utilizing its own rationalization ability to grade its weaker self without access to the ground truth. We experimentally demonstrated that OPSD achieves better performance than off-policy distillation/SFT, and performs on par with or better than GRPO, while exhibiting significantly better sample efficiency than GRPO. Our ablation studies reveal that sufficiently large language models are required for successful self-distillation, and that generating more tokens during the online sampling phase and full-vocabulary logit distillation leads to improved learning.
+7
+Limitations and Future Directions
+Due to computational constraints, our experiments are limited to models up to 8B parameters. While we observe that larger models benefit more from OPSD—consistent with our hypothesis that self-rationalization requires sufficient model capacity—it remains an open question whether this trend continues at scales beyond 8B parameters, such as 70B or larger frontier models.
+Several promising directions warrant further investigation. First, our current framework does not explicitly leverage correctness verification of generated answers; incorporating such signals could provide additional learning objectives beyond distribution matching.
+Finally, problem difficulty plays a crucial role in self-distillation: if reasoning problems exceed the model’s comprehension threshold, the teacher policy cannot provide meaningful supervision even with access to ground-truth solutions. This suggests that curriculum learning strategies—gradually increasing problem difficulty as the model improves—could enhance training effectiveness. Exploring adaptive curricula that maintain problems at the frontier of model capabilities represents an important direction for scaling OPSD to more challenging reasoning tasks.
+References
+8
+Appendix
+8.1
+Experimental Details
+We provide the training and evaluation configurations for our SFT, GRPO and OPSD experiments in Tables
+5
+,
+6
+and
+5
+. Both GRPO and OPSD methods use the same base hyperparameters where applicable to ensure fair comparison.
+Table 4
+:
+Training Configuration for SFT.
+Parameter
+SFT
+Learning Rate
+2
+×
+10
+−
+5
+2\times 10^{-5}
+Batch Size (per device)
+2
+Gradient Accumulation Steps
+4
+Effective Batch Size
+64
+LoRA Rank (
+r
+r
+)
+64
+LoRA Alpha (
+α
+\alpha
+)
+128
+LoRA Target Modules
+q_proj, k_proj, v_proj, o_proj,
+gate_proj, up_proj, down_proj
+Max Sequence Length
+16000
+Number of Training Epochs
+4
+Training Dataset Size
+30k
+Table 5
+:
+Evaluation Parameters.
+Parameter
+Value
+Max New Tokens
+38912
+Thinking Mode
+Enabled
+Top-p
+0.95
+Top-k
+-1
+Min-p
+0.0
+Presence Penalty
+0.0
+Samples per Prompt
+16
+Table 6
+:
+Training Configuration for GRPO and OPSD
+Parameter
+GRPO
+OPSD
+Learning Rate
+2
+×
+10
+−
+5
+2\times 10^{-5}
+2
+×
+10
+−
+5
+2\times 10^{-5}
+Batch Size (per device)
+1
+1
+Gradient Accumulation Steps
+4
+4
+Effective Batch Size
+32
+32
+LoRA Rank (
+r
+r
+)
+64
+64
+LoRA Alpha (
+α
+\alpha
+)
+128
+128
+LoRA Target Modules
+q_proj, k_proj, v_proj, o_proj,
+gate_proj, up_proj, down_proj
+Max Completion Length
+16000
+2048
+Number of Generations per Prompt
+8
+1
+Temperature
+1.2
+1.2
+KL Coefficient (
+β
+\beta
+)
+0.0
+–
+All experiments were conducted using 8 A100 GPUs with gradient checkpointing and Flash Attention 2 for memory efficiency. We use the AdamW
+(
+loshchilov2017decoupled
+)
+optimizer and bfloat16 precision for all training runs. For OPSD, unless otherwise stated, we used full-vocabulary logit distillation.
\ No newline at end of file
diff --git a/research/notes/streaming-diloco-with-overlapping-communication-towards-a-distributed-free-lunch.md b/research/notes/streaming-diloco-with-overlapping-communication-towards-a-distributed-free-lunch.md
new file mode 100644
index 0000000000000000000000000000000000000000..4353dbcce52d9b93857cb9241cd19c3862ca6198
--- /dev/null
+++ b/research/notes/streaming-diloco-with-overlapping-communication-towards-a-distributed-free-lunch.md
@@ -0,0 +1,4997 @@
+---
+title: 'Streaming DiLoCo with overlapping communication: Towards a Distributed Free
+  Lunch'
+id: streaming-diloco-with-overlapping-communication-towards-a-distributed-free-lunch
+tags:
+- deepread
+created: '2026-06-10T00:30:47.025739Z'
+source: https://arxiv.org/html/2501.18512
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:47.025585Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Streaming DiLoCo with overlapping communication: Towards a Distributed Free Lunch
+\correspondingauthor
+douillard@google.com
+\reportnumber
+Streaming DiLoCo with overlapping communication:
+Towards a Distributed Free Lunch
+Arthur Douillard
+Equal core contributions
+Google DeepMind
+Yanislav Donchev
+Equal core contributions
+Google DeepMind
+Keith Rush
+Google Research
+Satyen Kale
+Currently at Apple.
+Google Research
+Zachary Charles
+Google Research
+Zachary Garrett
+Google Research
+Gabriel Teston
+Google
+Dave Lacey
+Google DeepMind
+Ross McIlroy
+Google DeepMind
+Jiajun Shen
+Google DeepMind
+Alexandre Ramé
+Google DeepMind
+Arthur Szlam
+Google DeepMind
+Marc’Aurelio Ranzato
+Google DeepMind
+Paul Barham
+Google DeepMind
+Abstract
+Training of large language models (LLMs) is typically distributed across a large number of accelerators to reduce training time. Since internal states and parameter gradients need to be exchanged at each and every single gradient step, all devices need to be co-located using low-latency high-bandwidth communication links to support the required high volume of exchanged bits. Recently, distributed algorithms like DiLoCo
+(Douillard et al.,
+2024a
+)
+have relaxed such co-location constraint: accelerators can be grouped into “workers”, where synchronizations between workers only occur infrequently. This in turn means that workers can afford being connected by lower bandwidth communication links without affecting learning quality. However, in these methods, communication across workers still requires the same peak bandwidth as before, as the synchronizations require all parameters to be exchanged across all workers. In this paper, we improve DiLoCo in three ways. First, we synchronize only subsets of parameters in sequence, rather than all at once, which greatly reduces peak bandwidth. Second, we allow workers to continue training while synchronizing, which decreases wall clock time. Third, we quantize the data exchanged by workers, which further reduces bandwidth across workers. By properly combining these modifications, we show experimentally that we can distribute training of billion-scale parameters and reach similar quality as before, but reducing required bandwidth by two orders of magnitude.
+keywords:
+large-scale, language modeling, distributed learning
+1
+Introduction
+Scaling deep learning has led to significant leaps in capability
+(Team et al.,
+2024
+; OpenAI et al.,
+2024
+; Grattafiori et al.,
+2024
+)
+. Although neural architectures have evolved over the past decade, the standard approach to optimization remains essentially unchanged from the days of Alexnet
+(Krizhevsky et al.,
+2012
+)
+. Practitioners use minibatch stochastic gradient descent, with backpropagation through the model’s layers to compute the gradients. As in Alexnet, which already combined two hardware accelerators for parallel training, models are trained with multiple hardware accelerators.
+However, modern training runs, for example for large language models (LLM), may use tens of thousands of accelerators, and this number increases year after year. Building and maintaining a data-center that can co-locate that many accelerators is expensive and leads to increasingly complex engineering challenges. Beyond the physical infrastructure, orchestrating the passage of gradients, parameters and intermediate states between these devices at each optimization step, while keeping all devices fully utilized is technically challenging from a software engineering perspective. Furthermore, the more devices that are used for each synchronous training step, the more chances there are that one of them fails, risking halting training, or introducing subtle numerical issues.
+Recent publications
+(Douillard et al.,
+2024a
+; Jaghouar et al.,
+2024a
+)
+, building on work by
+McMahan et al. (
+2017
+)
+,
+have demonstrated that the co-location requirements of all accelerators can be loosened. These methods allow highly-performant training when accelerators are grouped into several “workers” with fast bandwidth intra-worker but with slow bandwidth inter-workers. The basic approach is to allow each worker to continue training for many minibatches, independently of the other workers; and then synchronize the parameters of the workers after a set number of these “inner” steps. The synchronization, in its simplest form
+(McMahan et al.,
+2017
+)
+, is to average the parameters of the workers
+(Wortsman et al.,
+2022
+)
+; but more sophisticated methods
+(Huo et al.,
+2020
+; Reddi et al.,
+2021
+)
+use the workers’ parameters to form a pseudo-gradient to update the shared parameters. The details of the formulation of the weight synchronization is important for machine-learning efficiency; see Section
+2.1
+.
+However, in these approaches, the synchronization typically requires an all-reduce operation which fully synchronizes the model parameters on some step. This all-reduce results in two main issues: 1) a large peak bandwidth, and 2) a blocking of the workers while they wait to receive updated weights. In this work, dubbed
+Streaming DiLoCo
+, we propose three modifications to these approaches to practically reduce the peak bandwidth and mitigate worker-blocking without loss of learning efficiency:
+Contribution 1:
+Synchronization
+.
+We synchronize subsets of parameters on a schedule, rather than all parameters at once. This contribution reduces the peak required bandwidth.
+Contribution 2:
+Overlapping
+.
+We overlap worker computation and communication of synchronizations. This contribution increases the tolerated latency of communication.
+Contribution 3:
+Quantization
+.
+We compress the outer gradients to four bits per parameters without loss of performance. This contribution reduces the total amount of bits exchanged.
+We show experimentally that our model, Streaming DiLoCo, is strictly superior to the original DiLoCo
+(Douillard et al.,
+2024a
+)
+, and achieves similar performance to the bandwidth-costly data-parallelism. Since we attain the same quality at negligible bandwidth, we consider our approach as an important stepping stone towards a form of
+distributed
+free lunch
+.
+2
+Model
+For all algorithms, we denote the model parameters as
+θ
+𝜃
+\theta
+italic_θ
+. We use the superscript notation
+θ
+(
+t
+)
+superscript
+𝜃
+𝑡
+\theta^{(t)}
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT
+to indicate the parameters at a given step
+t
+𝑡
+t
+italic_t
+, and the subscript notation
+θ
+m
+subscript
+𝜃
+𝑚
+\theta_{m}
+italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+to denote a particular shard of the DiLoCo replica. For example,
+θ
+m
+(
+t
+)
+subscript
+superscript
+𝜃
+𝑡
+𝑚
+\theta^{(t)}_{m}
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+indicates the parameters of DiLoCo replica
+m
+𝑚
+m
+italic_m
+at step
+t
+𝑡
+t
+italic_t
+. If no subscript is used, the parameters are replicated across DiLoCo replicas. Note that it is possible for parameters to not be replicated and yet to be of the same value.
+Algorithm 1
+FedOpt / DiLoCo
+1:
+M
+𝑀
+M
+italic_M
+replicas
+2:
+Synchronization frequency
+H
+𝐻
+H
+italic_H
+3:
+Model replicas
+{
+θ
+1
+(
+t
+−
+1
+)
+,
+…
+,
+θ
+M
+(
+t
+−
+1
+)
+}
+subscript
+superscript
+𝜃
+𝑡
+1
+1
+…
+subscript
+superscript
+𝜃
+𝑡
+1
+𝑀
+\{\theta^{(t-1)}_{1},\dots,\theta^{(t-1)}_{M}\}
+{ italic_θ start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , … , italic_θ start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_M end_POSTSUBSCRIPT }
+4:
+Data shards
+{
+𝒟
+1
+,
+…
+,
+𝒟
+M
+}
+subscript
+𝒟
+1
+…
+subscript
+𝒟
+𝑀
+\{\mathcal{D}_{1},\dots,\mathcal{D}_{M}\}
+{ caligraphic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , … , caligraphic_D start_POSTSUBSCRIPT italic_M end_POSTSUBSCRIPT }
+5:
+Optimizers
+InnerOpt
+and
+OuterOpt
+6:
+parallel for
+replica
+m
+=
+1
+⁢
+…
+⁢
+M
+𝑚
+1
+…
+𝑀
+m=1\ldots M
+italic_m = 1 … italic_M
+do
+7:
+for
+step
+t
+=
+1
+⁢
+…
+⁢
+T
+𝑡
+1
+…
+𝑇
+t=1\ldots T
+italic_t = 1 … italic_T
+do
+8:
+x
+∼
+𝒟
+m
+similar-to
+𝑥
+subscript
+𝒟
+𝑚
+x\sim\mathcal{D}_{m}
+italic_x ∼ caligraphic_D start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+9:
+ℒ
+←
+f
+⁢
+(
+x
+,
+θ
+m
+(
+t
+−
+1
+)
+)
+←
+ℒ
+𝑓
+𝑥
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+1
+\mathcal{L}\leftarrow f(x,\theta_{m}^{(t-1)})
+caligraphic_L ← italic_f ( italic_x , italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT )
+10:
+θ
+m
+(
+t
+)
+←
+InnerOpt
+⁢
+(
+θ
+m
+(
+t
+−
+1
+)
+,
+∇
+ℒ
+)
+←
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+InnerOpt
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+1
+subscript
+∇
+ℒ
+\theta_{m}^{(t)}\leftarrow\texttt{InnerOpt}(\theta_{m}^{(t-1)},\nabla_{%
+\mathcal{L}})
+italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← InnerOpt ( italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT , ∇ start_POSTSUBSCRIPT caligraphic_L end_POSTSUBSCRIPT )
+11:
+12:
+if
+t
+mod
+H
+=
+=
+0
+t\mod H==0
+italic_t roman_mod italic_H = = 0
+then
+13:
+Δ
+m
+(
+t
+)
+←
+θ
+m
+(
+t
+−
+H
+)
+−
+θ
+m
+(
+t
+)
+←
+subscript
+superscript
+Δ
+𝑡
+𝑚
+subscript
+superscript
+𝜃
+𝑡
+𝐻
+𝑚
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+\Delta^{(t)}_{m}\leftarrow\theta^{(t-H)}_{m}-\theta_{m}^{(t)}
+roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT ← italic_θ start_POSTSUPERSCRIPT ( italic_t - italic_H ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT - italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT
+14:
+Δ
+(
+t
+)
+←
+async-send
+⁢
+[
+1
+M
+⁢
+∑
+m
+=
+1
+M
+(
+Δ
+m
+(
+t
+)
+)
+]
+←
+superscript
+Δ
+𝑡
+async-send
+delimited-[]
+1
+𝑀
+superscript
+subscript
+𝑚
+1
+𝑀
+subscript
+superscript
+Δ
+𝑡
+𝑚
+\Delta^{(t)}\leftarrow{\small\texttt{async-send}}[\frac{1}{M}\sum_{m=1}^{M}(%
+\Delta^{(t)}_{m})]
+roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← async-send [ divide start_ARG 1 end_ARG start_ARG italic_M end_ARG ∑ start_POSTSUBSCRIPT italic_m = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_M end_POSTSUPERSCRIPT ( roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT ) ]
+15:
+block-receive
+⁢
+[
+Δ
+(
+t
+)
+]
+block-receive
+delimited-[]
+superscript
+Δ
+𝑡
+\texttt{block-receive}[{\Delta^{(t)}}]
+block-receive [ roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ]
+16:
+θ
+m
+(
+t
+)
+←
+OuterOpt
+⁢
+(
+θ
+m
+(
+t
+−
+H
+)
+,
+Δ
+(
+t
+)
+)
+←
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+OuterOpt
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+𝐻
+superscript
+Δ
+𝑡
+\theta_{m}^{(t)}\leftarrow\texttt{OuterOpt}(\theta_{m}^{(t-H)},\Delta^{(t)})
+italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← OuterOpt ( italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - italic_H ) end_POSTSUPERSCRIPT , roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT )
+17:
+end parallel for
+2.1
+Context: FedOpt and DiLoCo
+FedOpt
+(Reddi et al.,
+2021
+)
+is a generic framework to perform federated learning with a bi-level optimization.
+M
+𝑀
+M
+italic_M
+local replicas perform
+H
+𝐻
+H
+italic_H
+steps of
+inner
+independent optimizations on a different subset of the data (L3 to L5 in
+Algorithm 1
+). Every
+H
+𝐻
+H
+italic_H
+steps, each replica computes an
+outer gradient
+Δ
+m
+t
+=
+θ
+m
+(
+t
+−
+H
+)
+−
+θ
+m
+(
+t
+)
+superscript
+subscript
+Δ
+𝑚
+𝑡
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+𝐻
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+\Delta_{m}^{t}=\theta_{m}^{(t-H)}-\theta_{m}^{(t)}
+roman_Δ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_t end_POSTSUPERSCRIPT = italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - italic_H ) end_POSTSUPERSCRIPT - italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT
+(L7), a delta in the parameters space, and communicates to all other replicas. This communication can be performed through a central parameter server or through direct communication of each worker to the others (e.g. with a ring all-reduce), and results in each worker obtaining
+Δ
+t
+=
+1
+/
+M
+⁢
+∑
+m
+=
+1
+M
+Δ
+m
+t
+superscript
+Δ
+𝑡
+1
+𝑀
+superscript
+subscript
+𝑚
+1
+𝑀
+subscript
+superscript
+Δ
+𝑡
+𝑚
+\Delta^{t}=\nicefrac{{1}}{{M}}\sum_{m=1}^{M}\Delta^{t}_{m}
+roman_Δ start_POSTSUPERSCRIPT italic_t end_POSTSUPERSCRIPT = / start_ARG 1 end_ARG start_ARG italic_M end_ARG ∑ start_POSTSUBSCRIPT italic_m = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_M end_POSTSUPERSCRIPT roman_Δ start_POSTSUPERSCRIPT italic_t end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+(L7-9). This outer gradient is applied on a set of
+outer parameters
+, the previously synchronized parameters
+θ
+m
+(
+t
+−
+H
+)
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+𝐻
+\theta_{m}^{(t-H)}
+italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - italic_H ) end_POSTSUPERSCRIPT
+, with an
+outer optimizer
+(L10). The full algorithm is shown in
+Algorithm 1
+.
+The costly communication between non-colocated devices happens during the averaging of outer gradients, in L8-9 of
+Algorithm 1
+. It is as costly as in Data-Parallel, but instead of being executed at every step, it is done every
+H
+𝐻
+H
+italic_H
+(e.g. one hundred) steps, thus amortizing the communication cost.
+Figure 1
+:
+Streaming DiLoCo
+: each replica trains independently for dozen of inner optimization steps, and then synchronize a single fragment during outer optimization. In this figure, there are
+M
+=
+4
+𝑀
+4
+M=4
+italic_M = 4
+replicas with
+p
+=
+{
+1
+,
+2
+,
+3
+}
+𝑝
+1
+2
+3
+p=\{1,2,3\}
+italic_p = { 1 , 2 , 3 }
+fragments. Each fragment can be made of several transformer layers. Note that this figure only showcases the streaming partial updates (
+subsection 2.2
+) and not the quantized communication overlapping (subsection
+2.3
+and
+2.4
+).
+DiLoCo is a successful instantiation of FedOpt applied to language models where the inner optimizer is Adam
+(Kingma and Ba,
+2014
+)
+and the outer optimizer is SGD with Nesterov momentum
+(Sutskever et al.,
+2013
+)
+.
+In this work, which focuses on distributed optimization, and unlike in the federated learning literature (as discussed in FedOpt), the workers aren’t sampled; but instead all workers will be present at each step.
+2.2
+Streaming partial updates
+Figure 2
+:
+Streaming pattern
+: sequential (left) and strided (right). Colors denotes the fragment. A different fragment is synchronized each time.
+Instead of communicating the full outer gradient vector (
+Δ
+m
+(
+t
+)
+,
+∀
+m
+∈
+{
+1
+,
+…
+⁢
+M
+}
+superscript
+subscript
+Δ
+𝑚
+𝑡
+for-all
+𝑚
+1
+…
+𝑀
+\Delta_{m}^{(t)},\forall m\in\{1,...M\}
+roman_Δ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT , ∀ italic_m ∈ { 1 , … italic_M }
+) every
+H
+𝐻
+H
+italic_H
+steps, we propose to share only a fragment
+p
+𝑝
+p
+italic_p
+of it (
+Δ
+m
+,
+p
+(
+t
+)
+superscript
+subscript
+Δ
+𝑚
+𝑝
+𝑡
+\Delta_{m,p}^{(t)}
+roman_Δ start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT
+) more frequently, as highlighted in
+Figure 1
+. There is a huge possible space of choices for these fragments and specification of “more frequently”
+1
+1
+1
+For example, an extreme version might be to send a constant bitstream of random choices (according to some optimization-useful distribution) of parameters.
+; here we consider the simple partition of our network into
+P
+𝑃
+P
+italic_P
+fragments made of several transformer blocks. Specifically, we study two fragment patterns, as shown in
+Figure 2
+: 1) sequential where each fragment comprises consecutive transformer blocks and 2) strided where each fragment is composed of interleaved transformer blocks. We will demonstrate in
+subsection 3.3
+that the algorithm is robust to the particular choice of fragments. Since the stride version offers in practice slightly better compute utilization (less time spent communicating instead of computing), we will use it as the default choice in our experiments. As we increase model scale, the fragment definition (e.g., how many transformer blocks comprise a fragment) is maintained, which means that larger models have more fragments.
+The resulting algorithm in shown is
+Algorithm 2
+(and contrasted with the original version shown in
+Algorithm 1
+), where only a fragment
+p
+𝑝
+p
+italic_p
+of the replica
+m
+𝑚
+m
+italic_m
+is shared. We denote a fragment with a new lower script, thus
+θ
+m
+,
+p
+(
+t
+)
+subscript
+superscript
+𝜃
+𝑡
+𝑚
+𝑝
+\theta^{(t)}_{m,p}
+italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT
+is the parameters of fragment
+p
+𝑝
+p
+italic_p
+of the replica
+m
+𝑚
+m
+italic_m
+at step time
+t
+𝑡
+t
+italic_t
+.
+The Streaming DiLoCo’s inner optimization (L3-5 of
+Algorithm 2
+) is identical to DiLoCo’s (L3-5 of
+Algorithm 1
+). However, the outer optimization (L12) is done per fragment. If a fragment
+p
+𝑝
+p
+italic_p
+satisfies the condition
+t
++
+t
+p
+mod
+H
+=
+0
+modulo
+𝑡
+subscript
+𝑡
+𝑝
+𝐻
+0
+t+t_{p}\mod H=0
+italic_t + italic_t start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT roman_mod italic_H = 0
+, where
+t
+p
+subscript
+𝑡
+𝑝
+t_{p}
+italic_t start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT
+is a time offset fragment-dependent, then it is synchronized. In this way, each fragment will always do
+H
+𝐻
+H
+italic_H
+steps before being synchronized, but overall the model is synchronizing
+some
+fragment more frequently than every
+H
+𝐻
+H
+italic_H
+steps. For example, with
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+and
+P
+=
+2
+𝑃
+2
+P=2
+italic_P = 2
+fragments, the first fragment will be synchronized at step
+t
+=
+100
+𝑡
+100
+t=100
+italic_t = 100
+,
+t
+=
+200
+𝑡
+200
+t=200
+italic_t = 200
+, … (
+t
+p
+=
+1
+=
+0
+subscript
+𝑡
+𝑝
+1
+0
+t_{p=1}=0
+italic_t start_POSTSUBSCRIPT italic_p = 1 end_POSTSUBSCRIPT = 0
+); the second fragment will be synchronized at step
+t
+=
+150
+𝑡
+150
+t=150
+italic_t = 150
+,
+t
+=
+250
+𝑡
+250
+t=250
+italic_t = 250
+, … (
+t
+p
+=
+2
+=
+50
+subscript
+𝑡
+𝑝
+2
+50
+t_{p=2}=50
+italic_t start_POSTSUBSCRIPT italic_p = 2 end_POSTSUBSCRIPT = 50
+). While in practice, given an equal
+H
+𝐻
+H
+italic_H
+, streaming DiLoCo communicates more often than DiLoCo, the peak communication is reduced by a factor of
+|
+p
+|
+/
+L
+𝑝
+𝐿
+\nicefrac{{|p|}}{{L}}
+/ start_ARG | italic_p | end_ARG start_ARG italic_L end_ARG
+with
+|
+p
+|
+𝑝
+|p|
+| italic_p |
+the size of a fragment in
+layers
+and
+L
+𝐿
+L
+italic_L
+the total number of layers.
+Algorithm 2
+Streaming DiloCo
+1:
+M
+𝑀
+M
+italic_M
+replicas
+2:
+Number of inner steps
+H
+𝐻
+H
+italic_H
+3:
+Fragments
+p
+∈
+{
+1
+,
+…
+,
+P
+}
+𝑝
+1
+…
+𝑃
+p\,\in\{1,\dots,P\}
+italic_p ∈ { 1 , … , italic_P }
+with their respective synchronization offset
+t
+p
+subscript
+𝑡
+𝑝
+t_{p}
+italic_t start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT
+4:
+Model replicas
+{
+θ
+1
+(
+t
+)
+,
+…
+,
+θ
+M
+(
+t
+)
+}
+subscript
+superscript
+𝜃
+𝑡
+1
+…
+subscript
+superscript
+𝜃
+𝑡
+𝑀
+\{\theta^{(t)}_{1},\dots,\theta^{(t)}_{M}\}
+{ italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , … , italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_M end_POSTSUBSCRIPT }
+5:
+Inner overlap delay
+τ
+<
+H
+𝜏
+𝐻
+\tau<H
+italic_τ < italic_H
+6:
+Data shards
+{
+𝒟
+1
+,
+…
+,
+𝒟
+M
+}
+subscript
+𝒟
+1
+…
+subscript
+𝒟
+𝑀
+\{\mathcal{D}_{1},\dots,\mathcal{D}_{M}\}
+{ caligraphic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , … , caligraphic_D start_POSTSUBSCRIPT italic_M end_POSTSUBSCRIPT }
+7:
+Optimizers
+InnerOpt
+and
+OuterOpt
+8:
+parallel for
+replica
+m
+=
+1
+⁢
+…
+⁢
+M
+𝑚
+1
+…
+𝑀
+m=1\ldots M
+italic_m = 1 … italic_M
+do
+9:
+for
+step
+t
+=
+1
+⁢
+…
+⁢
+T
+𝑡
+1
+…
+𝑇
+t=1\ldots T
+italic_t = 1 … italic_T
+do
+10:
+x
+∼
+𝒟
+m
+similar-to
+𝑥
+subscript
+𝒟
+𝑚
+x\sim\mathcal{D}_{m}
+italic_x ∼ caligraphic_D start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+11:
+ℒ
+←
+f
+⁢
+(
+x
+,
+θ
+m
+(
+t
+−
+1
+)
+)
+←
+ℒ
+𝑓
+𝑥
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+1
+\mathcal{L}\leftarrow f(x,\theta_{m}^{(t-1)})
+caligraphic_L ← italic_f ( italic_x , italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT )
+12:
+θ
+m
+(
+t
+)
+←
+InnerOpt
+⁢
+(
+θ
+m
+(
+t
+−
+1
+)
+,
+∇
+ℒ
+)
+←
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+InnerOpt
+superscript
+subscript
+𝜃
+𝑚
+𝑡
+1
+subscript
+∇
+ℒ
+\theta_{m}^{(t)}\leftarrow\texttt{InnerOpt}(\theta_{m}^{(t-1)},\nabla_{%
+\mathcal{L}})
+italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← InnerOpt ( italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t - 1 ) end_POSTSUPERSCRIPT , ∇ start_POSTSUBSCRIPT caligraphic_L end_POSTSUBSCRIPT )
+13:
+14:
+if
+∃
+p
+𝑝
+\exists p
+∃ italic_p
+s.t.
+t
+−
+t
+p
+mod
+H
+=
+=
+0
+t-t_{p}\mod H==0
+italic_t - italic_t start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT roman_mod italic_H = = 0
+then
+15:
+Δ
+m
+,
+p
+(
+t
+)
+←
+θ
+m
+,
+p
+(
+t
+−
+H
+)
+−
+θ
+m
+,
+p
+←
+subscript
+superscript
+Δ
+𝑡
+𝑚
+𝑝
+subscript
+superscript
+𝜃
+𝑡
+𝐻
+𝑚
+𝑝
+subscript
+𝜃
+𝑚
+𝑝
+\Delta^{(t)}_{m,p}\leftarrow\theta^{(t-H)}_{m,p}-\theta_{m,p}
+roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT ← italic_θ start_POSTSUPERSCRIPT ( italic_t - italic_H ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT - italic_θ start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT
+16:
+Δ
+p
+(
+t
+)
+←
+async-send
+⁢
+[
+1
+M
+⁢
+∑
+m
+=
+1
+M
+(
+Δ
+m
+,
+p
+(
+t
+)
+)
+]
+←
+subscript
+superscript
+Δ
+𝑡
+𝑝
+async-send
+delimited-[]
+1
+𝑀
+superscript
+subscript
+𝑚
+1
+𝑀
+subscript
+superscript
+Δ
+𝑡
+𝑚
+𝑝
+\Delta^{(t)}_{p}\leftarrow{\small\texttt{async-send}}[\frac{1}{M}\sum_{m=1}^{M%
+}(\Delta^{(t)}_{m,p})]
+roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT ← async-send [ divide start_ARG 1 end_ARG start_ARG italic_M end_ARG ∑ start_POSTSUBSCRIPT italic_m = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_M end_POSTSUPERSCRIPT ( roman_Δ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT ) ]
+17:
+18:
+if
+∃
+p
+𝑝
+\exists p
+∃ italic_p
+s.t.
+t
+−
+t
+p
+−
+τ
+mod
+H
+=
+=
+0
+t-t_{p}-\tau\mod H==0
+italic_t - italic_t start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT - italic_τ roman_mod italic_H = = 0
+then
+19:
+block-receive
+⁢
+[
+Δ
+p
+(
+t
+−
+τ
+)
+]
+block-receive
+delimited-[]
+subscript
+superscript
+Δ
+𝑡
+𝜏
+𝑝
+\texttt{block-receive}[{\Delta^{(t-\tau)}_{p}}]
+block-receive [ roman_Δ start_POSTSUPERSCRIPT ( italic_t - italic_τ ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT ]
+20:
+θ
+~
+m
+,
+p
+(
+t
+)
+←
+OuterOpt
+⁢
+(
+θ
+m
+,
+p
+(
+t
+−
+τ
+−
+H
+)
+,
+Δ
+p
+(
+t
+−
+τ
+)
+)
+←
+subscript
+superscript
+~
+𝜃
+𝑡
+𝑚
+𝑝
+OuterOpt
+subscript
+superscript
+𝜃
+𝑡
+𝜏
+𝐻
+𝑚
+𝑝
+subscript
+superscript
+Δ
+𝑡
+𝜏
+𝑝
+\tilde{\theta}^{(t)}_{m,p}\leftarrow\texttt{\scriptsize OuterOpt}(\theta^{(t-%
+\tau-H)}_{m,p},\Delta^{(t-\tau)}_{p})
+over~ start_ARG italic_θ end_ARG start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT ← OuterOpt ( italic_θ start_POSTSUPERSCRIPT ( italic_t - italic_τ - italic_H ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT , roman_Δ start_POSTSUPERSCRIPT ( italic_t - italic_τ ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_p end_POSTSUBSCRIPT )
+21:
+θ
+m
+,
+p
+(
+t
+)
+←
+α
+⁢
+θ
+m
+,
+p
+(
+t
+)
++
+(
+1
+−
+α
+)
+⁢
+θ
+~
+m
+,
+p
+(
+t
+)
+←
+superscript
+subscript
+𝜃
+𝑚
+𝑝
+𝑡
+𝛼
+subscript
+superscript
+𝜃
+𝑡
+𝑚
+𝑝
+1
+𝛼
+subscript
+superscript
+~
+𝜃
+𝑡
+𝑚
+𝑝
+\theta_{m,p}^{(t)}\leftarrow\alpha\theta^{(t)}_{m,p}+(1-\alpha)\tilde{\theta}^%
+{(t)}_{m,p}
+italic_θ start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT ← italic_α italic_θ start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT + ( 1 - italic_α ) over~ start_ARG italic_θ end_ARG start_POSTSUPERSCRIPT ( italic_t ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p end_POSTSUBSCRIPT
+22:
+end parallel for
+2.3
+Overlapping communication with computation
+To further maximize the time spent doing computation v.s. communication, we propose to overlap the communication of the outer gradient fragment with the inner optimization computation; the overlap happens with a strictly positive
+τ
+𝜏
+\tau
+italic_τ
+in
+Algorithm 2
+, lines 10-12. At the beginning of outer step
+t
++
+1
+𝑡
+1
+t+1
+italic_t + 1
+, instead of waiting for the communication of the fragment
+block-receive
+, we immediately start the new round of optimization. After
+τ
+−
+1
+𝜏
+1
+\tau-1
+italic_τ - 1
+inner steps (L3-5), we block-wait for the exchanged fragment (L10), apply the outer optimizer on the previously synchronized fragment (
+θ
+m
+,
+p
+(
+t
+−
+τ
+)
+(
+t
+−
+τ
+−
+H
+)
+subscript
+superscript
+𝜃
+𝑡
+𝜏
+𝐻
+𝑚
+superscript
+𝑝
+𝑡
+𝜏
+\theta^{(t-\tau-H)}_{m,p^{(t-\tau)}}
+italic_θ start_POSTSUPERSCRIPT ( italic_t - italic_τ - italic_H ) end_POSTSUPERSCRIPT start_POSTSUBSCRIPT italic_m , italic_p start_POSTSUPERSCRIPT ( italic_t - italic_τ ) end_POSTSUPERSCRIPT end_POSTSUBSCRIPT
+), and merge it with the currently optimized fragment with a mixing factor
+α
+𝛼
+\alpha
+italic_α
+.
+α
+=
+1
+𝛼
+1
+\alpha=1
+italic_α = 1
+is equivalent to no communication between replicas,
+α
+=
+0
+𝛼
+0
+\alpha=0
+italic_α = 0
+discards any updates done in the first
+τ
+𝜏
+\tau
+italic_τ
+steps on the fragment
+p
+𝑝
+p
+italic_p
+, and
+α
+=
+0.5
+𝛼
+0.5
+\alpha=0.5
+italic_α = 0.5
+does a uniform average between the local fragment parameters and the globally shared one.
+2.4
+Low-precision outer gradients
+The previous methods, streaming and overlapping communication with computation, reduce peak bandwidth and wall-clock time, respectively. To reduce total amount of bits exchanged, we use lower-precision in the outer gradients exchanged by workers (while still using FP32 for computing gradients), up to a float with 4 bits (1 sign bit, 3 exponent bits, and 0 mantissa bit) called
+E3M0
+(Agrawal et al.,
+2024
+)
+. Across a wide variety of experiments, we found no sign of performance regression when employing such low precision numbers during communication, even at the billion scale. This compression is applied when sending each replica’s unreduced outer gradients to miminize the amount bits communicated (L8 of
+Algorithm 2
+), but once received by a replica, importantly, the accumulation is done in FP32 for stability.
+2.5
+Discussion on the memory overhead
+In an SPMD
+2
+2
+2
+https://en.wikipedia.org/wiki/Single_program,_multiple_data
+model, the memory overhead of the Data-Parallel baseline is the parameters (
+1
+×
+1\times
+1 ×
+) + Adam state (
+2
+×
+2\times
+2 ×
+). (Streaming or not) DiLoCo’s memory overhead is the parameters (
+1
+×
+1\times
+1 ×
+), the Adam state (
+2
+×
+2\times
+2 ×
+), the outer global parameters (
+1
+×
+1\times
+1 ×
+), and the outer Nesterov state (
+1
+×
+1\times
+1 ×
+).
+Thus, our method requires
+66
+%
+percent
+66
+66\%
+66 %
+(
+5
+/
+3
+5
+3
+\nicefrac{{5}}{{3}}
+/ start_ARG 5 end_ARG start_ARG 3 end_ARG
+) more memory compared to Data-Parallel. However, in the case of Streaming DiLoCo, only a
+subset
+of the outer parameters and outer optimizer state is needed at a given time. Therefore, this overhead can be alleviated by offloading the additional bits onto CPU memory
+(Beaumont et al.,
+2022
+)
+. The memory overhead to hold in HBM at any point in time is the size of a fragment
+|
+p
+|
+𝑝
+|p|
+| italic_p |
+times two, for the outer parameters and outer optimizer state.
+For a 100 billion parameter model for instance, with
+|
+p
+|
+=
+𝑝
+absent
+|p|=
+| italic_p | =
+three layers and with a total of
+108
+108
+108
+108
+layers, that amounts to a
+2
+%
+percent
+2
+2\%
+2 %
+increase of memory (additional 20 GB to 1,117 GB
+3
+3
+3
+The size in GB of the parameters & inner Adam optimizer state is the number of parameters
+×
+\times
+×
+3
+×
+\times
+×
+4 (FP32). The size in GB of the additional fragment and its outer Nesterov optimizer state is the number of parameters
+×
+\times
+×
+2
+×
+\times
+×
+4
+×
+3
+108
+absent
+3
+108
+\times\frac{3}{108}
+× divide start_ARG 3 end_ARG start_ARG 108 end_ARG
+.
+). This extra memory is used when there are no activations or gradients in live memory, and thus should fit in HBM without any problem.
+Furthermore, the communication schedule is deterministic and known prior to training. Thus, we can start the transfer from RAM to HBM of a fragment (and its associated outer optimizer state) while finishing the previous (inner) gradients passes. Given that only a small subset is required at a given time, the memory transfer cost is negligible. With an H100 with PCIe
+4
+4
+4
+https://www.nvidia.com/en-gb/data-center/h100/
+, characterized by 2 TB/s of bandwidth speed, and without any sharding, this transfer is done in less than 10 milliseconds.
+3
+Experiments
+We run experiments demonstrating the compute utilization benefits of our approach in a a bandwidth and compute simulation in Section
+3.1
+. In Section
+3.2
+we show in practice the model learning outcomes. Finally, in Section
+3.3
+we show results with variations of the three main contributions of the paper, ablating their relative importance.
+3.1
+Compute utilization
+Figure 3
+:
+Simulation of a schedule interleaving forward passes (in
+blue
+), backward passes w.r.t. activations and parameters (resp. in
+light
+and
+dark green
+), and (outer) gradient reduction (in
+purple
+).
+(a)
+1B parameters model.
+(b)
+10B parameters model
+(c)
+100B parameters model
+Figure 4
+:
+Compute Utilization
+simulated across a range of bandwidth. A compute utilization of 0.8 means 80% of the time is spent in computation, and 20% in communication. Our best method reaches a compute utilization of 95% for models 1B, 10B, and 100B with a bandwidth roughly constant between 1 and 5 Gbit/s. Data-Parallel on the other hand requires 100, 200, and 300Gbit/s.
+To highlight the impact of our contributions in a controlled setting, we built a simulator to estimate the
+compute utilization
+of each method: how much time is spent doing computation v.s. communication. The simulation is a DAG with four different types of nodes as seen in
+Figure 3
+: forward in blue, backward w.r.t activations and parameters in green, (outer or not, for resp. DiLoCo and Data-Parallel) gradients reduction in purple. Each node represent a single layer. Therefore, the total number of nodes, for a single step, is
+4
+×
+L
+−
+1
+4
+𝐿
+1
+4\times L-1
+4 × italic_L - 1
+(because we don’t need the backward w.r.t activations of the first layer). The overall training is represented by a DAG made of such nodes. We use this simulator to estimate the
+compute utilization
+of each method: how much time is spent doing computation v.s. communication. Ideally this number is close to 1.0: No time is spent waiting for communication. It is more useful than just reporting the reduction in data transferred because our overlapping method (
+subsection 2.3
+) reduces the latency while keeping the amount of data exchanged constant.
+In
+Figure 3
+, we report:
+•
+Data-Parallel
+: the baseline which communicates gradients of the full model at every step;
+•
+DiLoCo
+: which communicating outer gradients of the full model once in a while (in this example, every
+H
+=
+2
+𝐻
+2
+H=2
+italic_H = 2
+steps);
+•
+Streaming DiLoCo
+: which communicates outer gradients only for a subset of the model (here the fragment size is a single layer and there are two fragments) every
+H
+=
+2
+𝐻
+2
+H=2
+italic_H = 2
+steps;
+•
+Streaming DiLoCo with overlapping communication and computation
+: This is similar to the above but gradients sent across workers are only needed after
+τ
+𝜏
+\tau
+italic_τ
+steps (in this example
+τ
+=
+1
+𝜏
+1
+\tau=1
+italic_τ = 1
+).
+The simulated compute utilization (CU) depends on some factors, listed as columns in
+Table 4
+. For the model scales 10B and 100B, we estimate step time (pure compute) based on the flops profile, a reasonable MFU (
+40
+%
+percent
+40
+40\%
+40 %
+), and hardware theoretical flops per seconds. We simulate training of each method, across three scales (1B, 10B, and 100B) under various bandwidth profiles
+Figure 4
+.
+We make several observations: 1) Streaming DiLoCo (in
+green
+) improves the CU of DiLoCo (in
+orange
+) despite exchanging as much data, because it reduces the latency by splitting the communication of the outer gradients across fragments. 2) only overlapping communication with computation can reach full
+100
+%
+percent
+100
+100\%
+100 %
+compute utilization. 3) the required bandwidth can become
+lower
+as the model scale gets
+larger
+when overlapping communication with computation, because longer compute step time (forward & backward) will provide more time to perform the synchronization across workers.
+The last point may seem counter-intuitive at first glance, but is the main advantage of our method, exploiting the
+square-cube law of distributed training
+(Ryabinin et al.,
+2023
+)
+where computation scales worse than communication (
+O
+⁢
+(
+n
+3
+)
+𝑂
+superscript
+𝑛
+3
+O(n^{3})
+italic_O ( italic_n start_POSTSUPERSCRIPT 3 end_POSTSUPERSCRIPT )
+vs
+O
+(
+n
+2
+O(n^{2}
+italic_O ( italic_n start_POSTSUPERSCRIPT 2 end_POSTSUPERSCRIPT
+) for a square matrix
+n
+×
+n
+𝑛
+𝑛
+n\times n
+italic_n × italic_n
+). We provide in the appendix, in
+Figure 15
+, the simulated compute utilization for a 100B model across various compute step time.
+Remark
+.
+Of course this is only a simulation of what we expect to happen in practice. Such simulation is not perfect because for instance we consider only the bandwidth between datacenters and not the local bandwidth between devices. We believe however, that this is still a useful tool
+5
+5
+5
+https://en.wikipedia.org/wiki/Bonini%27s_paradox
+to estimate device utilization.
+(a)
+Evaluation loss on C4
+(b)
+HellaSwag accuracy
+Figure 5
+:
+Scaling
+models from 35M (1.49e17 flops) to 4B parameters (2e21 flops) on C4.
+3.2
+LLM Scaling Experiments
+We perform our experiments with a Chinchilla architecture
+(Hoffmann et al.,
+2022
+)
+. Following
+Wortsman et al. (
+2023
+)
+and
+Jaghouar et al. (
+2024a
+)
+, we use QKNorm
+(Henry et al.,
+2020
+)
+and a Z-loss
+(Chowdhery et al.,
+2023
+)
+with a factor of
+1
+⁢
+e
+−
+4
+1
+𝑒
+4
+1e-4
+1 italic_e - 4
+to stabilize training. We report in
+Table 2
+the architecture hyperparameters and token budget at each scale. Unlike recommended in Post-Local SGD
+(Lin et al.,
+2020
+)
+, we train all our models from scratch. The main hyperparameter of DiLoCo is its outer learning rate; we tuned it to be optimal at small scale at
+0.4
+0.4
+0.4
+0.4
+, and kept it fixed across all scales. Likewise, for the simplicity, and to show that Streaming DiLoCo is a drop-in replacement of DiLoCo, we used the same outer learning rate, without further hyperparameters tuning.
+Except mentioned otherwise, we use the C4 dataset
+(Raffel et al.,
+2020
+)
+and train models from 35 million to 4 billion parameters, all with a sequence length of
+1
+,
+024
+1
+024
+1{,}024
+1 , 024
+. Each scale is trained with the chinchilla-optimal number of steps. We use 2 DiLoCo replicas, each of them performing FSDP
+(Zhao et al.,
+2023
+)
+across their respective closely located devices.
+For training we use a modified version of the NanoDO codebase
+(Liu et al.,
+2024b
+)
+that uses DrJax
+(Rush et al.,
+2024
+)
+to parallelize inner steps across replicas. The inner optimization is done with an annotated variant of
+jax.vmap
+for the optimization step, with parameters having an extra leading axis for the DiLoCo replicas. The outer optimization is implemented with an all-reduce, without any central parameter server.
+Method
+Token Budget
+Hours spent w/
++
+∞
++\infty
++ ∞
+Gbits/s
+Hours spent w/ 1 Gbits/s
+Terabytes exchanged
+Eval Loss
+↓
+↓
+\downarrow
+↓
+HellaSwag
+↑
+↑
+\uparrow
+↑
+Piqa
+↑
+↑
+\uparrow
+↑
+Arc Easy
+↑
+↑
+\uparrow
+↑
+Data-Parallel
+25B
+0.67
+109
+441
+2.67
+42.09
+67.35
+40.42
+100B
+2.7
+438
+1,767
+2.52
+49.78
+69.15
+44.03
+250B
+6.75
+1097
+4,418
+2.45
+53.86
+70.45
+44.21
+Streaming DiLoCo
+with overlapped FP4 communication
+25B
+0.67
+0.88
+1.10
+2.66
+42.08
+67.46
+38.42
+100B
+2.7
+3.5
+4.42
+2.51
+49.98
+69.96
+44.03
+250B
+6.75
+8.75
+11.05
+2.45
+54.24
+71.38
+41.92
+Table 1
+:
+Overtraining
+Data-Parallel and our method on Dolma with a 1 billion parameters model. The latter performs slightly better despite exchanging in total
+400
+×
+400\times
+400 ×
+fewer bits, reducing the peak bandwidth by
+8
+×
+8\times
+8 ×
+, and with a significantly relaxed training communication latency constraint: allow communication to be as long as a full compute step.
+3.2.1
+Scaling
+We perform scaling experiments on C4, with models ranging from 35 millions parameters to 1 billion parameters, all with a sequence length of
+1
+,
+024
+1
+024
+1{,}024
+1 , 024
+. For Data-Parallel and Streaming DiLoCo with
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+, we also provide results on a 4 billion parameter model. At each scale, we use the Chinchilla-optimal
+(Hoffmann et al.,
+2022
+)
+number of steps. We highlight in
+Figure 5
+the evaluation loss (lower is better) and HellaSwag
+(Zellers et al.,
+2019
+)
+accuracy (higher is better).
+First, we observe in
+Figure 5
+that Data-Parallel (in
+blue
+), DiLoCo with
+H
+=
+30
+𝐻
+30
+H=30
+italic_H = 30
+inner steps (in
+orange
+), and Streaming DiLoCo with
+H
+=
+30
+𝐻
+30
+H=30
+italic_H = 30
+(
+green
+) perform all similarly across both loss (at 1B parameters, respectively 2.49, 2.49, and 2.48) and accuracy metrics (resp. 46.6%, 46.5%, and 46.6%). Streaming DiLoCo with more inner steps
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+(in
+red
+) has slightly worse performance initially but use significantly less bandwidth and the loss improves proportionally better as we scale: scaling law slope for Data-Parallel is -0.13149 while -0.13539 for Streaming DiLoCo. We report in the appendix
+Table 5
+all metrics, and include two more downstream tasks: Piqa
+(Bisk et al.,
+2020
+)
+and Arc-Easy
+(Clark et al.,
+2018
+)
+. Moreover
+Table 6
+considers an increased number of DiLoCo replicas.
+3.2.2
+Overtraining on Dolma
+The previous experiments were performed on C4 dataset using the chinchilla-optimal number of tokens. Using a 1 billion parameter model, this yields a token budget of 25 billion. However, language models are now usually
+overtrained
+(Gadre et al.,
+2024
+)
+. Therefore we perform a comparison of a Data-Parallel baseline vs our full model (streaming DiLoCo with overlapped FP4 communication) on the Dolma dataset
+(Soldaini et al.,
+2024
+)
+with a 1 billion parameter model and with a token budget of 25, 100, and 250 billions tokens (resp. 1.9e20, 7.6e20, and 1.9e21 flops) using a sequence length of
+2
+,
+048
+2
+048
+2{,}048
+2 , 048
+. In that larger, more realistic setting, we set the number of inner steps between synchronization to
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+to further minimize communication.
+We report results in
+Table 1
+, and note that both our method and the baseline perform similarly w.r.t loss and accuracy on downstream tasks (HellaSwag, Piqa, Arc-Easy). In addition of being neutral in term of ML performance: 1) the amount of bits exchanged between non-colocated devices over the course of training is
+400
+×
+400\times
+400 ×
+higher for Data-Parallel; 2) the peak bandwidth (amount of bits exchanged at given moment) is reduced by
+num layers =
+⁢
+24
+fragment size =
+⁢
+3
+=
+8
+×
+\frac{\text{num layers = }24}{\text{fragment size = }3}=8\times
+divide start_ARG num layers = 24 end_ARG start_ARG fragment size = 3 end_ARG = 8 ×
+; and 3) while Data-Parallel ideally hopes for a 0 second latency when communicating, our overlapping scheme allows us a latency as long as a full forward/backward pass, which is several seconds at large scale. For those reasons, we believe our work is step towards a truly “
+distributed free lunch
+”.
+3.3
+Ablations
+To ablate the importance of each component of Streaming DiLoCo, we perform all our ablations on a model of size 500 million parameters using the C4 dataset with the chinchilla-optimal number of steps and a token budget of 11 billions.
+We split our ablations section in three parts, corresponding to the three improvements brought in this paper: namely 1) Streaming synchronization in section
+3.3.1
+, 2) overlapping communication with computation in section
+3.3.2
+, and 3) finally quantized communication in section
+3.3.3
+.
+3.3.1
+Ablating the streaming synchronization
+In this ablation section, we consider different settings for streaming DiLoCo presented in
+subsection 2.2
+.
+Number of synced layers per fragment.
+We ablate in
+Figure 6
+the fragment size,
+i.e.
+, how many transformer blocks are included in a fragment. Based on this analysis, we choose a fixed fragment size of 3 layers, striking a desirable trade-off between ML performance and reduction of peak bandwidth (for which the smaller the fragments the better). We also consider whether to have a
+sequential
+or
+strided
+pattern (see illustration in
+Figure 2
+for a reference). We choose the latter for several reasons: 1) ML performance is slightly better for the fragment size we consider, 2) deeper networks, with a small fragment size (e.g. 3), should benefit more from striding by spreading out up-to-date synchronized layers across the full depth of the network. Finally, 3) it slightly improves the compute utilization (see
+Figure 7
+) by allowing better overlapping schedule, as clearly seen in
+Figure 14
+in the appendix.
+(a)
+C4 eval loss
+(b)
+Peak bandwidth reduction
+Figure 6
+:
+The fragment’s size
+will determine the peak bandwidth but also the learning dynamics. We choose in practice 3 layers per fragment across all model scales.
+Figure 7
+:
+Compute utilization profile
+of sequential vs strided pattern for a 100 billion parameters model.
+Comparison to FedPart.
+Streaming DiLoCo bears similarity with concurrent work dubbed FedPart
+(Wang et al.,
+2024
+)
+, where a subset of the model is also exchanged at each round. However, FedPart argues that non-shared layers should be be frozen during inner optimization. We believe this is rather flops-inefficient: For an 18 layer model, with 3 layers per fragment, 15 layers (83%) are frozen at any point in time despite doing forward/backward computation. We ran comparison of Streaming DiLoCo with and without the frozen pattern proposed by FedPart, reaching respectively on the C4 eval loss
+3.2145
+3.2145
+3.2145
+3.2145
+and
+2.6749
+2.6749
+2.6749
+2.6749
+. Freezing the
+18
+−
+3
+=
+15
+18
+3
+15
+18-3=15
+18 - 3 = 15
+layers that won’t be synchronized at the given round therefore results in a 20% increase of the evaluation loss. These results confirm our intuition that while freezing layers may help merging, this incurs a significant flop inefficiency, which might not be acceptable in training-compute bound settings (which are typical in current large scale training of LLMs).
+3.3.2
+Ablating the communication overlap
+In this ablation section, we investigate how to overlap communication with computation, see
+subsection 2.3
+for reference.
+Overlapping.
+We first vary the number of inner steps,
+τ
+𝜏
+\tau
+italic_τ
+we use to overlap communication with computation (see
+subsection 2.3
+).
+Figure 8
+shows results varying
+τ
+𝜏
+\tau
+italic_τ
+from 1 to 20, with
+α
+=
+0
+𝛼
+0
+\alpha=0
+italic_α = 0
+(discarding any intermediary inner updates) and
+α
+=
+0.5
+𝛼
+0.5
+\alpha=0.5
+italic_α = 0.5
+(uniform merging). We can see that the degradation in evaluation loss is negligible up to an overlap of 10 inner steps (
+<
+0.2
+%
+absent
+percent
+0.2
+<0.2\%
+< 0.2 %
+).
+By checking the compute utilization of a 100B model in
+Figure 9
+, we observe little gain in compute time after an overlap of 5 inner steps. Therefore, we advise practitioners to limit the overlap to 5 inner steps. In our main experiments, we used 1 step for simplicity.
+Figure 8
+:
+Varying
+the number of overlapped inner steps
+τ
+𝜏
+\tau
+italic_τ
+for
+α
+=
+{
+0
+,
+0.5
+}
+𝛼
+0
+0.5
+\alpha=\{0,0.5\}
+italic_α = { 0 , 0.5 }
+. A larger
+τ
+𝜏
+\tau
+italic_τ
+requires a significantly lower bandwidth, see also
+Figure 9
+.
+Figure 9
+:
+Estimated compute utilization
+for a 100B model when increasing
+τ
+𝜏
+\tau
+italic_τ
+, the number of inner steps which overlap with communication.
+Overlapping with some slack between workers.
+If workers use heterogeneous device types (e.g. TPUv5e vs TPUv6e) or are just placed in different environments, their execution speed might vary and it would be inefficient to force them to synchronize at the same optimization step.
+In this case, we could grant workers with some slack. For instance, in a 2 DiLoCo replicas setting, both workers could send their respective outer gradients at the same step (but not necessarily at the same time) and they could receive the update at a different step
+(Liu et al.,
+2024a
+)
+. We accomplish this by simply using a different
+τ
+𝜏
+\tau
+italic_τ
+per worker (
+τ
+1
+subscript
+𝜏
+1
+\tau_{1}
+italic_τ start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+and
+τ
+2
+subscript
+𝜏
+2
+\tau_{2}
+italic_τ start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+) as shown in line 7-9 in
+Algorithm 2
+. We show in
+Figure 10
+the evaluation loss when
+τ
+1
+=
+1
+subscript
+𝜏
+1
+1
+\tau_{1}=1
+italic_τ start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT = 1
+and varying
+τ
+2
+subscript
+𝜏
+2
+\tau_{2}
+italic_τ start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+. Similarly to
+Figure 8
+, the loss degradation is limited under a delay of up to 5 inner steps. This result suggests that Streaming DiLoCo is rather robust and could support training of large models on several heterogeneous workers.
+Figure 10
+:
+Varying
+the number of overlapped inner steps
+τ
+2
+subscript
+𝜏
+2
+\tau_{2}
+italic_τ start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+for the second worker while keeping
+τ
+1
+=
+1
+subscript
+𝜏
+1
+1
+\tau_{1}=1
+italic_τ start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT = 1
+. For all data points,
+α
+=
+0.5
+𝛼
+0.5
+\alpha=0.5
+italic_α = 0.5
+. Training is very robust for values of
+τ
+2
+subscript
+𝜏
+2
+\tau_{2}
+italic_τ start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+less than 5.
+3.3.3
+Ablating the quantized communication
+Finally, in this section, we consider various schemes to quantize our communication, as proposed in
+subsection 2.4
+.
+Compressing outer gradients.
+We ablate in
+Figure 11
+two ways of compressing the outer gradients: either by setting to zero some values (FedDropout
+(Wen et al.,
+2022
+)
+, Dare
+(Yu et al.,
+2024
+)
+, and Top-K selection) or by lowering the precision. In all cases, we accumulate the outer update in float32. We report both the evaluation loss on C4 and the accuracy on HellaSwag. Interestingly, lowering the precision, from float32 to float4 does not affect the performance, while setting some values to zero is significantly worse, particularly when zero-ing out at random. We also considered Ties-Merging’s pruning method
+(Yadav et al.,
+2023
+)
+but preliminary experiments showed it also underperformed; however this approach might become advantageous with larger number of replicas
+M
+𝑀
+M
+italic_M
+.
+(a)
+C4 evaluation loss
+(b)
+HellaSwag accuracy
+Figure 11
+:
+Compressing the outer gradients
+with either value dropping (FedDropout, Dare) or using lower-precision floating point numbers.
+4
+Related Works
+Model merging.
+Model merging, a subfield within the broader study of linear mode connectivity, explores the potential of linearly interpolating between parameters of multiple models to synthesize a unified model that inherits the strengths of its constituents
+(Matena and Raffel,
+2021
+; Wortsman et al.,
+2021
+)
+. A key finding in this domain is the existence of low-loss pathways within the parameter space
+(Frankle et al.,
+2020
+; Neyshabur et al.,
+2020
+)
+that connect independently trained models, effectively circumventing the anticipated loss barriers. For instance,
+Wortsman et al. (
+2022
+)
+demonstrated that averaging the parameters of models fine-tuned from a common pre-trained initialization on diverse tasks
+(Ramé et al.,
+2023a
+)
+or with varying hyperparameters
+(Wortsman et al.,
+2022
+)
+yields a performant merged model. This approach, initially demonstrated in computer vision, has been successfully extended to natural language processing
+(Li et al.,
+2022
+)
+, reinforcement learning with human feedback
+(Ramé et al.,
+2023b
+)
+, noisy label learning
+(Rebuffi et al.,
+2022
+)
+, and out-of-distribution generalization
+(Ramé et al.,
+2023c
+)
+. Recent research has further investigated alternative strategies for mitigating loss barriers in model merging, including techniques based on parameter space transformations and other model surgery methods aiming to resolve merging conflicts
+(Jordan et al.,
+2023
+; Stoica et al.,
+2023
+; Jin et al.,
+2023
+; Ainsworth et al.,
+2023
+; Yadav et al.,
+2023
+; Yu et al.,
+2024
+)
+.
+Federated learning / local SGD.
+While model merging proposes to combine several models once, FedAvg
+(McMahan et al.,
+2017
+)
+and Local SGD
+(Stich,
+2019
+)
+do it multiple times with the goal of reducing bandwidth requirements: they operate by performing local training (typically via SGD) across workers for some number of steps before doing some kind of synchronization of worker parameters, or aggregation of parameters across workers. In their original forms, both FedAvg and Local SGD simply averaged the parameters across workers. As shown by
+Reddi et al. (
+2021
+)
+, the synchronization is more effective when each worker calculates a “model delta”, and these are aggregated over workers to produce a pseudo-gradient
+(Reddi et al.,
+2021
+; Ilharco et al.,
+2022
+)
+or
+outer gradient
+, which is then fed to a first-order optimizer. This yields a bi-level optimization scheme with inner optimizers and an outer optimizer, referred to as FedOpt by
+Reddi et al. (
+2021
+)
+, who propose using SGD as the inner optimizer and adaptive methods like Adam
+(Kingma and Ba,
+2014
+)
+as the outer optimizer in resource-constrained FL settings.
+Distributed training for LLMs.
+The increased requirements of training large language models (LLMs) hastened the need for distributed methods, for both inference
+(Borzunov et al.,
+2023
+)
+and training
+(Presser,
+2020
+; Diskin et al.,
+2021
+; Ryabinin et al.,
+2021
+)
+. More recently, DiLoCo
+(Douillard et al.,
+2024a
+)
+proposed a particular instantiation of FedOpt
+(Reddi et al.,
+2021
+)
+with AdamW
+(Loshchilov and Hutter,
+2019
+)
+as inner optimizer and Nesterov
+(Sutskever et al.,
+2013
+)
+as outer optimizer
+(Huo et al.,
+2020
+)
+. This simple formulation proved to be effective for distributed training with LLMs, where the number of replicas is small (<100) and without replica sampling, closer to cross-silo federated learning
+(Kairouz et al.,
+2021
+)
+. The FedOpt algorithm was also shown to be effective in training LLMs in settings that looked more like cross-device federated learning
+(Charles et al.,
+2024
+)
+. The empirical success of DiLoCo has been reproduced multiple times
+(Jaghouar et al.,
+2024b
+; Sani et al.,
+2024b
+)
+and has been successfully scaled up to 10 billion parameter models
+(Jaghouar et al.,
+2024a
+)
+. Related, a simple change on how the outer Nesterov accumulates outer gradients proved to handle well asynchronicity between workers of different speeds
+(Liu et al.,
+2024a
+)
+. DiLoCo adds a new axis of paralellism to distributed training
+(Shoeybi et al.,
+2020
+)
+, and is compatible
+(Jaghouar et al.,
+2024a
+)
+with other existing axes like FSDP
+(Zhao et al.,
+2023
+)
+, or even another level of federated learning
+(Sani et al.,
+2024a
+)
+.
+Partial communication.
+Communicating a subset of the network is often used in federated learning to provide
+personalized
+models per user, see FedPart
+(Arivazhagan et al.,
+2019
+)
+. DiPaCo
+(Douillard et al.,
+2024b
+)
+recently proposed a distributed mixture-of-experts where subsets of the model is synchronized with subsets of the replicas, according to a sharing pattern that is optimized with Expectation-Maximization style of algorithm during training. WASH
+(Fournier et al.,
+2024
+)
+and later Sparta
+(Baioumy and Cheema,
+2025
+)
+propose to frequently exchange a random subset of the neurons. Finally FedPart
+(Wang et al.,
+2024
+)
+, developed at the same time as Streaming DiLoCo, also proposes to share per-layer fragments. However, they argue that for a given communication round, non-shared fragments should not undergo inner optimization, a strategy which we show slows down convergence. Note that all partial communication methods can be seen as a form of
+structured
+(outer) gradients compression.
+Gradient compression.
+Data-Parallel (with gradients) and Federated learning (with outer gradients) often share similar methods to compress the communication
+(Lin et al.,
+2018
+)
+: from randomly dropping values
+(Wen et al.,
+2022
+)
+, to combining multiple compression schemes (e.g. dropping, top-k, low-precision)
+(Wang et al.,
+2023
+)
+, to use low-rank compression
+(Vogels et al.,
+2019
+; Zhao et al.,
+2024
+)
+, or recently to keep only the fast moving components with DCT but communicates via an all-gather collective instead of an all-reduce
+(Peng et al.,
+2024
+)
+.
+5
+Conclusion and Future Work
+In this paper, we introduced three improvements over DiLoCo: we synchronize a only subset of the parameters at a time, we overlap the communication of this synchronization over several computation steps, and we compress the outer gradients to communicate to low-precision with only four bits. All these innovations combined together leads to a training with similar ML-performance as a classical Data-Parallel training, while using
+400
+×
+400\times
+400 ×
+less bandwidth, reducing the peak bandwidth compared to DiLoCo’s bursts of communication, and allowing communication to have an ideal non-zero latency by overlapping it with computation.
+In sum, we can reach a similar compute utilization as the widely used Data-Parallel using two orders of magnitude less Gbit/s bandwidth, while performing comparably in term of training loss and downstream evaluation accuracies as Data-Parallel. For those reasons, we claim that this work in a first step towards what we call a
+distributed
+free lunch, paving the way for a new way to train distributed networks with reduced bandwidth and yet without trading-off model quality.
+Next.
+In our view, the ubiquity of co-located Data-Parallel training is likely due to the hardware lottery
+(Hooker,
+2020
+)
+, when “
+a research idea wins because it is suited to the available software and hardware and not because the idea is superior to alternative research directions
+”. Data-Parallel training has been extensively studied, tuned, and scaled
+(Kaplan et al.,
+2020
+)
+, and it is hard to beat the wisdom-of-the-crowd of thousands of researchers. In contrast, the federated learning literature has mainly studied smaller scale models, primarily due to its focus on edge devices. There are huge opportunities for bringing the ideas from the federated learning literature to the new world of large scale training for LLMs. A critical next work is to study how new distributed methods like ours should be tuned and scaled across multiple axes (e.g. model size, overtraining factor, number of replicas). In particular, how to scale efficiently the number of DiLoCo replicas given an equivalent token budget is most needed.
+More generally, reducing the communication problem to a minor obstacle allows new classes of co-designed architectures and training paradigms (for example
+Douillard et al. (
+2024b
+)
+) maximizing available compute
+(Sutton,
+2019
+)
+: we hope to see the training of modular constellations of small models loosely connected
+(Dean,
+2021
+)
+across heterogeneous devices, using compute arbitrage spread world-wide.
+Acknowledgements
+We would like to thank Alban Rrustemi, Jeff Dean, Michael Isard, Sebastian Borgeaud, Rohan Anil, Koray Kavukcuoglu, and Raia Hadsell for their feedback and leadership support; Andrei Rusu, Adhiguna Kuncoro, Lucio Dery, Rachita Chhaparia, Zohar Yahav, Qixuan Feng, Zack Nado, Nova Fallen, Nicole Mitchell, Sean Augenstein, and Stephen Roller for the helpful advices; finally Alberto Magni, Juliana Vincente Franco, Joel Wee, James Lotte, Matthew Johnson, and Blake Hechtman for unblocking us through so many engineering hurdles alongside our journey.
+\nobibliography
+*
+References
+Agrawal et al. (2024)
+Aditya Agrawal, Matthew Hedlund, and Blake Hechtman.
+exmy: A data type and technique for arbitrary bit precision
+quantization.
+arXiv preprint library
+, 2024.
+URL
+https://arxiv.org/abs/2405.13938
+.
+Ainsworth et al. (2023)
+Samuel K. Ainsworth, Jonathan Hayase, and Siddhartha Srinivasa.
+Git re-basin: Merging models modulo permutation symmetries.
+arXiv preprint library
+, 2023.
+URL
+https://arxiv.org/abs/2209.04836
+.
+Arivazhagan et al. (2019)
+Manoj Ghuhan Arivazhagan, Vinay Aggarwal, Aaditya Kumar Singh, and Sunav
+Choudhary.
+Federated learning with personalization layers, 2019.
+URL
+https://arxiv.org/abs/1912.00818
+.
+Baioumy and Cheema (2025)
+Mohamed Baioumy and Alex Cheema.
+Sparta.
+blog.exolabs.net
+, 2025.
+URL
+https://blog.exolabs.net/day-12/
+.
+Beaumont et al. (2022)
+Olivier Beaumont, Lionel Eyraud-Dubois, Alena Shilova, and Xunyi Zhao.
+Weight offloading strategies.
+HAL repository
+, 2022.
+URL
+https://inria.hal.science/hal-03580767/
+.
+Bisk et al. (2020)
+Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, and Yejin Choi.
+Piqa: Reasoning about physical commonsense in natural language.
+Proceedings of the AAAI Conference on Artificial Intelligence
+(AAAI)
+, 2020.
+URL
+https://arxiv.org/abs/1911.11641
+.
+Borzunov et al. (2023)
+Alexander Borzunov, Dmitry Baranchuk, Tim Dettmers, Max Ryabinin, Younes
+Belkada, Artem Chumachenko, Pavel Samygin, and Colin Raffel.
+Petals: Collaborative inference and fine-tuning of large models.
+Proceedings of the Annual Meeting of the Association for
+Computational Linguistics (ACL Short Papers)
+, 2023.
+URL
+https://arxiv.org/abs/2209.01188
+.
+Charles et al. (2024)
+Zachary Charles, Nicole Mitchell, Krishna Pillutla, Michael Reneer, and Zachary
+Garrett.
+Towards federated foundation models: Scalable dataset pipelines for
+group-structured learning.
+Advances in Neural Information Processing Systems
+, 2024.
+URL
+https://arxiv.org/abs/2307.09619
+.
+Chowdhery et al. (2023)
+Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra,
+Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian
+Gehrmann, Parker Schuh, Kensen Shi, Sasha Tsvyashchenko, Joshua Maynez,
+Abhishek Rao, Parker Barnes, Yi Tay, Noam Shazeer, Vinodkumar Prabhakaran,
+Emily Reif, Nan Du, Ben Hutchinson, Reiner Pope, James Bradbury, Jacob
+Austin, Michael Isard, Guy Gur-Ari, Pengcheng Yin, Toju Duke, Anselm
+Levskaya, Sanjay Ghemawat, Sunipa Dev, Henryk Michalewski, Xavier Garcia,
+Vedant Misra, Kevin Robinson, Liam Fedus, Denny Zhou, Daphne Ippolito, David
+Luan, Hyeontaek Lim, Barret Zoph, Alexander Spiridonov, Ryan Sepassi, David
+Dohan, Shivani Agrawal, Mark Omernick, Andrew M. Dai,
+Thanumalayan Sankaranarayana Pillai, Marie Pellat, Aitor Lewkowycz, Erica
+Moreira, Rewon Child, Oleksandr Polozov, Katherine Lee, Zongwei Zhou, Xuezhi
+Wang, Brennan Saeta, Mark Diaz, Orhan Firat, Michele Catasta, Jason Wei,
+Kathy Meier-Hellstern, Douglas Eck, Jeff Dean, Slav Petrov, and Noah Fiedel.
+Palm: Scaling language modeling with pathways.
+Journal of Machine Learning Research
+, 2023.
+URL
+http://jmlr.org/papers/v24/22-1144.html
+.
+Clark et al. (2018)
+Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa
+Schoenick, and Oyvind Tafjord.
+Think you have solved question answering? try arc, the ai2 reasoning
+challenge, 2018.
+URL
+https://arxiv.org/abs/1803.05457v1
+.
+Dean (2021)
+Jeff Dean.
+Pathways: A next-generation ai architecture.
+Google’s blog
+, 2021.
+URL
+https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/
+.
+DeepSeek-AI et al. (2024)
+DeepSeek-AI, Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda
+Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, Damai Dai, Daya
+Guo, Dejian Yang, Deli Chen, Dongjie Ji, Erhang Li, Fangyun Lin, Fucong Dai,
+Fuli Luo, Guangbo Hao, Guanting Chen, Guowei Li, H. Zhang, Han Bao, Hanwei
+Xu, Haocheng Wang, Haowei Zhang, Honghui Ding, Huajian Xin, Huazuo Gao, Hui
+Li, Hui Qu, J. L. Cai, Jian Liang, Jianzhong Guo, Jiaqi Ni, Jiashi Li, Jiawei
+Wang, Jin Chen, Jingchang Chen, Jingyang Yuan, Junjie Qiu, Junlong Li,
+Junxiao Song, Kai Dong, Kai Hu, Kaige Gao, Kang Guan, Kexin Huang, Kuai Yu,
+Lean Wang, Lecong Zhang, Lei Xu, Leyi Xia, Liang Zhao, Litong Wang, Liyue
+Zhang, Meng Li, Miaojun Wang, Mingchuan Zhang, Minghua Zhang, Minghui Tang,
+Mingming Li, Ning Tian, Panpan Huang, Peiyi Wang, Peng Zhang, Qiancheng Wang,
+Qihao Zhu, Qinyu Chen, Qiushi Du, R. J. Chen, R. L. Jin, Ruiqi Ge, Ruisong
+Zhang, Ruizhe Pan, Runji Wang, Runxin Xu, Ruoyu Zhang, Ruyi Chen, S. S. Li,
+Shanghao Lu, Shangyan Zhou, Shanhuang Chen, Shaoqing Wu, Shengfeng Ye,
+Shengfeng Ye, Shirong Ma, Shiyu Wang, Shuang Zhou, Shuiping Yu, Shunfeng
+Zhou, Shuting Pan, T. Wang, Tao Yun, Tian Pei, Tianyu Sun, W. L. Xiao,
+Wangding Zeng, Wanjia Zhao, Wei An, Wen Liu, Wenfeng Liang, Wenjun Gao,
+Wenqin Yu, Wentao Zhang, X. Q. Li, Xiangyue Jin, Xianzu Wang, Xiao Bi,
+Xiaodong Liu, Xiaohan Wang, Xiaojin Shen, Xiaokang Chen, Xiaokang Zhang,
+Xiaosha Chen, Xiaotao Nie, Xiaowen Sun, Xiaoxiang Wang, Xin Cheng, Xin Liu,
+Xin Xie, Xingchao Liu, Xingkai Yu, Xinnan Song, Xinxia Shan, Xinyi Zhou,
+Xinyu Yang, Xinyuan Li, Xuecheng Su, Xuheng Lin, Y. K. Li, Y. Q. Wang, Y. X.
+Wei, Y. X. Zhu, Yang Zhang, Yanhong Xu, Yanhong Xu, Yanping Huang, Yao Li,
+Yao Zhao, Yaofeng Sun, Yaohui Li, Yaohui Wang, Yi Yu, Yi Zheng, Yichao Zhang,
+Yifan Shi, Yiliang Xiong, Ying He, Ying Tang, Yishi Piao, Yisong Wang, Yixuan
+Tan, Yiyang Ma, Yiyuan Liu, Yongqiang Guo, Yu Wu, Yuan Ou, Yuchen Zhu, Yuduan
+Wang, Yue Gong, Yuheng Zou, Yujia He, Yukun Zha, Yunfan Xiong, Yunxian Ma,
+Yuting Yan, Yuxiang Luo, Yuxiang You, Yuxuan Liu, Yuyang Zhou, Z. F. Wu,
+Z. Z. Ren, Zehui Ren, Zhangli Sha, Zhe Fu, Zhean Xu, Zhen Huang, Zhen Zhang,
+Zhenda Xie, Zhengyan Zhang, Zhewen Hao, Zhibin Gou, Zhicheng Ma, Zhigang Yan,
+Zhihong Shao, Zhipeng Xu, Zhiyu Wu, Zhongyu Zhang, Zhuoshu Li, Zihui Gu,
+Zijia Zhu, Zijun Liu, Zilin Li, Ziwei Xie, Ziyang Song, Ziyi Gao, and Zizheng
+Pan.
+Deepseek-v3 technical report.
+arXiv preprint library
+, 2024.
+URL
+https://arxiv.org/abs/2412.19437
+.
+Diskin et al. (2021)
+Michael Diskin, Alexey Bukhtiyarov, Max Ryabinin, Lucile Saulnier, Quentin
+Lhoest, Anton Sinitsin, Dmitry Popov, Dmitry Pyrkin, Maxim Kashirin,
+Alexander Borzunov, Albert Villanova del Moral, Denis Mazur, Ilia Kobelev,
+Yacine Jernite, Thomas Wolf, and Gennady Pekhimenko.
+Distributed deep learning in open collaborations.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2021.
+URL
+https://arxiv.org/abs/2106.10207
+.
+Douillard et al. (2024a)
+Arthur Douillard, Qixuan Feng, Andrei A. Rusu, Rachita Chhaparia, Yani Donchev,
+Adhiguna Kuncoro, Marc’Aurelio Ranzato, Arthur Szlam, and Jiajun Shen.
+DiLoCo: Distributed low-communication training of language models.
+International Conference on Machine Learning (ICML) Workshop
+,
+2024a.
+URL
+https://arXiv.org/abs/2311.08105
+.
+Douillard et al. (2024b)
+Arthur Douillard, Qixuan Feng, Andrei A. Rusu, Adhiguna Kuncoro, Yani Donchev,
+Rachita Chhaparia, Ionel Gog, Marc’Aurelio Ranzato, Jiajun Shen, and Arthur
+Szlam.
+Dipaco: Distributed path composition.
+arXiv preprint library
+, 2024b.
+URL
+https://arxiv.org/abs/2403.10616
+.
+Fournier et al. (2024)
+Louis Fournier, Adel Nabli, Masih Aminbeidokhti, Marco Pedersoli, Eugene
+Belilovsky, and Edouard Oyallon.
+Wash: Train your ensemble with communication-efficient weight
+shuffling, then average.
+Advances in Neural Information Processing Systems (NeurIPS)
+Workshop
+, 2024.
+URL
+https://arxiv.org/abs/2405.17517
+.
+Frankle et al. (2020)
+Jonathan Frankle, Gintare Karolina Dziugaite, Daniel M. Roy, and Michael
+Carbin.
+Linear mode connectivity and the lottery ticket hypothesis.
+International Conference on Machine Learning (ICML)
+, 2020.
+URL
+https://arxiv.org/abs/1912.05671
+.
+Gadre et al. (2024)
+Samir Yitzhak Gadre, Georgios Smyrnis, Vaishaal Shankar, Suchin Gururangan,
+Mitchell Wortsman, Rulin Shao, Jean Mercat, Alex Fang, Jeffrey Li, Sedrick
+Keh, Rui Xin, Marianna Nezhurina, Igor Vasiljevic, Jenia Jitsev, Luca
+Soldaini, Alexandros G. Dimakis, Gabriel Ilharco, Pang Wei Koh, Shuran Song,
+Thomas Kollar, Yair Carmon, Achal Dave, Reinhard Heckel, Niklas Muennighoff,
+and Ludwig Schmidt.
+Language models scale reliably with over-training and on downstream
+tasks.
+arXiv preprint library
+, 2024.
+URL
+https://arxiv.org/abs/2403.08540
+.
+Grattafiori et al. (2024)
+Aaron Grattafiori, Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek
+Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Alex
+Vaughan, Amy Yang, Angela Fan, Anirudh Goyal, Anthony Hartshorn, Aobo Yang,
+Archi Mitra, Archie Sravankumar, Artem Korenev, Arthur Hinsvark, Arun Rao,
+Aston Zhang, Aurelien Rodriguez, Austen Gregerson, Ava Spataru, Baptiste
+Roziere, Bethany Biron, Binh Tang, Bobbie Chern, Charlotte Caucheteux, Chaya
+Nayak, Chloe Bi, Chris Marra, Chris McConnell, Christian Keller, Christophe
+Touret, Chunyang Wu, Corinne Wong, Cristian Canton Ferrer, Cyrus Nikolaidis,
+Damien Allonsius, Daniel Song, Danielle Pintz, Danny Livshits, Danny Wyatt,
+David Esiobu, Dhruv Choudhary, Dhruv Mahajan, Diego Garcia-Olano, Diego
+Perino, Dieuwke Hupkes, Egor Lakomkin, Ehab AlBadawy, Elina Lobanova, Emily
+Dinan, Eric Michael Smith, Filip Radenovic, Francisco Guzmán, Frank Zhang,
+Gabriel Synnaeve, Gabrielle Lee, Georgia Lewis Anderson, Govind Thattai,
+Graeme Nail, Gregoire Mialon, Guan Pang, Guillem Cucurell, Hailey Nguyen,
+Hannah Korevaar, Hu Xu, Hugo Touvron, Iliyan Zarov, Imanol Arrieta Ibarra,
+Isabel Kloumann, Ishan Misra, Ivan Evtimov, Jack Zhang, Jade Copet, Jaewon
+Lee, Jan Geffert, Jana Vranes, Jason Park, Jay Mahadeokar, Jeet Shah, Jelmer
+van der Linde, Jennifer Billock, Jenny Hong, Jenya Lee, Jeremy Fu, Jianfeng
+Chi, Jianyu Huang, Jiawen Liu, Jie Wang, Jiecao Yu, Joanna Bitton, Joe
+Spisak, Jongsoo Park, Joseph Rocca, Joshua Johnstun, Joshua Saxe, Junteng
+Jia, Kalyan Vasuden Alwala, Karthik Prasad, Kartikeya Upasani, Kate Plawiak,
+Ke Li, Kenneth Heafield, Kevin Stone, Khalid El-Arini, Krithika Iyer, Kshitiz
+Malik, Kuenley Chiu, Kunal Bhalla, Kushal Lakhotia, Lauren Rantala-Yeary,
+Laurens van der Maaten, Lawrence Chen, Liang Tan, Liz Jenkins, Louis Martin,
+Lovish Madaan, Lubo Malo, Lukas Blecher, Lukas Landzaat, Luke de Oliveira,
+Madeline Muzzi, Mahesh Pasupuleti, Mannat Singh, Manohar Paluri, Marcin
+Kardas, Maria Tsimpoukelli, Mathew Oldham, Mathieu Rita, Maya Pavlova,
+Melanie Kambadur, Mike Lewis, Min Si, Mitesh Kumar Singh, Mona Hassan, Naman
+Goyal, Narjes Torabi, Nikolay Bashlykov, Nikolay Bogoychev, Niladri
+Chatterji, Ning Zhang, Olivier Duchenne, Onur Çelebi, Patrick Alrassy,
+Pengchuan Zhang, Pengwei Li, Petar Vasic, Peter Weng, Prajjwal Bhargava,
+Pratik Dubal, Praveen Krishnan, Punit Singh Koura, Puxin Xu, Qing He,
+Qingxiao Dong, Ragavan Srinivasan, Raj Ganapathy, Ramon Calderer,
+Ricardo Silveira Cabral, Robert Stojnic, Roberta Raileanu, Rohan Maheswari,
+Rohit Girdhar, Rohit Patel, Romain Sauvestre, Ronnie Polidoro, Roshan
+Sumbaly, Ross Taylor, Ruan Silva, Rui Hou, Rui Wang, Saghar Hosseini, Sahana
+Chennabasappa, Sanjay Singh, Sean Bell, Seohyun Sonia Kim, Sergey Edunov,
+Shaoliang Nie, Sharan Narang, Sharath Raparthy, Sheng Shen, Shengye Wan,
+Shruti Bhosale, Shun Zhang, Simon Vandenhende, Soumya Batra, Spencer Whitman,
+Sten Sootla, Stephane Collot, Suchin Gururangan, Sydney Borodinsky, Tamar
+Herman, Tara Fowler, Tarek Sheasha, Thomas Georgiou, Thomas Scialom, Tobias
+Speckbacher, Todor Mihaylov, Tong Xiao, Ujjwal Karn, Vedanuj Goswami, Vibhor
+Gupta, Vignesh Ramanathan, Viktor Kerkez, Vincent Gonguet, Virginie Do, Vish
+Vogeti, Vítor Albiero, Vladan Petrovic, Weiwei Chu, Wenhan Xiong, Wenyin Fu,
+Whitney Meers, Xavier Martinet, Xiaodong Wang, Xiaofang Wang, Xiaoqing Ellen
+Tan, Xide Xia, Xinfeng Xie, Xuchao Jia, Xuewei Wang, Yaelle Goldschlag,
+Yashesh Gaur, Yasmine Babaei, Yi Wen, Yiwen Song, Yuchen Zhang, Yue Li,
+Yuning Mao, Zacharie Delpierre Coudert, Zheng Yan, Zhengxing Chen, Zoe
+Papakipos, Aaditya Singh, Aayushi Srivastava, Abha Jain, Adam Kelsey, Adam
+Shajnfeld, Adithya Gangidi, Adolfo Victoria, Ahuva Goldstand, Ajay Menon,
+Ajay Sharma, Alex Boesenberg, Alexei Baevski, Allie Feinstein, Amanda Kallet,
+Amit Sangani, Amos Teo, Anam Yunus, Andrei Lupu, Andres Alvarado, Andrew
+Caples, Andrew Gu, Andrew Ho, Andrew Poulton, Andrew Ryan, Ankit Ramchandani,
+Annie Dong, Annie Franco, Anuj Goyal, Aparajita Saraf, Arkabandhu Chowdhury,
+Ashley Gabriel, Ashwin Bharambe, Assaf Eisenman, Azadeh Yazdan, Beau James,
+Ben Maurer, Benjamin Leonhardi, Bernie Huang, Beth Loyd, Beto De Paola,
+Bhargavi Paranjape, Bing Liu, Bo Wu, Boyu Ni, Braden Hancock, Bram Wasti,
+Brandon Spence, Brani Stojkovic, Brian Gamido, Britt Montalvo, Carl Parker,
+Carly Burton, Catalina Mejia, Ce Liu, Changhan Wang, Changkyu Kim, Chao Zhou,
+Chester Hu, Ching-Hsiang Chu, Chris Cai, Chris Tindal, Christoph
+Feichtenhofer, Cynthia Gao, Damon Civin, Dana Beaty, Daniel Kreymer, Daniel
+Li, David Adkins, David Xu, Davide Testuggine, Delia David, Devi Parikh,
+Diana Liskovich, Didem Foss, Dingkang Wang, Duc Le, Dustin Holland, Edward
+Dowling, Eissa Jamil, Elaine Montgomery, Eleonora Presani, Emily Hahn, Emily
+Wood, Eric-Tuan Le, Erik Brinkman, Esteban Arcaute, Evan Dunbar, Evan
+Smothers, Fei Sun, Felix Kreuk, Feng Tian, Filippos Kokkinos, Firat Ozgenel,
+Francesco Caggioni, Frank Kanayet, Frank Seide, Gabriela Medina Florez,
+Gabriella Schwarz, Gada Badeer, Georgia Swee, Gil Halpern, Grant Herman,
+Grigory Sizov, Guangyi, Zhang, Guna Lakshminarayanan, Hakan Inan, Hamid
+Shojanazeri, Han Zou, Hannah Wang, Hanwen Zha, Haroun Habeeb, Harrison
+Rudolph, Helen Suk, Henry Aspegren, Hunter Goldman, Hongyuan Zhan, Ibrahim
+Damlaj, Igor Molybog, Igor Tufanov, Ilias Leontiadis, Irina-Elena Veliche,
+Itai Gat, Jake Weissman, James Geboski, James Kohli, Janice Lam, Japhet
+Asher, Jean-Baptiste Gaya, Jeff Marcus, Jeff Tang, Jennifer Chan, Jenny Zhen,
+Jeremy Reizenstein, Jeremy Teboul, Jessica Zhong, Jian Jin, Jingyi Yang, Joe
+Cummings, Jon Carvill, Jon Shepard, Jonathan McPhie, Jonathan Torres, Josh
+Ginsburg, Junjie Wang, Kai Wu, Kam Hou U, Karan Saxena, Kartikay Khandelwal,
+Katayoun Zand, Kathy Matosich, Kaushik Veeraraghavan, Kelly Michelena, Keqian
+Li, Kiran Jagadeesh, Kun Huang, Kunal Chawla, Kyle Huang, Lailin Chen,
+Lakshya Garg, Lavender A, Leandro Silva, Lee Bell, Lei Zhang, Liangpeng Guo,
+Licheng Yu, Liron Moshkovich, Luca Wehrstedt, Madian Khabsa, Manav Avalani,
+Manish Bhatt, Martynas Mankus, Matan Hasson, Matthew Lennie, Matthias Reso,
+Maxim Groshev, Maxim Naumov, Maya Lathi, Meghan Keneally, Miao Liu,
+Michael L. Seltzer, Michal Valko, Michelle Restrepo, Mihir Patel, Mik
+Vyatskov, Mikayel Samvelyan, Mike Clark, Mike Macey, Mike Wang, Miquel Jubert
+Hermoso, Mo Metanat, Mohammad Rastegari, Munish Bansal, Nandhini Santhanam,
+Natascha Parks, Natasha White, Navyata Bawa, Nayan Singhal, Nick Egebo,
+Nicolas Usunier, Nikhil Mehta, Nikolay Pavlovich Laptev, Ning Dong, Norman
+Cheng, Oleg Chernoguz, Olivia Hart, Omkar Salpekar, Ozlem Kalinli, Parkin
+Kent, Parth Parekh, Paul Saab, Pavan Balaji, Pedro Rittner, Philip Bontrager,
+Pierre Roux, Piotr Dollar, Polina Zvyagina, Prashant Ratanchandani, Pritish
+Yuvraj, Qian Liang, Rachad Alao, Rachel Rodriguez, Rafi Ayub, Raghotham
+Murthy, Raghu Nayani, Rahul Mitra, Rangaprabhu Parthasarathy, Raymond Li,
+Rebekkah Hogan, Robin Battey, Rocky Wang, Russ Howes, Ruty Rinott, Sachin
+Mehta, Sachin Siby, Sai Jayesh Bondu, Samyak Datta, Sara Chugh, Sara Hunt,
+Sargun Dhillon, Sasha Sidorov, Satadru Pan, Saurabh Mahajan, Saurabh Verma,
+Seiji Yamamoto, Sharadh Ramaswamy, Shaun Lindsay, Shaun Lindsay, Sheng Feng,
+Shenghao Lin, Shengxin Cindy Zha, Shishir Patil, Shiva Shankar, Shuqiang
+Zhang, Shuqiang Zhang, Sinong Wang, Sneha Agarwal, Soji Sajuyigbe, Soumith
+Chintala, Stephanie Max, Stephen Chen, Steve Kehoe, Steve Satterfield,
+Sudarshan Govindaprasad, Sumit Gupta, Summer Deng, Sungmin Cho, Sunny Virk,
+Suraj Subramanian, Sy Choudhury, Sydney Goldman, Tal Remez, Tamar Glaser,
+Tamara Best, Thilo Koehler, Thomas Robinson, Tianhe Li, Tianjun Zhang, Tim
+Matthews, Timothy Chou, Tzook Shaked, Varun Vontimitta, Victoria Ajayi,
+Victoria Montanez, Vijai Mohan, Vinay Satish Kumar, Vishal Mangla, Vlad
+Ionescu, Vlad Poenaru, Vlad Tiberiu Mihailescu, Vladimir Ivanov, Wei Li,
+Wenchen Wang, Wenwen Jiang, Wes Bouaziz, Will Constable, Xiaocheng Tang,
+Xiaojian Wu, Xiaolan Wang, Xilun Wu, Xinbo Gao, Yaniv Kleinman, Yanjun Chen,
+Ye Hu, Ye Jia, Ye Qi, Yenda Li, Yilin Zhang, Ying Zhang, Yossi Adi, Youngjin
+Nam, Yu, Wang, Yu Zhao, Yuchen Hao, Yundi Qian, Yunlu Li, Yuzi He, Zach Rait,
+Zachary DeVito, Zef Rosnbrick, Zhaoduo Wen, Zhenyu Yang, Zhiwei Zhao, and
+Zhiyu Ma.
+The llama 3 herd of models, 2024.
+URL
+https://arxiv.org/abs/2407.21783
+.
+Henry et al. (2020)
+Alex Henry, Prudhvi Raj Dachapally, Shubham Pawar, and Yuxuan Chen.
+Query-key normalization for transformers.
+Proceedings of the Conference on Empirical Methods in Natural
+Language Processing (EMNLP)
+, 2020.
+URL
+https://arxiv.org/abs/2010.04245
+.
+Hoffmann et al. (2022)
+Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor
+Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes
+Welbl, Aidan Clark, Tom Hennigan, Eric Noland, Katie Millican, George van den
+Driessche, Bogdan Damoc, Aurelia Guy, Simon Osindero, Karen Simonyan, Erich
+Elsen, Jack W. Rae, Oriol Vinyals, and Laurent Sifre.
+Training compute-optimal large language models.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2022.
+URL
+https://arxiv.org/abs/2203.15556
+.
+Hooker (2020)
+Sara Hooker.
+The hardware lottery.
+Communication for the ACM
+, 2020.
+URL
+https://arxiv.org/abs/2009.06489
+.
+Huo et al. (2020)
+Zhouyuan Huo, Qian Yang, Bin Gu, and Lawrence Carin. Heng Huang.
+Faster on-device training using new federated momentum algorithm.
+arXiv preprint library
+, 2020.
+URL
+https://arxiv.org/abs/2002.02090
+.
+Ilharco et al. (2022)
+Gabriel Ilharco, Mitchell Wortsman, Samir Yitzhak Gadre, Shuran Song, Hannaneh
+Hajishirzi, Simon Kornblith, Ali Farhadi, and Ludwig Schmidt.
+Patching open-vocabulary models by interpolating weights.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2022.
+Jaghouar et al. (2024a)
+Sami Jaghouar, Jack Min Ong, Manveer Basra, Fares Obeid, Jannik Straube,
+Michael Keiblinger, Elie Bakouch, Lucas Atkins, Maziyar Panahi, Charles
+Goddard, Max Ryabinin, and Johannes Hagemann.
+Intellect-1 technical report.
+arXiv preprint library
+, 2024a.
+URL
+https://arxiv.org/abs/2412.01152
+.
+Jaghouar et al. (2024b)
+Sami Jaghouar, Jack Min Ong, and Johannes Hagemann.
+Opendiloco: An open-source framework for globally distributed
+low-communication training.
+arXiv preprint library
+, 2024b.
+URL
+https://arxiv.org/abs/2407.07852
+.
+Jin et al. (2023)
+Xisen Jin, Xiang Ren, Daniel Preotiuc-Pietro, and Pengxiang Cheng.
+Dataless knowledge fusion by merging weights of language models.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2023.
+URL
+https://arxiv.org/abs/2212.09849
+.
+Jordan et al. (2023)
+Keller Jordan, Hanie Sedghi, Olga Saukh, Rahim Entezari, and Behnam Neyshabur.
+Repair: Renormalizing permuted activations for interpolation repair.
+arXiv preprint library
+, 2023.
+URL
+https://arxiv.org/abs/2211.08403
+.
+Kairouz et al. (2021)
+Peter Kairouz, H Brendan McMahan, Brendan Avent, Aurélien Bellet, Mehdi
+Bennis, Arjun Nitin Bhagoji, Kallista Bonawitz, Zachary Charles, Graham
+Cormode, Rachel Cummings, et al.
+Advances and open problems in federated learning.
+Foundations and trends in machine learning
+, 2021.
+URL
+https://arxiv.org/abs/1912.04977
+.
+Kaplan et al. (2020)
+Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B. Brown, Benjamin Chess, Rewon
+Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei.
+Scaling laws for neural language models.
+arXiv preprint library
+, 2020.
+URL
+https://arxiv.org/abs/2001.08361
+.
+Kingma and Ba (2014)
+Diederik P. Kingma and Jimmy Ba.
+Adam: A method for stochastic optimization.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2014.
+URL
+https://arxiv.org/abs/1412.6980
+.
+Krizhevsky et al. (2012)
+Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton.
+Imagenet classification with deep convolutional neural networks.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2012.
+URL
+https://papers.nips.cc/paper_files/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html
+.
+Li et al. (2022)
+Margaret Li, Suchin Gururangan, Tim Dettmers, Mike Lewis, Tim Althoff, Noah A.
+Smith, and Luke Zettlemoyer.
+Branch-train-merge: Embarrassingly parallel training of expert
+language models.
+arXiv preprint library
+, 2022.
+URL
+https://arxiv.org/abs/2208.03306
+.
+Lin et al. (2020)
+Tao Lin, Sebastian U. Stich, Kumar Kshitij Patel, and Martin Jaggi.
+Don’t use large mini-batches, use local sgd.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2020.
+URL
+https://arxiv.org/abs/1808.07217
+.
+Lin et al. (2018)
+Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J. Dally.
+Deep gradient compression: Reducing the communication bandwidth for
+distributed training, 2018.
+URL
+https://arxiv.org/abs/1712.01887
+.
+Liu et al. (2024a)
+Bo Liu, Rachita Chhaparia, Arthur Douillard, Satyen Kale, Andrei A. Rusu,
+Jiajun Shen, Arthur Szlam, and Marc’Aurelio Ranzato.
+Asynchronous local-sgd training for language modeling.
+International Conference on Machine Learning (ICML) Workshop
+,
+2024a.
+URL
+https://arxiv.org/abs/2401.09135
+.
+Liu et al. (2024b)
+Peter J. Liu, Roman Novak, Jaehoon Lee, Mitchell Wortsman, Lechao Xiao, Katie
+Everett, Alexander A. Alemi, Mark Kurzeja, Pierre Marcenac, Izzeddin Gur,
+Simon Kornblith, Kelvin Xu, Gamaleldin Elsayed, Ian Fischer, Jeffrey
+Pennington, Ben Adlam, and Jascha-Sohl Dickstein.
+Nanodo: A minimal transformer decoder-only language model
+implementation in JAX., 2024b.
+URL
+http://github.com/google-deepmind/nanodo
+.
+Loshchilov and Hutter (2019)
+Ilya Loshchilov and Frank Hutter.
+Decoupled weight decay regularization.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2019.
+URL
+https://arxiv.org/abs/1711.05101
+.
+Matena and Raffel (2021)
+Michael Matena and Colin Raffel.
+Merging models with fisher-weighted averaging.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2021.
+URL
+https://arxiv.org/abs/2111.09832
+.
+McMahan et al. (2017)
+H. Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, and
+Blaise Agüera y Arcas.
+Communication-efficient learning of deep networks from decentralized
+data.
+International Conference on Artificial Intelligence and
+Statistics (AISTATS)
+, 2017.
+URL
+https://arxiv.org/abs/1602.05629
+.
+Neyshabur et al. (2020)
+Behnam Neyshabur, Hanie Sedghi, and Chiyuan Zhang.
+What is being transferred in transfer learning?
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2020.
+URL
+https://arxiv.org/abs/2008.11687
+.
+OpenAI et al. (2024)
+OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya,
+Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman,
+Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom,
+Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake
+Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg
+Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles
+Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany
+Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis
+Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben
+Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah
+Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien
+Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien
+Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix,
+Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges,
+Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes,
+Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross,
+Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen
+He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey,
+Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost
+Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang,
+Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan,
+Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar,
+Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim,
+Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz
+Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen
+Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade
+Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin,
+Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim
+Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie
+Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey,
+Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke
+Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel
+Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro
+Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long
+Ouyang, Cullen O’Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley
+Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex
+Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila
+Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael,
+Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power,
+Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya
+Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob
+Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani
+Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman,
+Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker,
+Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin,
+Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher,
+Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas
+Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth
+Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe,
+Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright,
+Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann,
+Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave
+Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin
+Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan,
+Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao,
+Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph.
+Gpt-4 technical report, 2024.
+URL
+https://arxiv.org/abs/2303.08774
+.
+Peng et al. (2024)
+Bowen Peng, Jeffrey Quesnelle, and Diederik P. Kingma.
+Demo: Decoupled momentum optimization.
+arXiv preprint library
+, 2024.
+URL
+https://arxiv.org/abs/2411.19870
+.
+Presser (2020)
+Shawn Presser.
+Swarm training, 2020.
+URL
+https://battle.shawwn.com/swarm-training-v01a.pdf
+.
+Raffel et al. (2020)
+Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael
+Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
+Exploring the limits of transfer learning with a unified text-to-text
+transformer.
+Journal of Machine Learning Research
+, 2020.
+URL
+https://arxiv.org/abs/1910.10683
+.
+Ramé et al. (2023a)
+Alexandre Ramé, Kartik Ahuja, Jianyu Zhang, Matthieu Cord, Léon Bottou, and
+David Lopez-Paz.
+Model ratatouille: Recycling diverse models for out-of-distribution
+generalization.
+International Conference on Machine Learning (ICML)
+,
+2023a.
+URL
+https://arxiv.org/abs/2212.10445
+.
+Ramé et al. (2023b)
+Alexandre Ramé, Guillaume Couairon, Mustafa Shukor, Corentin Dancette,
+Jean-Baptiste Gaya, Laure Soulier, and Matthieu Cord.
+Rewarded soups: towards pareto-optimal alignment by interpolating
+weights fine-tuned on diverse rewards.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2023b.
+URL
+https://arxiv.org/abs/2306.04488
+.
+Ramé et al. (2023c)
+Alexandre Ramé, Matthieu Kirchmeyer, Thibaud Rahier, Alain Rakotomamonjy,
+Patrick Gallinari, and Matthieu Cord.
+Diverse weight averaging for out-of-distribution generalization.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2023c.
+URL
+https://arxiv.org/abs/2205.09739
+.
+Rebuffi et al. (2022)
+Sylvestre-Alvise Rebuffi, Francesco Croce, and Sven Gowal.
+Revisiting adapters with adversarial training.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2022.
+URL
+https://arxiv.org/abs/2210.04886
+.
+Reddi et al. (2021)
+Sashank Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush,
+Jakub Konečný, Sanjiv Kumar, and H. Brendan McMahan.
+Adaptive federated optimization.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2021.
+URL
+https://arxiv.org/abs/2003.00295
+.
+Rush et al. (2024)
+Keith Rush, Zachary Charles, Zachary Garrett, Sean Augenstein, and Nicole
+Mitchell.
+Drjax: Scalable and differentiable mapreduce primitives in jax.
+International Conference on Machine Learning (ICML) Workshop
+,
+2024.
+URL
+https://arxiv.org/abs/2403.07128
+.
+Ryabinin et al. (2021)
+Max Ryabinin, Eduard Gorbunov, Vsevolod Plokhotnyuk, and Gennady Pekhimenko.
+Moshpit sgd: Communication-efficient decentralized training on
+heterogeneous unreliable devices.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2021.
+URL
+https://arxiv.org/abs/2103.03239
+.
+Ryabinin et al. (2023)
+Max Ryabinin, Tim Dettmers, Michael Diskin, and Alexander Borzunov.
+Swarm parallelism: Training large models can be surprisingly
+communication-efficient.
+International Conference on Machine Learning (ICML)
+, 2023.
+URL
+https://arxiv.org/abs/2301.11913
+.
+Sani et al. (2024a)
+Lorenzo Sani, Alex Iacob, Zeyu Cao, Royson Lee, Bill Marino, Yan Gao, Dongqi
+Cai, Zexi Li, Wanru Zhao, Xinchi Qiu, and Nicholas D. Lane.
+Photon: Federated llm pre-training.
+arXiv preprint library
+, 2024a.
+URL
+https://arxiv.org/abs/2411.02908
+.
+Sani et al. (2024b)
+Lorenzo Sani, Alex Iacob, Zeyu Cao, Bill Marino, Yan Gao, Tomas Paulik, Wanru
+Zhao, William F. Shen, Preslav Aleksandrov, Xinchi Qiu, and Nicholas D. Lane.
+The future of large language model pre-training is federated.
+arXiv preprint library
+, 2024b.
+URL
+https://arxiv.org/abs/2405.10853
+.
+Shazeer et al. (2017)
+Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le,
+Geoffrey Hinton, and Jeff Dean.
+Outrageously large neural networks: The sparsely-gated
+mixture-of-experts layer.
+arXiv preprint library
+, 2017.
+URL
+https://arxiv.org/abs/1701.06538
+.
+Shoeybi et al. (2020)
+Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper,
+and Bryan Catanzaro.
+Megatron-lm: Training multi-billion parameter language models using
+model parallelism.
+arXiv preprint library
+, 2020.
+URL
+https://arxiv.org/abs/1909.08053
+.
+Soldaini et al. (2024)
+Luca Soldaini, Rodney Kinney, Akshita Bhagia, Dustin Schwenk, David Atkinson,
+Russell Authur, Ben Bogin, Khyathi Chandu, Jennifer Dumas, Yanai Elazar,
+Valentin Hofmann, Ananya Harsh Jha, Sachin Kumar, Li Lucy, Xinxi Lyu, Nathan
+Lambert, Ian Magnusson, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik,
+Crystal Nam, Matthew E. Peters, Abhilasha Ravichander, Kyle Richardson,
+Zejiang Shen, Emma Strubell, Nishant Subramani, Oyvind Tafjord, Pete Walsh,
+Luke Zettlemoyer, Noah A. Smith, Hannaneh Hajishirzi, Iz Beltagy, Dirk
+Groeneveld, Jesse Dodge, and Kyle Lo.
+Dolma: an open corpus of three trillion tokens for language model
+pretraining research, 2024.
+URL
+https://arxiv.org/abs/2402.00159
+.
+Stich (2019)
+Sebastian U. Stich.
+Local SGD converges fast and communicates little.
+Proceedings of the International Conference on Learning
+Representations (ICLR)
+, 2019.
+URL
+https://arxiv.org/abs/1805.09767
+.
+Stoica et al. (2023)
+George Stoica, Daniel Bolya, Jakob Bjorner, Taylor Hearn, and Judy Hoffman.
+Zipit! merging models from different tasks without training.
+arXiv preprint library
+, 2023.
+URL
+https://arxiv.org/abs/2305.03053
+.
+Sutskever et al. (2013)
+Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton.
+On the importance of initialization and momentum in deep learning.
+International Conference on Machine Learning (ICML)
+, 2013.
+URL
+https://proceedings.mlr.press/v28/sutskever13.html
+.
+Sutton (2019)
+Rich Sutton.
+The bitter lesson.
+incompleteideas.net
+, 2019.
+URL
+http://www.incompleteideas.net/IncIdeas/BitterLesson.html
+.
+Team et al. (2024)
+Gemini Team, Rohan Anil, Sebastian Borgeaud, Jean-Baptiste Alayrac, Jiahui Yu,
+Radu Soricut, Johan Schalkwyk, Andrew M. Dai, Anja Hauth, Katie Millican,
+David Silver, Melvin Johnson, Ioannis Antonoglou, Julian Schrittwieser,
+Amelia Glaese, Jilin Chen, Emily Pitler, Timothy Lillicrap, Angeliki
+Lazaridou, Orhan Firat, James Molloy, Michael Isard, Paul R. Barham, Tom
+Hennigan, Benjamin Lee, Fabio Viola, Malcolm Reynolds, Yuanzhong Xu, Ryan
+Doherty, Eli Collins, Clemens Meyer, Eliza Rutherford, Erica Moreira, Kareem
+Ayoub, Megha Goel, Jack Krawczyk, Cosmo Du, Ed Chi, Heng-Tze Cheng, Eric Ni,
+Purvi Shah, Patrick Kane, Betty Chan, Manaal Faruqui, Aliaksei Severyn,
+Hanzhao Lin, YaGuang Li, Yong Cheng, Abe Ittycheriah, Mahdis Mahdieh, Mia
+Chen, Pei Sun, Dustin Tran, Sumit Bagri, Balaji Lakshminarayanan, Jeremiah
+Liu, Andras Orban, Fabian Güra, Hao Zhou, Xinying Song, Aurelien Boffy,
+Harish Ganapathy, Steven Zheng, HyunJeong Choe, Ágoston Weisz, Tao Zhu,
+Yifeng Lu, Siddharth Gopal, Jarrod Kahn, Maciej Kula, Jeff Pitman, Rushin
+Shah, Emanuel Taropa, Majd Al Merey, Martin Baeuml, Zhifeng Chen, Laurent El
+Shafey, Yujing Zhang, Olcan Sercinoglu, George Tucker, Enrique Piqueras,
+Maxim Krikun, Iain Barr, Nikolay Savinov, Ivo Danihelka, Becca Roelofs,
+Anaïs White, Anders Andreassen, Tamara von Glehn, Lakshman Yagati, Mehran
+Kazemi, Lucas Gonzalez, Misha Khalman, Jakub Sygnowski, Alexandre Frechette,
+Charlotte Smith, Laura Culp, Lev Proleev, Yi Luan, Xi Chen, James Lottes,
+Nathan Schucher, Federico Lebron, Alban Rrustemi, Natalie Clay, Phil Crone,
+Tomas Kocisky, Jeffrey Zhao, Bartek Perz, Dian Yu, Heidi Howard, Adam
+Bloniarz, Jack W. Rae, Han Lu, Laurent Sifre, Marcello Maggioni, Fred
+Alcober, Dan Garrette, Megan Barnes, Shantanu Thakoor, Jacob Austin, Gabriel
+Barth-Maron, William Wong, Rishabh Joshi, Rahma Chaabouni, Deeni Fatiha, Arun
+Ahuja, Gaurav Singh Tomar, Evan Senter, Martin Chadwick, Ilya Kornakov,
+Nithya Attaluri, Iñaki Iturrate, Ruibo Liu, Yunxuan Li, Sarah Cogan, Jeremy
+Chen, Chao Jia, Chenjie Gu, Qiao Zhang, Jordan Grimstad, Ale Jakse Hartman,
+Xavier Garcia, Thanumalayan Sankaranarayana Pillai, Jacob Devlin, Michael
+Laskin, Diego de Las Casas, Dasha Valter, Connie Tao, Lorenzo Blanco,
+Adrià Puigdomènech Badia, David Reitter, Mianna Chen, Jenny Brennan, Clara
+Rivera, Sergey Brin, Shariq Iqbal, Gabriela Surita, Jane Labanowski, Abhi
+Rao, Stephanie Winkler, Emilio Parisotto, Yiming Gu, Kate Olszewska, Ravi
+Addanki, Antoine Miech, Annie Louis, Denis Teplyashin, Geoff Brown, Elliot
+Catt, Jan Balaguer, Jackie Xiang, Pidong Wang, Zoe Ashwood, Anton Briukhov,
+Albert Webson, Sanjay Ganapathy, Smit Sanghavi, Ajay Kannan, Ming-Wei Chang,
+Axel Stjerngren, Josip Djolonga, Yuting Sun, Ankur Bapna, Matthew Aitchison,
+Pedram Pejman, Henryk Michalewski, Tianhe Yu, Cindy Wang, Juliette Love,
+Junwhan Ahn, Dawn Bloxwich, Kehang Han, Peter Humphreys, Thibault Sellam,
+James Bradbury, Varun Godbole, Sina Samangooei, Bogdan Damoc, Alex Kaskasoli,
+Sébastien M. R. Arnold, Vijay Vasudevan, Shubham Agrawal, Jason Riesa,
+Dmitry Lepikhin, Richard Tanburn, Srivatsan Srinivasan, Hyeontaek Lim, Sarah
+Hodkinson, Pranav Shyam, Johan Ferret, Steven Hand, Ankush Garg, Tom Le
+Paine, Jian Li, Yujia Li, Minh Giang, Alexander Neitz, Zaheer Abbas, Sarah
+York, Machel Reid, Elizabeth Cole, Aakanksha Chowdhery, Dipanjan Das,
+Dominika Rogozińska, Vitaliy Nikolaev, Pablo Sprechmann, Zachary Nado, Lukas
+Zilka, Flavien Prost, Luheng He, Marianne Monteiro, Gaurav Mishra, Chris
+Welty, Josh Newlan, Dawei Jia, Miltiadis Allamanis, Clara Huiyi Hu, Raoul
+de Liedekerke, Justin Gilmer, Carl Saroufim, Shruti Rijhwani, Shaobo Hou,
+Disha Shrivastava, Anirudh Baddepudi, Alex Goldin, Adnan Ozturel, Albin
+Cassirer, Yunhan Xu, Daniel Sohn, Devendra Sachan, Reinald Kim Amplayo, Craig
+Swanson, Dessie Petrova, Shashi Narayan, Arthur Guez, Siddhartha Brahma,
+Jessica Landon, Miteyan Patel, Ruizhe Zhao, Kevin Villela, Luyu Wang, Wenhao
+Jia, Matthew Rahtz, Mai Giménez, Legg Yeung, James Keeling, Petko Georgiev,
+Diana Mincu, Boxi Wu, Salem Haykal, Rachel Saputro, Kiran Vodrahalli, James
+Qin, Zeynep Cankara, Abhanshu Sharma, Nick Fernando, Will Hawkins, Behnam
+Neyshabur, Solomon Kim, Adrian Hutter, Priyanka Agrawal, Alex Castro-Ros,
+George van den Driessche, Tao Wang, Fan Yang, Shuo yiin Chang, Paul Komarek,
+Ross McIlroy, Mario Lučić, Guodong Zhang, Wael Farhan, Michael Sharman,
+Paul Natsev, Paul Michel, Yamini Bansal, Siyuan Qiao, Kris Cao, Siamak
+Shakeri, Christina Butterfield, Justin Chung, Paul Kishan Rubenstein, Shivani
+Agrawal, Arthur Mensch, Kedar Soparkar, Karel Lenc, Timothy Chung, Aedan
+Pope, Loren Maggiore, Jackie Kay, Priya Jhakra, Shibo Wang, Joshua Maynez,
+Mary Phuong, Taylor Tobin, Andrea Tacchetti, Maja Trebacz, Kevin Robinson,
+Yash Katariya, Sebastian Riedel, Paige Bailey, Kefan Xiao, Nimesh Ghelani,
+Lora Aroyo, Ambrose Slone, Neil Houlsby, Xuehan Xiong, Zhen Yang, Elena
+Gribovskaya, Jonas Adler, Mateo Wirth, Lisa Lee, Music Li, Thais Kagohara,
+Jay Pavagadhi, Sophie Bridgers, Anna Bortsova, Sanjay Ghemawat, Zafarali
+Ahmed, Tianqi Liu, Richard Powell, Vijay Bolina, Mariko Iinuma, Polina
+Zablotskaia, James Besley, Da-Woon Chung, Timothy Dozat, Ramona Comanescu,
+Xiance Si, Jeremy Greer, Guolong Su, Martin Polacek, Raphaël Lopez Kaufman,
+Simon Tokumine, Hexiang Hu, Elena Buchatskaya, Yingjie Miao, Mohamed
+Elhawaty, Aditya Siddhant, Nenad Tomasev, Jinwei Xing, Christina Greer, Helen
+Miller, Shereen Ashraf, Aurko Roy, Zizhao Zhang, Ada Ma, Angelos Filos, Milos
+Besta, Rory Blevins, Ted Klimenko, Chih-Kuan Yeh, Soravit Changpinyo, Jiaqi
+Mu, Oscar Chang, Mantas Pajarskas, Carrie Muir, Vered Cohen, Charline Le Lan,
+Krishna Haridasan, Amit Marathe, Steven Hansen, Sholto Douglas, Rajkumar
+Samuel, Mingqiu Wang, Sophia Austin, Chang Lan, Jiepu Jiang, Justin Chiu,
+Jaime Alonso Lorenzo, Lars Lowe Sjösund, Sébastien Cevey, Zach Gleicher,
+Thi Avrahami, Anudhyan Boral, Hansa Srinivasan, Vittorio Selo, Rhys May,
+Konstantinos Aisopos, Léonard Hussenot, Livio Baldini Soares, Kate Baumli,
+Michael B. Chang, Adrià Recasens, Ben Caine, Alexander Pritzel, Filip
+Pavetic, Fabio Pardo, Anita Gergely, Justin Frye, Vinay Ramasesh, Dan Horgan,
+Kartikeya Badola, Nora Kassner, Subhrajit Roy, Ethan Dyer, Víctor Campos
+Campos, Alex Tomala, Yunhao Tang, Dalia El Badawy, Elspeth White, Basil
+Mustafa, Oran Lang, Abhishek Jindal, Sharad Vikram, Zhitao Gong, Sergi
+Caelles, Ross Hemsley, Gregory Thornton, Fangxiaoyu Feng, Wojciech Stokowiec,
+Ce Zheng, Phoebe Thacker, Çağlar Ünlü, Zhishuai Zhang, Mohammad Saleh,
+James Svensson, Max Bileschi, Piyush Patil, Ankesh Anand, Roman Ring,
+Katerina Tsihlas, Arpi Vezer, Marco Selvi, Toby Shevlane, Mikel Rodriguez,
+Tom Kwiatkowski, Samira Daruki, Keran Rong, Allan Dafoe, Nicholas FitzGerald,
+Keren Gu-Lemberg, Mina Khan, Lisa Anne Hendricks, Marie Pellat, Vladimir
+Feinberg, James Cobon-Kerr, Tara Sainath, Maribeth Rauh, Sayed Hadi Hashemi,
+Richard Ives, Yana Hasson, Eric Noland, Yuan Cao, Nathan Byrd, Le Hou, Qingze
+Wang, Thibault Sottiaux, Michela Paganini, Jean-Baptiste Lespiau, Alexandre
+Moufarek, Samer Hassan, Kaushik Shivakumar, Joost van Amersfoort, Amol
+Mandhane, Pratik Joshi, Anirudh Goyal, Matthew Tung, Andrew Brock, Hannah
+Sheahan, Vedant Misra, Cheng Li, Nemanja Rakićević, Mostafa Dehghani,
+Fangyu Liu, Sid Mittal, Junhyuk Oh, Seb Noury, Eren Sezener, Fantine Huot,
+Matthew Lamm, Nicola De Cao, Charlie Chen, Sidharth Mudgal, Romina Stella,
+Kevin Brooks, Gautam Vasudevan, Chenxi Liu, Mainak Chain, Nivedita Melinkeri,
+Aaron Cohen, Venus Wang, Kristie Seymore, Sergey Zubkov, Rahul Goel, Summer
+Yue, Sai Krishnakumaran, Brian Albert, Nate Hurley, Motoki Sano, Anhad
+Mohananey, Jonah Joughin, Egor Filonov, Tomasz Kępa, Yomna Eldawy, Jiawern
+Lim, Rahul Rishi, Shirin Badiezadegan, Taylor Bos, Jerry Chang, Sanil Jain,
+Sri Gayatri Sundara Padmanabhan, Subha Puttagunta, Kalpesh Krishna, Leslie
+Baker, Norbert Kalb, Vamsi Bedapudi, Adam Kurzrok, Shuntong Lei, Anthony Yu,
+Oren Litvin, Xiang Zhou, Zhichun Wu, Sam Sobell, Andrea Siciliano, Alan
+Papir, Robby Neale, Jonas Bragagnolo, Tej Toor, Tina Chen, Valentin Anklin,
+Feiran Wang, Richie Feng, Milad Gholami, Kevin Ling, Lijuan Liu, Jules
+Walter, Hamid Moghaddam, Arun Kishore, Jakub Adamek, Tyler Mercado, Jonathan
+Mallinson, Siddhinita Wandekar, Stephen Cagle, Eran Ofek, Guillermo Garrido,
+Clemens Lombriser, Maksim Mukha, Botu Sun, Hafeezul Rahman Mohammad, Josip
+Matak, Yadi Qian, Vikas Peswani, Pawel Janus, Quan Yuan, Leif Schelin, Oana
+David, Ankur Garg, Yifan He, Oleksii Duzhyi, Anton Älgmyr, Timothée Lottaz,
+Qi Li, Vikas Yadav, Luyao Xu, Alex Chinien, Rakesh Shivanna, Aleksandr
+Chuklin, Josie Li, Carrie Spadine, Travis Wolfe, Kareem Mohamed, Subhabrata
+Das, Zihang Dai, Kyle He, Daniel von Dincklage, Shyam Upadhyay, Akanksha
+Maurya, Luyan Chi, Sebastian Krause, Khalid Salama, Pam G Rabinovitch, Pavan
+Kumar Reddy M, Aarush Selvan, Mikhail Dektiarev, Golnaz Ghiasi, Erdem Guven,
+Himanshu Gupta, Boyi Liu, Deepak Sharma, Idan Heimlich Shtacher, Shachi Paul,
+Oscar Akerlund, François-Xavier Aubet, Terry Huang, Chen Zhu, Eric Zhu,
+Elico Teixeira, Matthew Fritze, Francesco Bertolini, Liana-Eleonora
+Marinescu, Martin Bölle, Dominik Paulus, Khyatti Gupta, Tejasi Latkar, Max
+Chang, Jason Sanders, Roopa Wilson, Xuewei Wu, Yi-Xuan Tan, Lam Nguyen Thiet,
+Tulsee Doshi, Sid Lall, Swaroop Mishra, Wanming Chen, Thang Luong, Seth
+Benjamin, Jasmine Lee, Ewa Andrejczuk, Dominik Rabiej, Vipul Ranjan,
+Krzysztof Styrc, Pengcheng Yin, Jon Simon, Malcolm Rose Harriott, Mudit
+Bansal, Alexei Robsky, Geoff Bacon, David Greene, Daniil Mirylenka, Chen
+Zhou, Obaid Sarvana, Abhimanyu Goyal, Samuel Andermatt, Patrick Siegler, Ben
+Horn, Assaf Israel, Francesco Pongetti, Chih-Wei "Louis" Chen, Marco
+Selvatici, Pedro Silva, Kathie Wang, Jackson Tolins, Kelvin Guu, Roey Yogev,
+Xiaochen Cai, Alessandro Agostini, Maulik Shah, Hung Nguyen, Noah Ó
+Donnaile, Sébastien Pereira, Linda Friso, Adam Stambler, Adam Kurzrok,
+Chenkai Kuang, Yan Romanikhin, Mark Geller, ZJ Yan, Kane Jang, Cheng-Chun
+Lee, Wojciech Fica, Eric Malmi, Qijun Tan, Dan Banica, Daniel Balle, Ryan
+Pham, Yanping Huang, Diana Avram, Hongzhi Shi, Jasjot Singh, Chris Hidey,
+Niharika Ahuja, Pranab Saxena, Dan Dooley, Srividya Pranavi Potharaju, Eileen
+O’Neill, Anand Gokulchandran, Ryan Foley, Kai Zhao, Mike Dusenberry, Yuan
+Liu, Pulkit Mehta, Ragha Kotikalapudi, Chalence Safranek-Shrader, Andrew
+Goodman, Joshua Kessinger, Eran Globen, Prateek Kolhar, Chris Gorgolewski,
+Ali Ibrahim, Yang Song, Ali Eichenbaum, Thomas Brovelli, Sahitya Potluri,
+Preethi Lahoti, Cip Baetu, Ali Ghorbani, Charles Chen, Andy Crawford, Shalini
+Pal, Mukund Sridhar, Petru Gurita, Asier Mujika, Igor Petrovski, Pierre-Louis
+Cedoz, Chenmei Li, Shiyuan Chen, Niccolò Dal Santo, Siddharth Goyal, Jitesh
+Punjabi, Karthik Kappaganthu, Chester Kwak, Pallavi LV, Sarmishta Velury,
+Himadri Choudhury, Jamie Hall, Premal Shah, Ricardo Figueira, Matt Thomas,
+Minjie Lu, Ting Zhou, Chintu Kumar, Thomas Jurdi, Sharat Chikkerur, Yenai Ma,
+Adams Yu, Soo Kwak, Victor Ähdel, Sujeevan Rajayogam, Travis Choma, Fei Liu,
+Aditya Barua, Colin Ji, Ji Ho Park, Vincent Hellendoorn, Alex Bailey, Taylan
+Bilal, Huanjie Zhou, Mehrdad Khatir, Charles Sutton, Wojciech Rzadkowski,
+Fiona Macintosh, Konstantin Shagin, Paul Medina, Chen Liang, Jinjing Zhou,
+Pararth Shah, Yingying Bi, Attila Dankovics, Shipra Banga, Sabine Lehmann,
+Marissa Bredesen, Zifan Lin, John Eric Hoffmann, Jonathan Lai, Raynald Chung,
+Kai Yang, Nihal Balani, Arthur Bražinskas, Andrei Sozanschi, Matthew Hayes,
+Héctor Fernández Alcalde, Peter Makarov, Will Chen, Antonio Stella,
+Liselotte Snijders, Michael Mandl, Ante Kärrman, Paweł Nowak, Xinyi Wu,
+Alex Dyck, Krishnan Vaidyanathan, Raghavender R, Jessica Mallet, Mitch
+Rudominer, Eric Johnston, Sushil Mittal, Akhil Udathu, Janara Christensen,
+Vishal Verma, Zach Irving, Andreas Santucci, Gamaleldin Elsayed, Elnaz
+Davoodi, Marin Georgiev, Ian Tenney, Nan Hua, Geoffrey Cideron, Edouard
+Leurent, Mahmoud Alnahlawi, Ionut Georgescu, Nan Wei, Ivy Zheng, Dylan
+Scandinaro, Heinrich Jiang, Jasper Snoek, Mukund Sundararajan, Xuezhi Wang,
+Zack Ontiveros, Itay Karo, Jeremy Cole, Vinu Rajashekhar, Lara Tumeh, Eyal
+Ben-David, Rishub Jain, Jonathan Uesato, Romina Datta, Oskar Bunyan, Shimu
+Wu, John Zhang, Piotr Stanczyk, Ye Zhang, David Steiner, Subhajit Naskar,
+Michael Azzam, Matthew Johnson, Adam Paszke, Chung-Cheng Chiu, Jaume Sanchez
+Elias, Afroz Mohiuddin, Faizan Muhammad, Jin Miao, Andrew Lee, Nino
+Vieillard, Jane Park, Jiageng Zhang, Jeff Stanway, Drew Garmon, Abhijit
+Karmarkar, Zhe Dong, Jong Lee, Aviral Kumar, Luowei Zhou, Jonathan Evens,
+William Isaac, Geoffrey Irving, Edward Loper, Michael Fink, Isha Arkatkar,
+Nanxin Chen, Izhak Shafran, Ivan Petrychenko, Zhe Chen, Johnson Jia, Anselm
+Levskaya, Zhenkai Zhu, Peter Grabowski, Yu Mao, Alberto Magni, Kaisheng Yao,
+Javier Snaider, Norman Casagrande, Evan Palmer, Paul Suganthan, Alfonso
+Castaño, Irene Giannoumis, Wooyeol Kim, Mikołaj Rybiński, Ashwin
+Sreevatsa, Jennifer Prendki, David Soergel, Adrian Goedeckemeyer, Willi
+Gierke, Mohsen Jafari, Meenu Gaba, Jeremy Wiesner, Diana Gage Wright, Yawen
+Wei, Harsha Vashisht, Yana Kulizhskaya, Jay Hoover, Maigo Le, Lu Li, Chimezie
+Iwuanyanwu, Lu Liu, Kevin Ramirez, Andrey Khorlin, Albert Cui, Tian LIN,
+Marcus Wu, Ricardo Aguilar, Keith Pallo, Abhishek Chakladar, Ginger Perng,
+Elena Allica Abellan, Mingyang Zhang, Ishita Dasgupta, Nate Kushman, Ivo
+Penchev, Alena Repina, Xihui Wu, Tom van der Weide, Priya Ponnapalli,
+Caroline Kaplan, Jiri Simsa, Shuangfeng Li, Olivier Dousse, Fan Yang, Jeff
+Piper, Nathan Ie, Rama Pasumarthi, Nathan Lintz, Anitha Vijayakumar, Daniel
+Andor, Pedro Valenzuela, Minnie Lui, Cosmin Paduraru, Daiyi Peng, Katherine
+Lee, Shuyuan Zhang, Somer Greene, Duc Dung Nguyen, Paula Kurylowicz, Cassidy
+Hardin, Lucas Dixon, Lili Janzer, Kiam Choo, Ziqiang Feng, Biao Zhang,
+Achintya Singhal, Dayou Du, Dan McKinnon, Natasha Antropova, Tolga Bolukbasi,
+Orgad Keller, David Reid, Daniel Finchelstein, Maria Abi Raad, Remi Crocker,
+Peter Hawkins, Robert Dadashi, Colin Gaffney, Ken Franko, Anna Bulanova,
+Rémi Leblond, Shirley Chung, Harry Askham, Luis C. Cobo, Kelvin Xu, Felix
+Fischer, Jun Xu, Christina Sorokin, Chris Alberti, Chu-Cheng Lin, Colin
+Evans, Alek Dimitriev, Hannah Forbes, Dylan Banarse, Zora Tung, Mark
+Omernick, Colton Bishop, Rachel Sterneck, Rohan Jain, Jiawei Xia, Ehsan Amid,
+Francesco Piccinno, Xingyu Wang, Praseem Banzal, Daniel J. Mankowitz, Alex
+Polozov, Victoria Krakovna, Sasha Brown, MohammadHossein Bateni, Dennis Duan,
+Vlad Firoiu, Meghana Thotakuri, Tom Natan, Matthieu Geist, Ser tan Girgin,
+Hui Li, Jiayu Ye, Ofir Roval, Reiko Tojo, Michael Kwong, James Lee-Thorp,
+Christopher Yew, Danila Sinopalnikov, Sabela Ramos, John Mellor, Abhishek
+Sharma, Kathy Wu, David Miller, Nicolas Sonnerat, Denis Vnukov, Rory Greig,
+Jennifer Beattie, Emily Caveness, Libin Bai, Julian Eisenschlos, Alex
+Korchemniy, Tomy Tsai, Mimi Jasarevic, Weize Kong, Phuong Dao, Zeyu Zheng,
+Frederick Liu, Fan Yang, Rui Zhu, Tian Huey Teh, Jason Sanmiya, Evgeny
+Gladchenko, Nejc Trdin, Daniel Toyama, Evan Rosen, Sasan Tavakkol, Linting
+Xue, Chen Elkind, Oliver Woodman, John Carpenter, George Papamakarios, Rupert
+Kemp, Sushant Kafle, Tanya Grunina, Rishika Sinha, Alice Talbert, Diane Wu,
+Denese Owusu-Afriyie, Cosmo Du, Chloe Thornton, Jordi Pont-Tuset, Pradyumna
+Narayana, Jing Li, Saaber Fatehi, John Wieting, Omar Ajmeri, Benigno Uria,
+Yeongil Ko, Laura Knight, Amélie Héliou, Ning Niu, Shane Gu, Chenxi Pang,
+Yeqing Li, Nir Levine, Ariel Stolovich, Rebeca Santamaria-Fernandez, Sonam
+Goenka, Wenny Yustalim, Robin Strudel, Ali Elqursh, Charlie Deck, Hyo Lee,
+Zonglin Li, Kyle Levin, Raphael Hoffmann, Dan Holtmann-Rice, Olivier Bachem,
+Sho Arora, Christy Koh, Soheil Hassas Yeganeh, Siim Põder, Mukarram Tariq,
+Yanhua Sun, Lucian Ionita, Mojtaba Seyedhosseini, Pouya Tafti, Zhiyu Liu,
+Anmol Gulati, Jasmine Liu, Xinyu Ye, Bart Chrzaszcz, Lily Wang, Nikhil Sethi,
+Tianrun Li, Ben Brown, Shreya Singh, Wei Fan, Aaron Parisi, Joe Stanton,
+Vinod Koverkathu, Christopher A. Choquette-Choo, Yunjie Li, TJ Lu, Abe
+Ittycheriah, Prakash Shroff, Mani Varadarajan, Sanaz Bahargam, Rob
+Willoughby, David Gaddy, Guillaume Desjardins, Marco Cornero, Brona Robenek,
+Bhavishya Mittal, Ben Albrecht, Ashish Shenoy, Fedor Moiseev, Henrik
+Jacobsson, Alireza Ghaffarkhah, Morgane Rivière, Alanna Walton, Clément
+Crepy, Alicia Parrish, Zongwei Zhou, Clement Farabet, Carey Radebaugh,
+Praveen Srinivasan, Claudia van der Salm, Andreas Fidjeland, Salvatore
+Scellato, Eri Latorre-Chimoto, Hanna Klimczak-Plucińska, David Bridson,
+Dario de Cesare, Tom Hudson, Piermaria Mendolicchio, Lexi Walker, Alex
+Morris, Matthew Mauger, Alexey Guseynov, Alison Reid, Seth Odoom, Lucia
+Loher, Victor Cotruta, Madhavi Yenugula, Dominik Grewe, Anastasia
+Petrushkina, Tom Duerig, Antonio Sanchez, Steve Yadlowsky, Amy Shen, Amir
+Globerson, Lynette Webb, Sahil Dua, Dong Li, Surya Bhupatiraju, Dan Hurt,
+Haroon Qureshi, Ananth Agarwal, Tomer Shani, Matan Eyal, Anuj Khare,
+Shreyas Rammohan Belle, Lei Wang, Chetan Tekur, Mihir Sanjay Kale, Jinliang
+Wei, Ruoxin Sang, Brennan Saeta, Tyler Liechty, Yi Sun, Yao Zhao, Stephan
+Lee, Pandu Nayak, Doug Fritz, Manish Reddy Vuyyuru, John Aslanides, Nidhi
+Vyas, Martin Wicke, Xiao Ma, Evgenii Eltyshev, Nina Martin, Hardie Cate,
+James Manyika, Keyvan Amiri, Yelin Kim, Xi Xiong, Kai Kang, Florian Luisier,
+Nilesh Tripuraneni, David Madras, Mandy Guo, Austin Waters, Oliver Wang,
+Joshua Ainslie, Jason Baldridge, Han Zhang, Garima Pruthi, Jakob Bauer, Feng
+Yang, Riham Mansour, Jason Gelman, Yang Xu, George Polovets, Ji Liu, Honglong
+Cai, Warren Chen, XiangHai Sheng, Emily Xue, Sherjil Ozair, Christof
+Angermueller, Xiaowei Li, Anoop Sinha, Weiren Wang, Julia Wiesinger,
+Emmanouil Koukoumidis, Yuan Tian, Anand Iyer, Madhu Gurumurthy, Mark
+Goldenson, Parashar Shah, MK Blake, Hongkun Yu, Anthony Urbanowicz,
+Jennimaria Palomaki, Chrisantha Fernando, Ken Durden, Harsh Mehta, Nikola
+Momchev, Elahe Rahimtoroghi, Maria Georgaki, Amit Raul, Sebastian Ruder,
+Morgan Redshaw, Jinhyuk Lee, Denny Zhou, Komal Jalan, Dinghua Li, Blake
+Hechtman, Parker Schuh, Milad Nasr, Kieran Milan, Vladimir Mikulik, Juliana
+Franco, Tim Green, Nam Nguyen, Joe Kelley, Aroma Mahendru, Andrea Hu, Joshua
+Howland, Ben Vargas, Jeffrey Hui, Kshitij Bansal, Vikram Rao, Rakesh Ghiya,
+Emma Wang, Ke Ye, Jean Michel Sarr, Melanie Moranski Preston, Madeleine
+Elish, Steve Li, Aakash Kaku, Jigar Gupta, Ice Pasupat, Da-Cheng Juan, Milan
+Someswar, Tejvi M., Xinyun Chen, Aida Amini, Alex Fabrikant, Eric Chu, Xuanyi
+Dong, Amruta Muthal, Senaka Buthpitiya, Sarthak Jauhari, Nan Hua, Urvashi
+Khandelwal, Ayal Hitron, Jie Ren, Larissa Rinaldi, Shahar Drath, Avigail
+Dabush, Nan-Jiang Jiang, Harshal Godhia, Uli Sachs, Anthony Chen, Yicheng
+Fan, Hagai Taitelbaum, Hila Noga, Zhuyun Dai, James Wang, Chen Liang, Jenny
+Hamer, Chun-Sung Ferng, Chenel Elkind, Aviel Atias, Paulina Lee, Vít
+Listík, Mathias Carlen, Jan van de Kerkhof, Marcin Pikus, Krunoslav Zaher,
+Paul Müller, Sasha Zykova, Richard Stefanec, Vitaly Gatsko, Christoph
+Hirnschall, Ashwin Sethi, Xingyu Federico Xu, Chetan Ahuja, Beth Tsai, Anca
+Stefanoiu, Bo Feng, Keshav Dhandhania, Manish Katyal, Akshay Gupta, Atharva
+Parulekar, Divya Pitta, Jing Zhao, Vivaan Bhatia, Yashodha Bhavnani, Omar
+Alhadlaq, Xiaolin Li, Peter Danenberg, Dennis Tu, Alex Pine, Vera Filippova,
+Abhipso Ghosh, Ben Limonchik, Bhargava Urala, Chaitanya Krishna Lanka, Derik
+Clive, Yi Sun, Edward Li, Hao Wu, Kevin Hongtongsak, Ianna Li, Kalind
+Thakkar, Kuanysh Omarov, Kushal Majmundar, Michael Alverson, Michael
+Kucharski, Mohak Patel, Mudit Jain, Maksim Zabelin, Paolo Pelagatti, Rohan
+Kohli, Saurabh Kumar, Joseph Kim, Swetha Sankar, Vineet Shah, Lakshmi
+Ramachandruni, Xiangkai Zeng, Ben Bariach, Laura Weidinger, Tu Vu, Alek
+Andreev, Antoine He, Kevin Hui, Sheleem Kashem, Amar Subramanya, Sissie
+Hsiao, Demis Hassabis, Koray Kavukcuoglu, Adam Sadovsky, Quoc Le, Trevor
+Strohman, Yonghui Wu, Slav Petrov, Jeffrey Dean, and Oriol Vinyals.
+Gemini: A family of highly capable multimodal models, 2024.
+URL
+https://arxiv.org/abs/2312.11805
+.
+Vogels et al. (2019)
+Thijs Vogels, Sai Praneeth Karimireddy, and Martin Jaggi.
+Powersgd: Practical low-rank gradient compression for distributed
+optimization.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2019.
+URL
+https://arxiv.org/abs/1905.13727
+.
+Wang et al. (2024)
+Haolin Wang, Xuefeng Liu, Jianwei Niu, Wenkai Guo, and Shaojie Tang.
+Why go full? elevating federated learning through partial network
+updates.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2024.
+URL
+https://arxiv.org/abs/2410.11559
+.
+Wang et al. (2023)
+Jue Wang, Yucheng Lu, Binhang Yuan, Beidi Chen, Percy Liang, Christopher De Sa,
+Christopher Re, and Ce Zhang.
+Cocktailsgd: fine-tuning foundation models over 500mbps networks.
+International Conference on Machine Learning (ICML)
+, 2023.
+URL
+https://openreview.net/forum?id=w2Vrl0zlzA
+.
+Wen et al. (2022)
+Dingzhu Wen, Ki-Jun Jeon, and Kaibin Huang.
+Federated dropout – a simple approach for enabling federated
+learning on resource constrained devices.
+IEEE Wireless Communications Letters
+, 2022.
+URL
+https://arxiv.org/abs/2109.15258
+.
+Wortsman et al. (2021)
+Mitchell Wortsman, Maxwell Horton, Carlos Guestrin, Ali Farhadi, and Mohammad
+Rastegari.
+Learning neural network subspaces.
+International Conference on Machine Learning (ICML)
+, 2021.
+URL
+https://arxiv.org/abs/2102.10472
+.
+Wortsman et al. (2022)
+Mitchell Wortsman, Gabriel Ilharco, Samir Ya Gadre, Rebecca Roelofs, Raphael
+Gontijo-Lopes, Ari S Morcos, Hongseok Namkoong, Ali Farhadi, Yair Carmon,
+Simon Kornblith, and Ludwig Schmidt.
+Model soups: averaging weights of multiple fine-tuned models improves
+accuracy without increasing inference time.
+International Conference on Machine Learning (ICML)
+, 2022.
+URL
+https://arxiv.org/abs/2203.05482
+.
+Wortsman et al. (2023)
+Mitchell Wortsman, Peter J. Liu, Lechao Xiao, Katie Everett, Alex Alemi, Ben
+Adlam, John D. Co-Reyes, Izzeddin Gur, Abhishek Kumar, Roman Novak, Jeffrey
+Pennington, Jascha Sohl-dickstein, Kelvin Xu, Jaehoon Lee, Justin Gilmer, and
+Simon Kornblith.
+Small-scale proxies for large-scale transformer training
+instabilities.
+arXiv preprint library
+, 2023.
+URL
+https://arxiv.org/abs/2309.14322
+.
+Yadav et al. (2023)
+Prateek Yadav, Derek Tam, Leshem Choshen, Colin Raffel, and Mohit Bansal.
+Ties-merging: Resolving interference when merging models.
+Advances in Neural Information Processing Systems (NeurIPS)
+,
+2023.
+URL
+https://arxiv.org/abs/2306.01708
+.
+Yu et al. (2024)
+Le Yu, Bowen Yu, Haiyang Yu, Fei Huang, and Yongbin Li.
+Language models are super mario: Absorbing abilities from homologous
+models as a free lunch.
+International Conference on Machine Learning (ICML)
+, 2024.
+URL
+https://arxiv.org/abs/2311.03099
+.
+Zellers et al. (2019)
+Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi.
+Hellaswag: Can a machine really finish your sentence?
+Proceedings of the Annual Meeting of the Association for
+Computational Linguistics (ACL Short Papers)
+, 2019.
+URL
+https://arxiv.org/abs/1905.07830
+.
+Zhao et al. (2024)
+Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang Wang, Anima Anandkumar, and
+Yuandong Tian.
+Galore: Memory-efficient llm training by gradient low-rank
+projection.
+International Conference on Machine Learning (ICML)
+, 2024.
+URL
+https://arxiv.org/abs/2403.03507
+.
+Zhao et al. (2023)
+Yanli Zhao, Andrew Gu, Rohan Varma, Liang Luo, Chien-Chin Huang, Min Xu, Less
+Wright, Hamid Shojanazeri, Myle Ott, Sam Shleifer, Alban Desmaison, Can
+Balioglu, Pritam Damania, Bernard Nguyen, Geeta Chauhan, Yuchen Hao, Ajit
+Mathews, and Shen Li.
+Pytorch fsdp: Experiences on scaling fully sharded data parallel,
+2023.
+URL
+https://arxiv.org/abs/2304.11277
+.
+Supplementary Materials
+Architecture hyperparameters.
+We detail the architecture across model scales in
+Table 2
+. The token budget per scale is computed from the chinchilla-optimal amount of flops
+(Hoffmann et al.,
+2022
+)
+.
+Model scale
+Hidden dim
+Num layers
+Num heads
+Token budget
+35M
+2
+,
+048
+2
+048
+2{,}048
+2 , 048
+6
+8
+700M
+100M
+3
+,
+072
+3
+072
+3{,}072
+3 , 072
+9
+12
+1.5B
+200M
+4
+,
+096
+4
+096
+4{,}096
+4 , 096
+12
+16
+3.5B
+300M
+5
+,
+120
+5
+120
+5{,}120
+5 , 120
+15
+20
+6B
+500M
+6
+,
+144
+6
+144
+6{,}144
+6 , 144
+18
+24
+11B
+1B
+8
+,
+192
+8
+192
+8{,}192
+8 , 192
+24
+32
+25B
+4B
+12
+,
+288
+12
+288
+12{,}288
+12 , 288
+36
+48
+83B
+Table 2
+:
+Architecture hyperparameters
+: we consider model from 35M to 4B with the following hyperameters and chinchilla-optimal token budget. For all model scale, the vocabulary size is
+32
+,
+000
+32
+000
+32{,}000
+32 , 000
+.
+Number of replicas.
+We perform our main experiments with 2 replicas for simplicity but showcase in
+Figure 12
+an ablation of DiLoCo vs Streaming DiLoCo where the number of replicas
+M
+𝑀
+M
+italic_M
+vary from 2 to 8. Contrarely to
+(Douillard et al.,
+2024a
+)
+, we keep the total token budget constant. In
+12(a)
+, we keep the global batch size constant, and thus reduce the local per-replica batch size). In
+12(b)
+, we keep the local per-replica batch size constant, and thus increase the global batch size but also reduce the total number of steps.
+(a)
+Keeping the
+global
+batch size constant
+, and thus decreasing the
+local
+per-replica batch size.
+(b)
+Keeping the
+local
+per-replica batch size constant
+, and thus increasing the
+global
+batch size.
+Figure 12
+:
+Scaling the number of DiLoCo replicas
+M
+𝑀
+M
+italic_M
+from
+M
+=
+2
+𝑀
+2
+M=2
+italic_M = 2
+to
+M
+=
+4
+𝑀
+4
+M=4
+italic_M = 4
+. For all experiments, the token budget is kept constant.
+Number of inner steps.
+The number of inner steps
+H
+𝐻
+H
+italic_H
+, has an engineering effect and a learning effect: a larger
+H
+𝐻
+H
+italic_H
+means less frequent synchronization and thus less required bandwidth. On the other hand, a too small
+H
+𝐻
+H
+italic_H
+produce noisy small-normed outer gradients and a too high
+H
+𝐻
+H
+italic_H
+will see replicas drifting apart. Therefore, some middle ground needs to be found. We ablate in
+Figure 13
+and find that while Streaming DiLoCo has similar behavior as DiLoCo when
+H
+𝐻
+H
+italic_H
+increases, it is more robust to low values of
+H
+𝐻
+H
+italic_H
+.
+Figure 13
+:
+Varying
+the number of inner steps
+H
+𝐻
+H
+italic_H
+for DiLoCo and Streaming DiLoCo while keeping the total number of steps constants. A lower
+H
+𝐻
+H
+italic_H
+means more communication rounds to be done.
+Which parameters to evaluate.
+We considered multiple subset of the parameters to use for evaluation: 1) the arbitrarily chosen first replica (
+θ
+1
+subscript
+𝜃
+1
+\theta_{1}
+italic_θ start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+), 2) an average of all replicas (
+1
+M
+⁢
+∑
+m
+=
+1
+M
+θ
+m
+1
+𝑀
+superscript
+subscript
+𝑚
+1
+𝑀
+subscript
+𝜃
+𝑚
+\frac{1}{M}\sum_{m=1}^{M}\theta_{m}
+divide start_ARG 1 end_ARG start_ARG italic_M end_ARG ∑ start_POSTSUBSCRIPT italic_m = 1 end_POSTSUBSCRIPT start_POSTSUPERSCRIPT italic_M end_POSTSUPERSCRIPT italic_θ start_POSTSUBSCRIPT italic_m end_POSTSUBSCRIPT
+), or 3) the globally shared outer parameters (
+θ
+𝜃
+\theta
+italic_θ
+). Note that the latter is made of fragments that were synchronized at different points in time. We show the performance of each subset in
+Table 3
+: The difference here between these methods is small, but the outer parameters yield slightly better performance.
+Parameters evaluated
+Eval Loss
+HellaSwag
+First replica
+2.77
+37.77
+Replicas average
+2.68
+37.72
+Outer parameters
+2.67
+37.78
+Table 3
+:
+Which parameters to evaluate?
+: Evaluating the outer parameters, where each fragment has been synchronized at a different moment in time, yields better performance than any inner parameters.
+Sequential vs strided patterns.
+The choice of the synchronization pattern (
+Figure 2
+), has a slight impact on the ML performance (
+6(a)
+) but also on the compute utilization (
+Figure 7
+). Indeed, as better seen in
+Figure 14
+, the strided pattern will never have multiple early layers to be synchronize together. Therefore, it is easier to overlap their communication with the first few layers’ forward of the next step.
+Figure 14
+:
+Simulation of a schedule interleaving forward passes (in
+blue
+), backward passes w.r.t. activations and weights (resp. in
+light
+and
+dark green
+), and (outer) gradient reduction (in
+purple
+) for Streaming DiLoCo, respectively with a sequential and strided pattern.
+Compute utilization.
+We report in
+Table 4
+the amount of Gbit/s required, per method, to reach a certain level of compute utilization. See
+Figure 4
+for a figure view of this table. For DiLoCo and Streaming DiLoco (and variants thereof), we use
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+inner steps. For Streaming DiLoCo (and variants thereof), we use a fixed fragment size of 3 layers; therefore, deeper networks have more fragments: for 1B, 10B, and 100B model scales, it is respectively 8, 16, and 36 fragments. Also, respectively per model scales, a fragment is synchronized every 11, 5, and 2 steps. While the synchronization seems to be more frequent for deeper networks, from the perspective of particular fragment, it is synchronized roughly every
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+steps. To estimate the compute utilization in
+Table 4
+and
+Figure 4
+, the time spent per step doing computation (forward & backward) is critical: we report respectively 0.1s, 0.8s, and 4.9s based on each model scale flops profile, a reasonable amount of chips, and a MFU of
+60
+%
+percent
+60
+60\%
+60 %
+.
+Model size
+# layers
+Step time
+Method
+Gbit/s to reach a compute utilization
+CU
+=
+CU
+absent
+\texttt{CU}=
+CU =
+?
+50
+%
+percent
+50
+50\%
+50 %
+80
+%
+percent
+80
+80\%
+80 %
+90
+%
+percent
+90
+90\%
+90 %
+95
+%
+percent
+95
+95\%
+95 %
+99
+%
+percent
+99
+99\%
+99 %
+1B
+24
+0.1s
+Data-Parallel
+86.8
+152.6
+184.2
+222.3
+569.0
+Vanilla DiLoCo
+1.4
+6.2
+13.3
+23.3
+86.8
+Streaming DiLoCo
+1.4
+5.2
+9.1
+16.0
+28.1
+Streaming DiLoCo w/ overlapped com.
+1.4
+4.3
+6.2
+9.1
+11.0
+Streaming DiLoCo w/ overlapped FP4 com.
+0.4
+0.9
+1.7
+2.0
+3.0
+10B
+48
+0.8s
+Data-Parallel
+104.8
+222.3
+222.3
+268.3
+471.5
+Vanilla DiLoCo
+1.7
+7.5
+16.0
+33.9
+104.8
+Streaming DiLoCo
+1.7
+5.2
+9.1
+13.3
+19.3
+Streaming DiLoCo w/ overlapped com.
+1.7
+3.6
+5.2
+6.2
+7.5
+Streaming DiLoCo w/ overlapped FP4 com.
+0.4
+0.9
+1.4
+1.4
+1.7
+100B
+108
+4.9s
+Data-Parallel
+184.2
+323.8
+390.7
+390.7
+471.5
+Vanilla DiLoCo
+3.0
+11.0
+23.3
+49.4
+184.2
+Streaming DiLoCo
+2.4
+6.2
+9.1
+11.0
+19.3
+Streaming DiLoCo w/ overlapped com.
+1.7
+3.6
+4.3
+5.2
+5.2
+Streaming DiLoCo w/ overlapped FP4 com.
+0.5
+0.9
+1.1
+1.1
+1.4
+Table 4
+:
+Simulation
+: we estimate the step time (pure compute) of 10B and 100B based on the required flops using
+Kaplan et al. (
+2020
+)
+rule and using a MFU of 60%. For all DiLoCo and Streaming DiLoCo-variants, we use
+H
+=
+100
+𝐻
+100
+H=100
+italic_H = 100
+. For all Streaming DiLoCo-variants, we use a fragment size of 3 layers.
+Compute utilization with various speeds.
+Varying the time spent per step to do pure computation (forward & backward) affects the compute utilization: e.g. for a fixed bandwidth and thus fixed communication time, longer step time, will improve compute utilization. We report in
+Figure 15
+, simulated compute utilization when using, at 100B model scale, a compute step time of 1 second, 5 seconds, and 10 seconds.
+(a)
+1s step time
+(b)
+5s step time
+(c)
+10s step time
+Figure 15
+:
+Compute Utilization
+for a 100 billion parameters when the step time (pure compute) is 1 second, 5 seconds, and 10 seconds.
+Compute Utilization on Llama and DeepSeek.
+We estimate in
+Figure 16
+, the compute utilization of our method vs baselines on top of Llama405
+(Grattafiori et al.,
+2024
+)
+and DeepSeek-V3
+(DeepSeek-AI et al.,
+2024
+)
+. For each, we estimate their step time from the respective paper: 26.9 seconds for Llama (first stage of pretraining) and 20.1 seconds for DeepSeek, using the most charitable estimation everytime. Notably, for DeepSeek-V3 (
+16(b)
+), only 35 billion parameters are activated per token due to their MoE architecture
+(Shazeer et al.,
+2017
+)
+. However, the total 671 billion parameters are synchronized between replicas, massively increasing the amount of bits to transfer. In that case, in our simulation, our method (in
+red
+) can be close to 100% compute utilization with 4 Gbits per second vs 1 Tbit per second for Data-Parallel.
+(a)
+Llama405B.
+(b)
+DeepSeek-V3 (671B total, 35B activated).
+Figure 16
+:
+Compute Utilization
+simulated across a range of bandwidth for Llama405 and DeepSeek-V3, using step time estimated from respective papers.
+Scaling performance.
+We report in
+Table 5
+, the evaluation loss on C4 and accuracy on HellaSwag
+(Zellers et al.,
+2019
+)
+, Piqa
+(Bisk et al.,
+2020
+)
+, and Arc-Easy
+(Clark et al.,
+2018
+)
+, for four different methods across 6 model scales. See
+subsubsection 3.2.1
+for the initial discussion. Performance across scales are roughly similar among all considered methods, with usually a slight advantage for Data-Parallel. We found in practice this advantage to disappear when doing a more realistic overtraining with larger token budget in
+subsubsection 3.2.2
+.
+Model size
+Flops
+Method
+Eval Loss
+↓
+↓
+\downarrow
+↓
+HellaSwag
+↑
+↑
+\uparrow
+↑
+Piqa
+↑
+↑
+\uparrow
+↑
+Arc Easy
+↑
+↑
+\uparrow
+↑
+35M
+1.5e17
+Data-Parallel
+3.51
+24.62
+57.89
+29.65
+DiLoCo H=30
+3.54
+24.53
+58.11
+29.65
+Streaming DiLoCo with overlapped FP4 com., H=30
+3.53
+24.46
+57.67
+30.53
+Streaming DiLoCo with overlapped FP4 com., H=100
+3.56
+24.80
+57.89
+29.12
+100M
+9.4e17
+Data-Parallel
+3.19
+26.94
+60.12
+30.35
+DiLoCo H=30
+3.21
+26.59
+60.50
+29.12
+Streaming DiLoCo with overlapped FP4 com., H=30
+3.21
+26.97
+59.58
+31.40
+Streaming DiLoCo with overlapped FP4 com., H=100
+3.22
+26.68
+60.39
+31.93
+200M
+4e18
+Data-Parallel
+2.97
+29.86
+63.71
+35.44
+DiLoCo H=30
+2.98
+29.71
+62.30
+33.68
+Streaming DiLoCo with overlapped FP4 com., H=30
+2.98
+29.67
+61.92
+34.39
+Streaming DiLoCo with overlapped FP4 com., H=100
+3.00
+29.27
+62.13
+34.21
+300M
+1.4e19
+Data-Parallel
+2.80
+33.46
+64.69
+34.91
+DiLoCo H=30
+2.81
+33.87
+64.74
+34.74
+Streaming DiLoCo with overlapped FP4 com., H=30
+2.81
+33.66
+63.49
+35.09
+Streaming DiLoCo with overlapped FP4 com., H=100
+2.83
+33.00
+63.71
+34.39
+500M
+4.7e19
+Data-Parallel
+2.67
+38.68
+66.49
+37.19
+DiLoCo H=30
+2.68
+38.37
+65.61
+36.32
+Streaming DiLoCo with overlapped FP4 com., H=30
+2.67
+38.10
+66.21
+34.91
+Streaming DiLoCo with overlapped FP4 com., H=100
+2.69
+37.40
+65.51
+34.74
+1B
+1.9e20
+Data-Parallel
+2.49
+46.60
+68.93
+39.65
+DiLoCo H=30
+2.49
+46.56
+68.82
+36.84
+Streaming DiLoCo with overlapped FP4 com., H=30
+2.48
+46.60
+69.04
+39.12
+Streaming DiLoCo with overlapped FP4 com., H=100
+2.50
+46.00
+68.82
+38.42
+4B
+2e21
+Data-Parallel
+2.25
+59.56
+72.42
+43.51
+DiLoCo H=30
+-
+-
+-
+-
+Streaming DiLoCo with overlapped FP4 com., H=30
+-
+-
+-
+-
+Streaming DiLoCo with overlapped FP4 com., H=100
+2.26
+59.02
+72.52
+43.16
+Table 5
+:
+Scaling
+from 35 million parameters to 4 billion parameters using a chinchilla-optimal number of flops/tokens. We train on the C4 dataset, and report the evaluation loss on its validation set.
+Scaling with variable number of replicas.
+Contrarely to Data-Parallel, changing the number of replicas for DiLoCo is not mathematically equivalent due to the local training, happening independentely for each replicas. We display in
+Table 6
+, a scaling from 35 million parameters to 1 billion parameters on the C4 dataset of our method, Streaming DiLoCo with overlapped FP4 communication, with different number of replicas
+M
+=
+{
+2
+,
+4
+}
+𝑀
+2
+4
+M=\{2,4\}
+italic_M = { 2 , 4 }
+and different frequencies of synchronization
+H
+=
+{
+30
+,
+100
+}
+𝐻
+30
+100
+H=\{30,100\}
+italic_H = { 30 , 100 }
+. Likewise, in
+Table 7
+, we showcase token budget overtraining at 1 billion parameters on the Dolma dataset.
+Model size
+Flops
+M
+𝑀
+M
+italic_M
+H
+𝐻
+H
+italic_H
+Eval Loss
+↓
+↓
+\downarrow
+↓
+HellaSwag
+↑
+↑
+\uparrow
+↑
+Piqa
+↑
+↑
+\uparrow
+↑
+Arc Easy
+↑
+↑
+\uparrow
+↑
+35M
+1.5e17
+2
+30
+3.53
+24.46
+57.67
+30.53
+4
+30
+3.60
+24.50
+56.09
+28.60
+2
+100
+3.56
+24.80
+57.89
+29.12
+4
+100
+3.64
+24.67
+56.75
+26.84
+100M
+9.4e17
+2
+30
+3.21
+26.97
+59.58
+31.40
+4
+30
+3.25
+26.24
+59.74
+32.63
+2
+100
+3.22
+26.68
+60.39
+31.93
+4
+100
+3.29
+26.54
+60.34
+29.82
+200M
+4e18
+2
+30
+2.98
+29.67
+61.92
+34.39
+4
+30
+3.02
+29.09
+62.89
+35.44
+2
+100
+3.00
+29.27
+62.13
+34.21
+4
+100
+3.05
+28.53
+61.10
+33.51
+300M
+1.4e19
+2
+30
+2.81
+33.66
+63.49
+35.09
+4
+30
+2.84
+32.54
+64.42
+34.74
+2
+100
+2.83
+33.00
+63.71
+34.39
+4
+100
+2.87
+32.02
+64.25
+35.44
+500M
+4.7e19
+2
+30
+2.67
+38.10
+66.21
+34.91
+4
+30
+2.70
+36.95
+65.72
+35.26
+2
+100
+2.69
+37.40
+65.51
+34.74
+4
+100
+2.73
+36.02
+66.27
+35.09
+1B
+1.9e20
+2
+30
+2.48
+46.60
+69.04
+39.12
+4
+30
+2.50
+45.25
+67.95
+39.12
+2
+100
+2.50
+46.00
+68.82
+38.42
+4
+100
+2.53
+44.74
+68.34
+38.25
+Table 6
+:
+Scaling
+from 35 million parameters to 1 billion parameters Streaming DiLoCo with overlapped FP4 communication and with two different synchronization frequencies
+H
+=
+{
+30
+,
+100
+}
+𝐻
+30
+100
+H=\{30,100\}
+italic_H = { 30 , 100 }
+and number of DiLoCo replicas
+M
+=
+{
+2
+,
+4
+}
+.
+𝑀
+2
+4
+M=\{2,4\}.
+italic_M = { 2 , 4 } .
+Method
+Token Budget
+Terabytes exchanged
+↓
+↓
+\downarrow
+↓
+Eval Loss
+↓
+↓
+\downarrow
+↓
+HellaSwag
+↑
+↑
+\uparrow
+↑
+Piqa
+↑
+↑
+\uparrow
+↑
+Arc Easy
+↑
+↑
+\uparrow
+↑
+Data-Parallel
+25B
+441
+2.67
+42.09
+67.35
+40.42
+100B
+1,767
+2.52
+49.78
+69.15
+44.03
+250B
+4,418
+2.45
+53.86
+70.45
+44.21
+Our method, M=2
+25B
+1.10
+2.66
+42.08
+67.46
+38.42
+100B
+4.42
+2.51
+49.98
+69.96
+44.03
+250B
+11.05
+2.45
+54.24
+71.38
+41.92
+Our method, M=4
+25B
+0.55
+2.73
+38.93
+66.92
+39.64
+100B
+2.21
+2.54
+48.35
+69.42
+40.52
+250B
+5.52
+2.47
+52.20
+70.29
+42.45
+Table 7
+:
+Overtraining
+on the Dolma dataset with a 1 billion parameters model, and with an increasing token budgets (25B, 100B, and 250B). We report here for our model both with
+M
+=
+2
+𝑀
+2
+M=2
+italic_M = 2
+and
+M
+=
+4
+𝑀
+4
+M=4
+italic_M = 4
+DiLoCo replicas. With twice more replicas, the global batch size is doubled, and twice less steps are done. It is also thus roughly twice faster, but come with slightly worse performance. Our method is the final model: Streaming DiLoCo with overlapped FP4 communication.
+Outer gradients’ cosine similarity.
+We observe in
+Figure 17
+the cosine similarity per scale between each replica’s outer gradients for respectively all parameters but the embeddings (
+17(a)
+) and only the embeddings (
+17(b)
+). For both, the cosine similarity starts from slightly correlated (
+≈
+0.1
+absent
+0.1
+\approx 0.1
+≈ 0.1
+), spends of the training time to be close to orthogonal (
+≈
+0.0
+absent
+0.0
+\approx 0.0
+≈ 0.0
+), and ends slightly inversely correlated (
+≈
+−
+0.1
+absent
+0.1
+\approx-0.1
+≈ - 0.1
+) as we reach the fluctuation phase. Note also that the larger the model size, the lower is overall the cosine similarity.
+We also plot in
+Figure 18
+the cosine similarity per scale and per transformer layer. Notably, the first transformer layer at each scale has a significantly higher similarity, at every model scales.
+(a)
+All fragments but the embedding
+(b)
+Embedding fragment
+Figure 17
+:
+Cosine similarity between the outer gradients
+across scales.
+Figure 18
+:
+Cosine similarity between the outer gradients
+across scales. Each line is a transformer layer, with darker colors being earlier layers and lighter colors later layers.
\ No newline at end of file
diff --git a/research/notes/swe-benchswe-smith-datasets-at-hugging-face.md b/research/notes/swe-benchswe-smith-datasets-at-hugging-face.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bdd0206956522a9411a348a9eba4c915a0797d9
--- /dev/null
+++ b/research/notes/swe-benchswe-smith-datasets-at-hugging-face.md
@@ -0,0 +1,319 @@
+---
+title: SWE-bench/SWE-smith · Datasets at Hugging Face
+id: swe-benchswe-smith-datasets-at-hugging-face
+tags:
+- deepread
+created: '2026-06-10T00:23:59.131977Z'
+source: https://huggingface.co/datasets/SWE-bench/SWE-smith
+source_domain: huggingface.co
+fetched_at: '2026-06-10T00:23:59.131726Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: unknown
+content_type: unknown
+deprecated: false
+---
+
+SWE-bench/SWE-smith · Datasets at Hugging Face
+Dataset Viewer
+Auto-converted
+to Parquet
+API
+Embed
+Duplicate
+Data Studio
+Subset (1)
+default
+·
+59.1k rows
+default (59.1k rows)
+Split (1)
+train
+·
+59.1k rows
+train (59.1k rows)
+SQL
+Console
+instance_id
+string
+lengths
+26
+79
+patch
+string
+lengths
+199
+223k
+FAIL_TO_PASS
+list
+lengths
+1
+10.9k
+PASS_TO_PASS
+list
+lengths
+0
+22k
+image_name
+string
+classes
+222
+				values
+repo
+string
+classes
+222
+				values
+problem_statement
+string
+lengths
+0
+39.4k
+oauthlib__oauthlib.1fd52536.combine_file__09vlzwgc
+diff --git a/oauthlib/oauth2/rfc6749/utils.py b/oauthlib/oauth2/rfc6749/utils.py
+index 7dc27b3..c5db6ba 100644
+--- a/oauthlib/oauth2/rfc6749/utils.py
++++ b/oauthlib/oauth2/rfc6749/utils.py
+@@ -15,8 +15,8 @@ def list_to_scope(scope):
+     """Convert a list of scopes to a space separated string."""
+     if isinstance(sco...
+[
+  "tests/oauth2/rfc6749/clients/test_backend_application.py::BackendApplicationClientTest::test_parse_token_response",
+  "tests/oauth2/rfc6749/clients/test_base.py::ClientTest::test_add_mac_token",
+  "tests/oauth2/rfc6749/clients/test_legacy_application.py::LegacyApplicationClientTest::test_parse_token_response",
+  "...
+[
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_token",
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_verifier",
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_valid_request",
+  "te...
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+OAuth2 scope handling broken after recent changes
+
+#### Description
+
+The scope handling functions in `oauthlib.oauth2.rfc6749.utils` are producing incorrect results. When converting between scope lists and strings, the order is getting reversed and some edge cases are not handled properly.
+
+#### Steps/Code to Reproduce...
+oauthlib__oauthlib.1fd52536.combine_file__0fceycuu
+diff --git a/oauthlib/oauth1/rfc5849/__init__.py b/oauthlib/oauth1/rfc5849/__init__.py
+index 85e0b90..ac2824e 100644
+--- a/oauthlib/oauth1/rfc5849/__init__.py
++++ b/oauthlib/oauth1/rfc5849/__init__.py
+@@ -86,7 +86,7 @@ class Client:
+ 
+     @classmethod
+     def register_signature_method(cls, method_name, method_callbac...
+[
+  "tests/oauth1/rfc5849/test_client.py::ClientRealmTests::test_client_realm_sign_with_additional_realm",
+  "tests/oauth1/rfc5849/test_client.py::ClientRealmTests::test_client_realm_sign_with_default_realm",
+  "tests/oauth1/rfc5849/test_client.py::ClientConstructorTests::test_convert_to_unicode_resource_owner",
+  "tes...
+[
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_token",
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_verifier",
+  "tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_valid_request",
+  "te...
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+OAuth1 Client constructor swaps nonce and timestamp parameters
+
+#### Description
+
+When creating an OAuth1 Client with explicit nonce and timestamp values, the parameters get swapped internally. The nonce value is assigned to timestamp and vice versa.
+
+#### Steps/Code to Reproduce
+
+```python
+from oauthlib.oauth1 import ...
+oauthlib__oauthlib.1fd52536.combine_file__0fukhdzk
+"diff --git a/oauthlib/oauth2/rfc6749/grant_types/implicit.py b/oauthlib/oauth2/rfc6749/grant_types/
+(...TRUNCATED)
+["tests/oauth2/rfc6749/endpoints/test_client_authentication.py::ClientAuthenticationTest::test_clien
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+oauthlib__oauthlib.1fd52536.combine_file__0hkl0pea
+"diff --git a/oauthlib/openid/connect/core/grant_types/base.py b/oauthlib/openid/connect/core/grant_
+(...TRUNCATED)
+["tests/oauth2/rfc6749/endpoints/test_metadata.py::MetadataEndpointTest::test_openid_oauth2_preconfi
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"OpenID Connect grant types broken after recent changes\n\n#### Description\n\nOpenID Connect grant
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__0mvyid7d
+"diff --git a/oauthlib/oauth2/rfc8628/endpoints/device_authorization.py b/oauthlib/oauth2/rfc8628/en
+(...TRUNCATED)
+["tests/oauth2/rfc8628/endpoints/test_error_responses.py::ErrorResponseTest::test_duplicate_client_i
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"DeviceAuthorizationEndpoint constructor parameters incorrectly assigned\n\n#### Description\n\nThe
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__0q5tya4o
+"diff --git a/oauthlib/oauth2/rfc6749/clients/web_application.py b/oauthlib/oauth2/rfc6749/clients/w
+(...TRUNCATED)
+["tests/oauth2/rfc6749/clients/test_web_application.py::WebApplicationClientTest::test_auth_grant_ur
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"WebApplicationClient constructor parameters swapped causing authentication failures\n\n#### Descrip
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__0qgnxkrq
+"diff --git a/oauthlib/oauth2/rfc6749/endpoints/introspect.py b/oauthlib/oauth2/rfc6749/endpoints/in
+(...TRUNCATED)
+["tests/oauth2/rfc6749/endpoints/test_client_authentication.py::ClientAuthenticationTest::test_basic
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"IntrospectEndpoint initialization and response behavior broken\n\n#### Description\n\nThe Introspec
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__0y673oox
+"diff --git a/oauthlib/oauth2/rfc8628/endpoints/device_authorization.py b/oauthlib/oauth2/rfc8628/en
+(...TRUNCATED)
+["tests/oauth2/rfc8628/endpoints/test_error_responses.py::ErrorResponseTest::test_duplicate_client_i
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"Device authorization endpoint returns incorrect response structure\n\n#### Description\n\nThe devic
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__1bsv3m8l
+"diff --git a/oauthlib/oauth1/rfc5849/signature.py b/oauthlib/oauth1/rfc5849/signature.py\nindex 891
+(...TRUNCATED)
+["tests/oauth1/rfc5849/test_signatures.py::SignatureTests::test_hmac_false_positives","tests/oauth1/
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"OAuth1 signature functions produce incorrect signatures with swapped secrets\n\n#### Description\n\
+(...TRUNCATED)
+oauthlib__oauthlib.1fd52536.combine_file__1gnd4ecz
+"diff --git a/oauthlib/oauth2/rfc6749/endpoints/pre_configured.py b/oauthlib/oauth2/rfc6749/endpoint
+(...TRUNCATED)
+["tests/oauth2/rfc6749/endpoints/test_resource_owner_association.py::ResourceOwnerAssociationTest::t
+(...TRUNCATED)
+["tests/oauth1/rfc5849/endpoints/test_access_token.py::AccessTokenEndpointTest::test_check_request_t
+(...TRUNCATED)
+jyangballin/swesmith.x86_64.oauthlib_1776_oauthlib.1fd52536
+swesmith/oauthlib__oauthlib.1fd52536
+"OAuth2 Server grant types incorrectly assigned\n\nWhen using the pre-configured OAuth2 Server, the
+(...TRUNCATED)
+End of preview.
+Expand
+in
+Data Studio
+SWE-smith Dataset
+Code
+•
+Paper
+•
+Site
+[12/14/2025] NOTE: We will no longer actively update this dataset.
+While this dataset is still functional and usable, we recommend you use the `SWE-bench/SWE-smith-[lang]` datasets.
+For better maintainability and ease-of-use, we are maintaining language-specific datasets in lieu of this mono-repo.
+The SWE-smith Dataset is a training dataset of 50137 task instances from 128 GitHub repositories, collected using the SWE-smith toolkit.
+It is the largest dataset to date for training software engineering agents.
+All SWE-smith task instances come with an executable environment.
+To learn more about how to use this dataset to train Language Models for Software Engineering, please refer to the
+documentation
+.
+Copy to bucket
+new
+Use this dataset
+Downloads last month
+18,620
+Number of rows:
+59,136
+Total file size:
+278 MB
+Models trained or fine-tuned on
+SWE-bench/SWE-smith
+Text Generation
+•
+33B
+•
+Updated
+Sep 24, 2025
+•
+449
+•
+4
+Text Generation
+•
+33B
+•
+Updated
+May 12, 2025
+•
+267
+•
+•
+81
+8B
+•
+Updated
+Jul 13, 2025
+•
+264
+8B
+•
+Updated
+Jul 13, 2025
+•
+114
+Text Generation
+•
+8B
+•
+Updated
+Jul 13, 2025
+•
+72
+•
+•
+6
+Text Generation
+•
+33B
+•
+Updated
+Jan 27
+•
+48
+Browse 10 models trained on this dataset
+Collection including
+SWE-bench/SWE-smith
+SWE-smith datasets of task instances for different programming languages
+•
+9 items
+•
+Updated
+Mar 9
+•
+3
+Paper for
+SWE-bench/SWE-smith
+Paper
+•
+2504.21798
+•
+Published
+Apr 30, 2025
+•
+15
\ No newline at end of file
diff --git a/research/notes/swe-smith.md b/research/notes/swe-smith.md
new file mode 100644
index 0000000000000000000000000000000000000000..f87e76199157d976e5039335038086db61c111d2
--- /dev/null
+++ b/research/notes/swe-smith.md
@@ -0,0 +1,97 @@
+---
+title: SWE-smith
+id: swe-smith
+tags:
+- deepread
+created: '2026-06-10T00:23:50.391517Z'
+source: https://swesmith.com
+source_domain: swesmith.com
+fetched_at: '2026-06-10T00:23:50.391296Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: unknown
+content_type: unknown
+deprecated: false
+---
+
+SWE-smith
+SWE-smith
+Scaling Data for Software Engineering Agents
+April 30, 2025
+Creating training data for software engineering agents is difficult. Until now.
+Introducing SWE-smith: Generate 100s to 1000s of task instances for any GitHub repository.
+We've generated 50k+ task instances for 128 popular GitHub repositories, then
+        trained our own LM for
+SWE-agent
+.
+The result?
+SWE-agent-LM-32B
+achieve 40% pass@1 on
+SWE-bench Verified
+.
+Now, we've open-sourced
+everything
+, and we're excited to see what you build with it!
+Check out the tutorial below to generate 100 task instances for
+any
+GitHub repository in 10 minutes.
+Click
+here
+for an extended discussion.
+️🔥 Excited about
+SWE-smith
+? Build with us!
+> Create new bug generation techinques.
+> Expand to non-Python repositories.
+> Train better SWE-agents!
+Read our
+documentation
+or
+code
+for more.
+Authors
+John Yang
+,
+Kilian Lieret
+,
+Carlos E. Jimenez
+,
+Alexander Wettig
+,
+Kabir Khandpur
+,
+Yanzhe Zhang
+,
+Binyuan Hui
+,
+Ofir Press
+,
+Ludwig Schmidt
+,
+Diyi Yang
+Affiliations
+Stanford University
+,
+Stanford SALT Lab
+,
+Princeton Language & Intelligence
+,
+Alibaba Qwen
+Citation
+@misc{yang2025swesmith,
+  title={SWE-smith: Scaling Data for Software Engineering Agents}, 
+  author={John Yang and Kilian Lieret and Carlos E. Jimenez and Alexander Wettig and Kabir Khandpur and Yanzhe Zhang and Binyuan Hui and Ofir Press and Ludwig Schmidt and Diyi Yang},
+  year={2025},
+  eprint={2504.21798},
+  archivePrefix={arXiv},
+  primaryClass={cs.SE},
+  url={https://arxiv.org/abs/2504.21798},
+}
+© 2025
+Our projects
+SWE-bench
+SWE-agent
+Mini-SWE-Agent
+SWE-ReX
+sb-cli
\ No newline at end of file
diff --git a/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md b/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md
new file mode 100644
index 0000000000000000000000000000000000000000..6107569530f3d9ffca6fac05b3ee25a0a751cc29
--- /dev/null
+++ b/research/notes/training-software-engineering-agents-and-verifiers-with-swe-gym.md
@@ -0,0 +1,2776 @@
+---
+title: Training Software Engineering Agents and Verifiers with SWE-Gym
+id: training-software-engineering-agents-and-verifiers-with-swe-gym
+tags:
+- deepread
+created: '2026-06-10T00:23:36.811308Z'
+source: https://arxiv.org/html/2412.21139
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:23:36.810162Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Training Software Engineering Agents and Verifiers with SWE-Gym
+Training Software Engineering Agents and Verifiers with SWE-Gym
+Jiayi Pan
+Xingyao Wang
+Graham Neubig
+Navdeep Jaitly
+Heng Ji
+Alane Suhr
+Yizhe Zhang
+Abstract
+We present SWE-Gym, the first environment for training software engineering (SWE) agents.
+SWE-Gym contains 2,438 real-world task instances, each comprising a Python codebase with an executable runtime environment, unit tests, and a task specified in natural language.
+We use SWE-Gym to train language model based SWE agents, and achieve up to 19% absolute gains in resolution rate on the popular SWE-Bench Verified and Lite test sets.
+We also experiment with inference-time scaling through verifiers trained on agent trajectories sampled from SWE-Gym.
+When combined with our fine-tuned SWE agents, we achieve 32.0% and 26.0% on SWE-Bench Verified and Lite, respectively, reflecting a new state-of-the-art for open-weight SWE agents. To facilitate further research, we publicly release SWE-Gym,
+models, and agent trajectories.
+Machine Learning, ICML
+\addauthor
+gnred
+\NewDocumentCommand
+\heng
+mO
+Heng
+[#1]
+1
+Introduction
+Figure 1
+:
+SWE-Gym enables scalable improvements for software engineering agents.
+Top
+: Scaling the amount of training data shows consistent performance improvements as we obtain more training trajectories, with no signs of saturation at 491 trajectories.
+We use temperature
+t
+=
+0
+𝑡
+0
+t=0
+italic_t = 0
+for evaluation.
+Bottom
+: For inference time scaling, we generate a number of candidate trajectories per task and select the best using a verifier trained on SWE-Gym. This approach demonstrates roughly log-linear gains with the number of sampled solutions.
+t
+=
+0
+𝑡
+0
+t=0
+italic_t = 0
+(excluded from regression) is used as the first hypothesis to be consistent with the top figure; later rollouts use
+t
+=
+0.5
+𝑡
+0.5
+t=0.5
+italic_t = 0.5
+.
+Language models (LMs) have remarkable promise in automating software engineering (SWE) tasks, as most clearly measured by recent progress on benchmarks like SWE-Bench
+(Jimenez et al.,
+2024
+)
+and Commit0
+(Zhao et al.,
+2024
+)
+.
+While LM-based SWE agents have shown significant performance gains through improving agent-computer interfaces
+(Yang et al.,
+2024
+)
+and prompting strategies
+(Wang et al.,
+2024c
+)
+, advances in SWE agents have been limited by a reliance on proprietary models, with limited research to improve the underlying LM itself.
+Unlike other domains where supervised fine-tuning and reinforcement learning have significantly improved LM capabilities, such as chat
+(Ouyang et al.,
+2022
+)
+, math reasoning
+(Shao et al.,
+2024
+; Yuan et al.,
+2024
+)
+, and web navigation
+(Pan et al.,
+2024
+)
+, software engineering currently lacks suitable training environments, and creating environments is uniquely challenging.
+Real-world software engineering requires interaction with an executable runtime that has been prepared with the appropriate software dependencies and reproducible test suites, among other requirements.
+These challenges are reflected in the existing resources (Tab.
+1
+).
+For example, the SWE-Bench
+(Jimenez et al.,
+2024
+)
+training split contains only solutions (git patches that solve the task), missing the step-by-step actions taken by the developer to create each solution, and executable environments and reward signals. R2E
+(Jain et al.,
+2024
+)
+uses synthetic tasks that are very far from real-world problems, while datasets such as APPS
+(Hendrycks et al.,
+2021a
+)
+focus only on isolated tasks rather than realistic repository-level coding problems.
+Table 1
+:
+SWE-Gym is the first publicly available training environment combining real-world SWE tasks from GitHub issues with pre-installed dependencies and executable test verification.
+Repository-level
+: whether each task is situated in a sophisticated repository;
+Executable Environment
+: whether each task instance comes with an executable environment with all relevant dependencies pre-installed;
+Real task
+: whether task instruction is collected from human developers.
+Dataset (split)
+Repository-Level
+Executable Environment
+Real task
+# Instances (total)
+# Instances (train)
+CodeFeedback
+(
+Zheng et al.
+,
+2024b
+)
+✗
+✗
+✓
+66,383
+66,383
+APPS
+(
+Hendrycks et al.
+,
+2021a
+)
+✗
+✓
+✓
+10,000
+5,000
+HumanEval
+(
+Chen et al.
+,
+2021
+)
+✗
+✓
+✓
+164
+0
+MBPP
+(
+Tao et al.
+,
+2024
+)
+✗
+✓
+✓
+974
+374
+R2E
+(
+Jain et al.
+,
+2024
+)
+✓
+✓
+✗
+246
+0
+SWE-Bench (train)
+(
+Jimenez et al.
+,
+2024
+)
+✓
+✗
+✓
+19,008
+19,008
+SWE-Gym Raw
+✓
+✗
+✓
+64,689
+64,689
+SWE-Bench (test)
+(
+Jimenez et al.
+,
+2024
+)
+✓
+✓
+✓
+2,294
+0
+SWE-Gym
+✓
+✓
+✓
+2,438
+2,438
+To bridge this gap, we present SWE-Gym, the
+first training environment
+combining real-world software engineering tasks from GitHub issues with pre-installed dependencies and executable test verification.
+SWE-Gym contains 2,438 Python tasks sourced from 11 popular open-source repositories (Tab.
+2
+), providing useful environments for training LMs as agents and verifiers.
+SWE-Gym supports training state-of-the-art open-weight SWE agents
+.
+Based on the OpenHands
+(Wang et al.,
+2024c
+)
+agent scaffold for general-purpose software development (§
+2
+), we fine-tune a 32B Qwen-2.5 coder model
+(Hui et al.,
+2024b
+)
+using only 491 agent-environment interaction trajectories sampled using SWE-Gym, and achieve substantial absolute improvements of +12.3% (to 15.3%) and +13.6% (to 20.6%) in resolution rate on SWE-Bench Lite and SWE-Bench Verified respectively (§
+4.2
+).
+SWE-Gym is effective across agent scaffolds
+.
+In another agent scaffold based on a specialized workflow (MoatlessTools;
+Örwall
+2024
+; §
+2
+), we experiment with self-improvement, where the LM interacts with SWE-Gym, receives reward from it, and learns to improve itself through rejection sampling fine-tuning.
+This self-improvement boosts performance up to 19.7% on SWE-Bench Lite.
+SWE-Gym supports training verifier models to enable inference-time scaling
+.
+We use test suites included in SWE-Gym to determine whether sampled agent trajectories are successful or not.
+Given these samples, we train a verifier model
+(i.e., an outcome-supervised reward model;  Cobbe et al.,
+2021
+)
+that estimates a trajectory’s probability of success.
+This enables inference-time scaling, where we sample multiple agent trajectories and select the one with the highest estimated reward according to the verifier. This further improves the resolution rate to 32.0% (+11.4% absolute improvement) on SWE-Bench Verified (§
+5.1.1
+; Fig.
+1
+bottom) and 26.0% on SWE-Bench Lite (§
+5.1.2
+), establishing a new state-of-the-art among systems with publicly accessible weights (Tab.
+9
+).
+Our baseline training and inference-time scaling methods on SWE-Gym yield continuously improved results with increasing compute
+(Fig.
+1
+).
+In the training phase, performance scales with the number of sampled trajectories up to our current limit of 491 trajectories, suggesting that performance is currently limited by the compute budget for sampling rather than the number of tasks in SWE-Gym.
+Similarly, using the agent and verifier trained by SWE-Gym, the bottom panel shows that using more compute during inference time steadily improves the performance.
+2
+Related Work
+Agents that solve GitHub issues.
+We focus on software engineering agents designed to automatically resolve GitHub issues within the SWE-Bench framework
+(Jimenez et al.,
+2024
+)
+. These agents take a GitHub issue and its associated code repository as input and generate a valid code modification (i.e., a git diff patch) to address the issue. The correctness of these modifications is verified using a human-written test suite.
+Existing agent designs are categorized by the extent of human priors integrated into their workflows:
+Specialized workflows
+(Xia et al.,
+2024
+; Örwall,
+2024
+; Zhang et al.,
+2024b
+; Chen et al.,
+2024
+)
+involve human-defined stages (e.g., localization, code editing, patch re-ranking), where a LM is iteratively prompted for each stage to produce the final result. This approach reduces the task horizon and minimizes the need for long-term planning. However, specialized workflows require significant human engineering, may not generalize to novel issue types, and can fail if intermediate steps encounter problems.
+In contrast,
+general-purpose prompting
+(
+(Yang et al.,
+2024
+; Wang et al.,
+2024c
+)
+) rely on LM’s ability to plan over long horizons and generate actions based on a history of interactions without heavily pre-defined workflows. While more flexible, general approaches demand higher capabilities from the underlying LM and can be computationally expensive due to multiple interaction rounds.
+The most successful existing SWE agents are built on proprietary language models like GPT-4 or Claude and utilize specialized workflows to overcome these models’ limitations. This contrasts with other sequential decision-making domains
+(Silver et al.,
+2017
+; Akkaya et al.,
+2019
+)
+, where learning-based approaches, such as reinforcement learning, drive success by enabling systems to learn from interactions and rewards to develop task competence. A key barrier in the SWE agent domain is the lack of appropriate training environments. Our experiments show that SWE-Gym can be used to build strong learning-based agents, accelerating research in this area.
+Environments for training software agents.
+There is no existing dataset suitable for training software engineering agents.
+SWE-Bench
+(Jimenez et al.,
+2024
+)
+is widely used for evaluating software engineering performance, but its training split lacks executable environments and success signals present in the evaluation split, making it useful only for imitation learning approaches.
+HumanEval
+(Chen et al.,
+2021
+)
+is designed for standalone code generation tasks, akin to coding competitions. Therefore, it falls short of addressing the complex challenges inherent in real-world, repository-level software engineering tasks, which involve thousands of files, millions of lines of code, and tasks such as bug fixing, feature development, and system optimization.
+Similarly, R2E
+(Jain et al.,
+2024
+)
+is a small evaluation dataset with 246 instances and, due to its synthetic nature, lacks the realism and complexity in real-world software engineering scenario.
+Our proposed SWE-Gym instead uses real-world GitHub issues as task, and associated executable unit tests for evaluation. This results in realistic and complex task formulations, aligning closely with real-world challenges.
+Post-training: From chatbots and reasoners to agents.
+Post-training, which fine-tunes pre-trained language models using supervised or reinforcement learning, significantly improves model performance across various domains. Techniques like RLHF
+(Ouyang et al.,
+2022
+)
+have become standard for adapting language models into chatbots, improving both performance and alignment
+(Qwen Team,
+2024
+)
+. In math reasoning, datasets such as MATH
+(Hendrycks et al.,
+2021b
+)
+and GSM-8K
+(Cobbe et al.,
+2021
+)
+facilitate the training and evaluation of policy and verifier models
+(Cobbe et al.,
+2021
+; Wang et al.,
+2024a
+)
+.
+Earlier works
+(Wang et al.,
+2024b
+; Chen et al.,
+2023
+; Zeng et al.,
+2023
+; Wu et al.,
+2024
+)
+demonstrate that distilling agent trajectories from stronger models improve weaker models. Recent studies
+(Xi et al.,
+2024
+; Zhai et al.,
+2024
+; Bai et al.,
+2024
+)
+explore self-improving methods, showing that reinforcement learning or rejection sampling fine-tuning guided by reward enables LMs to enhance themselves without more capable teachers.
+However, post-training typically depends on expert demonstration data or training environments with reliable reward signals, which are largely absent in the software engineering domain. This has led to a reliance on prompting-based methods with proprietary language models. Our work addresses this gap with SWE-Gym, a training environment based on real-world software engineering tasks that uses expert-written tests as reward signals. Our experiments demonstrate that SWE-Gym can build strong SWE agents without prompt engineering.
+Category
+Metric
+SWE-Gym
+SWE-Gym Lite
+Size
+# Instances
+2,438 (2,294)
+230 (300)
+# Repos
+11 (12)
+11 (12)
+Issue Text
+Length by Words
+239.8 (195.1)
+186.2 (175.9)
+Codebase
+# Non-test Files
+971.2 (2944.2)
+818.8 (2988.5)
+# Non-test Lines
+340675.0 (363728.4)
+340626.2 (377562.4)
+Gold Patch
+# Lines edited
+69.8 (32.8)
+10.6 (10.1)
+# Files edited
+2.5 (1.7)
+1.0 (1.0)
+# Func. edited
+4.1 (3.0)
+1.4 (1.34)
+Tests
+# Fail to Pass
+10.0 (9.0)
+2.04 (3.5)
+# Total
+760.8 (132.5)
+99.9 (85.2)
+Table 2
+:
+Statistics comparing SWE-Gym with the SWE-Bench test split (in parenthesis).
+Except for size metrics, we report the average value across instances.
+Figure 2
+:
+Repository distribution of SWE-Gym instances.
+3
+SWE-Gym Environment
+SWE-Gym comprises 2,438 real-world software engineering tasks sourced from pull requests in 11 popular Python repositories, with pre-configured executable environments and expert-validated test cases, constructed in close alignment with SWE-Bench
+(Jimenez et al.,
+2024
+)
+.
+These repositories are separate from those used in SWE-Bench to avoid contamination.
+These tasks require SWE agents to develop test-passing solutions for real-world GitHub issues using provided codebases and executable environments.
+Such agents must map from natural language descriptions of the issue, as well as the initial state of the repository, to a pull request represented as a git patch.
+We also identify a subset of 230 tasks, SWE-Gym Lite, which contains generally easier and more self-contained tasks that are suitable for rapid prototyping, in alignment with SWE-Bench Lite
+(Jimenez et al.,
+2024
+)
+.
+To support future research in SWE agent development and automatic dataset synthesis, we also release SWE-Gym Raw, a large set of Python GitHub issues without executable environments (64,689 instances spanning 358 Python repositories).
+3.1
+Dataset Construction
+Identify Repositories.
+We first use SEART GitHub search
+1
+1
+1
+https://seart-ghs.si.usi.ch/
+to filter a list of initial repositories. Unlike SWE-Bench, which focuses on the top 5k most downloaded PyPI libraries
+(Jimenez et al.,
+2024
+)
+, we select Python repositories that were created before July 1, 2022 and have more than 500 stars, with at least 300 lines of code, more than 500 pull requests (PRs) and 100 contributors. This results in 358 repositories.
+Extracting Training Instances from Repositories.
+We use SWE-Bench’s instance extraction script to convert these repositories into task instances, each corresponding to a GitHub issue including the natural language description of the issue, a snapshot of the repository in which the issue was created, and a set of unit tests.
+Over the 358 repositories, we extract 64,689 task instances. We refer to this dataset as SWE-Gym Raw, which is over three times larger than the 19k instances gathered in previous work
+(Jimenez et al.,
+2024
+)
+and includes nearly ten times as many repositories.
+While SWE-Gym Raw instances contain code, issue descriptions, and the solution, they do not contain executable environments or a guarantee that its unit tests are effective in evaluating the correctness of a solution.
+Thus, we focus on 11 repositories with numerous instances and semi-manually create executable environments for them.
+Version Training Instances.
+Associating instances with their respective version numbers (e.g.
+1.2.3
+) and setting up environments version-by-version makes the environment collection process more practical by avoiding redundant setup work.
+We generalize SWE-Bench’s versioning script to support versioning via script execution, and semi-automatically collect versions for each instance based on information available in the repository (e.g.,
+pyproject.toml
+, git tag, etc).
+Setup Executable Environments and Verify Instances.
+Creating executable environments with pre-installed dependencies is crucial for developing software engineering agents, as it mirrors deployment settings and allows for incremental unit test feedback. Configuring dependencies for specific codebase versions is challenging due to the lack of a universal Python package installation method and backward compatibility issues, especially for older GitHub issues. Ignoring these environments could introduce distribution bias, diminishing SWE-Gym’s utility. To address this, we manually configure dependencies for each task instance using relevant configuration files (e.g.,
+requirements.txt
+), CI scripts, or documentation from the repository snapshot at the time of issue creation.
+We then use SWE-Bench’s execution-based validation script to ensure that the gold patch (the human-submitted code diff) passes more unit tests than the original code. This process required approximately 200 human annotation hours
+2
+2
+2
+Annotations are done by a subset of the authors.
+and 10,000 CPU core hours. After validation and filtering out failed instances, we obtained 2,438 unit-test-validated instances from 11 repositories. For full reproducibility, we publicly release pre-built Docker images for each instance, totaling 6 TB.
+3.2
+SWE-Gym Lite
+Solving software engineering tasks is computationally intensive, costing usually $1 or more per task with frontier models
+(Wang et al.,
+2024c
+)
+.
+To improve research efficiency via faster agent evaluation,
+Jimenez et al. (
+2024
+)
+introduce SWE-Bench Lite, a canonical subset of 300 instances from SWE-Bench.
+Following the SWE-Bench Lite filtering pipeline,
+3
+3
+3
+For details on its construction process, see
+https://www.swebench.com/lite.html
+.
+we delineate the
+SWE-Gym Lite
+split, comprising 230 instances. Similar to SWE-Bench Lite, this subset excludes tasks that require editing more than one file, tasks with poorly described problem statements, those with excessively complex ground-truth code diffs, and tests focused on error message validation.
+3.3
+Dataset Statistics
+Fig.
+2
+illustrates that the task distribution across repositories exhibits a long-tail pattern. Notably, tasks associated with
+pandas
+comprise nearly one-third of the total, whereas tasks related to
+bokeh
+represent a mere one percent.
+Our analysis suggests that tasks in SWE-Gym are on average harder than those included in SWE-Bench.
+Tab.
+2
+shows that SWE-Gym has statistics similar to SWE-Bench, with several key differences.
+Codebases in SWE-Gym, on average, have relatively fewer files than SWE-Bench, but a similar number of total lines of code.
+However, gold patches in SWE-Gym have significantly more lines and files edited when compared to SWE-Bench’s gold patches.
+Additionally, we find models have consistently lower performance on SWE-Gym compared to SWE-Bench.
+4
+4
+4
+§
+B.4
+contains details of these experiments.
+Beyond models and scaffolds overfitting to SWE-Bench, the decreased performance on SWE-Gym may also be due to our inclusion of sophisticated repositories like
+pandas
+and
+MONAI
+.
+4
+Training LMs as Agents with SWE-Gym
+We experiment with training language model agents using SWE-Gym.
+We use two agent scaffolds (OpenHands,
+Wang et al.
+2024c
+, §
+4.2
+; Moatless Tools,
+Örwall
+2024
+, §
+4.3
+).
+4.1
+Setting
+Agent Scaffolds.
+Recent LM-based SWE agents comprise a base language model, and a set of tools and prompts this base model has access to.
+This set of tools and prompting strategies is referred to as an agent scaffold, and recent work has developed numerous scaffolds for different purposes (refer to §
+2
+for examples).
+We experiment with two types of agent scaffolds: one for general-purpose prompting (OpenHands CodeAct;
+Wang et al.
+2024c
+) and one for specialized workflows (MoatlessTools;
+Örwall
+2024
+), which
+allows us to
+measure the efficacy of SWE-Gym across diverse deployment settings.
+Policy Improvement Algorithm.
+We use SWE-Gym to improve the underlying LM for a given SWE agent.
+As a baseline, we employ a simple policy improvement algorithm: rejection sampling fine-tuning (a.k.a. filtered behavior cloning), where we fine-tune the base LM on
+success
+trajectories sampled from SWE-Gym.
+Evaluation Metrics.
+We use the standard SWE agent benchmarks SWE-Bench Lite and Verified
+(Jimenez et al.,
+2024
+)
+for evaluation.
+We report (1)
+resolution rate (%)
+, the proportion of resolved task instances, and (2)
+Empty Patch (%)
+, the proportion of trajectories where none of the code in the repository is edited.
+We use OpenHands remote runtime
+(Neubig & Wang,
+2024
+)
+to parallelize evaluation (e.g., execute unit tests).
+Technical Details.
+For base LMs, we use
+Qwen-2.5-Coder-Instruct
+(Hui et al.,
+2024a
+)
+7B, 14B, and 32B. §
+B.2
+contains training run details.
+4.2
+Training General-Purpose Prompting Agents
+Table 3
+:
+Model performance (fine-tuned on 491 SWE-Gym-sampled trajectories) on SWE-Bench
+(Jimenez et al.,
+2024
+)
+using OpenHands
+(Wang et al.,
+2024c
+)
+as agent scaffold. We use
+Qwen-2.5-Coder-Instruct
+as the base model.
+Model
+Empty Patch (%,
+↓
+↓
+\downarrow
+↓
+)
+Stuck in Loop (%,
+↓
+↓
+\downarrow
+↓
+)
+Avg. Turn(s)
+Resolve Rate (%,
+↑
+↑
+\uparrow
+↑
+)
+Size
+zero-shot
+fine-tuned
+Δ
+Δ
+\Delta
+roman_Δ
+zero-shot
+fine-tuned
+Δ
+Δ
+\Delta
+roman_Δ
+zero-shot
+fine-tuned
+Δ
+Δ
+\Delta
+roman_Δ
+zero-shot
+fine-tuned
+Δ
+Δ
+\Delta
+roman_Δ
+SWE-Bench Lite (300 instances)
+7B
+40.3
+29.7
+-10.7
+47.0
+31.0
+-16.0
+20.3
+22.2
++1.9
+1.0 (
+±
+plus-or-minus
+\pm
+±
+1.0)
+10.0 (
+±
+plus-or-minus
+\pm
+±
+2.4)
++9.0
+14B
+49.7
+18.1
+-31.6
+31.7
+27.1
+-4.6
+23.2
+21.4
+-1.8
+2.7 (
+±
+plus-or-minus
+\pm
+±
+1.9)
+12.7 (
+±
+plus-or-minus
+\pm
+±
+2.3)
++10.0
+32B
+27.0
+18.1
+-8.9
+16.7
+18.1
++1.5
+15.5
+29.3
++13.9
+3.0
+(
+±
+plus-or-minus
+\pm
+±
+1.4)
+15.3
+(
+±
+plus-or-minus
+\pm
+±
+2.5)
++12.3
+SWE-Bench Verified (500 instances)
+7B
+45.8
+33.8
+-12.0
+39.6
+21.0
+-18.6
+21.9
+35.3
++13.4
+1.8 (
+±
+plus-or-minus
+\pm
+±
+1.1)
+10.6 (
+±
+plus-or-minus
+\pm
+±
+2.1)
++8.8
+14B
+44.9
+14.5
+-30.4
+32.1
+21.3
+-10.7
+25.5
+30.1
++4.6
+4.0 (
+±
+plus-or-minus
+\pm
+±
+1.6)
+16.4 (
+±
+plus-or-minus
+\pm
+±
+2.0)
++12.4
+32B
+9.5
+13.8
++4.3
+29.4
+23.8
+-5.6
+24.6
+31.6
++7.0
+7.0
+(
+±
+plus-or-minus
+\pm
+±
+1.3)
+20.6
+(
+±
+plus-or-minus
+\pm
+±
+2.1)
++13.6
+In this section, we use OpenHands (version CodeActAgent 2.1,
+Wang et al.
+2024b
+,
+c
+) as our agent scaffold, which is based on general-purpose ReAct-style prompting
+(Yao et al.,
+2023
+)
+.
+In contrast to specialized-workflows-agents (§
+2
+), it relies on the LM to generate actions and do planning. It equips the base LM with a bash terminal and a file editor. We disable the browser feature of OpenHands in this work.
+Trajectory Collection.
+By rejection sampling, we obtain 491 successful trajectories from SWE-Gym,.
+These trajectories are sampled from
+gpt-4o-2024-08-06
+and
+claude-3-5-sonnet-20241022
+with different temperature settings.
+Each successful trajectory, on average, has roughly 19 turns and approximately 19,000tokens.
+5
+5
+5
+Tab.
+8
+contains more statistics of the sampled trajectories.
+Although SWE-Gym offers many more tasks and allows repeated sampling, our 491 trajectories are limited primarily by computational budget.
+Training on SWE-Gym trajectories turns LM into effective agents to fix issues.
+As shown in Tab.
+3
+, the pre-trained base model achieves resolution rates of 3.0% and 7.0% on SWE-Bench Lite and Verified, respectively. After fine-tuning on 491 trajectories
+6
+6
+6
+We use a sampling temperature of 0 unless otherwise specified.
+, it improves by up to 12.3% (3.0% → 15.3%) and 13.6% (7.0% → 20.6%).
+Training reduces stuck-in-loop behavior.
+For agent tasks, open-weight LMs
+often get stuck in loops, where the model perpetually generates the same action for multiple turns, especially when prompted with general-purpose prompts (§
+2
+). Thus, we report
+Stuck in Loop (%)
+, the percentage of trajectories where the agent repeats the same action three times consecutively.
+As shown in Tab.
+3
+, zero-shot pre-trained models often get stuck in loops; even the largest 32B model is trapped in 29.4% of SWE-Bench Verified tasks. Fine-tuning on trajectories from SWE-Gym consistently reduces the stuck-in-loop rate by 4.6–18.6% across both SWE-Bench Lite and Verified tasks, except for the 32B model on SWE-Bench Lite, which increases by 1.5% due to its already low loop rate. This coincides with a decrease in the empty patch rate, likely enabling the agent to perform more code edits.
+Performance scales with model size.
+Rather unsurprisingly, larger base models consistently improve the resolution rate, empty patch rate, and stuck-in-loop rate (Tab.
+3
+).
+Self-improvement remains ineffective.
+In addition to fine-tuning on trajectories sampled from strong teacher models, we also experiment with fine-tuning on trajectories sampled directly from the policy being updated.
+We use the fine-tuned 32B model to sample 6 trajectories per SWE-Gym instance (using temperature
+t
+=
+0.5
+𝑡
+0.5
+t=0.5
+italic_t = 0.5
+), obtaining 868 successful trajectories (i.e., on-policy trajectories). We further fine-tune the base 32B model on a mixture of 868 on-policy trajectories and the previously collected 491 off-policy trajectories.
+When evaluating this fine-tuned model on SWE-Bench Lite, we observe the resolution rate drop from 15.3 to 8.7%, suggesting that self-improvement is not yet working. We hypothesize that we could achieve improved results using more advanced policy optimization methods, such as proximal policy optimization (PPO)
+(Schulman et al.,
+2017
+)
+, or with a stronger base model. These directions remain promising avenues for future investigation.
+4.3
+Self-Improvement with Specialized Workflow
+Unlike OpenHands, which offers freedom in long-horizon planning, MoatlessTools constrains the language model’s action space to pre-defined specialized workflows, reducing task horizons.
+Specialized workflows outperform general-purpose prompting for open-weight LMs. In Tab.
+3
+and Tab.
+4
+, the 7B and 32B LM achieve zero-shot resolution rates of 7% and 19% with MoatlessTools, compared to 1.0% and 3.0% with OpenHands on SWE-Bench Lite.
+Given MoatlessTools’ improved zero-shot performance and shorter task horizon, we hypothesize that self-improvement without a strong teacher is achievable using this scaffold and training on SWE-Gym.
+With a limited compute budget, we conduct this experiment with only 7B and 32B models, using LoRA
+(Hu et al.,
+2022
+)
+for the 32B model for improved efficiency.
+We use the 7B model for ablation experiments.
+We use iterative rejection sampling fine-tuning for policy improvement. Each iteration involves (a) performing 30 high-temperature (1.0) rollouts per task on SWE-Gym-Lite and adding successful trajectories to the fine-tuning dataset, and (b) fine-tuning the policy on these filtered trajectories. After two iterations, further improvements are negligible.
+Data Bias Impacts Performance.
+Repeated sampling, as in
+Brown et al. (
+2024
+)
+, shows that task success probability follows a long-tail distribution (Fig.
+6
+), where more samples increase solved instances. While broader task coverage benefits training, it introduces a bias toward easier tasks, making it suboptimal to train on all successful trajectories, as first observed in math reasoning
+Tong et al. (
+2024
+)
+.
+Mitigating Bias with Per-Instance Capping.
+We introduce per-instance capping—a method that limits the maximum number of selected samples per task. As illustrated in Fig.
+6
+, this balances dataset bias and size. A low cap reduces dataset size and performance (§
+5.2
+), while a high cap skews the distribution toward easier tasks. Empirically, a threshold of 2 achieves a good balance, slightly outperforming the full dataset and improving training speed (Tab.
+6
+). We rank trajectories by the number of model response rounds required, preferring fewer.
+Results.
+Results. After two policy improvement iterations (Tab.
+4
+), the 7B model’s resolution rate increased from 7.0% to 9.0% after the first iteration and to 10.0% after the second. In contrast, the 32B model improved from 19.0% to 19.7% after the first iteration with no further gains.
+We attribute the limited gains in the 32B model to the scaffold’s restricted action space and the rejection sampling fine-tuning method.
+Table 4:
+resolution rate (RR) and Empty patch rate (EP) on SWE-Bench Lite with the MoatlessTools Scaffold after online rejection sampling fine-tuning (temperature
+t
+=
+0
+𝑡
+0
+t=0
+italic_t = 0
+).
+Setting
+7B Model
+32B Model
+EP(
+%
+,
+↓
+\%,\downarrow
+% , ↓
+)
+RR(
+%
+,
+↑
+\%,\uparrow
+% , ↑
+)
+EP(
+%
+,
+↓
+\%,\downarrow
+% , ↓
+)
+RR(
+%
+,
+↑
+\%,\uparrow
+% , ↑
+)
+Zero-Shot
+56.3%
+7.0%
+24.3%
+19.0%
+Iteration 1
+29.0%
+9.0%
+18.3%
+19.7%
+Iteration 2
+23.3
+%
+10.0%
+9.7%
+19.7%
+5
+Scaling Agent Performance with SWE-Gym
+We explore two scaling directions enabled by SWE-Gym to enhance agent performance: inference-time scaling (§
+5.1
+) and training-time data scaling (§
+5.2
+).
+5.1
+Inference-Time Scaling with Verifiers
+Trajectories sampled from SWE-Gym can be used not only for training a policy, but also for training a verifier (i.e., reward) model.
+We train an outcome-supervised reward model (ORM)
+(Cobbe et al.,
+2021
+)
+that, given the relevant context of the task execution (including the problem statement, agent trajectory, and current git diff), generates a score that estimates the probability that the agent has solved the problem.
+We experiment with using this model to rerank candidate trajectories sampled from a SWE agent policy, and show that such learned verifiers enable effective inference-time scaling for further performance improvement.
+5.1.1
+Verifier for General-Purpose Prompting
+For OpenHands agents
+(Wang et al.,
+2024b
+,
+c
+)
+with general-purpose prompting (§
+2
+), we train a verifier (ORM) that takes as input the trajectory
+τ
+=
+[
+o
+1
+,
+a
+1
+,
+o
+2
+,
+a
+2
+,
+…
+,
+o
+n
+,
+a
+n
+]
+𝜏
+subscript
+𝑜
+1
+subscript
+𝑎
+1
+subscript
+𝑜
+2
+subscript
+𝑎
+2
+…
+subscript
+𝑜
+𝑛
+subscript
+𝑎
+𝑛
+\tau=[o_{1},a_{1},o_{2},a_{2},\dots,o_{n},a_{n}]
+italic_τ = [ italic_o start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT , italic_o start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT , … , italic_o start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT , italic_a start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ]
+, represented as an interleaved sequence of observations and actions, and generates a scalar reward
+r
+∈
+[
+0
+,
+1
+]
+𝑟
+0
+1
+r\in[0,1]
+italic_r ∈ [ 0 , 1 ]
+.
+Observations
+o
+k
+subscript
+𝑜
+𝑘
+o_{k}
+italic_o start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+include the task problem statement, command execution output, error messages, etc; action
+a
+k
+subscript
+𝑎
+𝑘
+a_{k}
+italic_a start_POSTSUBSCRIPT italic_k end_POSTSUBSCRIPT
+can be bash command or file operations (e.g., edit, view) from the agent.
+Training and Inference.
+We fine-tune 32B
+Qwen2.5-Coder-Instruct
+to label trajectories as successful or unsuccessful using output tokens
+<YES>
+and
+<NO>
+respectively.
+7
+7
+7
+§
+B.6
+includes the verifier prompt template.
+For training data, we re-use two sets of trajectories we sampled on SWE-Gym for agent training in §
+4.2
+: (1)
+off-policy trajectories
+which contain 443 successful trajectories; (2)
+on-policy trajectories
+which contain 875 successful trajectories sampled from the fine-tuned
+Qwen2.5-Coder-Instruct-32B
+.
+8
+8
+8
+We keep only trajectories within 32k-token length for training, which may reduce their number compared to Section
+4.2
+.
+We combine both on-policy and off-policy trajectories, randomly sample the same amount of unsuccessful trajectories from each subset (1,318 each), and combine them as our dataset for verifier training (total 2,636 trajectories).
+We fine-tune the model to predict
+<YES>
+for successful trajectories and
+<NO>
+for unsuccessful ones.
+At inference time, conditioned on the prompt and the agent trajectory
+τ
+𝜏
+\tau
+italic_τ
+, we use SGLang
+(Zheng et al.,
+2024a
+)
+to obtain the log probability of the next token being
+<YES>
+(
+l
+y
+subscript
+𝑙
+𝑦
+l_{y}
+italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT
+) or
+<NO>
+(
+l
+n
+subscript
+𝑙
+𝑛
+l_{n}
+italic_l start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT
+).
+We then calculate the reward as the probability of success by normalizing the log probability:
+r
+=
+exp
+⁡
+(
+l
+y
+)
+/
+(
+exp
+⁡
+(
+l
+y
+)
++
+exp
+⁡
+(
+l
+n
+)
+)
+𝑟
+subscript
+𝑙
+𝑦
+subscript
+𝑙
+𝑦
+subscript
+𝑙
+𝑛
+r=\exp(l_{y})/(\exp(l_{y})+\exp(l_{n}))
+italic_r = roman_exp ( italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT ) / ( roman_exp ( italic_l start_POSTSUBSCRIPT italic_y end_POSTSUBSCRIPT ) + roman_exp ( italic_l start_POSTSUBSCRIPT italic_n end_POSTSUBSCRIPT ) )
+.
+Metrics.
+We report two metrics:
+(1) Pass@
+k
+𝑘
+k
+italic_k
+, the proportion of tasks with at least one successful solution among
+k
+𝑘
+k
+italic_k
+samples, and
+(2) Best@
+k
+𝑘
+k
+italic_k
+, the success rate of the highest-reward trajectories selected by the verifier from
+k
+𝑘
+k
+italic_k
+samples per task. Pass@
+k
+𝑘
+k
+italic_k
+measures solution discovery (upper bound for Best@
+k
+𝑘
+k
+italic_k
+); Best@
+k
+𝑘
+k
+italic_k
+evaluates verifier accuracy. Mean and variance calculation are detailed in §
+B.1
+, following
+Lightman et al. (
+2023
+)
+.
+Figure 3
+:
+Increasing inference-time compute improves performance on SWE-Bench Verified with a learnt verifier.
+Both the agent and the verifier are a
+Qwen2.5-Coder-Instruct-32B
+model fine-tuned on the corresponding dataset (§
+5.1.1
+). OpenHands is used as the agent scaffold.
+Results.
+Fig.
+3
+shows how Pass@
+k
+𝑘
+k
+italic_k
+and Best@K scale with the number of sampled agent trajectories using the fine-tuned 32B model as the agent model. Pass@
+k
+𝑘
+k
+italic_k
+demonstrates strong improvement, rising from 20.6 to 37.8% resolution rate as
+k
+𝑘
+k
+italic_k
+increases from 1 to 8, and up to 42.8@
+k
+𝑘
+k
+italic_k
+=16.
+The Best@
+k
+𝑘
+k
+italic_k
+metric, which relies on our verifier’s ability to select the best trajectory, demonstrates more modest but steady progress, improving from a resolution rate of 20.6@1 to 29.8@8, and up to 32.0@16.
+The gap between Pass@
+k
+𝑘
+k
+italic_k
+and Best@
+k
+𝑘
+k
+italic_k
+, due to the imperfect performance of our trained verifier, indicates there is room for improvements in reward modeling for coding agents.
+Surprisingly, we found that fine-tuning the verifier model using LoRA
+(Hu et al.,
+2022
+)
+(29.8@8) with Unsloth
+(Unsloth Team,
+2024
+)
+performs better than full-parameter fine-tuning (27.2@8), potentially due regularization. Furthermore, as shown in Fig.
+1
+(bottom), the Best@
+k
+𝑘
+k
+italic_k
+curve exhibits strong linearity on a logarithmic scale, indicating a promising scaling behavior.
+Training data matters for verifier.
+We experiment with variations on the choice of training data for our verifier model.
+Using full-parameter fine-tuning on
+Qwen-2.5-Coder-Instruct-32B
+, we use different mixtures of on- and off-policy trajectories, as well as different distributions of successful and unsuccessful trajectories.
+As shown in Fig.
+8
+, our ablation study demonstrates that the choice of training data can significantly impact verifier performance.
+Training with a mixture of off-policy and on-policy data yields the best results (our default setting), reaching a resolution rate of 27@8.
+In contrast, using only on-policy data from the fine-tuned model shows moderate but limited improvement, while training exclusively on off-policy data from Claude and GPT leads to early performance plateaus around 22% resolution rate.
+Our findings indicate that verifier training benefits most from a diverse dataset combining both off-policy and on-policy examples.
+5.1.2
+Verifier for Specialized Workflow
+Figure 4
+:
+Scaling inference-time compute for MoatlessTools Agents (32B) with learned verifiers on SWE-Bench Lite. Temperature
+t
+=
+0.5
+𝑡
+0.5
+t=0.5
+italic_t = 0.5
+.
+For MoatlessTools agents with specialized workflows, given that it doesn’t have a turn-taking action-observation trajectory like OpenHands CodeActAgent,
+
+we prepare verifier inputs through a parsing process adopted from
+Zhang et al. (
+2024a
+)
+, which combines task descriptions, relevant agent context, and generated patches.
+9
+9
+9
+We provide the prompt template in §
+B.5
+.
+We train the verifier to map from this input to a single token indicating task success.
+Following the training procedure described in §
+5.1.1
+, we train 7B and 32B verifiers using on-policy trajectories from the last (2nd round of sampling, applying LoRA
+(Hu et al.,
+2022
+)
+. To address the easy-data bias in the training dataset, we cap the number of successful trajectories per instance at two and balance the data by subsampling failure cases to match the same number of successful ones.
+Results.
+We evaluate the verifiers by sampling from an agent policy with
+k
+=
+𝑘
+absent
+k=
+italic_k =
+8 at temperature 0.5. As shown in Fig.
+4
+and Fig.
+7
+, these verifiers enable effective scaling across verifier and policy sizes: the 7B verifier improves from 10 to 13.3% resolution rate on SWE-Bench Lite when paired with a 7B policy, while the 32B verifier improves from 19.7 to 26.3% when paired with a 32B policy. The 7B verifier plateaus after
+k
+=
+𝑘
+absent
+k=
+italic_k =
+4 samples when ranking trajectories from both 7B and 32B agents. In contrast, the 32B verifier continues improving even at
+k
+=
+𝑘
+absent
+k=
+italic_k =
+8, suggesting that verifier size significantly affects scaling behavior.
+5.2
+Training-Time Scaling with Data
+We then examine how scaling the amount of training data affects agent performance using 491 sampled trajectories from §
+4.2
+. We simulate three scaling methods through subsampling: (1)
+Scaling trajectories
+, where trajectories are randomly dropped (Fig.
+5
+); (2)
+Scaling unique task instances
+, where only one successful trajectory per task instance is selected (Fig.
+9
+); and (3)
+Scaling repositories
+, which sequentially includes all instances from each repository to assess repository-level diversity.
+Setup.
+Using OpenHands
+(Wang et al.,
+2024c
+)
+and the fine-tuning approach described in §
+4.2
+, we evaluate these scaling approaches on SWE-Bench Verified: scaling the number of trajectories, by subsampling from the full trajectory dataset from §
+4.2
+(at most 491 trajectories); unique instance scaling on these trajectories deduplicated by instance ID (at most 294 trajectories), and repository-based scaling where we sort repositories alphabetically and include all trajectories from each repository in order (e.g., first 25% contains complete trajectories from the first N repositories).
+We compare models trained on 25%, 50%, and 100% of the full dataset for each approach, sampling training subsets using the methods described above for each scaling approach.
+10
+10
+10
+Tab.
+7
+contains detailed statistics of these datasets.
+Scaling trends suggest instance and repository diversity is not yet a bottleneck.
+Fig.
+5
+demonstrates substantial scaling behavior, with consistent improvements in resolution rate as the number of training trajectories randomly increases, particularly for the 32B model.
+These results suggest that SWE-Gym’s current size and repository diversity are likely not performance bottlenecks - further improvements could likely be achieved by allocating more computing resources to sampling more training trajectories.
+Figure 5
+:
+Scaling effects of increasing the number of randomly sampled trajectories for training.
+Fig.
+9
+reveals comparable overall performance between different scaling approaches up to where deduplication takes effect. While Random Scaling (No Dedup.) achieves higher final performance, this is likely due to having more trajectories (491 vs 294) rather than better scaling efficiency.
+Among deduplicated approaches, Repository Scaling shows stronger initial performance at 25% data, suggesting that complete repository coverage may provide more coherent learning signals early in training.
+These results suggest that the repository and instance diversity of SWE-Gym is not yet a bottleneck - further improvements could likely be achieved by simply sampling more agent trajectory data for traning, regardless of duplication or repository distribution.
+6
+Conclusions, Limitations, and Future Work
+In this paper, we introduce SWE-Gym, the first training environment that addresses critical gaps in enabling scalable learning for software engineering agents. By combining real-world Python tasks with repository-level context, pre-configured execution environments, and test verifications, SWE-Gym will be a foundation for advancing LM agent training research.
+Through extensive experiments, we demonstrate that SWE-Gym enables both agent and verifier models to achieve significant improvements in resolving complex software tasks. Our findings highlight the scalability of these approaches, revealing potential for continuous performance gains with increased compute.
+We see many research directions that we are excited to explore in the future:
+1.
+Automatic Environment Synthesis
+SWE-Gym, while effective, is limited by its environment diversity, including the number of repositories, types of tasks, and programming languages. We view environment synthesis—via automated environment creation, test-case generation, or task generation—as a critical next step.
+2.
+Self-Improvement with Reinforcement Learning
+Despite notable progress, our self-improvement results are modest. Training language model agents with large-scale online reinforcement learning is a promising direction for further improvements.
+3.
+Human-Agent Interaction
+Current SWE settings focus solely on task completion, neglecting human-in-the-loop collaboration, which is essential for real-world software engineering. Methods like user simulation or learning from offline human-agent interaction data might offer ways for developing collaborative agents that align with human.
+Impact Statement
+This work presents SWE-Gym, an environment for training software engineering agents, with strong empirical results on its effectiveness. We discuss a few important societal implications to consider.
+First, improving automated software engineering capabilities could increase developer’s productivity and accessibility across industries.
+Although current models are primarily research artifacts and not yet production-ready, they can support critical open-source infrastructure and potentially make software development more accessible.
+Secondly, as these agents become more capable, they may impact software engineering jobs and require careful consideration around code ownership, licensing, and attribution.
+Additionally, while we focus on legitimate software engineering tasks, similar techniques could potentially be misused to automate the creation of malicious code.
+We encourage future work to further explore frameworks for responsible deployment of software engineering agents, including considerations around security, safety, and economic impacts.
+Acknowledgments
+We thank John Yang and Ofir Press for helpful discussions, and John Yang for assistance in reproducing data analysis results from SWE-Bench. We thank Modal Labs
+11
+11
+11
+https://modal.com/
+for the GPU compute support through its Academic Credits Program.
+XW and HJ are partially supported by U.S. DARPA ITM Program No. FA8650-23-C-7316. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.
+References
+Akkaya et al. (2019)
+Akkaya, I., Andrychowicz, M., Chociej, M., Litwin, M., McGrew, B., Petron, A., Paino, A., Plappert, M., Powell, G., Ribas, R., et al.
+Solving rubik’s cube with a robot hand.
+arXiv preprint arXiv:1910.07113
+, 2019.
+Badertdinov et al. (2024)
+Badertdinov, I., Trofimova, M., Anapolskiy, Y., Abramov, S., Zainullina, K., Golubev, A., Polezhaev, S., Litvintseva, D., Karasik, S., Fisin, F., Skvortsov, S., Nekrashevich, M., Shevtsov, A., and Yangel, B.
+Scaling data collection for training software engineering agents.
+Nebius blog
+, 2024.
+Bai et al. (2024)
+Bai, H., Zhou, Y., Cemri, M., Pan, J., Suhr, A., Levine, S., and Kumar, A.
+Digirl: Training in-the-wild device-control agents with autonomous reinforcement learning.
+ArXiv
+, abs/2406.11896, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:270562229
+.
+Brown et al. (2024)
+Brown, B., Juravsky, J., Ehrlich, R., Clark, R., Le, Q. V., R’e, C., and Mirhoseini, A.
+Large language monkeys: Scaling inference compute with repeated sampling.
+ArXiv
+, abs/2407.21787, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:271571035
+.
+Chen et al. (2023)
+Chen, B., Shu, C., Shareghi, E., Collier, N., Narasimhan, K., and Yao, S.
+Fireact: Toward language agent fine-tuning.
+ArXiv
+, abs/2310.05915, 2023.
+URL
+https://api.semanticscholar.org/CorpusID:263829338
+.
+Chen et al. (2024)
+Chen, D., Lin, S., Zeng, M., Zan, D., Wang, J.-G., Cheshkov, A., Sun, J., Yu, H., Dong, G., Aliev, A., Wang, J., Cheng, X., Liang, G., Ma, Y., Bian, P., Xie, T., and Wang, Q.
+Coder: Issue resolving with multi-agent and task graphs.
+CoRR in ArXiv
+, abs/2406.01304, 2024.
+Chen et al. (2021)
+Chen, M., Tworek, J., Jun, H., Yuan, Q., Pondé, H., Kaplan, J., Edwards, H., Burda, Y., Joseph, N., Brockman, G., Ray, A., Puri, R., Krueger, G., Petrov, M., Khlaaf, H., Sastry, G., Mishkin, P., Chan, B., Gray, S., Ryder, N., Pavlov, M., Power, A., Kaiser, L., Bavarian, M., Winter, C., Tillet, P., Such, F. P., Cummings, D. W., Plappert, M., Chantzis, F., Barnes, E., Herbert-Voss, A., Guss, W. H., Nichol, A., Babuschkin, I., Balaji, S., Jain, S., Carr, A., Leike, J., Achiam, J., Misra, V., Morikawa, E., Radford, A., Knight, M. M., Brundage, M., Murati, M., Mayer, K., Welinder, P., McGrew, B., Amodei, D., McCandlish, S., Sutskever, I., and Zaremba, W.
+Evaluating large language models trained on code.
+ArXiv
+, abs/2107.03374, 2021.
+URL
+https://api.semanticscholar.org/CorpusID:235755472
+.
+Cobbe et al. (2021)
+Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R., Hesse, C., and Schulman, J.
+Training verifiers to solve math word problems.
+ArXiv
+, abs/2110.14168, 2021.
+URL
+https://api.semanticscholar.org/CorpusID:239998651
+.
+Golubev et al. (2024)
+Golubev, A., Polezhaev, S., Zainullina, K., Trofimova, M., Badertdinov, I., Anapolskiy, Y., Litvintseva, D., Karasik, S., Fisin, F., Skvortsov, S., Nekrashevich, M., Shevtsov, A., Abramov, S., and Yangel, B.
+Leveraging training and search for better software engineering agents.
+Nebius blog
+, 2024.
+https://nebius.com/blog/posts/training-and-search-for-software-engineering-agents.
+Hendrycks et al. (2021a)
+Hendrycks, D., Basart, S., Kadavath, S., Mazeika, M., Arora, A., Guo, E., Burns, C., Puranik, S., He, H., Song, D., and Steinhardt, J.
+Measuring coding challenge competence with APPS.
+In Vanschoren, J. and Yeung, S. (eds.),
+Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1, NeurIPS Datasets and Benchmarks 2021, December 2021, virtual
+, 2021a.
+Hendrycks et al. (2021b)
+Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D. X., and Steinhardt, J.
+Measuring mathematical problem solving with the math dataset.
+ArXiv
+, abs/2103.03874, 2021b.
+URL
+https://api.semanticscholar.org/CorpusID:232134851
+.
+Hu et al. (2022)
+Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., and Chen, W.
+Lora: Low-rank adaptation of large language models.
+In
+The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event, April 25-29, 2022
+. OpenReview.net, 2022.
+URL
+https://openreview.net/forum?id=nZeVKeeFYf9
+.
+Hui et al. (2024a)
+Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., Dang, K., et al.
+Qwen2. 5-coder technical report.
+arXiv preprint arXiv:2409.12186
+, 2024a.
+Hui et al. (2024b)
+Hui, B., Yang, J., Cui, Z., Yang, J., Liu, D., Zhang, L., Liu, T., Zhang, J., Yu, B., Dang, K., et al.
+Qwen2. 5-coder technical report.
+arXiv preprint arXiv:2409.12186
+, 2024b.
+Jain et al. (2024)
+Jain, N., Shetty, M., Zhang, T., Han, K., Sen, K., and Stoica, I.
+R2E: turning any github repository into a programming agent environment.
+In
+Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024
+. OpenReview.net, 2024.
+URL
+https://openreview.net/forum?id=kXHgEYFyf3
+.
+Jimenez et al. (2024)
+Jimenez, C. E., Yang, J., Wettig, A., Yao, S., Pei, K., Press, O., and Narasimhan, K. R.
+Swe-bench: Can language models resolve real-world github issues?
+In
+The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024
+. OpenReview.net, 2024.
+URL
+https://openreview.net/forum?id=VTF8yNQM66
+.
+Lightman et al. (2023)
+Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker, B., Lee, T., Leike, J., Schulman, J., Sutskever, I., and Cobbe, K.
+Let’s verify step by step.
+ArXiv
+, abs/2305.20050, 2023.
+URL
+https://api.semanticscholar.org/CorpusID:258987659
+.
+Ma et al. (2024)
+Ma, Y., Cao, R., Cao, Y., Zhang, Y., Chen, J., Liu, Y., Liu, Y., Li, B., Huang, F., and Li, Y.
+Lingma swe-gpt: An open development-process-centric language model for automated software improvement.
+arXiv preprint arXiv:2411.00622
+, 2024.
+Modal (2024)
+Modal.
+Modal: High-performance AI infrastructure.
+https://modal.com/
+, 2024.
+Accessed: 2024-12-18.
+Neubig & Wang (2024)
+Neubig, G. and Wang, X.
+Evaluation of LLMs as Coding Agents on SWE-Bench (at 30x Speed!).
+All Hands AI blog
+, 2024.
+Ouyang et al. (2022)
+Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C., Mishkin, P., Zhang, C., Agarwal, S., Slama, K., Ray, A., et al.
+Training language models to follow instructions with human feedback.
+Advances in neural information processing systems
+, 35:27730–27744, 2022.
+Pan et al. (2024)
+Pan, J., Zhang, Y., Tomlin, N., Zhou, Y., Levine, S., and Suhr, A.
+Autonomous evaluation and refinement of digital agents.
+ArXiv
+, abs/2404.06474, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:269009430
+.
+PyTorch Team (2024)
+PyTorch Team.
+torchtune: PyTorch native post-training library.
+https://github.com/pytorch/torchtune
+, 2024.
+Qwen Team (2024)
+Qwen Team.
+Qwen2.5: A party of foundation models, September 2024.
+URL
+https://qwenlm.github.io/blog/qwen2.5/
+.
+Schulman et al. (2017)
+Schulman, J., Wolski, F., Dhariwal, P., Radford, A., and Klimov, O.
+Proximal policy optimization algorithms.
+ArXiv
+, abs/1707.06347, 2017.
+URL
+https://api.semanticscholar.org/CorpusID:28695052
+.
+Shao et al. (2024)
+Shao, Z., Wang, P., Zhu, Q., Xu, R., Song, J., Bi, X., Zhang, H., Zhang, M., Li, Y., Wu, Y., et al.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Silver et al. (2017)
+Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T. P., Simonyan, K., and Hassabis, D.
+Mastering chess and shogi by self-play with a general reinforcement learning algorithm.
+ArXiv
+, abs/1712.01815, 2017.
+URL
+https://api.semanticscholar.org/CorpusID:33081038
+.
+Tao et al. (2024)
+Tao, N., Ventresque, A., Nallur, V., and Saber, T.
+Enhancing program synthesis with large language models using many-objective grammar-guided genetic programming.
+Algorithms
+, 17(7):287, 2024.
+doi:
+10.3390/A17070287
+.
+URL
+https://doi.org/10.3390/a17070287
+.
+Tong et al. (2024)
+Tong, Y., Zhang, X., Wang, R., Wu, R. M., and He, J.
+Dart-math: Difficulty-aware rejection tuning for mathematical problem-solving.
+ArXiv
+, abs/2407.13690, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:271270574
+.
+Unsloth Team (2024)
+Unsloth Team.
+Easily finetune and train LLMs. Get faster with unsloth.
+https://unsloth.ai/
+, 2024.
+Wang et al. (2024a)
+Wang, P., Li, L., Shao, Z., Xu, R., Dai, D., Li, Y., Chen, D., Wu, Y., and Sui, Z.
+Math-shepherd: Verify and reinforce LLMs step-by-step without human annotations.
+In Ku, L.-W., Martins, A., and Srikumar, V. (eds.),
+Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
+, pp.  9426–9439, Bangkok, Thailand, August 2024a. Association for Computational Linguistics.
+doi:
+10.18653/v1/2024.acl-long.510
+.
+URL
+https://aclanthology.org/2024.acl-long.510
+.
+Wang et al. (2024b)
+Wang, X., Chen, Y., Yuan, L., Zhang, Y., Li, Y., Peng, H., and Ji, H.
+Executable code actions elicit better LLM agents.
+In
+Forty-first International Conference on Machine Learning, ICML 2024, Vienna, Austria, July 21-27, 2024
+. OpenReview.net, 2024b.
+URL
+https://openreview.net/forum?id=jJ9BoXAfFa
+.
+Wang et al. (2024c)
+Wang, X., Li, B., Song, Y., Xu, F. F., Tang, X., Zhuge, M., Pan, J., Song, Y., Li, B., Singh, J., Tran, H. H., Li, F., Ma, R., Zheng, M., Qian, B., Shao, Y., Muennighoff, N., Zhang, Y., Hui, B., Lin, J., Brennan, R., Peng, H., Ji, H., and Neubig, G.
+OpenHands: An Open Platform for AI Software Developers as Generalist Agents.
+CoRR in ArXiv
+, abs/2407.16741, 2024c.
+Wu et al. (2024)
+Wu, Z., Bai, H., Zhang, A., Gu, J., Vinod Vydiswaran, V., Jaitly, N., and Zhang, Y.
+Divide-or-conquer? which part should you distill your llm?
+ArXiv
+, 2024.
+Xi et al. (2024)
+Xi, Z., Ding, Y., Chen, W., Hong, B., Guo, H., Wang, J., Yang, D., Liao, C., Guo, X., He, W., Gao, S., Chen, L., Zheng, R., Zou, Y., Gui, T., Zhang, Q., Qiu, X., Huang, X., Wu, Z., and Jiang, Y.-G.
+Agentgym: Evolving large language model-based agents across diverse environments.
+ArXiv
+, abs/2406.04151, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:270285866
+.
+Xia et al. (2024)
+Xia, C. S., Deng, Y., Dunn, S., and Zhang, L.
+Agentless: Demystifying llm-based software engineering agents.
+CoRR
+, abs/2407.01489, 2024.
+doi:
+10.48550/ARXIV.2407.01489
+.
+URL
+https://doi.org/10.48550/arXiv.2407.01489
+.
+Yang et al. (2024)
+Yang, J., Jimenez, C. E., Wettig, A., Lieret, K., Yao, S., Narasimhan, K., and Press, O.
+Swe-agent: Agent-computer interfaces enable automated software engineering.
+CoRR
+, abs/2405.15793, 2024.
+doi:
+10.48550/ARXIV.2405.15793
+.
+URL
+https://doi.org/10.48550/arXiv.2405.15793
+.
+Yao et al. (2023)
+Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K. R., and Cao, Y.
+React: Synergizing reasoning and acting in language models.
+In
+The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023
+. OpenReview.net, 2023.
+URL
+https://openreview.net/forum?id=WE_vluYUL-X
+.
+Yuan et al. (2024)
+Yuan, L., Cui, G., Wang, H., Ding, N., Wang, X., Deng, J., Shan, B., Chen, H., Xie, R., Lin, Y., Liu, Z., Zhou, B., Peng, H., Liu, Z., and Sun, M.
+Advancing LLM reasoning generalists with preference trees.
+CoRR
+, abs/2404.02078, 2024.
+doi:
+10.48550/ARXIV.2404.02078
+.
+URL
+https://doi.org/10.48550/arXiv.2404.02078
+.
+Zeng et al. (2023)
+Zeng, A., Liu, M., Lu, R., Wang, B., Liu, X., Dong, Y., and Tang, J.
+Agenttuning: Enabling generalized agent abilities for llms.
+In
+Annual Meeting of the Association for Computational Linguistics
+, 2023.
+URL
+https://api.semanticscholar.org/CorpusID:264306101
+.
+Zhai et al. (2024)
+Zhai, Y., Bai, H., Lin, Z., Pan, J., Tong, S., Zhou, Y., Suhr, A., Xie, S., LeCun, Y., Ma, Y., and Levine, S.
+Fine-tuning large vision-language models as decision-making agents via reinforcement learning.
+ArXiv
+, abs/2405.10292, 2024.
+URL
+https://api.semanticscholar.org/CorpusID:269790773
+.
+Zhang et al. (2024a)
+Zhang, K., Yao, W., Liu, Z., Feng, Y., Liu, Z., Murthy, R., Lan, T., Li, L., Lou, R., Xu, J., Pang, B., Zhou, Y., Heinecke, S., Savarese, S., Wang, H., and Xiong, C.
+Diversity empowers intelligence: Integrating expertise of software engineering agents.
+ArXiv
+, abs/2408.07060, 2024a.
+URL
+https://api.semanticscholar.org/CorpusID:271860093
+.
+Zhang et al. (2024b)
+Zhang, Y., Ruan, H., Fan, Z., and Roychoudhury, A.
+Autocoderover: Autonomous program improvement.
+In
+ISSTA
+, 2024b.
+Zhao et al. (2024)
+Zhao, W., Jiang, N., Lee, C., Chiu, J. T., Cardie, C., Gallé, M., and Rush, A. M.
+Commit0: Library generation from scratch, 2024.
+URL
+https://arxiv.org/abs/2412.01769
+.
+Zheng et al. (2024a)
+Zheng, L., Yin, L., Xie, Z., Sun, C., Huang, J., Yu, C. H., Cao, S., Kozyrakis, C., Stoica, I., Gonzalez, J. E., Barrett, C., and Sheng, Y.
+Sglang: Efficient execution of structured language model programs, 2024a.
+URL
+https://arxiv.org/abs/2312.07104
+.
+Zheng et al. (2024b)
+Zheng, T., Zhang, G., Shen, T., Liu, X., Lin, B. Y., Fu, J., Chen, W., and Yue, X.
+Opencodeinterpreter: Integrating code generation with execution and refinement.
+ArXiv
+, abs/2402.14658, 2024b.
+URL
+https://api.semanticscholar.org/CorpusID:267782452
+.
+Örwall (2024)
+Örwall, A.
+Moatless Tool.
+https://github.com/aorwall/moatless-tools
+, 2024.
+Accessed: 2024-10-22.
+Appendix A
+Comparison with Concurrent Works
+Ma et al. (
+2024
+)
+trains an LM agent, Lingma SWE-GPT, using a method similar to our rejection sampling fine-tuning baseline, with a dataset comparable to our SWE-Gym Raw splits. Without executable unit test feedback, they rely on manually defined heuristics to filter out low-quality trajectories, such as comparing similarity between submitted patches and edit locations with gold patches.
+The model weights are publicly accessible but not the training pipeline or the dataset.
+Most relevant to our work are two consecutive blog posts by
+Golubev et al. (
+2024
+)
+and
+Badertdinov et al. (
+2024
+)
+, who also construct an executable training environment with real-world tasks from GitHub.
+Instead of manual configuration, they employ a general environment setup script and simply discard instances that fail the setup process.
+This approach leads to key differences in dataset size and distribution: while it biases the environment away from tasks with complex dependencies, they successfully collect 6,415 instances, about 1.5 times larger than our dataset.
+In
+Golubev et al. (
+2024
+)
+, they also study training agents and verifiers with the environment.
+Additionally, they explore a lookahead setting where a trained verifier ranks and selects the best next action.
+With a substantially large collection of agent trajectories (80,036 compared to thousands in our experiments) and model size (72B compared to 32B),
+Their best system achieves 40% accuracy on SWE-Bench Verified.
+While their dataset and agent trajectories are publicly accessible, the model is not.
+In comparison, with a comparable dataset size, our SWE-Gym has executable feedback, avoids potential dataset bias through manual configuration of environments, while providing comprehensive analysis of agent and verifier training, their scaling behaviors, and positive results on agent self-improvement. Our system achieves competitive results with significantly lower compute and a smaller model size (32B vs 72B). Lastly, we open source all artifacts of the project, including dataset, model weights, agent trajectory data and the training pipeline.
+Model
+SWE-Bench
+Openness
+Name, Model Size
+Lite
+Verified
+Model
+Environment
+Ma et al. (
+2024
+)
+, 72B
+22.0
+30.2
+✓
+✗
+Golubev et al. (
+2024
+)
+Agent and Verifier, 72B
+-
+40.6
+✗
+✓
+Our SWE-Gym Agent and Verifier, 32B
+26.0
+32.0
+✓
+✓
+Table 5
+:
+Comparison of model performance on SWE-Bench benchmark and if the model weights and environments are publically accessible (openness).
+Cap
+# Traj
+Empty Patch (
+%
+,
+↓
+\%,\downarrow
+% , ↓
+)
+resolution rate (
+%
+,
+↑
+\%,\uparrow
+% , ↑
+)
+0 (Zero-shot)
+0
+56.3
+7.0
+1
+36
+37.3
+9.0
+2
+62
+29.0
+9.7
+3
+82
+43.7
+7.7
+No Cap (All)
+172
+30.7
+9.3
+Table 6:
+resolution rate and empty patch rate on SWE-Bench Lite with a 7B model trained using different instance capping strategies (Cap).
+Figure 6
+:
+Success distribution over 30 rounds on SWE-Gym Lite with 7B model in zero-shot. The distribution is naturally biased toward easy tasks. Per instance capping reduces this bias but lowers the total trajectory count for training. We set temperature
+t
+=
+1
+𝑡
+1
+t=1
+italic_t = 1
+during sampling.
+Figure 7
+:
+Scaling inference-time compute for MoatlessTools Agents (7B and 32B) with their corresponding learned verifiers. Temperature
+t
+=
+0.5
+𝑡
+0.5
+t=0.5
+italic_t = 0.5
+.
+Figure 8
+:
+Ablation study for verifier training (§
+5.1.1
+). Performances are evaluated on SWE-Bench Verified.
+Both the agent and the verifier are
+Qwen2.5-Coder-Instruct-32B
+model fine-tuned on the corresponding dataset.
+OpenHands
+(Wang et al.,
+2024c
+)
+is used as the agent scaffold.
+Figure 9
+:
+Comparison of three data sampling approaches using 32B LM:
+scaling trajectories (dedup.), scaling unique task instances, and scaling repositories (§
+5.2
+).
+Appendix B
+Experiment Details
+Original
+Dedup.
+Sorted by Random (Dedup.)
+Sorted by Repo (Dedup.)
+First 25%
+First 50%
+First 25%
+First 50%
+getmoto/moto
+155
+72
+12
+33
+0
+46
+Project-MONAI/MONAI
+95
+53
+17
+25
+53
+53
+pandas-dev/pandas
+70
+61
+14
+30
+0
+0
+python/mypy
+46
+27
+7
+12
+0
+0
+dask/dask
+45
+29
+8
+17
+6
+29
+iterative/dvc
+36
+24
+8
+12
+0
+0
+conan-io/conan
+20
+12
+1
+7
+12
+12
+pydantic/pydantic
+11
+7
+2
+4
+0
+0
+facebookresearch/hydra
+7
+5
+2
+5
+0
+5
+bokeh/bokeh
+3
+2
+1
+1
+2
+2
+modin-project/modin
+3
+2
+1
+1
+0
+0
+Total
+491
+294
+73
+147
+73
+147
+Table 7
+:
+Distribution of success trajectories used in training-time scaling experiments (§
+5.2
+).
+Dedup.
+denotes that the trajectories are deduplicated by randomly select ONE success trajectory per instance ID;
+Sorted by random (repo) X% (Dedup.)
+denotes a subset of trajectories taken from the first X% from dedup. instances that are sorted randomly (by repository name).
+Percentiles
+Resolved
+Count
+Mean
+Std
+Min
+Max
+5%
+10%
+25%
+50%
+75%
+90%
+95%
+Num. of Messages
+✗
+5
+,
+557.0
+5
+557.0
+5,557.0
+5 , 557.0
+39.2
+39.2
+39.2
+39.2
+31.9
+31.9
+31.9
+31.9
+7.0
+7.0
+7.0
+7.0
+101.0
+101.0
+101.0
+101.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+9.0
+29.0
+29.0
+29.0
+29.0
+61.0
+61.0
+61.0
+61.0
+100.0
+100.0
+100.0
+100.0
+101.0
+101.0
+101.0
+101.0
+✓
+491.0
+491.0
+491.0
+491.0
+39.9
+39.9
+39.9
+39.9
+19.9
+19.9
+19.9
+19.9
+13.0
+13.0
+13.0
+13.0
+101.0
+101.0
+101.0
+101.0
+19.0
+19.0
+19.0
+19.0
+21.0
+21.0
+21.0
+21.0
+25.0
+25.0
+25.0
+25.0
+33.0
+33.0
+33.0
+33.0
+47.5
+47.5
+47.5
+47.5
+65.0
+65.0
+65.0
+65.0
+87.0
+87.0
+87.0
+87.0
+Num. of Tokens
+✗
+5
+,
+557.0
+5
+557.0
+5,557.0
+5 , 557.0
+17
+,
+218.3
+17
+218.3
+17,218.3
+17 , 218.3
+17
+,
+761.6
+17
+761.6
+17,761.6
+17 , 761.6
+1
+,
+615.0
+1
+615.0
+1,615.0
+1 , 615.0
+167
+,
+834.0
+167
+834.0
+167,834.0
+167 , 834.0
+1
+,
+833.0
+1
+833.0
+1,833.0
+1 , 833.0
+1
+,
+907.0
+1
+907.0
+1,907.0
+1 , 907.0
+2
+,
+268.0
+2
+268.0
+2,268.0
+2 , 268.0
+12
+,
+305.0
+12
+305.0
+12,305.0
+12 , 305.0
+26
+,
+434.0
+26
+434.0
+26,434.0
+26 , 434.0
+41
+,
+182.2
+41
+182.2
+41,182.2
+41 , 182.2
+51
+,
+780.6
+51
+780.6
+51,780.6
+51 , 780.6
+✓
+491.0
+491.0
+491.0
+491.0
+18
+,
+578.5
+18
+578.5
+18,578.5
+18 , 578.5
+11
+,
+361.4
+11
+361.4
+11,361.4
+11 , 361.4
+2
+,
+560.0
+2
+560.0
+2,560.0
+2 , 560.0
+81
+,
+245.0
+81
+245.0
+81,245.0
+81 , 245.0
+5
+,
+813.0
+5
+813.0
+5,813.0
+5 , 813.0
+8
+,
+357.0
+8
+357.0
+8,357.0
+8 , 357.0
+11
+,
+559.5
+11
+559.5
+11,559.5
+11 , 559.5
+15
+,
+999.0
+15
+999.0
+15,999.0
+15 , 999.0
+22
+,
+040.5
+22
+040.5
+22,040.5
+22 , 040.5
+31
+,
+632.0
+31
+632.0
+31,632.0
+31 , 632.0
+39
+,
+512.5
+39
+512.5
+39,512.5
+39 , 512.5
+Table 8
+:
+Statistics of SWE-Gym-sampled trajectories. We use the tokenizer from
+Qwen-2.5-Coder-Instruct-7B
+to estimate the number of tokens.
+Agent
+Model
+Model Size
+Training Data
+Resolved
+(
+%
+)
+(\%)
+( % )
+SWE-Bench Verified (500 instances)
+RAG
+SWE-Llama
+(Jimenez et al.,
+2024
+)
+7B
+10K instances
+1.4
+RAG
+SWE-Llama
+(Jimenez et al.,
+2024
+)
+13B
+10K instances
+1.2
+Lingma Agent
+(Ma et al.,
+2024
+)
+Lingma SWE-GPT (v0925)
+7B
+90K PRs from 4K repos
+18.2
+Lingma Agent
+(Ma et al.,
+2024
+)
+Lingma SWE-GPT (v0925)
+72B
+90K PRs from 4K repos
+28.8
+OpenHands
+(Wang et al.,
+2024c
+)
+(Ours)
+fine-tuned Qwen2.5-Coder-Instruct
+32B
+491 agent trajectories from 11 repos
+20.6
+OpenHands w/ Verifier
+(Wang et al.,
+2024c
+)
+(Ours)
+fine-tuned Qwen2.5-Coder-Instruct
+32B (Agent & Verifier)
+491
+491
+491
+491
+agent trajectories from 11 repos for agent +
+1318
+×
+2
+1318
+2
+1318\times 2
+1318 × 2
+success/failure agent trajectories for verifier
+32.0
+Table 9
+:
+Performance comparison with SWE-Bench
+(Jimenez et al.,
+2024
+)
+baselines
+with publicly accessible weights
+.
+Data source:
+https://www.swebench.com/
+, Accessed on Dec 21, 2024.
+B.1
+Mean and Variance for Pass@N and Best@N.
+We mostly follow
+(Lightman et al.,
+2023
+)
+for obtaining the mean and variance for the Pass@N and Best@N curve. Given a total of M rounds of rollouts, for
+N
+<
+M
+𝑁
+𝑀
+N<M
+italic_N < italic_M
+,
+we calculate the mean and variance across 100 randomly selected sub-samples of size
+N
+𝑁
+N
+italic_N
+from the
+M
+𝑀
+M
+italic_M
+rollouts. For the OpenHands CodeActAgent inference-time scaling curve at §
+3
+, we exclude this calculation for N=1 , as we use a temperature of 0 for the first attempt.
+B.2
+OpenHands Agent Experiments
+During training, we use OpenHands’s remote runtime
+(Neubig & Wang,
+2024
+)
+feature to execute agent trajectories in parallel on SWE-Gym.
+We use
+torchtune
+(PyTorch Team,
+2024
+)
+for full parameter fine-tuning with a learning rate of
+1e-4
+, maximum 5 epochs, global batch size of 8, max context length of
+32768
+. We fine-tuned both 7B, 14B, and 32B variant of the model, and experiments were performed with 2-8x NVIDIA H100 80G GPU on modal
+(Modal,
+2024
+)
+.
+The only exception is in the main experiment of §
+5.1.1
+, where we use LoRA
+(Hu et al.,
+2022
+)
+(29.8%
+@
+⁢
+8
+@
+8
+@8
+@ 8
+) via Unsloth library
+(Unsloth Team,
+2024
+)
+to train the verifier for max 2 epochs, while other hyper-parameter stays the same.
+Inference during evaluation is bounded by either 100 interaction turns or the base LM’s 32k context window length, whichever is reached first.
+B.3
+MoatlessTools Agent Experiments
+All MoatlessTools models are trained with a context window of
+10240
+.
+For experiments with the 7B model, we use torchtune to train the policy model with full-finetuning using 4 H100 GPUs. We set batch size to 8, learning rate to
+2
+×
+10
+−
+5
+2
+superscript
+10
+5
+2\times 10^{-5}
+2 × 10 start_POSTSUPERSCRIPT - 5 end_POSTSUPERSCRIPT
+, and train for 5 epochs.
+For the 32B model, we use Unsloth
+(Unsloth Team,
+2024
+)
+with a single H100 GPU for LoRA fine-tuning. We set the number of epochs to 5, batch size to 8, LoRA rank to 64, and learning rate to
+5
+×
+10
+−
+4
+5
+superscript
+10
+4
+5\times 10^{-4}
+5 × 10 start_POSTSUPERSCRIPT - 4 end_POSTSUPERSCRIPT
+. We use the same configuration for verifier training.
+For MoatlessAgent experiments, we serve the agent with FP8 quantization for improved throughput, which we found to have minimal effects on model performance.
+B.4
+Details of OpenHands Trajectory Sampling
+Trajectory Set
+Sampled from Model
+Sampled on Dataset
+Temperature
+Max Turns
+Success trajectories
+D
+0
+subscript
+𝐷
+0
+D_{0}
+italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0
+30
+19 (
+8.26
+%
+percent
+8.26
+8.26\%
+8.26 %
+)
+(Cumulative) Total
+D
+0
+subscript
+𝐷
+0
+D_{0}
+italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+𝟏𝟗
+19
+\mathbf{19}
+bold_19
+D
+1
+∖
+D
+0
+subscript
+𝐷
+1
+subscript
+𝐷
+0
+D_{1}\setminus D_{0}
+italic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT ∖ italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0.2
+30
+11 (
+4.78
+%
+percent
+4.78
+4.78\%
+4.78 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0.3
+30
+17 (
+7.39
+%
+percent
+7.39
+7.39\%
+7.39 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0.4
+30
+21 (
+9.13
+%
+percent
+9.13
+9.13\%
+9.13 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0.5
+30
+18 (
+7.83
+%
+percent
+7.83
+7.83\%
+7.83 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0.8
+30
+20 (
+8.70
+%
+percent
+8.70
+8.70\%
+8.70 %
+)
+(Cumulative) Total
+D
+1
+subscript
+𝐷
+1
+D_{1}
+italic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+𝟏𝟎𝟔
+106
+\mathbf{106}
+bold_106
+D
+2
+∖
+D
+1
+subscript
+𝐷
+2
+subscript
+𝐷
+1
+D_{2}\setminus D_{1}
+italic_D start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT ∖ italic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+gpt-4o-2024-08-06
+SWE-Gym Lite
+0
+50
+19 (
+8.26
+%
+percent
+8.26
+8.26\%
+8.26 %
+)
+claude-3-5-sonnet-20241022
+SWE-Gym Lite
+0
+50
+67 (
+29.1
+%
+percent
+29.1
+29.1\%
+29.1 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Full
+0
+50
+∗
+111 (
+4.55
+%
+percent
+4.55
+4.55\%
+4.55 %
+)
+gpt-4o-2024-08-06
+SWE-Gym Full
+1
+50
+188 (
+7.71
+%
+percent
+7.71
+7.71\%
+7.71 %
+)
+(Cumulative) Total
+D
+2
+subscript
+𝐷
+2
+D_{2}
+italic_D start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+𝟒𝟗𝟏
+491
+\mathbf{491}
+bold_491
+*
+Run into infrastructure-related error where some instances failed to complete, this number might be under estimate of actual number of success trajectories.
+Table 10
+:
+Summary of trajectories sampled from SWE-Gym.
+As detailed in Tab.
+10
+, we collect a few sets of trajectories for fine-tuning experiments.
+We collect dataset
+D
+0
+subscript
+𝐷
+0
+D_{0}
+italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+by sample
+gpt-4o-2024-08-06
+on SWE-Gym Lite with temperature 0 and collected 19 trajectories that eventually solve the task (evaluated by unit test in SWE-Gym).
+We then varied the temperatures (setting
+t={0.2, 0.3, 0.4, 0.5, 0.8}
+) and sample on SWE-Gym Lite. Combining these instances with
+D
+0
+subscript
+𝐷
+0
+D_{0}
+italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+, we get 106 trajectories that solve the given problem (
+D
+1
+subscript
+𝐷
+1
+D_{1}
+italic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+).
+We set the maximum number of turns to be 30 for both
+D
+0
+subscript
+𝐷
+0
+D_{0}
+italic_D start_POSTSUBSCRIPT 0 end_POSTSUBSCRIPT
+and
+D
+1
+subscript
+𝐷
+1
+D_{1}
+italic_D start_POSTSUBSCRIPT 1 end_POSTSUBSCRIPT
+.
+To experiment on the effect of max turn, we set max number of turns to 50 and sample
+gpt-4o-2024-08-06
+(19 resolved out of 230) and
+claude-3-5-sonnet-20241022
+(67 resolved out of 230) with temperature 0 on SWE-Gym Lite, and sample
+gpt-4o-2024-08-06
+(temperature
+t={0, 1}
+) on SWE-Gym full set (in total 299 resolved out of 4876 instances).
+This gives us in in total 106 + 19 + 67 + 299 = 491 success trajectories, which forms our final training trajectories
+D
+2
+subscript
+𝐷
+2
+D_{2}
+italic_D start_POSTSUBSCRIPT 2 end_POSTSUBSCRIPT
+.
+B.5
+MoatlessTools ORM Prompt
+The following is a pseudo-code that generates a prompt for MoatlessTools Verifier (ORM), which is modified from
+(Zhang et al.,
+2024a
+)
+. Unlike
+(Zhang et al.,
+2024a
+)
+, which relies on proprietary models like Claude-3.5-Sonnet for context extraction, we obtain context directly from the agent’s trajectory being evaluated.
+\inputminted
+[breaklines]pythonassets/moatless-orm.py
+B.6
+OpenHands ORM Prompt
+The following is a pseudo-code that generates a prompt for OpenHands Verifier (ORM).
+\inputminted
+[breaklines]pythonassets/openhands-orm.py
+The last assistant messages that contains judgement is only provided during training time. At inference time, the trained verifier is responsible predicting the probability of ‘Yes’ and ‘No’.
\ No newline at end of file
diff --git a/research/notes/understanding-r1-zero-like-training-a-critical-perspective.md b/research/notes/understanding-r1-zero-like-training-a-critical-perspective.md
new file mode 100644
index 0000000000000000000000000000000000000000..963abdcacb851a039989c16762de804ea455624e
--- /dev/null
+++ b/research/notes/understanding-r1-zero-like-training-a-critical-perspective.md
@@ -0,0 +1,3747 @@
+---
+title: 'Understanding R1-Zero-Like Training: A Critical Perspective'
+id: understanding-r1-zero-like-training-a-critical-perspective
+tags:
+- deepread
+created: '2026-06-10T00:30:45.532370Z'
+source: https://arxiv.org/html/2503.20783
+source_domain: arxiv.org
+fetched_at: '2026-06-10T00:30:45.532229Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: institutional
+content_type: paper
+deprecated: false
+---
+
+Understanding R1-Zero-Like Training: A Critical Perspective
+Understanding R1-Zero-Like Training: A Critical Perspective
+Zichen Liu
+*
+†
+\dagger
+1,2
+, Changyu Chen
+*1,3
+, Wenjun Li
+*3
+, Penghui Qi
+*1,2
+,
+Tianyu Pang
+1
+, Chao Du
+1
+, Wee Sun Lee
+2
+, Min Lin
+1
+1
+Sea AI Lab
+2
+National University of Singapore
+3
+Singapore Management University
+∗
+Core Contributors.
+†
+Project Lead.
+Abstract
+DeepSeek-R1-Zero has shown that reinforcement learning (RL) at scale can directly enhance the reasoning capabilities of LLMs without supervised fine-tuning. In this work, we critically examine R1-Zero-like training by analyzing its two core components:
+base models
+and
+RL
+. We investigate a wide range of base models, including DeepSeek-V3-Base, to understand how pretraining characteristics influence RL performance. Our analysis reveals that
+DeepSeek-V3-Base already exhibit “Aha moment”
+, while
+Qwen2.5 base models demonstrate strong reasoning capabilities even without prompt templates
+, suggesting potential pretraining biases. Additionally, we identify an optimization bias in Group Relative Policy Optimization (GRPO), which artificially increases response length (especially for incorrect outputs) during training. To address this, we introduce
+Dr. GRPO
+, an unbiased optimization method that improves token efficiency while maintaining reasoning performance. Leveraging these insights, we present a minimalist R1-Zero recipe that achieves
+43.3
+%
+43.3\%
+accuracy on AIME 2024 with a 7B base model, establishing a new state-of-the-art.
+https://github.com/sail-sg/understand-r1-zero
+1
+1
+1
+Developed with the LLM RL framework Oat:
+https://github.com/sail-sg/oat
+.
+Figure 1:
+Left
+: Dr. GRPO introduces simple yet significant modifications to address the biases in GRPO
+(Shao et al.,
+2024
+)
+, by removing the length and std normalization terms.
+Right
+: Our unbiased optimizer effectively prevents the model from generating progressively longer incorrect responses, thereby enhancing token efficiency.
+Figure 2:
+Model performance comparison.
+Oat-Zero-7B
+is RL-tuned with our minimalist recipe described in Sec.
+1
+(third paragraph). Please see
+App.
+B
+for more results.
+1
+Introduction
+DeepSeek-R1-Zero
+(Guo et al.,
+2025
+)
+revolutionizes the pipeline of large language model (LLM) post-training by introducing the
+R1-Zero-like training paradigm
+: directly applying RL to base LLMs without relying on supervised fine-tuning (SFT) as a preliminary step. This new paradigm is appealing due to its simplicity and the demonstrated
+RL scaling phenomenon
+: the model reasoning capabilities improve along with a continual increase in model’s response length. This phenomenon is also accompanied by the “Aha moment”, at which the model learns emergent skills such as self-reflections.
+In this paper, we aim to understand R1-Zero-like training by studying two essential components:
+base models
+and
+RL
+. In the first part, we investigate various attributes of base models, with the focus on the
+Qwen2.5
+model family
+(Yang et al.,
+2024a
+;
+b
+)
+, which has been used in recent attempts to reproduce R1-Zero
+(Pan et al.,
+2025
+; Zeng et al.,
+2025
+; Liu et al.,
+2025b
+; Hu et al.,
+2025
+)
+, as well as
+DeepSeek-V3-Base
+(Liu et al.,
+2024
+)
+, from which the real R1-Zero model was RL-tuned. In the second part, we identify the
+bias in optimization of GRPO
+(Shao et al.,
+2024
+)
+, which may lead to progressively longer
+incorrect
+responses. To this end, we propose a simple modification to eliminate the bias, i.e., to get GRPO Done Right (
+Dr. GRPO
+), which leads to
+better token efficiency
+(highlighted in
+Fig.
+1
+).
+Our analysis on base models and RL suggests a
+minimalist recipe
+for R1-Zero-like training: we RL-tune Qwen2.5-Math-7B using the (unbiased) Dr. GRPO algorithm on MATH
+(Hendrycks et al.,
+2021
+)
+level 3-5 questions with the Qwen-Math template, and achieve state-of-the-art performance (
+Fig.
+2
+) with only
+27
+27
+hours compute on
+8
+×
+8\times
+A100 GPUs. We hope our findings presented in this paper, models released, and the codebase open-sourced could benefit future research in the field.
+As an overview, we summarize the takeaways of this paper below:
+Overview of takeaways
+•
+(
+Sec.
+2.1
+) Template is crucial to make base models
+answer questions
+instead of completing sentences. In addition, all base models already possess math-solving capability prior to RL.
+•
+(
+Sec.
+2.2
+) Intriguingly, Qwen-2.5 base models get an
+immediate
+∼
+60
+%
+\sim 60\%
+improvement by not using template
+, making us hypothesize that they may pretrain on concatenated question-answer texts when cooking the models.
+•
+(
+Sec.
+2.3
+) Nearly all base models already exhibit the “Aha moment”,
+including DeepSeek-V3-Base
+.
+•
+(
+Sec.
+3.1
+,
+Sec.
+3.2
+) Dr. GRPO effectively fixes GRPO’s bias in optimization, achieving
+better token efficiency
+.
+•
+(
+Sec.
+3.3
+) Model-template
+mismatch
+can destroy reasoning capabilities before RL reconstructs it.
+•
+(
+Sec.
+3.4
+)
+Math pretraining on Llama-3.2-3B
+improves its RL ceiling.
+2
+Analysis on Base Models
+In this section, we scrutinize a wide range of base models, including the Qwen-2.5 family
+(Yang et al.,
+2024a
+;
+b
+)
+, Llama-3.1
+(Grattafiori et al.,
+2024
+)
+and DeepSeek series
+(Liu et al.,
+2024
+; Shao et al.,
+2024
+; Guo et al.,
+2025
+)
+, asking them
+500
+500
+questions sampled from the MATH
+(Hendrycks et al.,
+2021
+)
+training set and analyzing their responses.
+2.1
+R1-Zero Trainability: Templates Construct Exploratory Base Policies
+Since training from a base model is a fundamental setting of the R1-Zero-like paradigm, we first investigate whether widely used open-source base models, which are typically trained for sentence completion (i.e.,
+p
+θ
+​
+(
+𝐱
+)
+p_{\theta}({\mathbf{x}})
+), can have their question-answering capabilities effectively elicited through appropriate templates, thereby functioning as a question-answering base policy
+π
+θ
+(
+⋅
+|
+𝐪
+)
+\pi_{\theta}(\cdot|{\mathbf{q}})
+.
+In addition to the
+R1 template
+(
+Template
+1
+) in
+Guo et al. (
+2025
+)
+, we consider the
+Qwen-Math template
+(
+Template
+2
+) used by
+Zeng et al. (
+2025
+)
+, as well as
+No template
+(
+Template
+3
+):
+Template 1
+(
+R1 template
+)
+.
+A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within
+<
+<
+think
+>
+>
+<
+<
+/think
+>
+>
+and answer is enclosed within
+<
+<
+answer
+>
+>
+<
+<
+/answer
+>
+>
+tags, respectively, i.e.,
+<
+<
+think
+>
+>
+reasoning process here
+<
+<
+/think
+>
+>
+<
+<
+answer
+>
+>
+answer here
+<
+<
+/answer
+>
+>
+.\nUser:
+{question}
+\nAssistant:
+<
+<
+think
+>
+>
+Template 2
+(
+Qwen-Math template
+)
+.
+<
+<
+|im_start|
+>
+>
+system\nPlease reason step by step, and put your final answer within \\boxed{}.
+<
+<
+|im_end|
+>
+>
+\n
+<
+<
+|im_start |
+>
+>
+user\n
+{question}
+<
+<
+|im_end|
+>
+>
+\n
+<
+<
+|im_start|
+>
+>
+assistant\n
+Template 3
+(
+No template
+)
+.
+{question}
+Experimental settings
+. We include Qwen2.5-Math-1.5B, Qwen2.5-Math-7B, Qwen2.5-7B, Llama-3.1-8B, DeepSeek-Math-7B and DeepSeek-V3-Base-685B for experiments. For each model, we first apply
+No template
+to get the model responses, then let GPT-4o-mini to judge whether the model responses are in an answering format (regardless of quality) or in a sentence-completion pattern. We record the percentage of responses that tend to answer the question as the metric. We then apply both
+R1 template
+and
+Qwen-Math template
+to obtain model responses, and determine the most suitable template for each model based on the metric. Finally, we evaluate the pass@8 accuracy of each model with the corresponding template to assess whether the base policies can explore rewarding trajectories for RL improvement.
+Figure 3:
+Model attributes across three aspects.
+Question-Answering Ability
+: the extent to which a pretrained language model provides a direct answer to a question rather than continuing or expanding upon it;
+Exploration Ability
+: pass@8 measures how well base models explore;
+Self-Reflection
+: counts are obtained through cross-validation between keyword-based detection and LLM-based detection, as detailed in Appendix
+D
+.
+Results
+. The left plot of
+Fig.
+3
+shows how well base models (with or without templates) answer the provided questions. We observe that Llama and DeepSeek models all improve the answering ability by employing the proper template (R1 template). However, Qwen2.5 models work best (with
+100
+%
+100\%
+answering rate) when no template is used. This intriguing property motivates further investigation which will be discussed in
+Sec.
+2.2
+. Meanwhile, the lowest answering rate with no template suggests that DeepSeek-V3-Base is a nearly pure base model. This observation motivates us to explore whether a pure base model like DeepSeek-V3-Base demonstrates the Aha moment (
+Sec.
+2.3
+).
+The middle plot of
+Fig.
+3
+shows the pass@8 accuracy of different base models (with template) at different sampling temperatures. This metric can serve as an indicator of base policy’s exploration ability. For example, if a base policy cannot even sample a single trajectory that leads to the correct final answer, it is impossible for RL to improve the policy because there is no reward signal. Our results demonstrate that all tested models are exploratory (thus ready for RL), with Qwen2.5 models performing the best (even surpassing DeekSeek-V3-Base). This might partially explain that most R1-Zero projects
+(Zeng et al.,
+2025
+; Hu et al.,
+2025
+)
+are based on Qwen2.5 models.
+2.2
+Qwen-2.5 Models Unlock the Best Performance When Discarding Template
+We next dig into the intriguing observation (c.f.
+Fig.
+3
+(Left)) that all Qwen2.5 base models readily serve as chat models even without any template. We take a step further to evaluate the reasoning ability of Qwen2.5-Math models on five standard benchmarks: AIME 2024
+(Li et al.,
+2024a
+)
+, AMC
+(Li et al.,
+2024a
+)
+, MATH500
+(Hendrycks et al.,
+2021
+)
+, Minerva Math
+(Lewkowycz et al.,
+2022
+)
+, and OlympiadBench
+(He et al.,
+2024
+)
+. Following common practice, we use greedy decoding and limit the sampling budget to 3000 tokens.
+Base model + Template
+AIME24
+AMC
+MATH500
+Minerva
+OlympiadBench
+Avg.
+Qwen2.5-Math-1.5B
+(4-shot prompting)
+0.0
+20.0
+50.4
+12.1
+15.9
+19.7
+R1 template
+0.0
+9.6
+21.2
+6.6
+2.2
+7.9
+Qwen template
+20.0
+32.5
+33.0
+12.5
+22.8
+24.2
+No template
+16.7
+43.4
+61.8
+15.1
+28.4
+33.1
+Qwen2.5-Math-7B
+(4-shot prompting)
+3.3
+22.5
+61.6
+10.7
+20.9
+23.8
+R1 template
+0.0
+0.0
+0.0
+0.0
+0.1
+0.0
+Qwen template
+16.7
+38.6
+50.6
+9.9
+16.6
+26.5
+No template
+0.2
+45.8
+69.0
+21.3
+34.7
+38.2
+Table 1:
+Qwen2.5-Math models might be pretrained on concatenated question-answer text, resulting in peak performance when
+no template
+is applied.
+As shown in
+Table
+1
+, not using any template can drastically boost the average performance, resulting in an improvement of about
+60
+%
+60\%
+compared to the traditional 4-shot prompting. Since Qwen2.5-Math
+(Yang et al.,
+2024b
+)
+uses chat model’s data (question-answer pairs) during the pretraining stage, we hypothesize that they might pretrain on the concatenated text to maximize
+log
+⁡
+p
+θ
+​
+(
+𝐪
+;
+𝐨
+)
+\log p_{\theta}({\mathbf{q}};{\mathbf{o}})
+directly. If our hypothesis turns out true, we shall be more careful about using Qwen2.5 models to reproduce DeepSeek-R1-Zero, since the base models are already SFT-like without templates.
+2.3
+Aha Moment Already Appears in Base Models Including DeepSeek-V3-Base
+One of the most inspiring results of DeepSeek-R1-Zero is the emergence of self-reflection behaviors, a.k.a., Aha moment, through pure RL training. A few prior studies
+(Liu et al.,
+2025b
+; Yeo et al.,
+2025
+)
+have suggested that there may not be Aha moment in open-source R1 replications because the base models they use already exhibit self-reflection keywords. However, they have not tested DeepSeek-V3-Base, on which the real R1-Zero model was RL-tuned. We complete this missing piece by hosting DeepSeek-V3-Base-685B ourselves and investigating its responses to the
+500
+500
+MATH questions with the R1 template. From the right plot of
+Fig.
+3
+, we can observe that DeepSeek-V3-Base also generates a decent amount of self-reflections, further validating the claims of
+Liu et al. (
+2025b
+)
+. We also show examples in
+App.
+E
+(
+Fig.
+13
+) where DeepSeek-V3-Base generates keywords such as “Aha” and “wait”.
+An additional important question is whether self-reflection behaviors are associated with improved model performance after RL training. To investigate this, we host DeepSeek-R1-Zero and analyze its responses to the same questions from the MATH dataset. Although self-reflection behaviors occur more frequently in R1-Zero, we observe that these behaviors are not positively correlated with higher accuracy. Detailed analysis can be found in
+App.
+F
+.
+3
+Analysis on Reinforcement Learning
+Language model generation can be formulated as a token-level Markov Decision Process (MDP)
+ℳ
+=
+(
+𝒮
+,
+𝒜
+,
+r
+,
+p
+𝒬
+)
+{\mathcal{M}}=({\mathcal{S}},{\mathcal{A}},r,p_{{\mathcal{Q}}})
+.
+At each generation step
+t
+t
+, the state
+s
+t
+∈
+𝒮
+s_{t}\in{\mathcal{S}}
+is the concatenation of the input question and the output response generated so far:
+s
+t
+=
+𝐪
+;
+𝐨
+<
+t
+=
+[
+q
+1
+,
+…
+,
+q
+M
+,
+o
+1
+,
+…
+,
+o
+t
+−
+1
+]
+s_{t}={\mathbf{q}};{\mathbf{o}}_{<t}=[q_{1},\dots,q_{M},o_{1},\dots,o_{t-1}]
+. The policy
+π
+θ
+(
+⋅
+|
+s
+t
+)
+\pi_{\theta}(\cdot|s_{t})
+will select the next token
+o
+t
+o_{t}
+from the vocabulary
+𝒜
+{\mathcal{A}}
+, resulting in a deterministic transition to the next state
+s
+t
++
+1
+=
+s
+t
+;
+[
+o
+t
+]
+s_{t+1}=s_{t};[o_{t}]
+. The generation process starts from sampling an initial state
+s
+1
+=
+𝐪
+∼
+p
+𝒬
+s_{1}={\mathbf{q}}\sim p_{{\mathcal{Q}}}
+from a set of questions, and stops when the autoregressive policy generates the
+[eos]
+token or exhausts the budget.
+Typically, we maximize the entropy-regularized objective
+(Schulman et al.,
+2017a
+)
+:
+𝒥
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+[
+R
+(
+𝐪
+,
+𝐨
+)
+]
+−
+β
+𝔻
+K
+​
+L
+[
+π
+θ
+(
+⋅
+|
+𝐪
+)
+)
+|
+|
+π
+ref
+(
+⋅
+|
+𝐪
+)
+]
+]
+,
+\mathcal{J}(\pi_{\theta})=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[R({\mathbf{q}},{\mathbf{o}})]-\beta{\mathbb{D}}_{KL}[\pi_{\theta}(\cdot|{\mathbf{q}}))||\pi_{\text{ref}}(\cdot|{\mathbf{q}})]\right],
+(1)
+where
+R
+​
+(
+𝐪
+,
+𝐨
+)
+=
+∑
+t
+=
+1
+|
+𝐨
+|
+r
+​
+(
+s
+t
+,
+o
+t
+)
+R({\mathbf{q}},{\mathbf{o}})=\sum_{t=1}^{|{\mathbf{o}}|}r(s_{t},o_{t})
+is the return
+(Sutton & Barto,
+2018
+)
+of the trajectory
+𝐪
+;
+𝐨
+{\mathbf{q}};{\mathbf{o}}
+, and
+π
+ref
+\pi_{\text{ref}}
+is a reference policy.
+The KL regularization term is usually adopted (
+β
+>
+0
+\beta>0
+) for reinforcement learning from human feedback
+(Christiano et al.,
+2017
+)
+, where
+r
+r
+is a
+reward model
+learned from data collected by
+π
+ref
+\pi_{\text{ref}}
+. In this case, regularization helps prevent
+π
+θ
+\pi_{\theta}
+from deviating too far from the distribution where the reward model is accurate
+(Jaques et al.,
+2019
+; Stiennon et al.,
+2020
+)
+.
+However, RL-tuning reasoning models typically employs
+rule-based verifiers
+as
+r
+r
+(Lambert et al.,
+2024
+)
+, eliminating the concerns of distributional shift. This allows us to remove the KL term, which not only saves the memory and computation required by
+π
+ref
+\pi_{\text{ref}}
+during training, but also potentially leads to better performance for R1-Zero-like training
+(Hu et al.,
+2025
+)
+. We will assume
+β
+=
+0
+\beta=0
+throughout this paper.
+Policy optimization algorithms
+. To optimize
+π
+θ
+\pi_{\theta}
+with the above objective (
+Eq.
+1
+with
+β
+=
+0
+\beta=0
+), Proximal Policy Optimization (PPO)
+(Schulman et al.,
+2017b
+)
+maximizes the following surrogate objective:
+𝒥
+P
+​
+P
+​
+O
+​
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+,
+𝐨
+∼
+π
+θ
+old
+(
+⋅
+|
+𝐪
+)
+∑
+t
+=
+1
+|
+𝐨
+|
+{
+min
+⁡
+[
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+π
+θ
+old
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+A
+^
+t
+,
+clip
+​
+(
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+π
+θ
+old
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+​
+A
+^
+t
+]
+}
+,
+\begin{split}{\mathcal{J}}_{PPO}(\pi_{\theta})&=\mathbb{E}_{{\mathbf{q}}\sim p_{{\mathcal{Q}}},{\mathbf{o}}\sim\pi_{\theta_{\text{old}}}(\cdot|{\mathbf{q}})}\\
+&\sum_{t=1}^{|{\mathbf{o}}|}\left\{\min\left[\frac{\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})}{\pi_{\theta_{\text{old}}}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})}\hat{A}_{t},\text{clip}(\frac{\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})}{\pi_{\theta_{\text{old}}}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})},1-\epsilon,1+\epsilon)\hat{A}_{t}\right]\right\},\end{split}
+(2)
+where
+π
+θ
+old
+\pi_{\theta_{\text{old}}}
+is the policy before the update,
+ϵ
+\epsilon
+is the clipping hyperparameter, and
+A
+^
+t
+\hat{A}_{t}
+is an estimator of the advantage function of the
+t
+t
+-th token. A standard way to estimate
+A
+^
+t
+\hat{A}_{t}
+is to compute the Generalized Advantage Estimation (GAE)
+(Schulman et al.,
+2015
+)
+with a learned value model
+V
+ϕ
+V_{\phi}
+. However, in the context of LLM RL-tuning, learning the value model is computationally expensive, so methods that estimate
+A
+^
+t
+\hat{A}_{t}
+without
+V
+ϕ
+V_{\phi}
+are practically preferred. For example,
+Shao et al. (
+2024
+)
+proposed GRPO, which first samples a group of responses
+{
+𝐨
+1
+,
+…
+,
+𝐨
+G
+}
+\{{\mathbf{o}}_{1},\dots,{\mathbf{o}}_{G}\}
+per question and computes their returns
+𝐑
+=
+{
+R
+1
+,
+…
+,
+R
+G
+}
+\mathbf{R}=\{R_{1},\dots,R_{G}\}
+, then sets the advantage of all tokens from
+𝐨
+i
+{\mathbf{o}}_{i}
+as
+A
+^
+t
+=
+R
+i
+−
+mean
+⁡
+(
+𝐑
+)
+std
+⁡
+(
+𝐑
+)
+\hat{A}_{t}=\frac{R_{i}-\operatorname{mean}(\mathbf{R})}{\operatorname{std}(\mathbf{R})}
+.
+3.1
+GRPO Leads to Biased Optimization
+In Deepseek-R1-Zero
+(Guo et al.,
+2025
+)
+, a notable trend is the consistent increase in response length throughout the training process. This is frequently interpreted as an indication of the development of advanced reasoning abilities such as self-reflection. Recent studies
+(Pan et al.,
+2025
+; Zeng et al.,
+2025
+; Hu et al.,
+2025
+)
+have replicated this phenomenon using various algorithms and implementations. However, we argue that
+the observed increase in response length may also be attributed to a bias inherent in the GRPO
+(Shao et al.,
+2024
+)
+objective function
+:
+𝒥
+G
+​
+R
+​
+P
+​
+O
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+,
+{
+𝐨
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+(
+⋅
+|
+𝐪
+)
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+𝐨
+i
+|
+​
+∑
+t
+=
+1
+|
+𝐨
+i
+|
+{
+min
+⁡
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+​
+A
+^
+i
+,
+t
+,
+clip
+​
+(
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+​
+A
+^
+i
+,
+t
+]
+}
+,
+\begin{split}\mathcal{J}_{GRPO}&(\pi_{\theta})=\mathbb{E}_{{\mathbf{q}}\sim p_{{\mathcal{Q}}},\{{\mathbf{o}}_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(\cdot|{\mathbf{q}})}\\
+&\frac{1}{G}\sum_{i=1}^{G}{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\frac{1}{|{\mathbf{o}}_{i}|}}\sum_{t=1}^{|{\mathbf{o}}_{i}|}\left\{\min\left[\frac{\pi_{\theta}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}\hat{A}_{i,t},\text{clip}\left(\frac{\pi_{\theta}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})},1-\epsilon,1+\epsilon\right)\hat{A}_{i,t}\right]\right\},\end{split}
+(3)
+where
+A
+^
+i
+,
+t
+=
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+−
+mean
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+std
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+,
+\hat{A}_{i,t}=\frac{R({\mathbf{q}},{\mathbf{o}}_{i})-\operatorname{mean}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})}{{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\operatorname{std}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})}},
+with the return
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+R({\mathbf{q}},{\mathbf{o}}_{i})
+typically only including the
+outcome verifiable reward
+in LLM reasoning (the analysis also applies to process reward cases).
+Compared to the objective function in
+Eq.
+2
+, GRPO introduces two biases (see also
+Fig.
+4
+):
+•
+Response-level length bias
+: This arises from dividing by
+|
+𝐨
+i
+|
+|{\mathbf{o}}_{i}|
+. For positive advantages (
+A
+^
+i
+,
+t
+>
+0
+\hat{A}_{i,t}>0
+, indicating a correct response), this bias results in greater gradient updates for shorter responses, leading the policy to favor brevity in correct answers. Conversely, for negative advantages (
+A
+^
+i
+,
+t
+<
+0
+\hat{A}_{i,t}<0
+, indicating an incorrect response), longer responses are penalized less due to their larger
+|
+𝐨
+i
+|
+|{\mathbf{o}}_{i}|
+, causing the policy to prefer lengthier responses among incorrect ones.
+•
+Question-level difficulty bias
+: This is caused by dividing the centered outcome reward by
+std
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+\operatorname{std}(\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\})
+. Questions with lower standard deviations (e.g., those that are too easy or too hard, with the outcome rewards being almost all 1 or 0) are given higher weights during policy updates. While advantage normalization is a common trick in RL
+(Andrychowicz et al.,
+2021
+)
+, it is typically computed across an entire batch. In contrast, question-level normalization results in varying weights in the objective for different questions, leading to a difficulty bias in optimization.
+Figure 4:
+Illustration of the biases in GRPO. Note that the effective advantage of GRPO
+a
+i
+,
+t
+a_{i,t}
+is equivalent to a reweighted version of the unbiased advantage
+A
+~
+i
+,
+t
+=
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+−
+mean
+⁡
+(
+𝐑
+)
+\tilde{A}_{i,t}=R({\mathbf{q}},{\mathbf{o}}_{i})-\operatorname{mean}(\mathbf{R})
+. The terms
+std
+⁡
+(
+𝐑
+)
+\operatorname{std}(\mathbf{R})
+and
+|
+𝐨
+i
+|
+|{\mathbf{o}}_{i}|
+could bias the optimization by assigning different weights to different questions and responses, as denoted by the sizes of the blue circles and the lengths of the orange arrows. Upward arrows indicate positive advantages, and vice versa.
+Length Bias Also Exists in Open-Source PPO Implementations
+. We also examined several popular open-source implementations of vanilla PPO algorithms for LLM post-training. To our surprise, all of these implementations normalize the loss by response length (see
+LABEL:lst:ppo_impl
+and
+Table
+2
+), which
+misaligns
+with the PPO objective as defined in
+Eq.
+2
+.
+This formulation-implementation misalignment was present even before the publication of GRPO. We speculate that the misalignment might originate from the
+pretraining stage
+(Shoeybi et al.,
+2019
+)
+, where all tokens are packed into a fixed-length context and normalizing the loss by the context length (i.e., computing
+loss.mean(-1)
+) improves the numerical stability. However, in the
+RL-tuning stage
+, typical implementations
+(von Werra et al.,
+2020
+)
+normalize the loss by the response length, which is
+not
+a constant, introducing an unintended length bias.
+Listing 1:
+Comparison between a typical open-source PPO loss implementation that is biased (red) and our implementation (green).
+MAX_TOKENS
+is a global constant during the entire training (unless budget curriculum is enabled), which specifies the maximum number of generation tokens. Other constants also work with differences in gradient norm.
+⬇
+1
+def
+masked_mean
+(
+tensor
+,
+mask
+,
+dim
+):
+2
+-
+return
+(tensor
+*
+mask).sum(axis=dim)
+/
+mask.sum(axis=dim)
+3
++
+return
+(tensor
+*
+mask).sum(axis=-1)
+/
+MAX_TOKENS
+4
+5
+ppo_loss
+=
+...
+#
+compute
+per-token
+ppo
+loss
+6
+response_mask
+=
+...
+#
+per-token
+response
+mask
+7
+#
+per-response
+length
+normalization
+(e.g.,
+OpenRLHF)
+8
+loss_variant1
+=
+masked_mean(ppo_loss,
+response_mask,
+dim=-1).mean()
+9
+#
+OR
+per-batch
+length
+normalization
+(e.g.,
+trl,
+verl)
+10
+loss_variant2
+=
+masked_mean(ppo_loss,
+response_mask,
+dim=None).mean()
+Repository
+Code Link
+Unbiased?
+trl
+(von Werra et al.,
+2020
+)
+PPO Loss
+OpenRLHF
+(Hu et al.,
+2024
+)
+PPO Loss
+verl
+(Sheng et al.,
+2024
+)
+PPO Loss
+SimpleRL-Zero
+(Zeng et al.,
+2025
+)
+PPO Loss
+Open-Reasoner-Zero
+(Hu et al.,
+2025
+)
+PPO Loss
+Table 2:
+Many open-sourced PPO implementations contain length bias.
+3.2
+Dr. GRPO: Group Relative Policy Optimization Done Right
+To avoid the aforementioned optimization bias in GRPO, we propose to simply remove the
+1
+|
+𝐨
+i
+|
+{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\frac{1}{|{\mathbf{o}}_{i}|}}
+and
+std
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\operatorname{std}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})}
+normalization terms. Meanwhile, to faithfully implement the unbiased optimization objective, we could replace the
+mask.sum(axis=dim)
+with a constant value (e.g., generation budget) in the
+masked_mean
+function in
+LABEL:lst:ppo_impl
+, as highlighted by the line in green.
+Notably, these simple modifications recover the PPO objective in
+Eq.
+2
+, with the advantage estimated by Monte Carlo return with an unbiased baseline
+(Sutton & Barto,
+2018
+)
+. We give detailed derivations in
+App.
+A
+. We refer to our new optimization algorithm as
+Dr. GRPO
+. We next experimentally validate its effectiveness.
+Experimental settings
+.
+We implement our algorithm using Oat
+(Liu et al.,
+2025a
+)
+, a modular, research-friendly and efficient LLM RL framework. We adopt the Qwen2.5-1.5B base model and the R1 template (
+Template
+1
+) for online RL-tuning. We implement the verification-based reward function using Math-Verify
+2
+2
+2
+https://github.com/huggingface/Math-Verify
+.
+, with the following minimalistic rule:
+R
+​
+(
+𝐪
+,
+𝐨
+)
+=
+{
+1
+if
+𝐨
+contains the correct final answer to
+𝐪
+0
+otherwise
+R({\mathbf{q}},{\mathbf{o}})=\begin{cases}1&\text{if ${\mathbf{o}}$ contains the correct final answer to ${\mathbf{q}}$}\\
+0&\text{otherwise}\end{cases}
+We run RL on questions sampled from the MATH
+(Hendrycks et al.,
+2021
+)
+training dataset, and compare the vanilla GRPO with the proposed Dr. GRPO. We evaluate the online model on five benchmarks: AIME2024, AMC, MATH500, Minerva Math and OlympiadBench.
+More experimental details including hyperparameters can be found in
+App.
+G
+.
+Figure 5:
+Comparison of Dr. GRPO and GRPO in terms of training dynamics (Top) and evaluation results (Bottom).
+Results
+. We report various metrics in
+Fig.
+5
+to demonstrate that Dr. GRPO can effectively mitigate the optimization bias and lead to
+better token efficiency
+. In particular, we first note that both GRPO and Dr. GRPO exhibit similar trend to DeepSeek-R1-Zero
+(Guo et al.,
+2025
+)
+, namely their response length increases along with training reward (Plots 1 & 2). However, we observe that GRPO tends to continually generate longer
+responses even when the reward improvement slows down (Plot 2). Although such a phenomenon is often referred to as the “emergence” of long-CoT through RL
+(Zeng et al.,
+2025
+; Hu et al.,
+2025
+)
+, we argue that it is also confounded by the response-level length bias (
+Sec.
+3.1
+) during optimization
+3
+3
+3
+We note that both
+Zeng et al. (
+2025
+)
+and
+Hu et al. (
+2025
+)
+employ PPO, which is unbiased by formulation. However, their loss implementations still introduce the length bias (see
+LABEL:lst:ppo_impl
+).
+. In contrast, by computing the unbiased policy gradients, Dr. GRPO prevents the response length from growing wildly during training (Plot 2). Moreover, on evaluation benchmarks, the length of incorrect responses is substantially reduced by Dr. GRPO compared to the baseline (Plot 4), suggesting that an unbiased optimizer also
+mitigates overthinking
+(Chen et al.,
+2024
+)
+.
+Figure 6:
+The average benchmark accuracy of different {template, question set} combinations during RL training.
+3.3
+A Duet of Template and Question Set Coverage in RL dynamics
+Recall that the Qwen2.5-Math base models can readily answer questions with high accuracy without any prompt template (
+Sec.
+2.2
+). Based on this intriguing observation, we are interested in how different templates affect the RL training. Furthermore, given the general belief that larger question set coverage leads to better performance
+(Luo et al.,
+2025
+; Hu et al.,
+2025
+)
+, we also study the interaction between different templates and different levels of question coverage.
+Experimental settings
+. Starting from the Qwen2.5-Math-1.5B base model, we apply R1 template, Qwen-Math template and No template respectively to run RL using Dr. GRPO. All experiments are repeated for different question sets that are detailed in
+Table
+3
+.
+Question set
+#
+Description
+ORZ
+57k
+Combining AIME, Numina-Math, Tulu3 MATH; diverse and large amount
+MATH
+12k
+High-school math competition questions
+GSM
+8k
+Simpler grade-school math questions
+ASDiv
+2k
+Basic algebra (
++
+−
+×
+÷
+)
++-\times\div)
+questions
+Table 3:
+Different question sets that have different levels of difficulty and coverage.
+Results
+.
+Fig.
+6
+shows the RL curves of different runs, from which we can make several interesting observations:
+1)
+Templates determine the performance of the initial policies, but RL can improve all policies to a comparable performance of
+∼
+40
+%
+\sim 40\%
+(given a proper question set);
+2)
+When using the R1 template, question sets have a significant impact on the dynamics of RL, with too narrow coverage leading to lower plateau performance. However, when using the Qwen-Math template, the best final performance is attained by RL on GSM-8K, demonstrating that training on much simpler (and o.o.d.) questions can largely improve (nearly double) the test accuracy on harder questions. From these observations, we draw the following insights:
+•
+The Qwen2.5-Math-1.5B base model already possesses strong math-solving capabilities (see the starting point in the right plot of
+Fig.
+6
+).
+Applying templates in fact destroys
+the capability before RL reconstructs it. This implies that we should be more conservative in claiming the huge gains brought about by pure RL.
+•
+When there is a large
+mismatch
+between base models and templates (e.g., R1 template mismatches Qwen2.5-Math-1.5B), the policy improvement mainly comes from RL-tuning, thus requiring question set to have good coverage (left plot of
+Fig.
+6
+).
+Otherwise
+, even a small and completely o.o.d. question set could induce the reasoning ability equally well, by
+reinforcing useful reasoning behaviors instead of infusing new knowledge
+.
+3.4
+Domain-Specific Pretraining Improves RL Ceiling
+Recent successful R1-Zero-like replications of math reasoners mostly employ Qwen2.5 base models as the initial policies
+(Zeng et al.,
+2025
+; Cui et al.,
+2025
+; Hu et al.,
+2025
+)
+, which are already strong math solvers and exhibit self-reflection patterns (
+Sec.
+2.2
+and
+2.3
+).
+In this section we hope to explore the other side:
+can R1-Zero-like training succeed on originally weak (in terms of math reasoning) base models?
+We answer this question affirmatively, with the observation that
+math pretraining would improve the ceiling of RL
+.
+Figure 7:
+Left
+: The average benchmark performance curves of different base models.
+Right
+: The comparison between Dr. GRPO and GRPO with respect to reasoning accuracy (solid lines) and model response length (dashed lines).
+Experimental settings
+. We adopt the Llama-3.2-3B base model as our starting point, and use the unbiased Dr. GRPO algorithm for RL-tuning with the R1 template. We hypothesize that domain-specific pretraining would help RL, hence we adopt the
+Llama-3.2-3B-FineMath
+4
+4
+4
+https://huggingface.co/HuggingFaceTB/FineMath-Llama-3B
+.
+, which is continual pretrained on the FineMath dataset
+(Allal et al.,
+2025
+)
+. Moreover, as we hypothesize that Qwen2.5 models are likely to be pretrained on concatenated question-response texts (
+Sec.
+2.2
+), we similarly prepare a concatenated dataset from NuminaMath-1.5
+(Li et al.,
+2024b
+)
+, and continual pretrain Llama-3.2-3B-FineMath for 2 epochs with learning rate 1e-5. We refer to the concatanated continual pretrained model as
+Llama-3.2-3B-NuminaQA
+.
+Results
+. We present the RL curves of different base models in the left plot of
+Fig.
+7
+. We observe that RL can even improve the vanilla Llama base model, but the gain is minimal. After continual pretraining (and concatenated continual pretraining) to embed math domain knowledge, Llama models can show much stronger RL performance, validating our hypothesis. We also revisit the GRPO’s optimization bias with the Llama base model. The right plot of
+Fig.
+7
+compares the model performance and response length trained with GRPO and Dr. GRPO. We can clearly see that GRPO can produce the “double-increase” phenomenon, potentially leading to a
+misperception
+that long-CoT can also emerge on Llama models after math pretraining. Unfortunately, the increase of length might be due to the optimization bias (
+Sec.
+3.1
+), which can be effectively mitigated by the proposed Dr. GRPO (
+Sec.
+3.2
+& right plot of
+Fig.
+7
+).
+4
+Closing Remarks
+We have taken a critical perspective to examine base models used for R1-Zero-like training, as well as algorithms used for RL. Through the analysis, we demystified how pretraining biases influence RL outcomes and how optimization choices, like GRPO, can unintentionally shape model behavior. With the proposed Dr. GRPO, we offer a simple fix that improves token efficiency while preserving reasoning performance. Our results show that scaling RL can be both effective and efficient—sometimes, less really is more.
+References
+Ahmadian et al. (2024)
+Arash Ahmadian, Chris Cremer, Matthias Gallé, Marzieh Fadaee, Julia Kreutzer, Olivier Pietquin, Ahmet Üstün, and Sara Hooker.
+Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms.
+arXiv preprint arXiv:2402.14740
+, 2024.
+Allal et al. (2025)
+Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Guilherme Penedo, Lewis Tunstall, Andrés Marafioti, Hynek Kydlíček, Agustín Piqueres Lajarín, Vaibhav Srivastav, et al.
+Smollm2: When smol goes big–data-centric training of a small language model.
+arXiv preprint arXiv:2502.02737
+, 2025.
+Andrychowicz et al. (2021)
+Marcin Andrychowicz, Anton Raichuk, Piotr Stańczyk, Manu Orsini, Sertan Girgin, Raphaël Marinier, Leonard Hussenot, Matthieu Geist, Olivier Pietquin, Marcin Michalski, et al.
+What matters for on-policy deep actor-critic methods? a large-scale study.
+In
+International conference on learning representations
+, 2021.
+Chen et al. (2024)
+Xingyu Chen, Jiahao Xu, Tian Liang, Zhiwei He, Jianhui Pang, Dian Yu, Linfeng Song, Qiuzhi Liu, Mengfei Zhou, Zhuosheng Zhang, et al.
+Do not think that much for 2+ 3=? on the overthinking of o1-like llms.
+arXiv preprint arXiv:2412.21187
+, 2024.
+Christiano et al. (2017)
+Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei.
+Deep reinforcement learning from human preferences.
+Advances in neural information processing systems
+, 30, 2017.
+Cui et al. (2025)
+Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al.
+Process reinforcement through implicit rewards.
+arXiv preprint arXiv:2502.01456
+, 2025.
+Grattafiori et al. (2024)
+Aaron Grattafiori, Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Alex Vaughan, et al.
+The llama 3 herd of models.
+arXiv preprint arXiv:2407.21783
+, 2024.
+Guo et al. (2025)
+Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al.
+Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
+arXiv preprint arXiv:2501.12948
+, 2025.
+He et al. (2024)
+Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, et al.
+Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems.
+arXiv preprint arXiv:2402.14008
+, 2024.
+Hendrycks et al. (2021)
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt.
+Measuring mathematical problem solving with the math dataset.
+arXiv preprint arXiv:2103.03874
+, 2021.
+Hu et al. (2024)
+Jian Hu, Xibin Wu, Zilin Zhu, Xianyu, Weixun Wang, Dehao Zhang, and Yu Cao.
+Openrlhf: An easy-to-use, scalable and high-performance rlhf framework.
+arXiv preprint arXiv:2405.11143
+, 2024.
+Hu et al. (2025)
+Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, and Heung-Yeung Shum Xiangyu Zhang.
+Open-reasoner-zero: An open source approach to scaling reinforcement learning on the base model.
+https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero
+, 2025.
+Jaques et al. (2019)
+Natasha Jaques, Asma Ghandeharioun, Judy Hanwen Shen, Craig Ferguson, Agata Lapedriza, Noah Jones, Shixiang Gu, and Rosalind Picard.
+Way off-policy batch deep reinforcement learning of implicit human preferences in dialog.
+arXiv preprint arXiv:1907.00456
+, 2019.
+Kool et al. (2019)
+Wouter Kool, Herke van Hoof, and Max Welling.
+Buy 4 reinforce samples, get a baseline for free!, 2019.
+Lambert et al. (2024)
+Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al.
+T
+\
+\backslash
+” ulu 3: Pushing frontiers in open language model post-training.
+arXiv preprint arXiv:2411.15124
+, 2024.
+Lewkowycz et al. (2022)
+Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay Ramasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, et al.
+Solving quantitative reasoning problems with language models.
+Advances in Neural Information Processing Systems
+, 35:3843–3857, 2022.
+Li et al. (2024a)
+Jia Li, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Huang, Kashif Rasul, Longhui Yu, Albert Q Jiang, Ziju Shen, et al.
+Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions.
+Hugging Face repository
+, 13:9, 2024a.
+Li et al. (2024b)
+Jia Li, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Costa Huang, Kashif Rasul, Longhui Yu, Albert Jiang, Ziju Shen, Zihan Qin, Bin Dong, Li Zhou, Yann Fleureau, Guillaume Lample, and Stanislas Polu.
+Numinamath, 2024b.
+Liu et al. (2024)
+Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al.
+Deepseek-v3 technical report.
+arXiv preprint arXiv:2412.19437
+, 2024.
+Liu et al. (2025a)
+Zichen Liu, Changyu Chen, Chao Du, Wee Sun Lee, and Min Lin.
+Oat: A research-friendly framework for llm online alignment.
+https://github.com/sail-sg/oat
+, 2025a.
+Liu et al. (2025b)
+Zichen Liu, Changyu Chen, Wenjun Li, Tianyu Pang, Chao Du, and Min Lin.
+There may not be aha moment in r1-zero-like training — a pilot study.
+https://oatllm.notion.site/oat-zero
+, 2025b.
+Notion Blog.
+Luo et al. (2025)
+Michael Luo, Sijun Tan, Justin Wong, Xiaoxiang Shi, William Y. Tang, Manan Roongta, Colin Cai, Jeffrey Luo, Tianjun Zhang, Li Erran Li, Raluca Ada Popa, and Ion Stoica.
+Deepscaler: Surpassing o1-preview with a 1.5b model by scaling rl.
+https://github.com/agentica-project/deepscaler
+, 2025.
+Pan et al. (2025)
+Jiayi Pan, Junjie Zhang, Xingyao Wang, Lifan Yuan, Hao Peng, and Alane Suhr.
+Tinyzero.
+https://github.com/Jiayi-Pan/TinyZero, 2025.
+Accessed: 2025-01-24.
+Schulman et al. (2015)
+John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel.
+High-dimensional continuous control using generalized advantage estimation.
+arXiv preprint arXiv:1506.02438
+, 2015.
+Schulman et al. (2017a)
+John Schulman, Xi Chen, and Pieter Abbeel.
+Equivalence between policy gradients and soft q-learning.
+arXiv preprint arXiv:1704.06440
+, 2017a.
+Schulman et al. (2017b)
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov.
+Proximal policy optimization algorithms.
+arXiv preprint arXiv:1707.06347
+, 2017b.
+Shao et al. (2024)
+Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Y Wu, et al.
+Deepseekmath: Pushing the limits of mathematical reasoning in open language models.
+arXiv preprint arXiv:2402.03300
+, 2024.
+Sheng et al. (2024)
+Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu.
+Hybridflow: A flexible and efficient rlhf framework.
+arXiv preprint arXiv:2409.19256
+, 2024.
+Shoeybi et al. (2019)
+Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro.
+Megatron-lm: Training multi-billion parameter language models using model parallelism.
+arXiv preprint arXiv:1909.08053
+, 2019.
+Stiennon et al. (2020)
+Nisan Stiennon, Long Ouyang, Jeffrey Wu, Daniel Ziegler, Ryan Lowe, Chelsea Voss, Alec Radford, Dario Amodei, and Paul F Christiano.
+Learning to summarize with human feedback.
+Advances in neural information processing systems
+, 33:3008–3021, 2020.
+Sutton & Barto (2018)
+Richard S. Sutton and Andrew G. Barto.
+Reinforcement Learning: An Introduction
+.
+The MIT Press, second edition, 2018.
+von Werra et al. (2020)
+Leandro von Werra, Younes Belkada, Lewis Tunstall, Edward Beeching, Tristan Thrush, Nathan Lambert, Shengyi Huang, Kashif Rasul, and Quentin Gallouédec.
+Trl: Transformer reinforcement learning.
+https://github.com/huggingface/trl
+, 2020.
+Yang et al. (2024a)
+An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al.
+Qwen2.5 technical report.
+arXiv preprint arXiv:2412.15115
+, 2024a.
+Yang et al. (2024b)
+An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao, Bowen Yu, Chengpeng Li, Dayiheng Liu, Jianhong Tu, Jingren Zhou, Junyang Lin, et al.
+Qwen2.5-math technical report: Toward mathematical expert model via self-improvement.
+arXiv preprint arXiv:2409.12122
+, 2024b.
+Yeo et al. (2025)
+Edward Yeo, Yuxuan Tong, Morry Niu, Graham Neubig, and Xiang Yue.
+Demystifying long chain-of-thought reasoning in llms.
+arXiv preprint arXiv:2502.03373
+, 2025.
+Zeng et al. (2025)
+Weihao Zeng, Yuzhen Huang, Wei Liu, Keqing He, Qian Liu, Zejun Ma, and Junxian He.
+7b model and 8k examples: Emerging reasoning with reinforcement learning is both effective and efficient.
+https://hkust-nlp.notion.site/simplerl-reason
+, 2025.
+Notion Blog.
+Appendix A
+Policy Gradient Derivations
+In the context of RL for LLM post-training, we typically maximize the value of
+𝒥
+​
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+R
+​
+(
+𝐪
+,
+𝐨
+)
+]
+]
+,
+\mathcal{J}(\pi_{\theta})=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[R({\mathbf{q}},{\mathbf{o}})]\right],
+(4)
+where
+R
+​
+(
+𝐪
+,
+𝐨
+)
+=
+∑
+t
+=
+1
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+)
+R({\mathbf{q}},{\mathbf{o}})=\sum_{t=1}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t})
+is the return
+(Sutton & Barto,
+2018
+)
+of the trajectory
+𝐪
+;
+𝐨
+{\mathbf{q}};{\mathbf{o}}
+, and
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+)
+r({\mathbf{q}},{\mathbf{o}}_{\leq t})
+represents the token-level reward for
+t
+t
+-th token in response
+𝐨
+{\mathbf{o}}
+.
+The Monte Carlo policy gradient
+(Sutton & Barto,
+2018
+)
+of
+Eq.
+4
+is
+∇
+θ
+𝒥
+​
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+𝐨
+|
+𝐪
+)
+​
+R
+​
+(
+𝐪
+,
+𝐨
+)
+]
+]
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+∇
+θ
+​
+∑
+t
+=
+1
+|
+𝐨
+|
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+R
+​
+(
+𝐪
+,
+𝐨
+)
+]
+]
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+∑
+t
+=
+1
+|
+𝐨
+|
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+∑
+t
+′
+=
+t
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+′
+)
+]
+]
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+𝐨
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+∑
+t
+=
+1
+|
+𝐨
+|
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+(
+∑
+t
+′
+=
+t
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+′
+)
+−
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+)
+]
+]
+,
+\begin{split}\nabla_{\theta}\mathcal{J}(\pi_{\theta})&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\nabla_{\theta}\log\pi_{\theta}({\mathbf{o}}|{\mathbf{q}})R({\mathbf{q}},{\mathbf{o}})]\right]\\
+&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\nabla_{\theta}\sum_{t=1}^{|{\mathbf{o}}|}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})R({\mathbf{q}},{\mathbf{o}})]\right]\\
+&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\sum_{t=1}^{|{\mathbf{o}}|}\nabla_{\theta}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})\sum_{t^{\prime}=t}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t^{\prime}})]\right]\\
+&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{{\mathbf{o}}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}\left[\sum_{t=1}^{|{\mathbf{o}}|}\nabla_{\theta}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})\left(\sum_{t^{\prime}=t}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t^{\prime}})-B({\mathbf{q}},{\mathbf{o}}_{<t})\right)\right]\right],\end{split}
+(5)
+where
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+B({\mathbf{q}},{\mathbf{o}}_{<t})
+is a variance reduction term, which is invariant with respect to
+o
+t
+o_{t}
+so that
+𝔼
+o
+t
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+[
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+]
+\displaystyle\underset{o_{t}\sim\pi_{\theta}(\cdot|{\mathbf{q}},{\mathbf{o}}_{<t})}{\mathbb{E}}[\nabla_{\theta}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})B({\mathbf{q}},{\mathbf{o}}_{<t})]
+=
+𝔼
+o
+t
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+[
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+]
+​
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+\displaystyle=\underset{o_{t}\sim\pi_{\theta}(\cdot|{\mathbf{q}},{\mathbf{o}}_{<t})}{\mathbb{E}}[\nabla_{\theta}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})]B({\mathbf{q}},{\mathbf{o}}_{<t})
+=
+[
+∑
+o
+t
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+]
+​
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+\displaystyle=[\sum_{o_{t}}\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})\nabla_{\theta}\log\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})]B({\mathbf{q}},{\mathbf{o}}_{<t})
+=
+[
+∑
+o
+t
+∇
+θ
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+]
+​
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+\displaystyle=[\sum_{o_{t}}\nabla_{\theta}\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})]B({\mathbf{q}},{\mathbf{o}}_{<t})
+=
+[
+∇
+θ
+​
+∑
+o
+t
+π
+θ
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+]
+​
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+\displaystyle=[\nabla_{\theta}\sum_{o_{t}}\pi_{\theta}(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})]B({\mathbf{q}},{\mathbf{o}}_{<t})
+=
+[
+∇
+θ
+1
+]
+​
+B
+​
+(
+𝐪
+,
+𝐨
+z
+<
+t
+)
+=
+0
+.
+\displaystyle=[\nabla_{\theta}1]B({\mathbf{q}},{\mathbf{o}}_{z<t})=0.
+Typically, we set
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+=
+𝔼
+𝐨
+≥
+t
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+,
+𝐨
+<
+t
+)
+​
+[
+∑
+t
+′
+=
+t
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+′
+)
+]
+B({\mathbf{q}},{\mathbf{o}}_{<t})=\underset{{\mathbf{o}}_{\geq t}\sim\pi_{\theta}(\cdot|{\mathbf{q}},{\mathbf{o}}_{<t})}{\mathbb{E}}[\sum_{t^{\prime}=t}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t^{\prime}})]
+, which is the expected cumulative reward in the future (also known as the value of the current state), and denote
+A
+​
+(
+o
+t
+|
+𝐪
+,
+𝐨
+<
+t
+)
+=
+∑
+t
+′
+=
+t
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+′
+)
+−
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+A(o_{t}|{\mathbf{q}},{\mathbf{o}}_{<t})=\sum_{t^{\prime}=t}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t^{\prime}})-B({\mathbf{q}},{\mathbf{o}}_{<t})
+as the advantage. In the case of outcome reward,
+∑
+t
+′
+=
+t
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+′
+)
+=
+∑
+t
+=
+1
+|
+𝐨
+|
+r
+​
+(
+𝐪
+,
+𝐨
+≤
+t
+)
+=
+R
+​
+(
+𝐪
+,
+𝐨
+)
+\sum_{t^{\prime}=t}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t^{\prime}})=\sum_{t=1}^{|{\mathbf{o}}|}r({\mathbf{q}},{\mathbf{o}}_{\leq t})=R({\mathbf{q}},{\mathbf{o}})
+.
+By setting
+B
+​
+(
+𝐪
+,
+𝐨
+<
+t
+)
+=
+mean
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+B({\mathbf{q}},{\mathbf{o}}_{<t})=\operatorname{mean}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})
+, the policy gradient of
+Eq.
+5
+becomes
+∇
+θ
+𝒥
+​
+(
+π
+θ
+)
+=
+𝔼
+𝐪
+∼
+p
+𝒬
+​
+[
+𝔼
+{
+𝐨
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+(
+⋅
+|
+𝐪
+)
+​
+[
+1
+G
+​
+∑
+i
+=
+1
+G
+∑
+t
+=
+1
+|
+𝐨
+|
+∇
+θ
+log
+⁡
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+​
+A
+~
+i
+,
+t
+]
+]
+,
+\begin{split}\nabla_{\theta}\mathcal{J}(\pi_{\theta})&=\underset{{{\mathbf{q}}\sim p_{\mathcal{Q}}}}{\mathbb{E}}\left[\underset{\{{\mathbf{o}}_{i}\}_{i=1}^{G}\sim\pi_{\theta}(\cdot|{\mathbf{q}})}{\mathbb{E}}[\frac{1}{G}\sum_{i=1}^{G}\sum_{t=1}^{|{\mathbf{o}}|}\nabla_{\theta}\log\pi_{\theta}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})\tilde{A}_{i,t}]\right],\end{split}
+(6)
+where
+A
+~
+i
+,
+t
+=
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+−
+mean
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+std
+⁡
+(
+{
+R
+​
+(
+𝐪
+,
+𝐨
+1
+)
+,
+…
+,
+R
+​
+(
+𝐪
+,
+𝐨
+G
+)
+}
+)
+.
+\tilde{A}_{i,t}=\frac{R({\mathbf{q}},{\mathbf{o}}_{i})-\operatorname{mean}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})}{{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\xcancel{\operatorname{std}({\{R({\mathbf{q}},{\mathbf{o}}_{1}),\dots,R({\mathbf{q}},{\mathbf{o}}_{G})\}})}}}.
+We adopt the PPO
+(Schulman et al.,
+2017b
+)
+objective to compute
+Eq.
+6
+:
+𝒥
+​
+(
+π
+θ
+)
+=
+𝔼
+[
+𝐪
+∼
+p
+𝒬
+,
+{
+𝐨
+i
+}
+i
+=
+1
+G
+∼
+π
+θ
+o
+​
+l
+​
+d
+(
+⋅
+|
+𝐪
+)
+]
+1
+G
+​
+∑
+i
+=
+1
+G
+1
+|
+𝐨
+i
+|
+​
+∑
+t
+=
+1
+|
+𝐨
+i
+|
+{
+min
+⁡
+[
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+​
+A
+~
+i
+,
+clip
+​
+(
+π
+θ
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+π
+θ
+o
+​
+l
+​
+d
+​
+(
+o
+i
+,
+t
+|
+𝐪
+,
+𝐨
+i
+,
+<
+t
+)
+,
+1
+−
+ϵ
+,
+1
++
+ϵ
+)
+​
+A
+~
+i
+]
+}
+,
+\begin{split}\mathcal{J}(\pi_{\theta})&=\mathbb{E}{[{\mathbf{q}}\sim p_{\mathcal{Q}},\{{\mathbf{o}}_{i}\}_{i=1}^{G}\sim\pi_{\theta_{old}}(\cdot|{\mathbf{q}})]}\\
+&\frac{1}{G}\sum_{i=1}^{G}{\color[rgb]{1,0,0}\definecolor[named]{pgfstrokecolor}{rgb}{1,0,0}\xcancel{\frac{1}{|{\mathbf{o}}_{i}|}}}\sum_{t=1}^{|{\mathbf{o}}_{i}|}\left\{\min\left[\frac{\pi_{\theta}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}\tilde{A}_{i},\text{clip}\left(\frac{\pi_{\theta}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})}{\pi_{\theta_{old}}(o_{i,t}|{\mathbf{q}},{\mathbf{o}}_{i,<t})},1-\epsilon,1+\epsilon\right)\tilde{A}_{i}\right]\right\},\end{split}
+from which we conclude that both
+std
+\operatorname{std}
+and
+|
+𝐨
+|
+|{\mathbf{o}}|
+should not appear in the RL objective.
+Unbiasedness of
+A
+~
+i
+,
+t
+\tilde{A}_{i,t}
+. We note that
+A
+~
+i
+,
+t
+\tilde{A}_{i,t}
+computed above is equivalent to that of REINFORCE Leave-One-Out (RLOO)
+(Ahmadian et al.,
+2024
+; Kool et al.,
+2019
+)
+up to a scaling factor, which can be subsumed into the learning rate without affecting the RL dynamics. Specifically,
+G
+G
+−
+1
+⋅
+A
+~
+i
+,
+t
+=
+G
+G
+−
+1
+​
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+−
+G
+G
+−
+1
+​
+1
+G
+​
+∑
+j
+=
+1
+G
+R
+​
+(
+𝐪
+,
+𝐨
+j
+)
+=
+G
+G
+−
+1
+​
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+−
+1
+G
+−
+1
+​
+∑
+j
+=
+1
+,
+j
+≠
+i
+G
+R
+​
+(
+𝐪
+,
+𝐨
+j
+)
+−
+1
+G
+−
+1
+​
+R
+​
+(
+𝐪
+,
+𝐨
+i
+)
+=
+A
+^
+i
+,
+t
+RLOO
+.
+\begin{split}{\color[rgb]{0.6,0,0.4}\definecolor[named]{pgfstrokecolor}{rgb}{0.6,0,0.4}\frac{G}{G-1}}\cdot\tilde{A}_{i,t}&={\color[rgb]{0.6,0,0.4}\definecolor[named]{pgfstrokecolor}{rgb}{0.6,0,0.4}\frac{G}{G-1}}R({\mathbf{q}},{\mathbf{o}}_{i})-{\color[rgb]{0.6,0,0.4}\definecolor[named]{pgfstrokecolor}{rgb}{0.6,0,0.4}\frac{G}{G-1}}\frac{1}{G}\sum_{j=1}^{G}R({\mathbf{q}},{\mathbf{o}}_{j})\\
+&={\color[rgb]{0.6,0,0.4}\definecolor[named]{pgfstrokecolor}{rgb}{0.6,0,0.4}\frac{G}{G-1}}R({\mathbf{q}},{\mathbf{o}}_{i})-\frac{1}{G-1}\sum_{j=1,j\neq i}^{G}R({\mathbf{q}},{\mathbf{o}}_{j})-\frac{1}{G-1}R({\mathbf{q}},{\mathbf{o}}_{i})\\
+&=\hat{A}^{\text{RLOO}}_{i,t}.\end{split}
+Appendix B
+Detailed Benchmark Results
+We show the detailed benchmark results for three scales (1.5B, 3B and 7B) in
+Table
+4
+. We also include the instruct models at the same scale and R1-Distill models for comparison. Note that since we employ the Qwen2.5-Math base models, which have a context length of 4k, we thus limit the generation budget at 3k for all baselines compared. For models that are trained for a longer context (OpenReasoner-Zero end R1-Distill-Qwen), we also report their performance at 8k generation budget.
+Base model + Method
+AIME24
+AMC
+MATH500
+Minerva
+OlympiadBench
+Avg.
+Qwen2.5-Math-
+1.5B
+20.0
+32.5
+33.0
+12.5
+22.8
+24.2
+Qwen2.5-Math-1.5B*
+16.7
+43.4
+61.8
+15.1
+28.4
+33.1
+Oat-Zero-1.5B
+20.0
+53.0
+74.2
+25.7
+37.6
+42.1
+R1-Distill-Qwen-1.5B @ 3k
+2.5
+21.7
+52.2
+16.3
+17.3
+22.0
+R1-Distill-Qwen-1.5B @ 8k
+20.0
+49.4
+77.4
+25.0
+35.8
+41.5
+Qwen2.5-Math-1.5B-Instruct
+10.0
+48.2
+74.2
+26.5
+40.2
+39.8
+Llama-3.2-
+3B
+0.0
+2.4
+6.4
+6.3
+1.3
+3.3
++ RL w. Dr. GRPO
+3.3
+7.2
+10.0
+11.0
+2.2
+6.8
+Llama-3.2-3B-FineMath
+0.0
+3.6
+18.4
+5.9
+2.2
+6.0
++ RL w. Dr. GRPO
+3.3
+10.8
+38.0
+12.9
+9.0
+14.8
+Llama-3.2-3B-NuminaQA
+0.0
+0.0
+0.6
+0.0
+0.1
+0.14
++ RL w. Dr. GRPO (
+Oat-Zero-3B
+)
+6.7
+18.1
+50.0
+14.3
+14.7
+20.7
+Llama-3.2-3B-Instruct
+6.7
+15.7
+38.8
+11.8
+12.6
+17.1
+Qwen2.5-Math-
+7B
+16.7
+38.6
+50.6
+9.9
+16.6
+26.5
+Qwen2.5-Math-7B*
+0.2
+45.8
+69.0
+21.3
+34.7
+38.2
+SimpleRL-Zero-7B
+26.7
+60.2
+78.2
+27.6
+40.3
+46.6
+PRIME-Zero-7B
+16.7
+62.7
+83.8
+36.0
+40.9
+48.0
+OpenReasoner-Zero-7B @ 3k
+13.3
+47.0
+79.2
+31.6
+44.0
+43.0
+OpenReasoner-Zero-7B @ 8k
+13.3
+54.2
+82.4
+31.6
+47.9
+45.9
+Oat-Zero-7B
+43.3
+62.7
+80.0
+30.1
+41.0
+51.4
+R1-Distill-Qwen-7B @ 3k
+10.0
+26.2
+60.1
+23.0
+23.1
+28.5
+R1-Distill-Qwen-7B @ 8k
+33.3
+68.4
+88.1
+35.9
+47.7
+54.7
+Qwen2.5-Math-7B-Instruct
+16.7
+53.0
+83.6
+29.8
+42.7
+45.1
+Table 4:
+A comparison on benchmark scores.
+Ours
+models are RL-tuned by our minimalist recipe (
+Sec.
+1
+). * means we employ the best template (no template) to generate answers, such that the test scores are highest and can faithfully reflect the capabilities of the base models.
+Appendix C
+Extended Empirical Results
+In this section we present two extended empirical results for (1) the ablation of different bias terms in GRPO and (2) statistical significance of Dr. GRPO’s results. We RL-tune the Qwen2.5-1.5B base model on a mixture of 3K diverse math questions drawn from ASDiv, MATH, and AIME (pre-2023).
+Figure 8:
+Ablation results on the two bias terms in GRPO.
+Fig.
+8
+shows the training and evaluation curves for the following variants: Dr. GRPO, GRPO w/o length normalization, GRPO w/o standard deviation (std) normalization and Vanilla GRPO. From the middle subplot, we observe that both Dr. GRPO and the variant without length normalization generate shorter responses compared to the other two. This confirms that the length bias term has a more significant influence on response length–consistent with our expectations.
+In terms of performance, Dr. GRPO and the other ablated variants consistently outperform vanilla GRPO in both training rewards and evaluation accuracy. This indicates that removing bias terms (either length or std) improves policy learning, validating our motivation for Dr. GRPO.
+Figure 9:
+Evaluation results of 3 independent RL runs. The mean curves are drawn in solid lines and the standard deviation is plotted in the shaded areas.
+Fig.
+9
+compares GRPO and Dr. GRPO across three independent runs. We observe that Dr. GRPO consistently demonstrates statistically significant improvements–both in token efficiency and final accuracy–across different random seeds.
+Appendix D
+Keyword-based Detection and LLM-Based Identification of Self-Reflection Behaviors
+We construct a pool of carefully selected keywords and phrases that signal self-reflection behaviors in the LLM’s responses. However, LLM-generated responses often contain hallucinations and off-topic content, leading to the presence of simple, ambiguous keywords that do not necessarily indicate genuine self-reflection. For instance, terms like “wait” and “try again” frequently result in false positive detections. To reduce false positives, we maintain a small, highly selective keyword pool consisting of terms that are strongly indicative of self-reflection. In our experiment, the keyword pool is limited to: recheck, rethink, reassess, reevaluate, re-evaluate, reevaluation, re-examine, reexamine, reconsider, reanalyze, double-check, check again, think again, verify again, and go over the steps.
+Figure 10:
+Count of keyword occurrences out of 40,000 responses (500 questions
+×
+\times
+8 responses per question
+×
+\times
+10 temperatures). y is in log scale.
+We present the occurrences of various keywords in the responses generated by different models in Figure
+10
+. Interestingly, different model families emphasize different keywords. For instance, phrases such as “check again”, “double-check”, “re-evaluate”, “re-examine”, “recheck”, “reconsider”, and “verify again” appear most frequently in the Qwen2.5 family. In contrast, “re-evaluate”, “re-examine”, and “verify again” do not appear in the responses of the DeepSeek family, while Llama models frequently use the phrase “think again.” We hypothesize that this phenomenon results from differences in the pretraining data, particularly in relation to reasoning and mathematics.
+Although we meticulously select the keyword pool, it may still be insufficient to identify some implicit behaviors of self-reflection that do not contain a specific keyword. Additionally, it can lead to false positives, as illustrated in Case (a) of Figure
+11
+. To address these limitations and more accurately assess the self-reflection capability of base models, we leverage stronger LLMs (
+GPT-4o-mini
+in our experiments) to analyze the responses and determine whether they exhibit explicit self-reflection (e.g., keywords like ”recheck” and ”reevaluate”) or implicit self-reflection (e.g., more sophisticated patterns that cannot be easily captured through keyword matching). This approach helps distinguish true self-reflection behaviors from superficial or incidental use of related terms.
+Figure 11:
+Case (a)
+: a false positive in keyword-based detection.
+Case (b)
+: a false positive in LLM-based detection.
+While LLM-based detection effectively filters out false positives from keyword-based detection and identifies implicit self-reflection behaviors, it can still misclassify responses, particularly when they are lengthy and complex. For instance, Case (b) in Figure
+11
+shows a false positive in LLM-based detection, where the response is categorized as self-reflection by the LLM but does not actually exhibit self-reflection. This type of error can be filtered out by keyword-based detection. To enhance robustness, we integrate keyword-based and LLM-based detection through cross-validation. The combined detection results, along with the individual results from keyword-based and LLM-based methods, are presented in Figure
+12
+.
+Figure 12:
+Comparison of keyword-based detection, LLM-based detection, and cross detection. Self-reflections are counted at the question level across 500 questions, where a question is marked as having self-reflection if at least one of its eight responses exhibits self-reflection.
+Appendix E
+Examples of Aha Moment in DeepSeek-V3-Base
+Fig.
+13
+shows two examples to demonstrate that the DeepSeek-V3-Base model already exhibits the so-called “aha moment” even before the RL-tuning.
+Figure 13:
+Cases showing that DeepSeek-V3-Base already exhibits “Aha moment” even before RL tunning.
+Appendix F
+Comparison Between DeepSeek-V3-Base and DeepSeek-R1-Zero
+Figure 14:
+Breakdown of response categories across difficulty levels in the MATH dataset for DeepSeek-V3-Base and DeepSeek-R1-Zero.
+Category
+Base
+R1-Zero
+Unformatted
+880.7
+7870.3
+Correct
+621.3
+4965.4
+Incorrect
+1038.9
+8206.1
+Table 5:
+Average response string lengths across categories for DeepSeek-V3-Base (Base) and DeepSeek-R1-Zero (R1-Zero).
+We analyze DeepSeek-V3-Base and DeepSeek-R1-Zero to understand changes in model behavior during R1-Zero training. In
+Fig.
+14
+, we present the breakdown of response categories across difficulty levels for 500 MATH questions evaluated on both models. The results indicate that most incorrect responses are corrected after RL training, demonstrating substantial performance gains from R1-Zero training. Meanwhile, we find an increase in unformatted responses,
+which aligns with the observation
+in
+Liu et al. (
+2025b
+)
+.
+In
+Table
+5
+, we report the average response lengths across categories. Note that truncated responses would fall into any of the other three categories if a larger context size were used; thus, we exclude them from the table. The results show a substantial increase in response lengths across all categories, including correct responses, consistent with the results in the Fig. 3 of
+Guo et al. (
+2025
+)
+. However, the average length of incorrect responses is notably longer than that of correct responses. We hypothesize this is because more challenging questions generally require longer responses due to increased reasoning complexity, and incorrect responses are more likely to originate from harder questions, resulting in a longer average length.
+Figure 15:
+Accuracy difference between responses with and without self-reflection for each question (responses sampled from DeepSeek-R1-Zero).
+Self-reflection does not necessarily imply higher accuracy.
+To investigate whether self-reflection behaviors are associated with model performance during the inference (acknowledging that self-reflection may improve exploration during training—a potential positive effect outside this section’s scope), we analyze questions that elicit at least one response with self-reflection from DeepSeek-R1-Zero across eight trials. For each question, we sample 100 responses and divide them into two groups: those with self-reflection and those without. We then compute the accuracy difference between these two groups for each question. As shown in
+Fig.
+15
+, the results indicate that nearly half responses with self-reflection do not achieve higher accuracy than those without self-reflection, suggesting that self-reflection does not necessarily imply higher inference-stage accuracy for DeepSeek-R1-Zero.
+Appendix G
+Detailed Experimental Settings
+All our experiments are performed on 8
+×
+\times
+A100 GPUs and finished in about one day. We enable the actor-learner collocation supported by Oat
+(Liu et al.,
+2025a
+)
+to optimize the training efficiency. We show the experimental configurations in
+Table
+6
+.
+Parameter
+Value
+Actor
+Maximum response length
+3000
+3000
+tokens
+Sampling temperature
+1.0
+(top P, top k)
+(1.0, -1)
+Number of responses per question
+8
+Learner
+Optimizer
+AdamW
+Adam parameters (
+β
+1
+,
+β
+2
+\beta_{1},\beta_{2}
+)
+(0.9, 0.95)
+Weight decay
+0.0
+Gradient norm clipping
+1.0
+Learning rate scheduler
+Constant
+Learning rate
+1
+×
+10
+−
+6
+1\times 10^{-6}
+Inner proximal update epoch
+1
+KL loss coefficient
+0.0
+KL penalty coefficient
+0.0
+Policy clipping parameter
+0.2
+Table 6:
+Hyperparameter configurations used in all experiments.
+Appendix H
+Prompts Used for GPT-As-A-Judge
+Prompt for checking the model’s question-answering ability.
+Prompt for Checking Question-Answering Ability
+I will send you a question and a long response generated by an LLM. Your task is to determine whether the output attempts to answer the question or not. The output may sometimes include irrelevant content, hallucinations, or random, off-topic responses.
+Please classify the output into one of the following categories:
+Output Format
+:
+Your response must start with a
+single integer
+(0 or 1), followed by a
+brief explanation
+.
+•
+Return 0:
+→ The output is not trying to answer the question (e.g., irrelevant content, random talking, hallucinations).
+Example output:
+‘0: The response is off-topic and does not address the question.‘
+•
+Return 1:
+→ The output attempts to answer the question, regardless of how complete or accurate the answer is.
+Example output:
+‘1: The response engages with the question, even if the answer is incomplete or incorrect.‘
+Question:
+{question}
+Response:
+{response}
+Prompt for LLM-based detection to determine whether a response contains self-reflection behaviors.
+LLM-based Detection for Self-Reflection
+I will send you a mathematical question along with a detailed response. Your task is to determine whether the response is attempting to answer the question. If the response is off-topic, hallucinated, random talk, or otherwise irrelevant, mark it as
+0
+. Otherwise, assess whether the response exhibits self-reflection.
+Categorization Rules
+:
+1.
+Category 0
+: The response is
+off-topic, nonsensical, incoherent, overly repetitive, or lacks logical reasoning
+.
+•
+Example cases:
+–
+The response does not relate to the question.
+–
+It contains meaningless or hallucinated content.
+–
+It consists of excessive repetition without coherence.
+2.
+Category 1
+: The response
+attempts to answer the question
+but does
+not
+exhibit self-reflection.
+•
+Example cases:
+–
+The response directly solves the problem without revisiting steps.
+–
+No attempt is made to verify the correctness of the answer or explore alternative solutions.
+3.
+Category 2
+: The response
+demonstrates self-reflection
+at any level.
+•
+This may include:
+–
+Explicit self-reflection keywords
+, such as: *recheck, rethink, reassess, reevaluate, re-evaluate, reevaluation, re-examine, reexamine, reconsider, reanalyze, double-check, check again, think again, verify again, go over the steps*, etc.
+–
+Implicit self-reflection behaviors
+, such as revisiting the solution, questioning assumptions, or considering alternative approaches
+without explicit keywords
+.
+•
+If any form of self-reflection is present,
+always categorize it as 2
+, regardless of correctness or answer quality.
+4.
+Category 3
+: The response consists
+solely of Python code for calculations
+without exhibiting self-reflection.
+•
+Example cases:
+–
+The response only provides a Python script to compute the solution
+without any verification, re-evaluation, or alternative considerations
+.
+Output Format
+:
+Your response should first provide a
+very brief explanation
+of your analysis, followed by a
+single category number (0, 1, 2, or 3)
+at the end. You must include the category number at the end of your response.
+Example outputs:
+•
+‘The response is off-topic and does not attempt to answer the question. 0.‘
+•
+‘The response provides a direct solution without self-reflection. 1.‘
+•
+‘The response demonstrates self-reflection. 2.‘
+•
+‘The response consists solely of Python code without any self-reflection. 3.‘
+Question:
+{question}
+Response:
+{response}
\ No newline at end of file
diff --git a/research/notes/verlreadmemd-at-main-verl-projectverl-github.md b/research/notes/verlreadmemd-at-main-verl-projectverl-github.md
new file mode 100644
index 0000000000000000000000000000000000000000..859800725c9a089dd2e657edecc1451965ff92bc
--- /dev/null
+++ b/research/notes/verlreadmemd-at-main-verl-projectverl-github.md
@@ -0,0 +1,657 @@
+---
+title: verl/README.md at main · verl-project/verl · GitHub
+id: verlreadmemd-at-main-verl-projectverl-github
+tags:
+- deepread
+created: '2026-06-10T00:41:02.699860Z'
+source: https://github.com/volcengine/verl/blob/main/README.md
+source_domain: github.com
+fetched_at: '2026-06-10T00:41:02.699705Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: code
+deprecated: false
+---
+
+verl/README.md at main · verl-project/verl · GitHub
+Skip to content
+You signed in with another tab or window.
+Reload
+to refresh your session.
+You signed out in another tab or window.
+Reload
+to refresh your session.
+You switched accounts on another tab or window.
+Reload
+to refresh your session.
+Dismiss alert
+verl-project
+/
+verl
+Public
+Notifications
+You must be signed in to change notification settings
+Fork
+4k
+Star
+21.9k
+Files
+Expand file tree
+main
+/
+README.md
+Copy path
+Blame
+More file actions
+Blame
+More file actions
+Latest commit
+History
+History
+History
+309 lines (247 loc) · 37 KB
+main
+/
+README.md
+Top
+File metadata and controls
+Preview
+Code
+Blame
+309 lines (247 loc) · 37 KB
+Raw
+Copy raw file
+Download raw file
+Outline
+Edit and raw actions
+👋 Hi, everyone!
+    verl is a RL training library initiated by
+ByteDance Seed team
+and maintained by the verl community.
+verl: Volcano Engine Reinforcement Learning for LLMs
+verl is a flexible, efficient and production-ready RL training library for large language models (LLMs).
+verl is the open-source version of
+HybridFlow: A Flexible and Efficient RLHF Framework
+paper.
+verl is flexible and easy to use with:
+Easy extension of diverse RL algorithms
+: The hybrid-controller programming model enables flexible representation and efficient execution of complex post-training dataflows. Build RL dataflows such as GRPO, PPO in a few lines of code.
+Seamless integration of existing LLM infra with modular APIs
+: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as FSDP, Megatron-LM, vLLM, SGLang, etc
+Flexible device mapping
+: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+Ready integration with popular HuggingFace models
+verl is fast with:
+State-of-the-art throughput
+: SOTA LLM training and inference engine integrations and SOTA RL throughput.
+Efficient actor model resharding with 3D-HybridEngine
+: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+News
+[2026/05]
+uni-agent
+is released: a unified agent framework to build, run, and train LLM agents at scale, built on top of verl.
+[2026/05]
+VeRL-Omni
+is pre-released: a unified RL stack for diffusion and omni-modal model post-training built on top of verl. Read the
+blog post
+for details.
+[2026/05] verl's zero-mismatch HuggingFace rollout
+vexact
+is released: with batch-invariant kernels, shared model definition with FSDP, and out-of-box examples compatible with VeOmni.
+[2026/04] verl's Megatron backend LoRA and router replay support is showcased at
+PyTorch Conference Europe 2026
+.
+[2026/03] verl is presented at NVIDIA GTC26:
+session#1
+,
+session#2
+[2026/01] verl has been migrated to the
+verl-project
+[2026/01] verl first meetup was successfully held in Shanghai on 01/10, hosted by Volcengine and NVIDIA, the slides has been uploaded to
+verl-data
+.
+[2026/01] The
+recipe
+directory has been migrated to a dedicated repository:
+verl-recipe
+and added as a submodule. See
+#4795
+. It can be used as it was after
+git submodule update --init --recursive recipe
+. Note that
+transfer_queue
+,
+fully_async_policy
+,
+one_step_off_policy
+and
+vla
+are kept under
+verl/experimental
+since they are planned to be merged into the main library. Use them through
+verl.experimental.{module}
+.
+[2025/12]
+Mind Lab
+successfully used
+verl
+and
+Megatron-bridge
+to train GRPO Lora for Trillion-parameter model on 64 H800 - See their
+techblog
+.
+[2025/10] verl is presented in the
+PyTorch Conference 2025
+.
+[2025/08] verl is presented in the
+PyTorch Expert Exchange Webinar
+.
+Slides
+available.
+[2025/07] The
+ReTool
+recipe is fully open sourced.
+Blog
+[2025/07] The first verl meetup will be held at ICML Vancouver on July 16th! Please
+join us
+if you are at ICML! (onsite only)
+[2025/06] verl with Megatron backend enables large MoE models such as
+DeepSeek-671B and Qwen3-235B
+.
+[2025/03]
+DAPO
+is the open-sourced SOTA RL algorithm that achieves 50 points on AIME 2024 based on the Qwen2.5-32B pre-trained model, surpassing the previous SOTA achieved by DeepSeek's GRPO (DeepSeek-R1-Zero-Qwen-32B). DAPO's training is fully powered by verl and the reproduction code is available in
+recipe/dapo
+now.
+more...
+[2025/04] [Seed-Thinking-v1.5](
+https://github.com/ByteDance-Seed/Seed-Thinking-v1.5/blob/main/seed-thinking-v1.5.pdf
+) tech report is released! Trained with verl, Seed-Thinking-v1.5 achieves 86.7 on AIME 2024, 55.0 on Codeforces and 77.3 on GPQA, demonstrating excellent reasoning abilities in STEM and coding. Beyond reasoning tasks, the method demonstrates notable generalization across diverse domains.
+[2025/07] verl keynote at [AWS AI Hours Singapore](
+https://pages.awscloud.com/aws-ai-hours-sg.html#agenda
+) on 7/8, verl & verl-agent project updates at [Agent for SWE meetup](
+https://lu.ma/e498qhsi
+) by LF AI & Data Singapore on 7/11.
+[2025/06] verl team will provide latest project updates at [PyTorch Day China](
+https://www.lfasiallc.com/pytorch-day-china/
+) on June 7th. Meet our dev team in Beijing!
+[2025/04] [VAPO](
+https://arxiv.org/pdf/2504.05118
+) (value-based augmented PPO) paper covers our latest RL method for reasoning models. Trained from Qwen-32B-base model, VAPO achieves 60.4 on AIME 2024, outperforming DAPO-32B.
+[2025/05] [PF-PPO](
+https://arxiv.org/abs/2409.06957
+), accepted to ICML 2025, is now supported in verl! PF-PPO enhances policy learning efficiency and robustness by filtering potentially noisy reward signals and reusing high-quality experiences via a replay buffer.
+[2025/04] We will give a tutorial about latest post-training techniques and programming guide for verl at [ICLR 2025 Expo](
+https://iclr.cc/virtual/2025/calendar?filter_events=Expo+Talk+Panel&filter_rooms=
+), [SCI-FM workshop](
+https://open-foundation-model.github.io/
+) and [LMSys afterparty](
+https://lu.ma/d23nyynm
+). Talk materials available [here](
+https://github.com/eric-haibin-lin/verl-community/tree/main/iclr25
+).
+[2025/03] verl v0.3.0.post1 is released! See [release note](
+https://github.com/verl-project/verl/releases/
+) for details. It achieves [~1.4x speedup](
+https://tongyx361.github.io/blogs/posts/verl-intro/#/verl-flexible-and-efficient-rl-for-llms
+) compared to prev versions.
+[2025/05] verl will be presented at [A2M Shanghai](
+https://a2m.msup.com.cn/home/?aid=4488&city=shanghai
+) on 5/16 - 5/17.
+[2025/05] verl will be presented at [GOSIM x PyTorch Day 2025](
+https://paris2025.gosim.org/
+). See you in Paris!
+[2025/03] We introduced the programming model of verl at the [vLLM Beijing Meetup](
+https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg
+) and [verl intro and updates](
+https://github.com/eric-haibin-lin/verl-community/blob/main/slides/verl-lmsys-meetup.pdf
+) at the [SGLang-LMSYS Org Meetup](
+https://lu.ma/ntjrr7ig
+) in Sunnyvale mid-March.
+[2025/03] We will present verl(HybridFlow) at EuroSys 2025. See you in Rotterdam!
+[2025/02] verl v0.2.0.post2 is released!
+[2025/02] We presented verl in the
+Bytedance/NVIDIA/Anyscale Ray Meetup
+. See you in San Jose!
+[2025/01] [Doubao-1.5-pro](
+https://team.doubao.com/zh/special/doubao_1_5_pro
+) is released with SOTA-level performance on LLM & VLM. The RL scaling preview model is trained using verl, reaching OpenAI O1-level performance on math benchmarks (70.0 pass@1 on AIME).
+[2024/12] verl is presented at Ray Forward 2024. Slides available
+here
+[2024/12] The team presented
+Post-training LLMs: From Algorithms to Infrastructure
+at NeurIPS 2024.
+Slides
+and
+video
+available.
+[2024/10] verl is presented at Ray Summit.
+Youtube video
+available.
+[2024/08] HybridFlow (verl) is accepted to EuroSys 2025.
+Key Features
+FSDP
+,
+FSDP2
+and
+Megatron-LM
+for training.
+vLLM
+,
+SGLang
+and
+HF Transformers
+for rollout generation.
+Compatible with Hugging Face Transformers and Modelscope Hub: Qwen3.5, Qwen3, Qwen-2.5, Llama3.1, Gemma2, DeepSeek-LLM, etc
+Supervised fine-tuning.
+Reinforcement learning with
+PPO
+,
+GRPO
+,
+GSPO
+,
+ReMax
+,
+REINFORCE++
+,
+RLOO
+,
+PRIME
+,
+DAPO
+,
+DrGRPO
+,
+KL_Cov & Clip_Cov
+etc.
+Support model-based reward and function-based reward (verifiable reward) for math,
+coding
+, etc
+Support vision-language models (VLMs) and
+multi-modal RL
+with Qwen2.5-vl, Kimi-VL
+Multi-turn with tool calling
+LLM alignment recipes such as
+Self-play preference optimization (SPPO)
+Flash attention 2, sequence packing, sequence parallelism via DeepSpeed Ulysses,
+LoRA
+,
+Liger-kernel
+(
+USE_LIGER=1
+).
+Scales up to 671B models and hundreds of GPUs with
+expert parallelism
+Multi-gpu
+LoRA RL
+support to save memory.
+Experiment tracking with wandb, swanlab, mlflow and tensorboard.
+Hardware Support: Supports NVIDIA, AMD,
+Ascend
+Getting Started
+Documentation
+Quickstart:
+Installation
+Quickstart
+Programming Guide
+&
+Tech Talk
+(in Chinese)
+PPO in verl
+GRPO in verl
+Running a PPO example step-by-step:
+Prepare Data for Post-Training
+Implement Reward Function for Dataset
+PPO Example Architecture
+Config Explanation
+Reproducible algorithm baselines:
+RL performance on coding, math
+Algorithm recipes (
+recipe/
+):
+Optional workflows and baselines live under
+recipe/
+. Each recipe subdirectory includes a small
+REQUIRED_VERL.txt
+file describing the intended
+verl
+install: pinned recipes use a
+tag or fixed git SHA
+; rolling recipes record an explicit
+VERL_COMMIT
+(and related submodule / recipe-folder SHAs) so you can
+pip install verl@git+…@<sha>
+without guessing. See
+recipe/README.md
+for the full index and links.
+For code explanation and advance usage (extension):
+PPO Trainer and Workers
+PPO Ray Trainer
+Model Engine
+Engine Workers (FSDP / Megatron-LM / Automodel / VeOmni / TorchTitan)
+Advanced Usage and Extension
+Add Models with the FSDP Backend
+Add Models with the Megatron-LM Backend
+Multi-turn Rollout Support
+Search Tool Integration
+Sandbox Fusion Integration
+Extend to Other RL(HF) algorithms
+Ray API design tutorial
+Blogs from the community
+When Reasoning Models Break Tokenization: The Hidden Complexity of Multiturn Training
+verl deployment on AWS SageMaker
+verl x SGLang Multi-turn Code Walkthrough
+Optimizing SGLang Memory Usage in verl
+SGLang, verl, OpenBMB and Tsinghua University: Pioneering End-to-End Multi-Turn RLHF
+Reinforcement Learning from Human Feedback on AMD GPUs with verl and ROCm Integration
+veMLP x verl ：玩转强化学习训练
+使用 verl 进行 GRPO 分布式强化学习训练最佳实践
+HybridFlow verl 原文浅析
+最高提升 20 倍吞吐量！豆包大模型团队发布全新 RLHF 框架，现已开源！
+Performance Tuning Guide
+The performance is essential for on-policy RL algorithm. We have written a detailed
+performance tuning guide
+to help you optimize performance.
+Upgrade to vLLM >= v0.8.2
+verl now supports vLLM>=0.8.2 when using FSDP as the training backend. Please refer to
+this document
+for the installation guide and more information. Please avoid vllm 0.7.x, which contains bugs that may lead to OOMs and unexpected errors.
+Use Latest SGLang
+SGLang is fully supported with verl, and SGLang RL Group is working extensively on building unique features, including multi-turn agentic RL, VLM RLHF, server-based RL, and partial rollout. Please refer to
+this document
+for the installation guide and more information.
+Upgrade to FSDP2
+verl is fully embracing FSDP2! FSDP2 is recommended by torch distributed team, providing better throughput and memory usage, and is composible with other features (e.g. torch.compile). To enable FSDP2, simply use verl main and set the following options:
+actor_rollout_ref.ref.strategy=fsdp2
+actor_rollout_ref.actor.strategy=fsdp2
+critic.strategy=fsdp2
+Furthermore, FSDP2 cpu offloading is compatible with gradient accumulation. You can turn it on to save memory with
+actor_rollout_ref.actor.fsdp_config.offload_policy=True
+. For more details, see
+#1026
+AMD Support (ROCm Kernel)
+verl runs on AMD ROCm GPUs (MI300X / MI325X / MI355X) with FSDP, FSDP2, and Megatron trainer backends, and vLLM as the validated inference engine (SGLang support is in progress). See the
+AMD ROCm quick-start guide
+for container bring-up, environment verification, and training examples.
+Citation and acknowledgement
+If you find the project helpful, please cite:
+HybridFlow: A Flexible and Efficient RLHF Framework
+A Framework for Training Large Language Models for Code Generation via Proximal Policy Optimization
+@article
+{
+sheng2024hybridflow
+,
+title
+=
+{
+HybridFlow: A Flexible and Efficient RLHF Framework
+}
+,
+author
+=
+{
+Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu
+}
+,
+year
+=
+{
+2024
+}
+,
+journal
+=
+{
+arXiv preprint arXiv: 2409.19256
+}
+}
+verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The project is adopted and contributed by Bytedance, Anyscale, LMSys.org,
+Alibaba Qwen team
+, Shanghai AI Lab, Tsinghua University, UC Berkeley, UCLA, UIUC, University of Hong Kong, ke.com,
+All Hands AI
+,
+ModelBest
+, JD AI Lab, Microsoft Research,
+StepFun
+, Amazon, LinkedIn, Meituan,
+Camel-AI
+,
+OpenManus
+, Xiaomi, NVIDIA research,
+Baichuan
+,
+RedNote
+,
+SwissAI
+,
+Moonshot AI (Kimi)
+, Baidu, Snowflake, Skywork.ai, JetBrains,
+IceSword Lab
+, and many more.
+Awesome Projects Built with
+verl
+Welcome to register your awesome project build with
+verl
+for other developers' reference!
+TinyZero
+: a reproduction of
+DeepSeek R1 Zero
+recipe for reasoning tasks
+SkyThought
+: RL training for Sky-T1-7B by NovaSky AI team.
+simpleRL-reason
+: SimpleRL-Zoo: Investigating and Taming Zero Reinforcement Learning for Open Base Models in the Wild
+Easy-R1
+:
+Multi-modal
+RL training framework
+RandOpt
+: Neural Thickets: Diverse Task Experts Are Dense Around Pretrained Weights
+OpenManus-RL
+: LLM Agents RL tuning framework for multiple agent environments.
+rllm
+: async RL training with
+verl-pipeline
+RAGEN
+: a general-purpose reasoning
+agent
+training framework
+Search-R1
+: RL with reasoning and
+searching (tool-call)
+interleaved LLMs
+ReSearch
+: Learning to
+Re
+ason with
+Search
+for LLMs via Reinforcement Learning
+Skywork-OR1
+: Skywork open reaonser series
+ToRL
+: Scaling tool-integrated RL
+Absolute Zero Reasoner
+:
+A no human curated data self-play framework for reasoning
+verl-agent
+: A scalable training framework for
+long-horizon LLM/VLM agents
+, along with a new algorithm
+GiGPO
+RL-Factory
+: An easy and efficient RL post-training framework for Agentic Learning
+ReTool
+: ReTool: reinforcement learning for strategic tool use in LLMs. Code release is in progress...
+verl-tool
+: An unified and easy-to-extend tool-agent training framework based on verl
+PRIME
+: Process reinforcement through implicit rewards
+MemAgent
+: MemAgent: Reshaping Long-Context LLM with Multi-Conv RL based Memory Agent
+POLARIS
+: A Post-training recipe for scaling RL on Advanced Reasoning models
+GUI-R1
+:
+GUI-R1
+: A Generalist R1-style Vision-Language Action Model For
+GUI Agents
+DeepRetrieval
+: RL Training of
+Search Agent
+with
+Search/Retrieval Outcome
+Code-R1
+: Reproducing R1 for
+Code
+with Reliable Rewards
+DeepResearcher
+: Scaling deep research via reinforcement learning in real-world environments
+VAGEN
+: Training VLM agents with multi-turn reinforcement learning
+RM-R1
+: RL training of reasoning reward models
+Dr. MAS
+: Stable
+end-to-end RL
+post-training for
+multi-agent LLM systems
+LUFFY
+: Learning to Reason under Off-Policy Guidance
+DeepMath
+: DeepMath-103K data and series models for math reasoning
+PACS
+: Implicit Actor Critic Coupling via a Supervised Learning Framework for RLVR
+Entropy Mechanism of RL
+: The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning
+LLaSA-TTS-GRPO
+: TTS fine-tuning with GRPO optimization based on LLASA models
+PF-PPO
+: Policy Filtration for PPO based on the reliability of reward signals for more efficient and robust RLHF.
+RACRO
+: Build multi-modal reasoning models via decoupling it into query-conditioned captioning and text-only reasoning
+Agent Lightning
+: A flexible and extensible framework that enables seamless agent optimization for any existing agent framework.
+VTool-R1
+: VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use.
+Kimina-Prover-RL
+: Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1.
+RL-PLUS
+: Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization.
+rStar2-Agent
+: Using reinforcement learning with multi-step tool-calling for math tasks, rStar2-Agent-14B reaches frontier-level math reasoning in just 510 RL training steps
+Vision-SR1
+: Self-Rewarding Vision-Language Model via Reasoning Decomposition
+SimpleVLA-RL
+: SimpleVLA-RL: A Simple yet Effective Vision-Language Action Model for Reinforcement Learning
+Table-R1
+: Table-R1: Inference-Time Scaling for Table Reasoning
+Revisual-R1
+: Revisual-R1: Advancing Multimodal Reasoning From Optimized Cold Start to Staged Reinforcement Learning
+ARES
+: ARES: Multimodal Adaptive Reasoning via Difficulty-Aware Token-Level Entropy Shaping
+Meta-Bandit-LLM
+: Meta-Bandit-LLM: Long-horizon multiturn interactive training for meta-bandit agents
+PokeeResearch
+: PokeeResearch: State-of-the-art 7B DeepResearch Agent that leverages web search and content reading capabilities to answer complex questions using the most up-to-date information available online.
+Search Self-play
+: Pushing the Frontier of Agent Capability without Supervision
+OneThinker
+: All-in-one Reasoning Model for Image and Video
+OpenTinker
+: Democratizing Agentic Reinforcement Learning as a Service
+FlowRL
+: Matching reward distributions via
+flow balance
+for diverse exploration and generalizable reasoning
+Logic-RL
+: a reproduction of DeepSeek R1 Zero on 2K Tiny Logic Puzzle Dataset.
+Seed-Coder
+: RL training of Seed-Coder boosts performance on competitive programming
+all-hands/openhands-lm-32b-v0.1
+: A strong, open coding agent model, trained with
+multi-turn fine-tuning
+s3
+Efficient Yet Effective
+Search Agent Training via RL
+Rec-R1
+: Bridging Generative Large Language Models and Recommendation Systems via Reinforcement Learning
+Explore RL Data Scaling
+: Exploring Data Scaling Trends and Effects in Reinforcement Learning from Human Feedback
+FIRE
+: Flaming-hot initiation with regular execution sampling for large language models
+DQO
+: Enhancing multi-Step reasoning abilities of language models through direct Q-function optimization
+ProRL
+: Prolonged Reinforcement Learning Expands Reasoning Boundaries in Large Language Models
+cognition-engineering
+: Test time scaling drives cognition engineering.
+Trust Region Preference Approximation
+: A simple and stable
+reinforcement learning algorithm
+for LLM reasoning.
+AdaRFT
+: Efficient Reinforcement Finetuning via
+Adaptive Curriculum Learning
+critic-rl
+: LLM critics for code generation
+self-rewarding-reasoning-LLM
+: self-rewarding and correction with
+generative reward models
+DeepEnlighten
+: Reproduce R1 with
+social reasoning
+tasks and analyze key findings
+MetaSpatial
+: Reinforcing
+3D Spatial Reasoning
+in
+VLMs
+for the
+Metaverse
+PURE
+:
+Credit assignment
+is the key to successful reinforcement fine-tuning using
+process reward model
+cognitive-behaviors
+: Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs
+deepscaler
+: iterative context scaling with GRPO
+DAPO
+: the fully open source SOTA RL algorithm that beats DeepSeek-R1-zero-32B
+NoisyRollout
+: Reinforcing Visual Reasoning with Data Augmentation
+SPEAR
+:
+Self-imitation
+with
+Progressive Exploration
+for Agentic Reinforcement Learning (ICLR 2026)
+RuleReasoner
+:
+RuleReasoner:
+Reinforced Rule-based Reasoning via
+Domain-aware Dynamic Sampling
+(ICLR 2026)
+MetaphorStar
+:
+Image Metaphor
+Understanding and Reasoning with End-to-End
+Visual Reinforcement Learning
+DART-GUI
+: a decoupled agentic RL framework for Computer Use Agents, achieving ~2× training speedup and ~5× environment utilization!
+Rethinking OPD
+: Rethinking On-Policy Distillation of Large Language Models: Phenomenology, Mechanism, and Recipe
+Contribution Guide
+See
+contributions guide
+About
+ByteDance Seed Team
+Founded in 2023, ByteDance Seed Team is dedicated to crafting the industry's most advanced AI foundation models. The team aspires to become a world-class research team and make significant contributions to the advancement of science and society. You can get to know Bytedance Seed better through the following channels👇
+We are HIRING! Send us an
+email
+if you are interested in internship/FTE opportunities in RL for agents.
+You can’t perform that action at this time.
\ No newline at end of file
diff --git a/research/notes/welcome-to-verls-documentation-verl-documentation-2.md b/research/notes/welcome-to-verls-documentation-verl-documentation-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..a232ca52a93c8e60291604bc75521d2530282aa3
--- /dev/null
+++ b/research/notes/welcome-to-verls-documentation-verl-documentation-2.md
@@ -0,0 +1,227 @@
+---
+title: Welcome to verl’s documentation! — verl  documentation
+id: welcome-to-verls-documentation-verl-documentation-2
+tags:
+- deepread
+created: '2026-06-10T00:41:00.572353Z'
+source: https://verl.readthedocs.io/en/latest/
+source_domain: verl.readthedocs.io
+fetched_at: '2026-06-10T00:41:00.572215Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: docs
+deprecated: false
+---
+
+Welcome to verl’s documentation! — verl  documentation
+Welcome to verl’s documentation!
+View page source
+Welcome to verl’s documentation!
+
+verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the
+HybridFlow
+paper.
+verl is flexible and easy to use with:
+Easy extension of diverse RL algorithms
+: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
+Seamless integration of existing LLM infra with modular APIs
+: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
+Flexible device mapping and parallelism
+: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+Ready integration with popular HuggingFace models
+verl is fast with:
+State-of-the-art throughput
+: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
+Efficient actor model resharding with 3D-HybridEngine
+: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+Quickstart
+Installation
+Requirements
+Choices of Backend Engines
+Install from docker image
+Install from custom environment
+Install with AMD GPUs - ROCM kernel support
+Quickstart: PPO training on GSM8K dataset
+Introduction
+Dataset Introduction
+Step 1: Prepare the dataset
+Step 2: Download a model for post-training
+Step 3: Perform PPO training with the instruct model
+Multinode Training
+Option 1: Launch Manually
+Option 2: Launch via SkyPilot on Kubernetes or clouds
+Option 3: Launch via Slurm
+Option 4: Launch via dstack
+How to debug?
+Multi-node training on AMD clusters
+Ray Debug Tutorial
+How to debug?
+More Resources
+Agentic RL Training
+Overview
+Server-based Asynchronous Rollout
+Multi-turn Conversations and Tool Calls
+Agent Framework
+Programming guide
+HybridFlow Programming Guide
+Motivation and Design
+Codebase walkthrough (PPO)
+Repository organization
+The Design of
+verl.single_controller
+Preface
+Origin
+A Running Example:
+generate_sequences
+Beyond RL Post-Training: Generalizing
+verl.single_controller
+Data Preparation
+Prepare Data for Post-Training
+Implement Reward Function for Dataset
+Configurations
+Config Explanation
+ppo_trainer.yaml for RL FSDP Backend
+evaluation.yaml
+sft_trainer.yaml for SFT FSDP Backend
+PPO Example
+PPO Example Architecture
+GSM8K Example
+Megatron-FSDP Example
+Multi-Modal Example Architecture
+SkyPilot Examples
+Algorithms
+Proximal Policy Optimization (PPO)
+Group Relative Policy Optimization (GRPO)
+Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
+Recipe: Self-Play Fine-Tuning (SPIN)
+Recipe: Self-Play Preference Optimization (SPPO)
+Recipe: Entropy Mechanism
+On-Policy RL with Optimal Reward Baseline (OPO)
+Algorithm Baselines
+GPG: Group Policy Gradient
+Rollout Correction
+Mathematical Formulations of Rollout Correction Methods in
+verl
+Optimal Token Baseline (OTB)
+Divergence Proximal Policy Optimization (DPPO)
+On-Policy Distillation (OPD)
+PPO Trainer and Workers
+PPO Ray Trainer
+Model Engine
+Engine Workers
+Automodel Backend
+SGLang Backend
+TensorRT-LLM Backend
+Performance Tuning Guide
+Training DeepSeek 671b
+Verl LLM Best Practices (DAPO + Qwen3-235B)
+Performance Tuning Guide
+Rollout KV Cache Offload via Mooncake-Store
+Upgrading to vLLM >= 0.8
+Hardware Resource Needed for RL
+verl Profiler System
+NVIDIA Nsight Systems profiling in verl
+PyTorch Profiling in verl
+Adding new models
+Add models with the FSDP backend
+Add models with the Megatron-LM backend
+Async Training
+Recipe: One Step Off Policy Async Trainer
+Recipe: Fully Async Policy Trainer
+Recipe: Async On-Policy Knowledge Distillation Trainer
+Low Precision
+FP8 RL in verl
+NVFP4 QAT (Quantization-Aware Training) in verl
+Advanced Features
+Using Checkpoints to Support Fault Tolerance Training
+RoPE Scaling override
+Attention Implementation Override
+RL(HF) algorithms with LoRA Support
+Multi-turn Rollout Support
+Ray API Design Tutorial
+Extend to other RL(HF) algorithms
+Sandbox Fusion Example
+Trace Function Usage Instructions
+SkipManager: Skip everything in the RL pipeline.
+Agent Loop
+Reward Loop
+TransferQueue Data System
+Use Prometheus and Grafana to Monitor Rollout
+Guide to Using MTP in SFT/RL Training and Inference
+Hardware Support
+Multi-Chip Support
+AMD (ROCm) Tutorial
+Ascend (NPU) Tutorial
+API References
+Data interface
+Single Controller interface
+Trainer Interface
+Utilities
+Blog
+verl 0.7 release blog
+FAQ
+Frequently Asked Questions
+Ray related
+Distributed training
+Install related
+Illegal memory access
+Checkpoints
+Triton
+compile_module_from_src
+error
+What is the meaning of train batch size, mini batch size, and micro batch size?
+How to generate ray timeline to analyse performance of a training job?
+How to set proxy only for wandb?
+Missmatch between inference and training sequence (high actor/grad_norm)
+Contributing
+Editing Agent Instructions
+Development Notes
+Sandbox Fusion Tool Integration
+Contribution
+
+verl is free software; you can redistribute it and/or modify it under the terms
+of the Apache License 2.0. We welcome contributions.
+Join us on
+GitHub
+,
+Slack
+and
+Wechat
+for discussions.
+Contributions from the community are welcome! Please check out our
+project roadmap
+and
+good first issues
+to see where you can contribute.
+Code Linting and Formatting
+
+We use pre-commit to help improve code quality. To initialize pre-commit, run:
+pip
+install
+pre-commit
+pre-commit
+install
+To resolve CI errors locally, you can also manually run pre-commit by:
+pre-commit
+run
+Adding CI tests
+
+If possible, please add CI test(s) for your new feature:
+Find the most relevant workflow yml file, which usually corresponds to a
+hydra
+default config (e.g.
+ppo_trainer
+,
+ppo_megatron_trainer
+,
+sft_trainer
+, etc).
+Add related path patterns to the
+paths
+section if not already included.
+Minimize the workload of the test script(s) (see existing scripts for examples).
+We are HIRING! Send us an
+email
+if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
\ No newline at end of file
diff --git a/research/notes/welcome-to-verls-documentation-verl-documentation.md b/research/notes/welcome-to-verls-documentation-verl-documentation.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c26db008016c3435cb11c6ed44bc1223c33ea83
--- /dev/null
+++ b/research/notes/welcome-to-verls-documentation-verl-documentation.md
@@ -0,0 +1,227 @@
+---
+title: Welcome to verl’s documentation! — verl  documentation
+id: welcome-to-verls-documentation-verl-documentation
+tags:
+- deepread
+created: '2026-06-10T00:40:49.227131Z'
+source: https://verl.readthedocs.io/en/latest/index.html
+source_domain: verl.readthedocs.io
+fetched_at: '2026-06-10T00:40:49.226989Z'
+fetch_provider: builtin
+status: draft
+type: note
+tier: ground_truth
+content_type: docs
+deprecated: false
+---
+
+Welcome to verl’s documentation! — verl  documentation
+Welcome to verl’s documentation!
+View page source
+Welcome to verl’s documentation!
+
+verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the
+HybridFlow
+paper.
+verl is flexible and easy to use with:
+Easy extension of diverse RL algorithms
+: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
+Seamless integration of existing LLM infra with modular APIs
+: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
+Flexible device mapping and parallelism
+: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
+Ready integration with popular HuggingFace models
+verl is fast with:
+State-of-the-art throughput
+: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
+Efficient actor model resharding with 3D-HybridEngine
+: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
+Quickstart
+Installation
+Requirements
+Choices of Backend Engines
+Install from docker image
+Install from custom environment
+Install with AMD GPUs - ROCM kernel support
+Quickstart: PPO training on GSM8K dataset
+Introduction
+Dataset Introduction
+Step 1: Prepare the dataset
+Step 2: Download a model for post-training
+Step 3: Perform PPO training with the instruct model
+Multinode Training
+Option 1: Launch Manually
+Option 2: Launch via SkyPilot on Kubernetes or clouds
+Option 3: Launch via Slurm
+Option 4: Launch via dstack
+How to debug?
+Multi-node training on AMD clusters
+Ray Debug Tutorial
+How to debug?
+More Resources
+Agentic RL Training
+Overview
+Server-based Asynchronous Rollout
+Multi-turn Conversations and Tool Calls
+Agent Framework
+Programming guide
+HybridFlow Programming Guide
+Motivation and Design
+Codebase walkthrough (PPO)
+Repository organization
+The Design of
+verl.single_controller
+Preface
+Origin
+A Running Example:
+generate_sequences
+Beyond RL Post-Training: Generalizing
+verl.single_controller
+Data Preparation
+Prepare Data for Post-Training
+Implement Reward Function for Dataset
+Configurations
+Config Explanation
+ppo_trainer.yaml for RL FSDP Backend
+evaluation.yaml
+sft_trainer.yaml for SFT FSDP Backend
+PPO Example
+PPO Example Architecture
+GSM8K Example
+Megatron-FSDP Example
+Multi-Modal Example Architecture
+SkyPilot Examples
+Algorithms
+Proximal Policy Optimization (PPO)
+Group Relative Policy Optimization (GRPO)
+Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
+Recipe: Self-Play Fine-Tuning (SPIN)
+Recipe: Self-Play Preference Optimization (SPPO)
+Recipe: Entropy Mechanism
+On-Policy RL with Optimal Reward Baseline (OPO)
+Algorithm Baselines
+GPG: Group Policy Gradient
+Rollout Correction
+Mathematical Formulations of Rollout Correction Methods in
+verl
+Optimal Token Baseline (OTB)
+Divergence Proximal Policy Optimization (DPPO)
+On-Policy Distillation (OPD)
+PPO Trainer and Workers
+PPO Ray Trainer
+Model Engine
+Engine Workers
+Automodel Backend
+SGLang Backend
+TensorRT-LLM Backend
+Performance Tuning Guide
+Training DeepSeek 671b
+Verl LLM Best Practices (DAPO + Qwen3-235B)
+Performance Tuning Guide
+Rollout KV Cache Offload via Mooncake-Store
+Upgrading to vLLM >= 0.8
+Hardware Resource Needed for RL
+verl Profiler System
+NVIDIA Nsight Systems profiling in verl
+PyTorch Profiling in verl
+Adding new models
+Add models with the FSDP backend
+Add models with the Megatron-LM backend
+Async Training
+Recipe: One Step Off Policy Async Trainer
+Recipe: Fully Async Policy Trainer
+Recipe: Async On-Policy Knowledge Distillation Trainer
+Low Precision
+FP8 RL in verl
+NVFP4 QAT (Quantization-Aware Training) in verl
+Advanced Features
+Using Checkpoints to Support Fault Tolerance Training
+RoPE Scaling override
+Attention Implementation Override
+RL(HF) algorithms with LoRA Support
+Multi-turn Rollout Support
+Ray API Design Tutorial
+Extend to other RL(HF) algorithms
+Sandbox Fusion Example
+Trace Function Usage Instructions
+SkipManager: Skip everything in the RL pipeline.
+Agent Loop
+Reward Loop
+TransferQueue Data System
+Use Prometheus and Grafana to Monitor Rollout
+Guide to Using MTP in SFT/RL Training and Inference
+Hardware Support
+Multi-Chip Support
+AMD (ROCm) Tutorial
+Ascend (NPU) Tutorial
+API References
+Data interface
+Single Controller interface
+Trainer Interface
+Utilities
+Blog
+verl 0.7 release blog
+FAQ
+Frequently Asked Questions
+Ray related
+Distributed training
+Install related
+Illegal memory access
+Checkpoints
+Triton
+compile_module_from_src
+error
+What is the meaning of train batch size, mini batch size, and micro batch size?
+How to generate ray timeline to analyse performance of a training job?
+How to set proxy only for wandb?
+Missmatch between inference and training sequence (high actor/grad_norm)
+Contributing
+Editing Agent Instructions
+Development Notes
+Sandbox Fusion Tool Integration
+Contribution
+
+verl is free software; you can redistribute it and/or modify it under the terms
+of the Apache License 2.0. We welcome contributions.
+Join us on
+GitHub
+,
+Slack
+and
+Wechat
+for discussions.
+Contributions from the community are welcome! Please check out our
+project roadmap
+and
+good first issues
+to see where you can contribute.
+Code Linting and Formatting
+
+We use pre-commit to help improve code quality. To initialize pre-commit, run:
+pip
+install
+pre-commit
+pre-commit
+install
+To resolve CI errors locally, you can also manually run pre-commit by:
+pre-commit
+run
+Adding CI tests
+
+If possible, please add CI test(s) for your new feature:
+Find the most relevant workflow yml file, which usually corresponds to a
+hydra
+default config (e.g.
+ppo_trainer
+,
+ppo_megatron_trainer
+,
+sft_trainer
+, etc).
+Add related path patterns to the
+paths
+section if not already included.
+Minimize the workload of the test script(s) (see existing scripts for examples).
+We are HIRING! Send us an
+email
+if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
\ No newline at end of file