CoCoOne commited on
Commit
353ee9f
·
1 Parent(s): 6798401

Slim Space deployment mirror

Browse files
.dockerignore CHANGED
@@ -1,23 +1,24 @@
1
- .git
2
  .gitignore
 
 
 
 
 
3
  __pycache__/
4
  *.py[cod]
5
  .pytest_cache/
6
  .mypy_cache/
7
  .ruff_cache/
 
8
  .env
9
  .envrc
10
  .venv/
11
  venv/
12
- workspace/*
13
- !workspace/.gitkeep
14
- traces/*
15
- !traces/.gitkeep
16
- api_runs/*
17
- !api_runs/.gitkeep
18
- runtime/
19
- tests/
20
  .codex/
 
21
  .idea/
22
  .vscode/
23
  .DS_Store
 
1
+ .git/
2
  .gitignore
3
+ AGENTS.md
4
+ runtime/
5
+ data/
6
+ inputs/
7
+
8
  __pycache__/
9
  *.py[cod]
10
  .pytest_cache/
11
  .mypy_cache/
12
  .ruff_cache/
13
+
14
  .env
15
  .envrc
16
  .venv/
17
  venv/
18
+ env/
19
+
 
 
 
 
 
 
20
  .codex/
21
+ .agents/
22
  .idea/
23
  .vscode/
24
  .DS_Store
.env.example DELETED
@@ -1,39 +0,0 @@
1
- # Required
2
- API_KEY="your_openai_compatible_key" # API key for your OpenAI-compatible LLM provider.
3
- API_BASE="https://your-openai-compatible-endpoint/v1" # Base URL for the OpenAI-compatible chat-completions endpoint.
4
- MODEL_NAME="gpt-5.5" # Main model used by the agent and WebFetch summarization.
5
- SERPER_KEY="your_serper_key" # https://serper.dev/
6
- JINA_KEY="your_jina_key" # https://jina.ai/
7
- MINERU_TOKEN="your_mineru_token" # https://mineru.net/
8
- HF_TOKEN="your_huggingface_token" # Hugging Face token with dataset write access when collection is enabled.
9
-
10
- # Optional
11
- WORKSPACE_ROOT="./workspace" # Default local workspace root when --workspace-root is not provided.
12
- MAX_LLM_CALL_PER_RUN=100 # Maximum chat-completions calls allowed in one agent run.
13
- MAX_AGENT_ROUNDS=100 # Maximum ReAct loop rounds before forced termination.
14
- MAX_AGENT_RUNTIME_SECONDS=9000 # Maximum wall-clock runtime per agent run.
15
- LLM_TIMEOUT_SECONDS=600 # Timeout for each chat-completions request.
16
- LLM_MAX_OUTPUT_TOKENS=10000 # Maximum output tokens requested from the main model.
17
- MAX_INPUT_TOKENS=320000 # Maximum input-token budget used for runtime token accounting.
18
- LLM_MAX_RETRIES=10 # Maximum retries for transient LLM API failures.
19
- TEMPERATURE=0.6 # Main model sampling temperature.
20
- TOP_P=0.95 # Main model nucleus-sampling top_p.
21
- PRESENCE_PENALTY=1.1 # Main model presence penalty when supported by the provider.
22
- AUTO_COMPACT_TRIGGER_TOKENS="128k" # Context size threshold that triggers automatic memory compaction.
23
- IMAGE_PART_TOKEN_ESTIMATE=1536 # Token estimate used for each runtime image_url content part.
24
- LLM_IMAGE_MAX_EDGE=1568 # Maximum image edge length sent to multimodal LLMs.
25
- LLM_IMAGE_MAX_BYTES=524288 # Maximum compressed image payload size sent to multimodal LLMs.
26
- LLM_IMAGE_JPEG_QUALITY=85 # Initial JPEG quality for runtime image compression.
27
- DEBUG_AGENT=false # Print verbose agent-loop debug logs.
28
- DEBUG_SEARCH=false # Print verbose WebSearch debug logs.
29
- DEBUG_SCHOLAR=false # Print verbose ScholarSearch debug logs.
30
- DEBUG_VISIT=false # Print verbose WebFetch debug logs.
31
- RH_SPACE_RUNS_DIR="/tmp/researchharness_space/runs" # Parent directory for temporary per-chat runs in hosted mode.
32
- RH_SPACE_RETENTION_SECONDS=21600 # Delete inactive hosted runs older than this many seconds.
33
- RH_SPACE_MAX_RUNS=40 # Keep at most this many inactive hosted runs.
34
- RH_SPACE_CLEANUP_INTERVAL_SECONDS=900 # Background cleanup interval for hosted runs.
35
- RH_COLLECTION_ENABLED=true # Automatically collect hosted run traces after each completed run.
36
- RH_COLLECTION_DATASET_REPO="CoCoOne/ResearchHarness-Data" # Hugging Face dataset repo receiving trace PRs.
37
- RH_COLLECTION_BATCH_SIZE=5 # Create one dataset PR after this many collected runs.
38
- RH_COLLECTION_MAX_BUNDLE_BYTES=20971520 # Drop any single trace bundle larger than this many bytes.
39
- RH_ROLE_PROMPT_FILES="" # Optional role prompt files separated by os.pathsep.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,230 +1,31 @@
1
- runtime/
2
- # Local agent artifacts
3
  AGENTS.md
4
- workspace/*
5
- !workspace/.gitkeep
6
- api_runs/*
7
- !api_runs/.gitkeep
8
- traces/*
9
- !traces/.gitkeep
10
- /inputs/
11
  data/
12
- benchmarks/**/local_*.py
13
- .idea/
14
- .vscode/
15
- .DS_Store
16
- tests/example_files/pdfs/dummy_document
17
- .codex
18
-
19
 
20
- # Byte-compiled / optimized / DLL files
21
  __pycache__/
22
- *.py[codz]
23
  *$py.class
24
-
25
- # C extensions
26
  *.so
27
 
28
- # Distribution / packaging
29
- .Python
30
- build/
31
- develop-eggs/
32
- dist/
33
- downloads/
34
- eggs/
35
- .eggs/
36
- lib/
37
- lib64/
38
- parts/
39
- sdist/
40
- var/
41
- wheels/
42
- share/python-wheels/
43
- *.egg-info/
44
- .installed.cfg
45
- *.egg
46
- MANIFEST
47
-
48
- # PyInstaller
49
- # Usually these files are written by a python script from a template
50
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
- *.manifest
52
- *.spec
53
-
54
- # Installer logs
55
- pip-log.txt
56
- pip-delete-this-directory.txt
57
-
58
- # Unit test / coverage reports
59
- htmlcov/
60
- .tox/
61
- .nox/
62
- .coverage
63
- .coverage.*
64
- .cache
65
- nosetests.xml
66
- coverage.xml
67
- *.cover
68
- *.py.cover
69
- .hypothesis/
70
- .pytest_cache/
71
- cover/
72
-
73
- # Translations
74
- *.mo
75
- *.pot
76
-
77
- # Django stuff:
78
- *.log
79
- local_settings.py
80
- db.sqlite3
81
- db.sqlite3-journal
82
-
83
- # Flask stuff:
84
- instance/
85
- .webassets-cache
86
-
87
- # Scrapy stuff:
88
- .scrapy
89
-
90
- # Sphinx documentation
91
- docs/_build/
92
-
93
- # PyBuilder
94
- .pybuilder/
95
- target/
96
-
97
- # Jupyter Notebook
98
- .ipynb_checkpoints
99
-
100
- # IPython
101
- profile_default/
102
- ipython_config.py
103
-
104
- # pyenv
105
- # For a library or package, you might want to ignore these files since the code is
106
- # intended to run in multiple environments; otherwise, check them in:
107
- # .python-version
108
-
109
- # pipenv
110
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
112
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
113
- # install all needed dependencies.
114
- #Pipfile.lock
115
-
116
- # UV
117
- # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
118
- # This is especially recommended for binary packages to ensure reproducibility, and is more
119
- # commonly ignored for libraries.
120
- #uv.lock
121
-
122
- # poetry
123
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
124
- # This is especially recommended for binary packages to ensure reproducibility, and is more
125
- # commonly ignored for libraries.
126
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
127
- #poetry.lock
128
- #poetry.toml
129
-
130
- # pdm
131
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
132
- # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
133
- # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
134
- #pdm.lock
135
- #pdm.toml
136
- .pdm-python
137
- .pdm-build/
138
-
139
- # pixi
140
- # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
141
- #pixi.lock
142
- # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
143
- # in the .venv directory. It is recommended not to include this directory in version control.
144
- .pixi
145
-
146
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
147
- __pypackages__/
148
-
149
- # Celery stuff
150
- celerybeat-schedule
151
- celerybeat.pid
152
-
153
- # SageMath parsed files
154
- *.sage.py
155
-
156
- # Environments
157
  .env
158
  .envrc
159
- .venv
160
- env/
161
  venv/
162
- ENV/
163
- env.bak/
164
- venv.bak/
165
-
166
- # Spyder project settings
167
- .spyderproject
168
- .spyproject
169
-
170
- # Rope project settings
171
- .ropeproject
172
-
173
- # mkdocs documentation
174
- /site
175
 
176
- # mypy
177
  .mypy_cache/
178
- .dmypy.json
179
- dmypy.json
180
-
181
- # Pyre type checker
182
- .pyre/
183
-
184
- # pytype static type analyzer
185
- .pytype/
186
-
187
- # Cython debug symbols
188
- cython_debug/
189
-
190
- # PyCharm
191
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
192
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
193
- # and can be added to the global gitignore or merged into this file. For a more nuclear
194
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
195
- #.idea/
196
-
197
- # Abstra
198
- # Abstra is an AI-powered process automation framework.
199
- # Ignore directories containing user credentials, local state, and settings.
200
- # Learn more at https://abstra.io/docs
201
- .abstra/
202
-
203
- # Visual Studio Code
204
- # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
205
- # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
206
- # and can be added to the global gitignore or merged into this file. However, if you prefer,
207
- # you could uncomment the following to ignore the entire vscode folder
208
- # .vscode/
209
-
210
- # Ruff stuff:
211
  .ruff_cache/
 
 
212
 
213
- # PyPI configuration file
214
- .pypirc
215
-
216
- # Cursor
217
- # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
218
- # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
219
- # refer to https://docs.cursor.com/context/ignore-files
220
- .cursorignore
221
- .cursorindexingignore
222
-
223
- # Marimo
224
- marimo/_static/
225
- marimo/_lsp/
226
- __marimo__/
227
 
228
- # Hugging Face Space runtime artifacts
229
- runtime/
230
- /tmp/
 
 
 
 
 
1
  AGENTS.md
2
+ runtime/
 
 
 
 
 
 
3
  data/
4
+ inputs/
 
 
 
 
 
 
5
 
 
6
  __pycache__/
7
+ *.py[cod]
8
  *$py.class
 
 
9
  *.so
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  .env
12
  .envrc
13
+ .venv/
 
14
  venv/
15
+ env/
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ .pytest_cache/
18
  .mypy_cache/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  .ruff_cache/
20
+ .coverage
21
+ htmlcov/
22
 
23
+ build/
24
+ dist/
25
+ *.egg-info/
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ .codex/
28
+ .agents/
29
+ .idea/
30
+ .vscode/
31
+ .DS_Store
README.md CHANGED
@@ -10,17 +10,87 @@ license: mit
10
  short_description: Lightweight harness for tool-using LLM agents.
11
  ---
12
 
13
- # ResearchHarness Space
14
 
15
- This Space runs the ResearchHarness browser frontend as a lightweight hosted agent UI.
16
- It reuses the ResearchHarness tool-calling runtime and keeps the hosted mode intentionally simple:
 
17
 
18
- - Users do not choose a local workspace.
19
- - Each new chat gets an isolated temporary runtime directory.
20
- - Uploaded images are saved under that chat workspace and also passed to the model when supported.
21
- - Agent traces and session state are stored beside the temporary workspace.
22
- - Completed runs are automatically packaged for trajectory collection.
23
- - Old workspaces and traces are cleaned periodically so the Space does not grow without bound.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  ## Required Secrets
26
 
@@ -48,7 +118,6 @@ Configure these as Hugging Face Space secrets before starting the app:
48
  | `RH_COLLECTION_DATASET_REPO` | `CoCoOne/ResearchHarness-Data` | Dataset repo that receives trajectory PRs. |
49
  | `RH_COLLECTION_BATCH_SIZE` | `5` | Create one dataset PR after this many collected runs. |
50
  | `RH_COLLECTION_MAX_BUNDLE_BYTES` | `20971520` | Drop a single run bundle if it exceeds this byte limit. |
51
- | `RH_ROLE_PROMPT_FILES` | empty | Optional `os.pathsep`-separated role prompt files inside the Space image. |
52
  | `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
53
 
54
  ## Runtime Layout
@@ -82,3 +151,31 @@ python app.py
82
  ```
83
 
84
  Then open `http://127.0.0.1:7860`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: Lightweight harness for tool-using LLM agents.
11
  ---
12
 
13
+ # ResearchHarness Space Maintenance Notes
14
 
15
+ This repository is the Hugging Face Docker Space deployment for
16
+ [`ResearchHarness`](https://github.com/black-yt/ResearchHarness). It is an online
17
+ app mirror, not the public open-source documentation and not a full source mirror.
18
 
19
+ The public project README, tutorials, benchmark notes, API server documentation,
20
+ and local CLI documentation belong in the main GitHub repository. This Space
21
+ README should stay focused on long-term deployment maintenance: what is copied
22
+ from the main repo, what is intentionally changed for hosted use, and what is
23
+ new in the Space.
24
+
25
+ ## Repository Relationship
26
+
27
+ | Repository | Role |
28
+ | --- | --- |
29
+ | `black-yt/ResearchHarness` | Main open-source runtime, CLI, API server, frontend, docs, tests, and benchmark adapters. |
30
+ | `CoCoOne/ResearchHarness` | Hugging Face Space app that hosts the browser frontend with managed temporary workspaces. |
31
+ | `CoCoOne/ResearchHarness-Data` | Hugging Face dataset receiving collected hosted-run trajectory PRs. |
32
+
33
+ Maintenance rule:
34
+
35
+ - Copy only the runtime/frontend pieces needed by the hosted app.
36
+ - Do not blindly sync the whole main repository into this Space.
37
+ - Space-only deployment logic must not be copied back into the main repo unless
38
+ it is genuinely general-purpose.
39
+ - Public documentation should be updated in the main repo, not duplicated here.
40
+
41
+ ## Copied From The Main Repository
42
+
43
+ These files/directories are copied from the main repo and should be refreshed
44
+ when their corresponding upstream implementation changes:
45
+
46
+ - `agent_base/`: core ReAct runtime, prompts, tool registry, provider
47
+ compatibility, trace/session state, image handling, and compaction logic.
48
+ - `agent_base/tools/`: hosted-safe tool implementations used by the frontend.
49
+ - `frontend/static/`: shared browser UI assets, styles, and client logic.
50
+ - `frontend/local_server.py`: WebSocket streaming frontend server base, with
51
+ Space-specific managed-workspace behavior preserved.
52
+ - `requirements.txt`: Python runtime dependencies needed by the hosted app.
53
+
54
+ When updating these files from the main repo, inspect the diff and preserve the
55
+ Space-specific changes listed below.
56
+
57
+ ## Space-Specific Changes
58
+
59
+ These behaviors are intentional Space-only deltas:
60
+
61
+ - `app.py` is the Hugging Face entrypoint and owns Space startup, cleanup, and
62
+ trajectory collection configuration.
63
+ - Users cannot select arbitrary server folders. Each new chat gets an isolated
64
+ managed run directory under `RH_SPACE_RUNS_DIR`.
65
+ - The runtime layout is always:
66
+ `run_.../agent_workspace/` for agent-visible files and
67
+ `run_.../agent_trace/` for traces and `_session_state.json`.
68
+ - Uploaded images are saved under `agent_workspace/inputs/images/` and are also
69
+ passed to the model as image inputs when supported.
70
+ - The frontend exposes a per-run model dropdown. Current options are `gpt-5.5`
71
+ and `claude-opus-4-7`; the selection must stay local to that run and must not
72
+ mutate global process environment variables.
73
+ - Completed runs are packaged for trajectory collection and submitted as pull
74
+ requests to the configured Hugging Face dataset after the batch threshold is
75
+ reached.
76
+ - Old inactive runs are cleaned periodically so the Space does not grow without
77
+ bound.
78
+
79
+ ## Intentionally Removed From The Space
80
+
81
+ The Space intentionally does not keep the full main-repo surface area:
82
+
83
+ - `run_agent.py`, `run_server.py`, `run_frontend.py`
84
+ - OpenAI-compatible API server code under `api/`
85
+ - benchmark adapters and benchmark documentation under `benchmarks/`
86
+ - long-form tutorials under `docs/`
87
+ - local placeholder directories such as `workspace/`, `api_runs/`, and `traces/`
88
+ - CLI-only console formatting helpers
89
+ - test fixtures and local test suites
90
+ - `.env.example`
91
+
92
+ Removing these files keeps the deployed app small and avoids stale code or
93
+ misleading documentation drifting away from the main repository.
94
 
95
  ## Required Secrets
96
 
 
118
  | `RH_COLLECTION_DATASET_REPO` | `CoCoOne/ResearchHarness-Data` | Dataset repo that receives trajectory PRs. |
119
  | `RH_COLLECTION_BATCH_SIZE` | `5` | Create one dataset PR after this many collected runs. |
120
  | `RH_COLLECTION_MAX_BUNDLE_BYTES` | `20971520` | Drop a single run bundle if it exceeds this byte limit. |
 
121
  | `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
122
 
123
  ## Runtime Layout
 
151
  ```
152
 
153
  Then open `http://127.0.0.1:7860`.
154
+
155
+ Before pushing Space changes, run at least:
156
+
157
+ ```bash
158
+ python3 -B - <<'PY'
159
+ from pathlib import Path
160
+ import py_compile
161
+
162
+ for path in Path(".").rglob("*.py"):
163
+ if ".git" not in path.parts:
164
+ py_compile.compile(str(path), doraise=True)
165
+ print("syntax ok")
166
+ PY
167
+
168
+ RH_COLLECTION_ENABLED=false python3 -B - <<'PY'
169
+ from fastapi.testclient import TestClient
170
+ import app
171
+
172
+ client = TestClient(app.app)
173
+ response = client.get("/")
174
+ assert response.status_code == 200
175
+ assert "ResearchHarness" in response.text
176
+ print("app ok")
177
+ PY
178
+
179
+ node --check frontend/static/app.js
180
+ git diff --check
181
+ ```
agent_base/console_utils.py DELETED
@@ -1,223 +0,0 @@
1
- import argparse
2
- import json
3
- import os
4
- from pathlib import Path
5
- import shutil
6
- import sys
7
- import unicodedata
8
- from typing import Any, Optional
9
-
10
-
11
- ANSI_RESET = "\033[0m"
12
- ANSI_COLORS = {
13
- "header": "\033[36m",
14
- "assistant": "\033[32m",
15
- "tool": "\033[33m",
16
- "runtime": "\033[34m",
17
- "user": "\033[35m",
18
- "error": "\033[31m",
19
- }
20
-
21
-
22
- def _char_display_width(char: str) -> int:
23
- if unicodedata.combining(char):
24
- return 0
25
- if unicodedata.category(char) in {"Cc", "Cf"}:
26
- return 0
27
- return 2 if unicodedata.east_asian_width(char) in {"F", "W"} else 1
28
-
29
-
30
- def _display_width(text: str) -> int:
31
- return sum(_char_display_width(char) for char in str(text))
32
-
33
-
34
- def _truncate_display(text: str, width: int) -> str:
35
- if _display_width(text) <= width:
36
- return text
37
- suffix = "..."
38
- target = max(0, width - _display_width(suffix))
39
- out = []
40
- used = 0
41
- for char in text:
42
- char_width = _char_display_width(char)
43
- if used + char_width > target:
44
- break
45
- out.append(char)
46
- used += char_width
47
- return "".join(out) + suffix
48
-
49
-
50
- def _pad_display(text: str, width: int) -> str:
51
- return text + " " * max(0, width - _display_width(text))
52
-
53
-
54
- def _last_soft_break(chars: list[str]) -> int:
55
- for index in range(len(chars) - 1, 0, -1):
56
- if chars[index].isspace() and "".join(chars[:index]).strip():
57
- return index
58
- return -1
59
-
60
-
61
- class ConsoleEventPrinter:
62
- def __init__(self, *, model_name: str, workspace_root: Path, prompt: str):
63
- self.model_name = model_name
64
- self.workspace_root = workspace_root
65
- self.prompt = prompt.strip()
66
- self._printed_any = False
67
- self._use_color = (
68
- "NO_COLOR" not in os.environ
69
- and os.environ.get("TERM") != "dumb"
70
- and (sys.stdout.isatty() or bool(os.environ.get("FORCE_COLOR") or os.environ.get("CLICOLOR_FORCE")))
71
- )
72
-
73
- def print_header(self) -> None:
74
- self._print_box(
75
- "ResearchHarness CLI",
76
- f"Model: {self.model_name}\nWorkspace Root: {self.workspace_root}\n\nPrompt:\n{self.prompt}",
77
- "header",
78
- )
79
-
80
- def reset_rounds(self) -> None:
81
- self._printed_any = False
82
-
83
- def _paint(self, text: str, color_key: str) -> str:
84
- if not self._use_color:
85
- return text
86
- return f"{ANSI_COLORS.get(color_key, '')}{text}{ANSI_RESET}"
87
-
88
- def _terminal_width(self) -> int:
89
- return max(60, min(110, shutil.get_terminal_size((100, 20)).columns))
90
-
91
- def _wrap_line(self, line: str, width: int) -> list[str]:
92
- expanded = line.expandtabs(2)
93
- if expanded == "":
94
- return [""]
95
- chunks: list[str] = []
96
- current: list[str] = []
97
- current_width = 0
98
- for char in expanded:
99
- char_width = _char_display_width(char)
100
- if current and current_width + char_width > width:
101
- break_at = _last_soft_break(current)
102
- if break_at > 0:
103
- chunks.append("".join(current[:break_at]).rstrip())
104
- current = list("".join(current[break_at + 1 :]).lstrip())
105
- current_width = _display_width("".join(current))
106
- else:
107
- chunks.append("".join(current))
108
- current = []
109
- current_width = 0
110
- current.append(char)
111
- current_width += char_width
112
- if current:
113
- chunks.append("".join(current))
114
- return chunks or [""]
115
-
116
- def _print_box(self, title: str, body: str, color_key: str = "runtime") -> None:
117
- width = self._terminal_width()
118
- inner_width = width - 4
119
- title_text = f" {_truncate_display(title.strip(), width - 6)} "
120
- top = "+" + title_text + "-" * max(0, width - 2 - _display_width(title_text)) + "+"
121
- bottom = "+" + "-" * (width - 2) + "+"
122
- if self._printed_any:
123
- print()
124
- print(self._paint(top, color_key))
125
- for raw_line in str(body or "").splitlines() or [""]:
126
- for line in self._wrap_line(raw_line, inner_width):
127
- padded = _pad_display(line, inner_width)
128
- print(f"{self._paint('|', color_key)} {padded} {self._paint('|', color_key)}")
129
- print(self._paint(bottom, color_key))
130
- self._printed_any = True
131
-
132
- def _title(self, label: str, turn_index: int) -> str:
133
- return f"{label} | round {turn_index}" if turn_index > 0 else label
134
-
135
- def _format_tool_call(self, tool_name: str, tool_args: Any) -> str:
136
- try:
137
- tool_args_text = json.dumps(tool_args, ensure_ascii=False, indent=2)
138
- except TypeError:
139
- tool_args_text = str(tool_args)
140
- return f"- {tool_name}\n{tool_args_text}"
141
-
142
- def handle_event(self, row: dict[str, Any]) -> None:
143
- role = str(row.get("role", ""))
144
- turn_index = int(row.get("turn_index", 0) or 0)
145
- text = str(row.get("text", ""))
146
- capture_type = str(row.get("capture_type", ""))
147
- tool_names = row.get("tool_names") if isinstance(row.get("tool_names"), list) else []
148
- tool_arguments = row.get("tool_arguments") if isinstance(row.get("tool_arguments"), list) else []
149
- finish_reason = str(row.get("finish_reason", ""))
150
- error = str(row.get("error", ""))
151
-
152
- if capture_type and not text.strip():
153
- return
154
-
155
- if role == "system":
156
- return
157
-
158
- if role == "user":
159
- if turn_index == 0:
160
- return
161
- self._print_box(self._title("Runtime Message", turn_index), text, "user")
162
- return
163
-
164
- if role == "assistant":
165
- lines: list[str] = []
166
- if tool_names:
167
- if text.strip():
168
- lines.append(text)
169
- else:
170
- suffix = f" finish_reason={finish_reason}" if finish_reason else ""
171
- lines.append(f"(no text; native tool-calls only.{suffix})")
172
- lines.append("")
173
- lines.append("Assistant Tool Calls:")
174
- for idx, tool_name in enumerate(tool_names):
175
- tool_args = tool_arguments[idx] if idx < len(tool_arguments) else {}
176
- lines.append(self._format_tool_call(str(tool_name), tool_args))
177
- elif text.strip():
178
- lines.append(text)
179
- else:
180
- suffix = f" finish_reason={finish_reason}" if finish_reason else ""
181
- lines.append(f"(empty assistant output.{suffix})")
182
- if error:
183
- lines.append("")
184
- lines.append(f"Assistant Error: {error}")
185
- self._print_box(self._title("Assistant", turn_index), "\n".join(lines), "error" if error else "assistant")
186
- return
187
-
188
- if role == "tool":
189
- tool_name = str(tool_names[0]) if tool_names else "Tool"
190
- lines = [text]
191
- if error:
192
- lines.extend(["", f"{tool_name} Error: {error}"])
193
- self._print_box(self._title(f"{tool_name} Result", turn_index), "\n".join(lines), "error" if error else "tool")
194
- return
195
-
196
- if role == "runtime":
197
- lines = [text]
198
- if error:
199
- lines.extend(["", f"Runtime Error: {error}"])
200
- self._print_box(self._title("Runtime", turn_index), "\n".join(lines), "error" if error else "runtime")
201
-
202
-
203
- def main(argv: Optional[list[str]] = None) -> int:
204
- parser = argparse.ArgumentParser(description="Show a minimal example of the CLI console event formatter.")
205
- parser.parse_args(argv)
206
- printer = ConsoleEventPrinter(model_name="demo-model", workspace_root=Path("."), prompt="demo question")
207
- printer.print_header()
208
- printer.handle_event(
209
- {
210
- "role": "assistant",
211
- "turn_index": 1,
212
- "text": "",
213
- "tool_names": ["Read"],
214
- "tool_arguments": [{"path": "demo.txt"}],
215
- "termination": "",
216
- "error": "",
217
- }
218
- )
219
- return 0
220
-
221
-
222
- if __name__ == "__main__":
223
- raise SystemExit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent_base/react_agent.py CHANGED
@@ -1,18 +1,15 @@
1
- import argparse
2
  from contextlib import contextmanager
3
  import json
4
  import os
5
  import re
6
  import signal
7
- import sys
8
  import threading
9
  from pathlib import Path
10
- from typing import Any, Callable, Dict, List, Optional, Sequence, Type
11
 
12
  from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
13
  import tiktoken
14
  from agent_base.base import BaseAgent
15
- from agent_base.console_utils import ConsoleEventPrinter
16
  from agent_base.context_compact import compact_messages, should_compact_messages
17
  from agent_base.model_profiles import resolve_model_profile
18
  from agent_base.provider_compat import apply_sampling_params
@@ -25,16 +22,8 @@ from agent_base.tools.tool_runtime import Bash, TerminalInterrupt, TerminalKill,
25
  from agent_base.tools.tool_user import AskUser
26
  from agent_base.tools.tool_web import ScholarSearch, WebFetch, WebSearch
27
  from agent_base.utils import (
28
- PROJECT_ROOT,
29
- MissingRequiredEnvError,
30
- append_saved_image_paths_to_prompt,
31
  env_flag,
32
- image_input_content_parts,
33
- load_dotenv,
34
- read_role_prompt_files,
35
- require_required_env,
36
  safe_jsonable,
37
- stage_image_file_for_input,
38
  )
39
 
40
  import datetime
@@ -75,6 +64,10 @@ DEFAULT_PRESENCE_PENALTY = 1.1
75
  DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
76
 
77
 
 
 
 
 
78
  class LLMHardTimeoutError(TimeoutError):
79
  pass
80
 
@@ -551,10 +544,10 @@ def image_context_trace_text(result: Any) -> str:
551
  return text
552
 
553
 
554
- def default_llm_config() -> dict:
555
- model_name = os.environ.get("MODEL_NAME", DEFAULT_MODEL_NAME)
556
  return {
557
- "model": model_name,
558
  "api_key": os.environ.get("API_KEY", "EMPTY"),
559
  "api_base": os.environ.get("API_BASE"),
560
  "timeout_seconds": float(os.environ.get("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))),
@@ -1195,6 +1188,7 @@ class MultiTurnReactAgent(BaseAgent):
1195
  tool_arguments,
1196
  workspace_root=resolved_workspace_root,
1197
  runtime_deadline=runtime_deadline,
 
1198
  )
1199
  except KeyboardInterrupt:
1200
  messages = messages[:tool_turn_message_start]
@@ -1312,142 +1306,3 @@ class MultiTurnReactAgent(BaseAgent):
1312
 
1313
  def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs):
1314
  return execute_tool_by_name(self.tool_map, tool_name, tool_args, **kwargs)
1315
-
1316
-
1317
- def _path_has_suffix(path: Path, suffix_parts: Sequence[str]) -> bool:
1318
- normalized_parts = tuple(part.casefold() for part in path.parts)
1319
- normalized_suffix = tuple(part.casefold() for part in suffix_parts)
1320
- if len(normalized_parts) < len(normalized_suffix):
1321
- return False
1322
- return normalized_parts[-len(normalized_suffix) :] == normalized_suffix
1323
-
1324
-
1325
- def resolve_agent_class_for_role_prompt_files(role_prompt_files: Sequence[str]) -> Type[MultiTurnReactAgent]:
1326
- for raw_path in role_prompt_files:
1327
- path_text = str(raw_path).strip()
1328
- if not path_text:
1329
- continue
1330
- path = Path(path_text).expanduser().resolve(strict=False)
1331
- if _path_has_suffix(path, ("benchmarks", "ResearchClawBench", "role_prompt.md")):
1332
- from benchmarks.ResearchClawBench.adapter import ResearchClawBenchAgent
1333
-
1334
- return ResearchClawBenchAgent
1335
- return MultiTurnReactAgent
1336
-
1337
-
1338
- def _parse_cli_args(argv: list[str]) -> tuple[str, Optional[str], Optional[str], str, list[str], list[str], Optional[bool]]:
1339
- parser = argparse.ArgumentParser(description="Run the local agent directly from agent_base.react_agent.")
1340
- parser.add_argument("prompt", nargs="*", help="Prompt text.")
1341
- parser.add_argument("--prompt-file", help="Optional UTF-8 text file containing the prompt.")
1342
- parser.add_argument("--trace-dir", help="Optional directory where the run trace JSONL should be created.")
1343
- parser.add_argument(
1344
- "--workspace-root",
1345
- help="Optional workspace root for local file tools, Bash, and TerminalStart.",
1346
- )
1347
- parser.add_argument(
1348
- "--role-prompt-file",
1349
- action="append",
1350
- default=[],
1351
- dest="role_prompt_files",
1352
- metavar="PATH",
1353
- help="Append one role-specific prompt file to the base system prompt. May be passed multiple times.",
1354
- )
1355
- parser.add_argument(
1356
- "--images",
1357
- action="append",
1358
- nargs="+",
1359
- default=[],
1360
- dest="image_paths",
1361
- metavar="PATH",
1362
- help="Attach one or more local image paths to the initial user message.",
1363
- )
1364
- parser.add_argument(
1365
- "--chat",
1366
- action=argparse.BooleanOptionalAction,
1367
- default=None,
1368
- help="Continue asking for follow-up user messages after each final answer. Defaults to on only in an interactive terminal.",
1369
- )
1370
- args = parser.parse_args(argv)
1371
-
1372
- prompt_text = ""
1373
- if args.prompt_file:
1374
- prompt_text = Path(args.prompt_file).read_text(encoding="utf-8").strip()
1375
- elif args.prompt:
1376
- prompt_text = " ".join(args.prompt).strip()
1377
-
1378
- if not prompt_text:
1379
- raise ValueError("A non-empty prompt is required via positional args or --prompt-file.")
1380
- role_prompt = read_role_prompt_files(args.role_prompt_files)
1381
- return (
1382
- prompt_text,
1383
- args.trace_dir,
1384
- args.workspace_root,
1385
- role_prompt,
1386
- list(args.role_prompt_files),
1387
- [path for group in args.image_paths for path in group],
1388
- args.chat,
1389
- )
1390
-
1391
-
1392
- def main(argv: Optional[list[str]] = None) -> int:
1393
- load_dotenv(PROJECT_ROOT / ".env")
1394
- try:
1395
- require_required_env("ResearchHarness agent")
1396
- prompt_text, trace_dir, workspace_root, role_prompt, role_prompt_files, image_paths, chat_arg = _parse_cli_args(argv or sys.argv[1:])
1397
- agent_cls = resolve_agent_class_for_role_prompt_files(role_prompt_files)
1398
- agent = agent_cls(
1399
- llm=default_llm_config(),
1400
- trace_dir=trace_dir,
1401
- role_prompt=role_prompt or None,
1402
- )
1403
- resolved_workspace_root = normalize_workspace_root(workspace_root)
1404
- initial_content_parts: list[dict[str, Any]] = []
1405
- saved_image_paths: list[str] = []
1406
- for image_index, image_path in enumerate(image_paths):
1407
- saved_path, data_url = stage_image_file_for_input(
1408
- image_path,
1409
- workspace_root=resolved_workspace_root,
1410
- image_index=image_index,
1411
- )
1412
- saved_image_paths.append(saved_path)
1413
- initial_content_parts.extend(image_input_content_parts(data_url, saved_path))
1414
- run_prompt = append_saved_image_paths_to_prompt(prompt_text, saved_image_paths)
1415
- printer = ConsoleEventPrinter(
1416
- model_name=agent.model,
1417
- workspace_root=resolved_workspace_root,
1418
- prompt=run_prompt,
1419
- )
1420
- printer.print_header()
1421
- session = agent._run_session(
1422
- run_prompt,
1423
- workspace_root=str(resolved_workspace_root),
1424
- event_callback=printer.handle_event,
1425
- initial_content_parts=initial_content_parts or None,
1426
- )
1427
- chat_enabled = chat_arg if chat_arg is not None else (sys.stdin.isatty() and sys.stdout.isatty())
1428
- messages = session.get("messages", [])
1429
- while chat_enabled:
1430
- try:
1431
- followup = input("\n[ResearchHarness] Follow-up (Ctrl+C to exit): ").strip()
1432
- except (KeyboardInterrupt, EOFError):
1433
- print("\n[ResearchHarness] Chat ended.")
1434
- break
1435
- if not followup:
1436
- continue
1437
- print(f"\n[ResearchHarness] Continuing conversation: {followup}")
1438
- printer.reset_rounds()
1439
- session = agent._run_session(
1440
- followup,
1441
- workspace_root=str(resolved_workspace_root),
1442
- event_callback=printer.handle_event,
1443
- prior_messages=messages,
1444
- )
1445
- messages = session.get("messages", messages)
1446
- return 0
1447
- except (MissingRequiredEnvError, ValueError) as exc:
1448
- print(str(exc), file=sys.stderr)
1449
- return 1
1450
-
1451
-
1452
- if __name__ == "__main__":
1453
- raise SystemExit(main())
 
 
1
  from contextlib import contextmanager
2
  import json
3
  import os
4
  import re
5
  import signal
 
6
  import threading
7
  from pathlib import Path
8
+ from typing import Any, Callable, Dict, List, Optional, Sequence
9
 
10
  from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
11
  import tiktoken
12
  from agent_base.base import BaseAgent
 
13
  from agent_base.context_compact import compact_messages, should_compact_messages
14
  from agent_base.model_profiles import resolve_model_profile
15
  from agent_base.provider_compat import apply_sampling_params
 
22
  from agent_base.tools.tool_user import AskUser
23
  from agent_base.tools.tool_web import ScholarSearch, WebFetch, WebSearch
24
  from agent_base.utils import (
 
 
 
25
  env_flag,
 
 
 
 
26
  safe_jsonable,
 
27
  )
28
 
29
  import datetime
 
64
  DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
65
 
66
 
67
+ def default_model_name() -> str:
68
+ return os.environ.get("MODEL_NAME", DEFAULT_MODEL_NAME).strip() or DEFAULT_MODEL_NAME
69
+
70
+
71
  class LLMHardTimeoutError(TimeoutError):
72
  pass
73
 
 
544
  return text
545
 
546
 
547
+ def default_llm_config(model_name: Optional[str] = None) -> dict:
548
+ selected_model = str(model_name or "").strip() or default_model_name()
549
  return {
550
+ "model": selected_model,
551
  "api_key": os.environ.get("API_KEY", "EMPTY"),
552
  "api_base": os.environ.get("API_BASE"),
553
  "timeout_seconds": float(os.environ.get("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))),
 
1188
  tool_arguments,
1189
  workspace_root=resolved_workspace_root,
1190
  runtime_deadline=runtime_deadline,
1191
+ model_name=self.model,
1192
  )
1193
  except KeyboardInterrupt:
1194
  messages = messages[:tool_turn_message_start]
 
1306
 
1307
  def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs):
1308
  return execute_tool_by_name(self.tool_map, tool_name, tool_args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent_base/tools/README.md DELETED
@@ -1,457 +0,0 @@
1
- # Tools
2
-
3
- This document describes the tool surface exposed to the model. Tool names use PascalCase consistently.
4
-
5
- The current implementation is grouped by category:
6
-
7
- - `agent_base/tools/tool_file.py`
8
- - `agent_base/tools/tool_runtime.py`
9
- - `agent_base/tools/tool_user.py`
10
- - `agent_base/tools/tool_web.py`
11
-
12
- ## Overview
13
-
14
- The current tool set is:
15
-
16
- - `Glob`
17
- - `Grep`
18
- - `Read`
19
- - `ReadPDF`
20
- - `ReadImage`
21
- - `Write`
22
- - `Edit`
23
- - `Bash`
24
- - `WebSearch`
25
- - `ScholarSearch`
26
- - `WebFetch`
27
- - `AskUser`
28
- - `TerminalStart`
29
- - `TerminalWrite`
30
- - `TerminalRead`
31
- - `TerminalInterrupt`
32
- - `TerminalKill`
33
-
34
- ## Tool Matrix
35
-
36
- | Tool | Category | Arguments | Description | Return Shape / Notes |
37
- | --- | --- | --- | --- | --- |
38
- | `Glob` | Local files | `pattern`, `path?`, `include_dirs?`, `max_results?` | Discover files or directories by pathname pattern inside the workspace. | Returns `root`, `match_count`, `truncated`, and `results`. Best for pathname discovery rather than reading content. |
39
- | `Grep` | Local files | `pattern`, `path?`, `glob?`, `case_sensitive?`, `max_results?`, `max_chars?` | Search local text files by content and return matching lines. | Returns search metadata plus matched file paths, line numbers, and line text. Skips obvious binary files, images, and PDFs. |
40
- | `Read` | Local files | `path`, `start_line?`, `end_line?`, `max_chars?` | Read a local text file, optionally by line range. | Returns normalized path, line metadata, truncation status, and `content`. Redirects PDF/image tasks toward `ReadPDF` or `ReadImage`. |
41
- | `ReadPDF` | Local files | `path`, `max_chars?`, `max_image_paths?` | Read a local PDF, extract text, and expose extracted image paths when available. | Returns text content plus `image_paths` and image-count metadata. Depends on [`structai`](https://github.com/black-yt/structai) and `MINERU_TOKEN`. |
42
- | `ReadImage` | Local files | `path` | Read a local image and expose image metadata for runtime multimodal use. | Returns image metadata only. During agent runs, the runtime sends a compressed attachment to the LLM API as an `image_url` content part. |
43
- | `Write` | Local files | `path`, `content`, `overwrite?` | Create a text file or overwrite one when explicitly allowed. | Creates parent directories automatically. Returns an error if the file exists and `overwrite=false`. |
44
- | `Edit` | Local files | `path`, `patch` | Apply a targeted patch to a local text file. | Expects unified-diff / hunk-style input. Context-based matching, not a full `patch(1)` implementation. |
45
- | `Bash` | Runtime | `command`, `timeout?`, `workdir?` | Run one-shot shell commands for deterministic local execution, parsing, and validation. | Returns `stdout` and `stderr`. Primary local execution tool for short Python, `rg`, `find`, `git`, and structured local processing. |
46
- | `WebSearch` | Web | `query` | Perform general web search over one or more complementary queries. | Returns a text summary headed by `## Web Results` with title, link, snippet, and date/source when available. Uses Serper. |
47
- | `ScholarSearch` | Web | `query` | Search academic results such as papers, year, abstract, and citations. | Returns a text summary headed by `## Scholar Results` with title, PDF link, publication info, year, citation count, and abstract. Uses Serper Scholar. |
48
- | `WebFetch` | Web | `url`, `goal` | Fetch a page, extract evidence relevant to a concrete goal, and summarize it. | Uses Jina Reader plus the configured summary model. Returns evidence-focused text rather than raw HTML. |
49
- | `AskUser` | Human interaction | `question`, `context?` | Ask the human user one concise clarification question when essential information cannot be determined from tools or existing instructions. | Writes the question to the interactive terminal and returns the user's answer. If no interactive terminal is available, returns an explicit unavailable message. |
50
- | `TerminalStart` | Runtime | `cwd?`, `shell?`, `rows?`, `cols?` | Start a persistent terminal session. | Returns session metadata such as `session_id`, `pid`, `cwd`, `shell`, `alive`, and `returncode`. |
51
- | `TerminalWrite` | Runtime | `session_id`, `input`, `append_newline?`, `yield_time_ms?`, `max_output_chars?` | Send input to a persistent terminal session and read incremental output. | Best for stateful shells, REPLs, and long-running foreground processes. |
52
- | `TerminalRead` | Runtime | `session_id`, `yield_time_ms?`, `max_output_chars?` | Read unread output from an existing persistent terminal session. | Useful when a process is still running and output arrives over time. |
53
- | `TerminalInterrupt` | Runtime | `session_id`, `max_output_chars?` | Send `Ctrl-C` to the foreground process in a terminal session without destroying the session. | Use when a long-running process must be interrupted but the shell should remain alive. |
54
- | `TerminalKill` | Runtime | `session_id`, `force?` | Terminate a persistent terminal session and release resources. | Final cleanup step for terminal sessions that are no longer needed. |
55
-
56
- ## Glob
57
-
58
- Purpose:
59
-
60
- - Discover local files or directories by glob pattern.
61
- - Good for pathname discovery, not for reading file contents.
62
-
63
- Arguments:
64
-
65
- - `pattern`: string, a `pathlib`-style glob such as `**/*.py`
66
- - `path`: optional string, search root, defaults to the current workspace
67
- - `include_dirs`: optional boolean, defaults to `false`
68
- - `max_results`: optional integer, defaults to `200`
69
-
70
- Returns:
71
-
72
- - `root`
73
- - `pattern`
74
- - `include_dirs`
75
- - `match_count`
76
- - `truncated`
77
- - `results`
78
-
79
- ## Grep
80
-
81
- Purpose:
82
-
83
- - Search local text files by content.
84
- - Return matched file paths, line numbers, and line text.
85
-
86
- Arguments:
87
-
88
- - `pattern`: string, regular expression
89
- - `path`: optional string, file or directory path, defaults to the current workspace
90
- - `glob`: optional string, file filter when scanning a directory, defaults to `**/*`
91
- - `case_sensitive`: optional boolean, defaults to `false`
92
- - `max_results`: optional integer, defaults to `100`
93
- - `max_chars`: optional integer, defaults to `20000`
94
-
95
- Behavior:
96
-
97
- - If `path` is a file, only that file is searched.
98
- - If `path` is a directory, matching text files are searched recursively.
99
- - Images, PDFs, and obviously binary files are skipped.
100
-
101
- Returns:
102
-
103
- - `root`
104
- - `pattern`
105
- - `glob`
106
- - `case_sensitive`
107
- - `files_scanned`
108
- - `match_count`
109
- - `truncated`
110
- - `results`
111
-
112
- ## Read
113
-
114
- Purpose:
115
-
116
- - Read a local text file.
117
- - Support partial line ranges.
118
- - Support long-text truncation.
119
-
120
- Arguments:
121
-
122
- - `path`: string, file path
123
- - `start_line`: optional integer, 1-based start line
124
- - `end_line`: optional integer, 1-based end line
125
- - `max_chars`: optional integer, maximum returned characters, defaults to `20000`
126
-
127
- Behavior:
128
-
129
- - Only text files are handled directly.
130
- - If the input is a PDF, the tool tells the model to use `ReadPDF`.
131
- - If the input is an image, the tool tells the model to use `ReadImage`.
132
-
133
- Returns:
134
-
135
- - `path`
136
- - `source_type: text`
137
- - `start_line`
138
- - `end_line`
139
- - `total_lines`
140
- - `truncated`
141
- - `content`
142
-
143
- ## ReadPDF
144
-
145
- Purpose:
146
-
147
- - Read a local PDF.
148
- - Return extracted text.
149
- - Return extracted local image paths when the PDF parser produces image assets.
150
-
151
- Arguments:
152
-
153
- - `path`: string, PDF path
154
- - `max_chars`: optional integer, maximum returned characters, defaults to `20000`
155
- - `max_image_paths`: optional integer, maximum listed extracted image paths, defaults to `20`
156
-
157
- Behavior:
158
-
159
- - Calls `structai.read_pdf(...)` from [`structai`](https://github.com/black-yt/structai) underneath.
160
- - Uses the returned `text` and `img_paths`.
161
- - Depends on `MINERU_TOKEN`.
162
- - If [`structai`](https://github.com/black-yt/structai) is missing, returns a clear dependency error instead of breaking unrelated file tools.
163
- - For PDF figure tasks, prefer `ReadPDF` first to discover extracted text and extracted image paths, then use `ReadImage` on the actual extracted image file.
164
-
165
- Returns:
166
-
167
- - `path`
168
- - `source_type: pdf`
169
- - `total_lines`
170
- - `truncated`
171
- - `image_count`
172
- - `image_paths_listed`
173
- - `image_paths_truncated`
174
- - `image_paths`
175
- - `content`
176
-
177
- ## ReadImage
178
-
179
- Purpose:
180
-
181
- - Read a local image.
182
- - Return image metadata.
183
- - During a main agent run, pass a compressed image to the LLM API as an `image_url` content part instead of stuffing raw base64 text into ordinary message text.
184
-
185
- Arguments:
186
-
187
- - `path`: string, image path
188
-
189
- Behavior:
190
-
191
- - Uses `PIL.Image.open(...)` underneath.
192
- - The runtime creates a compressed JPEG attachment for the LLM request and sends it as an inline `data:` URL in an `image_url` content part.
193
- - Trace records and direct tool output keep image metadata only, not the full binary payload.
194
-
195
- Returns:
196
-
197
- - `path`
198
- - `source_type`
199
- - `format`
200
- - `mime_type`
201
- - `mode`
202
- - `width`
203
- - `height`
204
- - `byte_count`
205
- - `llm_attachment_format`
206
- - `llm_attachment_width`
207
- - `llm_attachment_height`
208
- - `llm_attachment_byte_count`
209
-
210
- ## Write
211
-
212
- Purpose:
213
-
214
- - Create a text file.
215
- - Overwrite an existing file when explicitly requested.
216
-
217
- Arguments:
218
-
219
- - `path`: string, destination file path
220
- - `content`: string, complete file content
221
- - `overwrite`: optional boolean, defaults to `false`
222
-
223
- Behavior:
224
-
225
- - Parent directories are created automatically.
226
- - If `overwrite=false` and the file already exists, the tool returns an error.
227
-
228
- ## Edit
229
-
230
- Purpose:
231
-
232
- - Edit a local text file partially.
233
- - Best for targeted patches, not full-file rewrites.
234
-
235
- Arguments:
236
-
237
- - `path`: string, destination file path
238
- - `patch`: string, unified-diff / hunk-style patch
239
-
240
- Behavior:
241
-
242
- - Requires explicit hunks such as `@@ -1,2 +1,2 @@`.
243
- - The current implementation matches by surrounding context blocks rather than implementing full `patch(1)` line-number semantics.
244
-
245
- Returns:
246
-
247
- - updated file path on success
248
- - applied hunk count
249
-
250
- ## Bash
251
-
252
- Purpose:
253
-
254
- - Execute one-shot shell commands.
255
- - Handle paths, search, git, conda, and local script orchestration.
256
- - Serve as the primary local execution tool for temporary Python, deterministic computation, validation, formatting, and parsing.
257
-
258
- Arguments:
259
-
260
- - `command`: string, shell command to execute
261
- - `timeout`: optional integer, seconds, defaults to `30`
262
- - `workdir`: optional string, working directory
263
-
264
- Behavior:
265
-
266
- - Uses local `bash`.
267
- - Returns both `stdout` and `stderr`.
268
- - Timeout produces an explicit error.
269
- - Short scripts are well suited to a heredoc such as `python3 - <<'PY'`.
270
-
271
- Recommended use cases:
272
-
273
- - pathname and file discovery
274
- - `rg`, `find`, `git`
275
- - local Python or other CLI programs
276
- - deterministic CSV / JSON / text processing
277
- - local computation and validation against absolute paths returned by file tools
278
-
279
- ## WebSearch
280
-
281
- Purpose:
282
-
283
- - General web search.
284
- - Supports passing multiple complementary queries in one call.
285
-
286
- Arguments:
287
-
288
- - `query`: array of strings, at least one query
289
-
290
- Behavior:
291
-
292
- - Calls Serper's Google Search endpoint.
293
- - Reads `SERPER_KEY` at runtime.
294
-
295
- Returns:
296
-
297
- - query summary text
298
- - `## Web Results`
299
- - title, link, snippet, and date/source when available
300
-
301
- ## ScholarSearch
302
-
303
- Purpose:
304
-
305
- - Academic search.
306
- - Return paper title, year, abstract, citation count, and related metadata.
307
-
308
- Arguments:
309
-
310
- - `query`: array of strings, at least one query
311
-
312
- Behavior:
313
-
314
- - Calls Serper's Google Scholar endpoint.
315
- - Reads `SERPER_KEY` at runtime.
316
-
317
- Returns:
318
-
319
- - query summary text
320
- - `## Scholar Results`
321
- - title, PDF link, `publicationInfo`, year, citation count, and abstract
322
-
323
- ## WebFetch
324
-
325
- Purpose:
326
-
327
- - Visit a webpage.
328
- - Extract evidence relevant to a concrete goal.
329
- - Produce a goal-oriented summary.
330
-
331
- Arguments:
332
-
333
- - `url`: string or array of strings, page URL or URLs
334
- - `goal`: string, the specific goal to extract from the page
335
-
336
- Behavior:
337
-
338
- - Fetches page text through Jina Reader first.
339
- - Then calls the configured summary-model endpoint for evidence extraction and summarization.
340
- - Returns a fetch-and-extract result, not raw HTML.
341
-
342
- Dependencies:
343
-
344
- - `JINA_KEY`
345
- - `API_KEY`
346
- - `API_BASE`
347
- - `MODEL_NAME`
348
-
349
- Returns:
350
-
351
- - `The useful information in ...`
352
- - `Evidence in page:`
353
- - `Summary:`
354
-
355
- ## TerminalStart
356
-
357
- Purpose:
358
-
359
- - Start a persistent terminal session.
360
-
361
- Arguments:
362
-
363
- - `cwd`: optional string, working directory
364
- - `shell`: optional string, shell path
365
- - `rows`: optional integer, terminal rows, defaults to `30`
366
- - `cols`: optional integer, terminal columns, defaults to `120`
367
-
368
- Returns:
369
-
370
- - `session_id`
371
- - `pid`
372
- - `cwd`
373
- - `shell`
374
- - `alive`
375
- - `returncode`
376
-
377
- ## TerminalWrite
378
-
379
- Purpose:
380
-
381
- - Send input to an existing terminal session and read output.
382
-
383
- Arguments:
384
-
385
- - `session_id`: string, session id
386
- - `input`: string, text to send
387
- - `append_newline`: optional boolean, defaults to `true`
388
- - `yield_time_ms`: optional integer, defaults to `200`
389
- - `max_output_chars`: optional integer, defaults to `20000`
390
-
391
- ## TerminalRead
392
-
393
- Purpose:
394
-
395
- - Read unread output from an existing terminal session.
396
-
397
- Arguments:
398
-
399
- - `session_id`: string, session id
400
- - `yield_time_ms`: optional integer, defaults to `200`
401
- - `max_output_chars`: optional integer, defaults to `20000`
402
-
403
- ## TerminalInterrupt
404
-
405
- Purpose:
406
-
407
- - Send `Ctrl-C` to the foreground process in a terminal session.
408
- - Keep the session alive.
409
-
410
- Arguments:
411
-
412
- - `session_id`: string, session id
413
- - `max_output_chars`: optional integer, defaults to `20000`
414
-
415
- ## TerminalKill
416
-
417
- Purpose:
418
-
419
- - Terminate a terminal session.
420
- - Release related resources.
421
-
422
- Arguments:
423
-
424
- - `session_id`: string, session id
425
- - `force`: optional boolean, defaults to `false`
426
-
427
- ## AskUser
428
-
429
- Purpose:
430
-
431
- - Ask the human user for essential missing information, preference, or approval.
432
- - Use only when the answer cannot be determined from workspace files, available tools, or existing instructions.
433
-
434
- Arguments:
435
-
436
- - `question`: string, concise question to ask.
437
- - `context`: optional string, brief explanation of why the question is necessary.
438
-
439
- Behavior:
440
-
441
- - Writes the question to the interactive terminal and waits for one user answer.
442
- - Returns an explicit unavailable message instead of blocking when no interactive terminal exists.
443
- - Not available in ResearchClawBench runs.
444
-
445
- ## Suggested Usage
446
-
447
- - Use `Glob` first for pathname discovery.
448
- - Use `Grep` first for local text search.
449
- - Use `Read` for local text files.
450
- - Use `ReadPDF` for local PDFs.
451
- - Use `ReadImage` for local images.
452
- - Use `Edit` for targeted file changes.
453
- - Use `Write` for full-file writes.
454
- - Use `Bash` for one-shot system commands.
455
- - Use `AskUser` only when a human answer is genuinely necessary.
456
- - Use `Terminal*` only when persistent interactive shell state is actually needed.
457
- - Route pure Python analysis through `Bash` rather than introducing a separate Python tool.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent_base/tools/tool_web.py CHANGED
@@ -373,11 +373,12 @@ class WebFetch(ToolBase):
373
  except ValueError as exc:
374
  return f"[WebFetch] {exc}"
375
  runtime_deadline = kwargs.get("runtime_deadline")
 
376
 
377
  start_time = time.time()
378
 
379
  if isinstance(url, str):
380
- response = self.readpage_jina(url, goal, runtime_deadline=runtime_deadline)
381
  elif isinstance(url, list):
382
  response = []
383
  start_time = time.time()
@@ -396,7 +397,12 @@ class WebFetch(ToolBase):
396
  cur_response += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
397
  cur_response += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
398
  else:
399
- cur_response = self.readpage_jina(one_url, goal, runtime_deadline=runtime_deadline)
 
 
 
 
 
400
  response.append(cur_response)
401
  response = "\n=======\n".join(response)
402
  else:
@@ -406,11 +412,18 @@ class WebFetch(ToolBase):
406
  print(f"Summary Length {len(response)}")
407
  return response.strip()
408
 
409
- def call_server(self, msgs, max_retries=2, runtime_deadline: Optional[float] = None):
 
 
 
 
 
 
410
  client = self._ensure_summary_client()
411
  if client is None or not self._summary_api_base:
412
  return "[WebFetch] Summary model error: API_BASE is not set."
413
- if not self._summary_model_name:
 
414
  return "[WebFetch] Summary model error: MODEL_NAME is not set."
415
  last_error = "unknown summary-model error"
416
  for attempt in range(max_retries):
@@ -424,12 +437,12 @@ class WebFetch(ToolBase):
424
  else client
425
  )
426
  request_kwargs = {
427
- "model": self._summary_model_name,
428
  "messages": msgs,
429
  }
430
  apply_sampling_params(
431
  request_kwargs,
432
- model_name=self._summary_model_name,
433
  temperature=self._summary_temperature,
434
  top_p=self._summary_top_p,
435
  presence_penalty=self._summary_presence_penalty,
@@ -494,8 +507,21 @@ class WebFetch(ToolBase):
494
  return content
495
  return "[WebFetch] Failed to read page: exhausted retries"
496
 
497
- def readpage_jina(self, url: str, goal: str, runtime_deadline: Optional[float] = None) -> str:
498
- summary_page_func = self.call_server
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  max_retries = int(os.getenv("LLM_MAX_RETRIES", str(DEFAULT_LLM_MAX_RETRIES)))
500
 
501
  content = self.html_readpage_jina(url, runtime_deadline=runtime_deadline)
 
373
  except ValueError as exc:
374
  return f"[WebFetch] {exc}"
375
  runtime_deadline = kwargs.get("runtime_deadline")
376
+ summary_model_name = str(kwargs.get("model_name") or "").strip()
377
 
378
  start_time = time.time()
379
 
380
  if isinstance(url, str):
381
+ response = self.readpage_jina(url, goal, runtime_deadline=runtime_deadline, summary_model_name=summary_model_name)
382
  elif isinstance(url, list):
383
  response = []
384
  start_time = time.time()
 
397
  cur_response += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
398
  cur_response += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
399
  else:
400
+ cur_response = self.readpage_jina(
401
+ one_url,
402
+ goal,
403
+ runtime_deadline=runtime_deadline,
404
+ summary_model_name=summary_model_name,
405
+ )
406
  response.append(cur_response)
407
  response = "\n=======\n".join(response)
408
  else:
 
412
  print(f"Summary Length {len(response)}")
413
  return response.strip()
414
 
415
+ def call_server(
416
+ self,
417
+ msgs,
418
+ max_retries=2,
419
+ runtime_deadline: Optional[float] = None,
420
+ model_name: str = "",
421
+ ):
422
  client = self._ensure_summary_client()
423
  if client is None or not self._summary_api_base:
424
  return "[WebFetch] Summary model error: API_BASE is not set."
425
+ summary_model_name = str(model_name or self._summary_model_name or os.environ.get("MODEL_NAME", "")).strip()
426
+ if not summary_model_name:
427
  return "[WebFetch] Summary model error: MODEL_NAME is not set."
428
  last_error = "unknown summary-model error"
429
  for attempt in range(max_retries):
 
437
  else client
438
  )
439
  request_kwargs = {
440
+ "model": summary_model_name,
441
  "messages": msgs,
442
  }
443
  apply_sampling_params(
444
  request_kwargs,
445
+ model_name=summary_model_name,
446
  temperature=self._summary_temperature,
447
  top_p=self._summary_top_p,
448
  presence_penalty=self._summary_presence_penalty,
 
507
  return content
508
  return "[WebFetch] Failed to read page: exhausted retries"
509
 
510
+ def readpage_jina(
511
+ self,
512
+ url: str,
513
+ goal: str,
514
+ runtime_deadline: Optional[float] = None,
515
+ summary_model_name: str = "",
516
+ ) -> str:
517
+ def summary_page_func(messages, max_retries=2, runtime_deadline: Optional[float] = None):
518
+ return self.call_server(
519
+ messages,
520
+ max_retries=max_retries,
521
+ runtime_deadline=runtime_deadline,
522
+ model_name=summary_model_name,
523
+ )
524
+
525
  max_retries = int(os.getenv("LLM_MAX_RETRIES", str(DEFAULT_LLM_MAX_RETRIES)))
526
 
527
  content = self.html_readpage_jina(url, runtime_deadline=runtime_deadline)
agent_base/utils.py CHANGED
@@ -87,21 +87,6 @@ def require_required_env(context: str = "ResearchHarness") -> None:
87
  )
88
 
89
 
90
- def read_role_prompt_files(paths: Iterable[str]) -> str:
91
- blocks: list[str] = []
92
- for raw_path in paths:
93
- path_text = str(raw_path).strip()
94
- if not path_text:
95
- continue
96
- path = Path(path_text).expanduser()
97
- if not path.exists():
98
- raise ValueError(f"Role prompt file does not exist: {path}")
99
- if not path.is_file():
100
- raise ValueError(f"Role prompt path is not a file: {path}")
101
- blocks.append(path.read_text(encoding="utf-8").strip())
102
- return "\n\n".join(block for block in blocks if block.strip())
103
-
104
-
105
  def _safe_image_stem(name: str, fallback: str) -> str:
106
  stem = re.sub(r"[^A-Za-z0-9_.-]+", "_", Path(name).stem).strip("._")
107
  return stem or fallback
 
87
  )
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def _safe_image_stem(name: str, fallback: str) -> str:
91
  stem = re.sub(r"[^A-Za-z0-9_.-]+", "_", Path(name).stem).strip("._")
92
  return stem or fallback
api/__init__.py DELETED
@@ -1 +0,0 @@
1
- """OpenAI-compatible API helpers for ResearchHarness."""
 
 
api/openai_server.py DELETED
@@ -1,518 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import base64
4
- import binascii
5
- import datetime
6
- import json
7
- import re
8
- import time
9
- from dataclasses import dataclass
10
- from pathlib import Path
11
- from typing import Any, Optional
12
- from uuid import uuid4
13
-
14
- import uvicorn
15
- from fastapi import Body, FastAPI, Request
16
- from fastapi.responses import JSONResponse
17
-
18
- from agent_base.react_agent import (
19
- AVAILABLE_TOOL_MAP,
20
- MultiTurnReactAgent,
21
- assistant_text_content,
22
- default_llm_config,
23
- model_supports_runtime_image_parts,
24
- )
25
- from agent_base.tools.tooling import normalize_workspace_root
26
- from agent_base.utils import append_jsonl, image_input_content_parts, read_role_prompt_files, safe_jsonable
27
-
28
-
29
- DATA_IMAGE_RE = re.compile(r"^data:(image/[A-Za-z0-9.+-]+);base64,(.*)$", re.DOTALL)
30
- IMAGE_EXTENSIONS = {
31
- "image/png": ".png",
32
- "image/jpeg": ".jpg",
33
- "image/jpg": ".jpg",
34
- "image/webp": ".webp",
35
- "image/gif": ".gif",
36
- }
37
- DEFAULT_MAX_IMAGE_BYTES = 25 * 1024 * 1024
38
-
39
- INPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness input wrapper.
40
-
41
- Convert the user's OpenAI-compatible chat request into a stable task for a
42
- tool-using ResearchHarness agent.
43
-
44
- Return only a JSON object with these string fields:
45
- - agent_instruction: the task the agent should solve, including all substantive question details.
46
- - output_contract: the final output format or schema requested by the user. If no strict format is requested, say "plain text".
47
- - wrapper_notes: brief notes about images, constraints, or benchmark-specific requirements.
48
-
49
- Rules:
50
- - Do not answer the task.
51
- - Do not remove substantive constraints.
52
- - Keep strict final formatting requirements out of agent_instruction when possible.
53
- - If images are listed, mention their saved paths in agent_instruction.
54
- """
55
-
56
- OUTPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness output wrapper.
57
-
58
- Format the ResearchHarness agent result so it satisfies the user's requested
59
- final output contract.
60
-
61
- Rules:
62
- - Return only the final answer requested by the user.
63
- - Do not add markdown fences unless the user explicitly required them.
64
- - Do not solve the task again.
65
- - Do not introduce facts not present in the agent result.
66
- - Make the answer complete and self-contained for a remote user or evaluator.
67
- - The answer may mention workspace files when useful, but it must not depend on
68
- local files as the only carrier of the answer.
69
- - Include the actual answer and any necessary evidence or solution steps in the
70
- returned text.
71
- - If reasoning or evidence is required, summarize it directly in the final
72
- answer according to the requested format.
73
- - If the requested format is JSON, return valid JSON only.
74
- - If the agent result does not contain enough information, produce the best
75
- contract-compliant failure answer instead of inventing evidence.
76
- """
77
-
78
-
79
- class OpenAICompatError(Exception):
80
- def __init__(self, status_code: int, message: str, error_type: str = "invalid_request_error"):
81
- super().__init__(message)
82
- self.status_code = status_code
83
- self.message = message
84
- self.error_type = error_type
85
-
86
-
87
- @dataclass
88
- class ServerConfig:
89
- api_runs_dir: Path
90
- role_prompt: str = ""
91
- host: str = "127.0.0.1"
92
- port: int = 8686
93
- input_wrapper: bool = True
94
- output_wrapper: bool = True
95
-
96
-
97
- @dataclass
98
- class PreparedInput:
99
- wrapper_messages: list[dict[str, str]]
100
- initial_content_parts: list[dict[str, Any]]
101
- image_paths: list[str]
102
-
103
-
104
- def openai_error_response(exc: OpenAICompatError) -> JSONResponse:
105
- return JSONResponse(
106
- status_code=exc.status_code,
107
- content={"error": {"message": exc.message, "type": exc.error_type}},
108
- )
109
-
110
-
111
- def make_chat_completion_response(*, request_id: str, model: str, content: str) -> dict[str, Any]:
112
- return {
113
- "id": request_id,
114
- "object": "chat.completion",
115
- "created": int(time.time()),
116
- "model": model,
117
- "choices": [
118
- {
119
- "index": 0,
120
- "message": {"role": "assistant", "content": content},
121
- "finish_reason": "stop",
122
- }
123
- ],
124
- }
125
-
126
-
127
- def validate_chat_payload(payload: Any) -> dict[str, Any]:
128
- if not isinstance(payload, dict):
129
- raise OpenAICompatError(400, "Request body must be a JSON object.")
130
- if payload.get("stream") is True:
131
- raise OpenAICompatError(400, "Streaming is not supported by this synchronous endpoint.")
132
- try:
133
- n_value = int(payload.get("n", 1) or 1)
134
- except (TypeError, ValueError) as exc:
135
- raise OpenAICompatError(400, "n must be an integer.") from exc
136
- if n_value != 1:
137
- raise OpenAICompatError(400, "Only n=1 is supported.")
138
- model = str(payload.get("model", "")).strip()
139
- if not model:
140
- raise OpenAICompatError(400, "model is required.")
141
- messages = payload.get("messages")
142
- if not isinstance(messages, list) or not messages:
143
- raise OpenAICompatError(400, "messages must be a non-empty list.")
144
- return payload
145
-
146
-
147
- def prepare_openai_input(messages: list[Any], workspace_root: Path) -> PreparedInput:
148
- wrapper_messages: list[dict[str, str]] = []
149
- initial_content_parts: list[dict[str, Any]] = []
150
- image_paths: list[str] = []
151
- image_dir = workspace_root / "inputs" / "images"
152
- image_index = 0
153
-
154
- for message in messages:
155
- if not isinstance(message, dict):
156
- raise OpenAICompatError(400, "Each message must be an object.")
157
- role = str(message.get("role", "")).strip()
158
- if role not in {"system", "user", "assistant"}:
159
- raise OpenAICompatError(400, f"Unsupported message role: {role!r}.")
160
- content = message.get("content", "")
161
- text_parts: list[str] = []
162
- if isinstance(content, str):
163
- text_parts.append(content)
164
- elif isinstance(content, list):
165
- for part in content:
166
- if not isinstance(part, dict):
167
- raise OpenAICompatError(400, "Multimodal content parts must be objects.")
168
- part_type = str(part.get("type", "")).strip()
169
- if part_type == "text":
170
- text_parts.append(str(part.get("text", "")))
171
- elif part_type == "image_url":
172
- image_url = part.get("image_url")
173
- if not isinstance(image_url, dict):
174
- raise OpenAICompatError(400, "image_url content must contain an image_url object.")
175
- url = str(image_url.get("url", "")).strip()
176
- detail = str(image_url.get("detail", "auto") or "auto")
177
- rel_path = save_data_image(
178
- url,
179
- workspace_root=workspace_root,
180
- image_dir=image_dir,
181
- image_index=image_index,
182
- )
183
- image_index += 1
184
- image_paths.append(rel_path)
185
- text_parts.append(f"[image saved at {rel_path}]")
186
- initial_content_parts.extend(image_input_content_parts(url, rel_path, detail=detail))
187
- else:
188
- raise OpenAICompatError(400, f"Unsupported content part type: {part_type!r}.")
189
- else:
190
- raise OpenAICompatError(400, "message content must be a string or a list of content parts.")
191
- wrapper_messages.append({"role": role, "content": "\n".join(part for part in text_parts if part)})
192
-
193
- return PreparedInput(
194
- wrapper_messages=wrapper_messages,
195
- initial_content_parts=initial_content_parts,
196
- image_paths=image_paths,
197
- )
198
-
199
-
200
- def save_data_image(url: str, *, workspace_root: Path, image_dir: Path, image_index: int) -> str:
201
- match = DATA_IMAGE_RE.match(url)
202
- if not match:
203
- raise OpenAICompatError(
204
- 400,
205
- "Only data:image/...;base64,... image_url inputs are supported in the first API version.",
206
- )
207
- mime_type = match.group(1).lower()
208
- extension = IMAGE_EXTENSIONS.get(mime_type)
209
- if extension is None:
210
- raise OpenAICompatError(400, f"Unsupported image MIME type: {mime_type}.")
211
- try:
212
- image_bytes = base64.b64decode(match.group(2), validate=True)
213
- except (binascii.Error, ValueError) as exc:
214
- raise OpenAICompatError(400, "Invalid base64 image data.") from exc
215
- if len(image_bytes) > DEFAULT_MAX_IMAGE_BYTES:
216
- raise OpenAICompatError(400, f"Image exceeds the {DEFAULT_MAX_IMAGE_BYTES} byte limit.")
217
- image_dir.mkdir(parents=True, exist_ok=True)
218
- filename = f"image_{image_index:03d}{extension}"
219
- path = image_dir / filename
220
- path.write_bytes(image_bytes)
221
- return path.relative_to(workspace_root).as_posix()
222
-
223
-
224
- def wrapper_request_payload(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, Any]:
225
- return {
226
- "messages": prepared.wrapper_messages,
227
- "saved_image_paths": prepared.image_paths,
228
- "response_format": safe_jsonable(payload.get("response_format")),
229
- "requested_model_label": str(payload.get("model", "")),
230
- }
231
-
232
-
233
- def build_input_wrapper_messages(*, prepared: PreparedInput, payload: dict[str, Any]) -> list[dict[str, str]]:
234
- return [
235
- {"role": "system", "content": INPUT_WRAPPER_SYSTEM_PROMPT},
236
- {
237
- "role": "user",
238
- "content": json.dumps(wrapper_request_payload(prepared=prepared, payload=payload), ensure_ascii=False, indent=2),
239
- },
240
- ]
241
-
242
-
243
- def build_passthrough_input_plan(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, str]:
244
- conversation = "\n\n".join(
245
- f"{message['role'].upper()}:\n{message['content']}" for message in prepared.wrapper_messages
246
- ).strip()
247
- response_format = payload.get("response_format")
248
- output_contract = "Follow the final answer requirements in the original request."
249
- if response_format is not None:
250
- output_contract += "\nOpenAI response_format request:\n" + json.dumps(
251
- safe_jsonable(response_format),
252
- ensure_ascii=False,
253
- indent=2,
254
- )
255
- return {
256
- "agent_instruction": conversation or "Answer the user's request.",
257
- "output_contract": output_contract,
258
- "wrapper_notes": "Input wrapper disabled; the original normalized conversation was passed through directly.",
259
- }
260
-
261
-
262
- def build_agent_prompt(input_plan: dict[str, Any], prepared: PreparedInput) -> str:
263
- image_block = "\n".join(f"- {path}" for path in prepared.image_paths) if prepared.image_paths else "- none"
264
- return (
265
- "You are solving a user request through ResearchHarness.\n\n"
266
- "Task for the agent:\n"
267
- f"{str(input_plan.get('agent_instruction', '')).strip()}\n\n"
268
- "User-provided images saved in this workspace:\n"
269
- f"{image_block}\n\n"
270
- "The original image content is attached to the initial user message when the backend model supports image parts. "
271
- "The same images are also saved at the paths above so you may call ReadImage when visual inspection is needed.\n\n"
272
- "Do not optimize your tool-use loop for the final output schema. Solve the task completely, then finish with a complete, "
273
- "self-contained internal final text that includes the actual answer, the evidence used, and any concise reasoning needed to understand it. "
274
- "You may mention files you created or inspected, but the internal final text must not depend on local files as the only carrier of the answer.\n\n"
275
- "Final output contract that will be enforced by a formatter after your run:\n"
276
- f"{str(input_plan.get('output_contract', 'plain text')).strip()}\n\n"
277
- "Wrapper notes:\n"
278
- f"{str(input_plan.get('wrapper_notes', '')).strip()}"
279
- )
280
-
281
-
282
- def build_output_wrapper_messages(
283
- *,
284
- prepared: PreparedInput,
285
- payload: dict[str, Any],
286
- input_plan: dict[str, Any],
287
- agent_result_text: str,
288
- ) -> list[dict[str, str]]:
289
- output_payload = {
290
- "original_messages": prepared.wrapper_messages,
291
- "saved_image_paths": prepared.image_paths,
292
- "output_contract": str(input_plan.get("output_contract", "plain text")),
293
- "response_format": safe_jsonable(payload.get("response_format")),
294
- "agent_result_text": agent_result_text,
295
- }
296
- return [
297
- {"role": "system", "content": OUTPUT_WRAPPER_SYSTEM_PROMPT},
298
- {"role": "user", "content": json.dumps(output_payload, ensure_ascii=False, indent=2)},
299
- ]
300
-
301
-
302
- def extract_json_object(text: str) -> dict[str, Any]:
303
- stripped = text.strip()
304
- if stripped.startswith("```"):
305
- stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE)
306
- stripped = re.sub(r"\s*```$", "", stripped)
307
- try:
308
- parsed = json.loads(stripped)
309
- except json.JSONDecodeError:
310
- start = stripped.find("{")
311
- end = stripped.rfind("}")
312
- if start < 0 or end <= start:
313
- raise OpenAICompatError(500, "Input wrapper did not return a JSON object.", "server_error") from None
314
- try:
315
- parsed = json.loads(stripped[start : end + 1])
316
- except json.JSONDecodeError as exc:
317
- raise OpenAICompatError(500, f"Input wrapper returned invalid JSON: {exc}", "server_error") from exc
318
- if not isinstance(parsed, dict):
319
- raise OpenAICompatError(500, "Input wrapper JSON must be an object.", "server_error")
320
- if not str(parsed.get("agent_instruction", "")).strip():
321
- raise OpenAICompatError(500, "Input wrapper JSON missing agent_instruction.", "server_error")
322
- if not str(parsed.get("output_contract", "")).strip():
323
- parsed["output_contract"] = "plain text"
324
- parsed.setdefault("wrapper_notes", "")
325
- return parsed
326
-
327
-
328
- def call_wrapper_text(
329
- agent: MultiTurnReactAgent,
330
- messages: list[dict[str, str]],
331
- *,
332
- max_output_tokens: Optional[int] = None,
333
- ) -> str:
334
- response = agent.call_compaction_api(messages, max_output_tokens=max_output_tokens)
335
- if not isinstance(response, dict) or response.get("status") == "error":
336
- error_text = response.get("error", "unknown wrapper error") if isinstance(response, dict) else str(response)
337
- raise OpenAICompatError(500, error_text, "server_error")
338
- text = assistant_text_content(response.get("content")).strip()
339
- if not text:
340
- raise OpenAICompatError(500, "Wrapper returned empty content.", "server_error")
341
- return text
342
-
343
-
344
- def final_max_tokens(payload: dict[str, Any]) -> Optional[int]:
345
- raw_value = payload.get("max_tokens", payload.get("max_completion_tokens"))
346
- if raw_value is None:
347
- return None
348
- try:
349
- value = int(raw_value)
350
- except (TypeError, ValueError) as exc:
351
- raise OpenAICompatError(400, "max_tokens must be an integer.") from exc
352
- if value <= 0:
353
- raise OpenAICompatError(400, "max_tokens must be positive.")
354
- return value
355
-
356
-
357
- def append_api_event(trace_dir: Path, event: str, payload: dict[str, Any]) -> None:
358
- append_jsonl(
359
- trace_dir / "api_trace.jsonl",
360
- {
361
- "timestamp": int(time.time()),
362
- "event": event,
363
- "payload": safe_jsonable(payload),
364
- },
365
- )
366
-
367
-
368
- def run_chat_completion(payload: dict[str, Any], config: ServerConfig) -> dict[str, Any]:
369
- payload = validate_chat_payload(payload)
370
- request_id = "chatcmpl_" + uuid4().hex
371
- run_id = "run_" + datetime.datetime.now().astimezone().strftime("%Y%m%d_%H%M%S") + "_" + uuid4().hex[:8]
372
- run_root = config.api_runs_dir / run_id
373
- agent_workspace = run_root / "agent_workspace"
374
- trace_dir = run_root / "agent_trace"
375
- agent_workspace.mkdir(parents=True, exist_ok=False)
376
- trace_dir.mkdir(parents=True, exist_ok=False)
377
- prepared = prepare_openai_input(payload["messages"], agent_workspace)
378
- llm_config = default_llm_config()
379
- backend_model = str(llm_config.get("model", ""))
380
- if prepared.initial_content_parts and not model_supports_runtime_image_parts(backend_model):
381
- raise OpenAICompatError(
382
- 400,
383
- f"Backend model {backend_model!r} does not support image content parts.",
384
- )
385
-
386
- tool_names = [name for name in AVAILABLE_TOOL_MAP if name != "AskUser"]
387
- agent = MultiTurnReactAgent(
388
- function_list=tool_names,
389
- llm=llm_config,
390
- trace_dir=str(trace_dir),
391
- role_prompt=config.role_prompt or None,
392
- )
393
-
394
- if config.input_wrapper:
395
- input_wrapper_messages = build_input_wrapper_messages(prepared=prepared, payload=payload)
396
- input_wrapper_text = call_wrapper_text(agent, input_wrapper_messages, max_output_tokens=1200)
397
- input_plan = extract_json_object(input_wrapper_text)
398
- append_api_event(
399
- trace_dir,
400
- "input_wrapper",
401
- {
402
- "enabled": True,
403
- "request": input_wrapper_messages,
404
- "response_text": input_wrapper_text,
405
- "input_plan": input_plan,
406
- },
407
- )
408
- else:
409
- input_plan = build_passthrough_input_plan(prepared=prepared, payload=payload)
410
- append_api_event(
411
- trace_dir,
412
- "input_wrapper",
413
- {
414
- "enabled": False,
415
- "input_plan": input_plan,
416
- },
417
- )
418
-
419
- agent_prompt = build_agent_prompt(input_plan, prepared)
420
- session = agent._run_session(
421
- agent_prompt,
422
- workspace_root=str(agent_workspace),
423
- initial_content_parts=prepared.initial_content_parts or None,
424
- )
425
- agent_result_text = str(session.get("result_text", "")).strip()
426
- append_api_event(
427
- trace_dir,
428
- "agent_result",
429
- {
430
- "termination": session.get("termination", ""),
431
- "result_text": agent_result_text,
432
- "trace_path": session.get("trace_path", ""),
433
- },
434
- )
435
-
436
- if config.output_wrapper:
437
- output_wrapper_messages = build_output_wrapper_messages(
438
- prepared=prepared,
439
- payload=payload,
440
- input_plan=input_plan,
441
- agent_result_text=agent_result_text,
442
- )
443
- final_text = call_wrapper_text(agent, output_wrapper_messages, max_output_tokens=final_max_tokens(payload))
444
- append_api_event(
445
- trace_dir,
446
- "output_wrapper",
447
- {
448
- "enabled": True,
449
- "request": output_wrapper_messages,
450
- "response_text": final_text,
451
- },
452
- )
453
- else:
454
- final_text = agent_result_text
455
- append_api_event(
456
- trace_dir,
457
- "output_wrapper",
458
- {
459
- "enabled": False,
460
- "response_text": final_text,
461
- },
462
- )
463
- return make_chat_completion_response(
464
- request_id=request_id,
465
- model=str(payload.get("model", "researchharness")),
466
- content=final_text,
467
- )
468
-
469
-
470
- def create_app(config: ServerConfig) -> FastAPI:
471
- app = FastAPI(title="ResearchHarness OpenAI-Compatible API", version="1.0")
472
-
473
- @app.exception_handler(OpenAICompatError)
474
- async def _handle_openai_compat_error(request: Request, exc: OpenAICompatError) -> JSONResponse:
475
- return openai_error_response(exc)
476
-
477
- @app.get("/v1/health")
478
- async def health() -> dict[str, Any]:
479
- return {
480
- "status": "ok",
481
- "api_runs_dir": str(config.api_runs_dir),
482
- "input_wrapper": config.input_wrapper,
483
- "output_wrapper": config.output_wrapper,
484
- }
485
-
486
- @app.post("/v1/chat/completions")
487
- async def chat_completions(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
488
- try:
489
- return run_chat_completion(payload, config)
490
- except OpenAICompatError:
491
- raise
492
- except Exception as exc:
493
- raise OpenAICompatError(500, f"ResearchHarness API error: {exc}", "server_error") from exc
494
-
495
- return app
496
-
497
-
498
- def serve(
499
- *,
500
- api_runs_dir: str,
501
- host: str = "127.0.0.1",
502
- port: int = 8686,
503
- role_prompt_files: Optional[list[str]] = None,
504
- input_wrapper: bool = True,
505
- output_wrapper: bool = True,
506
- ) -> None:
507
- root = normalize_workspace_root(api_runs_dir)
508
- role_prompt = read_role_prompt_files(role_prompt_files or [])
509
- config = ServerConfig(
510
- api_runs_dir=root,
511
- role_prompt=role_prompt,
512
- host=host,
513
- port=port,
514
- input_wrapper=input_wrapper,
515
- output_wrapper=output_wrapper,
516
- )
517
- app = create_app(config)
518
- uvicorn.run(app, host=host, port=port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api_runs/.gitkeep DELETED
@@ -1 +0,0 @@
1
-
 
 
app.py CHANGED
@@ -7,7 +7,6 @@ from pathlib import Path
7
 
8
  import uvicorn
9
 
10
- from agent_base.utils import read_role_prompt_files
11
  from frontend.local_server import app, configure_frontend
12
 
13
 
@@ -32,18 +31,9 @@ def _bool_env(name: str, default: bool) -> bool:
32
  raise ValueError(f"{name} must be a boolean, got {raw!r}")
33
 
34
 
35
- def _role_prompt_files() -> list[str]:
36
- raw = os.getenv("RH_ROLE_PROMPT_FILES", "").strip()
37
- if not raw:
38
- return []
39
- return [item for item in raw.split(os.pathsep) if item]
40
-
41
-
42
  def configure_space() -> None:
43
  runs_dir = Path(os.getenv("RH_SPACE_RUNS_DIR", "/tmp/researchharness_space/runs")).expanduser()
44
- role_prompt = read_role_prompt_files(_role_prompt_files())
45
  configure_frontend(
46
- role_prompt=role_prompt,
47
  managed_runs_dir=str(runs_dir),
48
  cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
49
  cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
 
7
 
8
  import uvicorn
9
 
 
10
  from frontend.local_server import app, configure_frontend
11
 
12
 
 
31
  raise ValueError(f"{name} must be a boolean, got {raw!r}")
32
 
33
 
 
 
 
 
 
 
 
34
  def configure_space() -> None:
35
  runs_dir = Path(os.getenv("RH_SPACE_RUNS_DIR", "/tmp/researchharness_space/runs")).expanduser()
 
36
  configure_frontend(
 
37
  managed_runs_dir=str(runs_dir),
38
  cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
39
  cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
benchmarks/QA/README.md DELETED
@@ -1,102 +0,0 @@
1
- # QA / VQA Benchmarks
2
-
3
- This directory documents the lightweight ResearchHarness contract for
4
- question-answering benchmarks, including plain-text QA and multimodal VQA-style
5
- tasks.
6
-
7
- The recommended integration is the OpenAI-compatible synchronous API server:
8
-
9
- ```bash
10
- python3 /abs/path/to/ResearchHarness/run_server.py \
11
- --api-runs-dir ./api_runs
12
- ```
13
-
14
- For QA/VQA benchmark runs, optionally add this benchmark role overlay:
15
-
16
- ```bash
17
- python3 /abs/path/to/ResearchHarness/run_server.py \
18
- --api-runs-dir ./api_runs \
19
- --role-prompt-file /abs/path/to/ResearchHarness/benchmarks/QA/role_prompt.md
20
- ```
21
-
22
- Each request creates a fresh run directory:
23
-
24
- ```text
25
- ./api_runs/
26
- `-- run_YYYYMMDD_HHMMSS_<random>/
27
- |-- agent_workspace/ # visible to the agent
28
- | `-- inputs/
29
- | `-- images/ # user-provided images, when present
30
- `-- agent_trace/ # server-side trace and session state
31
- |-- api_trace.jsonl
32
- |-- trace_*.jsonl
33
- `-- _session_state.json
34
- ```
35
-
36
- The input and output LLM wrappers are enabled by default:
37
-
38
- - `--input-wrapper` / `--no-input-wrapper` controls the input normalization pass.
39
- - `--output-wrapper` / `--no-output-wrapper` controls the final answer formatting pass.
40
-
41
- Strict-format benchmarks should usually keep both wrappers enabled. To return
42
- the agent's direct final text instead, run:
43
-
44
- ```bash
45
- python3 /abs/path/to/ResearchHarness/run_server.py \
46
- --api-runs-dir ./api_runs \
47
- --no-input-wrapper \
48
- --no-output-wrapper
49
- ```
50
-
51
- External benchmark runners can then use the regular OpenAI SDK with:
52
-
53
- ```python
54
- from openai import OpenAI
55
-
56
- client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
57
-
58
- response = client.chat.completions.create(
59
- model="researchharness",
60
- messages=[{"role": "user", "content": "Answer the question."}],
61
- )
62
-
63
- answer = response.choices[0].message.content
64
- ```
65
-
66
- ## Multimodal Input
67
-
68
- For image benchmarks, send OpenAI-style content parts. The first API version
69
- supports one or more `data:image/...;base64,...` URLs in the same request.
70
-
71
- ```python
72
- response = client.chat.completions.create(
73
- model="researchharness",
74
- messages=[
75
- {
76
- "role": "user",
77
- "content": [
78
- {"type": "text", "text": "What is shown? Return JSON with key answer."},
79
- {"type": "image_url", "image_url": {"url": data_url}},
80
- ],
81
- }
82
- ],
83
- )
84
- ```
85
-
86
- The API saves each submitted image under `agent_workspace/inputs/images/`,
87
- passes the image content to the first ResearchHarness model call when the
88
- backend model supports image parts, and includes each saved path in the
89
- agent-visible text.
90
-
91
- The returned answer should be self-contained for a remote evaluator. Workspace
92
- files may support the run, but the response should not only say to consult
93
- `answer.md`, `report.md`, an image file, or another local artifact.
94
-
95
- ## Scope
96
-
97
- - The endpoint is synchronous and returns one final text answer.
98
- - Each request gets a separate workspace subdirectory.
99
- - The API uses an input wrapper, the ResearchHarness agent, and an output
100
- wrapper so strict benchmark output formats do not destabilize the agent loop.
101
- - Streaming, async run status, artifact download, and remote image fetching are
102
- intentionally out of scope for this minimal QA contract.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/QA/role_prompt.md DELETED
@@ -1,31 +0,0 @@
1
- # Benchmark Role Overlay
2
-
3
- You are running inside ResearchHarness for a QA or VQA benchmark.
4
-
5
- Behavior:
6
- - Solve the user's task directly and carefully.
7
- - Use tools only when they materially improve answer quality.
8
- - If the request includes saved image paths, inspect the image evidence when it
9
- is needed for the answer.
10
- - Do not ask the user follow-up questions.
11
- - Do not stop with a plan. Produce the answer once enough evidence has been
12
- gathered.
13
- - It is acceptable to explain what evidence was used in the agent's internal
14
- final text; a downstream formatter will enforce the benchmark's exact output
15
- contract.
16
- - Assume the remote evaluator only sees the returned text, not your workspace.
17
- - Your final text must be a complete, independent plain-text answer.
18
- - Include the actual answer to the original question.
19
- - Include supporting evidence, calculations, or reasoning steps when they are
20
- needed to make the answer understandable.
21
- - In this benchmark role, do not rely on local workspace files as the answer.
22
- Files such as `answer.md`, `report.md`, images, or other artifacts may support
23
- your work, but the returned text itself must contain the answer a remote
24
- evaluator needs.
25
-
26
- For visual tasks:
27
- - Prefer the attached image content when it is available in the model input.
28
- - Use `ReadImage` on saved image paths when additional visual inspection is
29
- needed or when the prompt explicitly asks you to inspect local image files.
30
- - Do not invent visual details that are not supported by the image or tool
31
- output.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/README.md DELETED
@@ -1,18 +0,0 @@
1
- # Benchmarks
2
-
3
- This folder records benchmark-specific integration contracts that live
4
- **outside** `agent_base` so the core harness stays generic, lightweight, and
5
- fair across different evaluations.
6
-
7
- | Benchmark | Directory | Tracked contract |
8
- | --- | --- | --- |
9
- | ResearchClawBench | `benchmarks/ResearchClawBench/` | `README.md` + `role_prompt.md` + `adapter.py` |
10
- | QA / VQA-style benchmarks | `benchmarks/QA/` | `README.md` + `role_prompt.md` |
11
-
12
- ## Notes
13
-
14
- - `agent_base/` stays focused on the reusable harness runtime.
15
- - Benchmark-specific prompts, adapters, and integration notes should live under
16
- their own benchmark subdirectory.
17
- - Local benchmark helpers may exist for private experimentation, but they do
18
- not define the formal external integration contract.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/ResearchClawBench/README.md DELETED
@@ -1,44 +0,0 @@
1
- # ResearchClawBench
2
-
3
- This directory contains the tracked files needed to document how `ResearchHarness`
4
- should be integrated into `ResearchClawBench`.
5
-
6
- ResearchHarness is intended to serve here as a **general and fair execution
7
- substrate** for tool-using LLM evaluation, while `ResearchClawBench` remains in
8
- charge of task construction, hidden-answer isolation, and scoring.
9
-
10
- ## Recommended `agents.json` Entry
11
-
12
- Use a single direct command that launches the thin top-level ResearchHarness
13
- entrypoint.
14
-
15
- ```json
16
- {
17
- "researchharness": {
18
- "label": "ResearchHarness",
19
- "icon": "H",
20
- "logo": "/static/logos/rh.svg",
21
- "cmd": "python3 /abs/path/to/ResearchHarness/run_agent.py <PROMPT> --workspace-root <WORKSPACE> --role-prompt-file /abs/path/to/ResearchHarness/benchmarks/ResearchClawBench/role_prompt.md --trace-dir <WORKSPACE>"
22
- }
23
- }
24
- ```
25
-
26
- ## Why This Shape
27
-
28
- - `ResearchClawBench` already prepares the workspace, writes `INSTRUCTIONS.md`,
29
- and isolates hidden checklist data.
30
- - `ResearchHarness` should only execute the agent through a stable harness
31
- interface.
32
- - The command stays unchanged. The entrypoint automatically selects the
33
- lightweight adapter in `benchmarks/ResearchClawBench/adapter.py` when this
34
- benchmark role prompt is used.
35
-
36
- ## Notes
37
-
38
- - Replace `/abs/path/to/ResearchHarness/` with the real local checkout path.
39
- - The command should stay one-line and non-interactive.
40
- - The adapter prevents premature termination on long tasks by refusing to accept
41
- plain-text completion before `report/report.md` exists in the workspace.
42
- - The adapter excludes `AskUser`; RCB runs must remain fully non-interactive.
43
- - Any local batch helpers or ad hoc benchmark scripts should remain untracked
44
- and live outside the formal integration contract.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/ResearchClawBench/adapter.py DELETED
@@ -1,93 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
- from typing import Any, Optional, Sequence
5
-
6
- from agent_base.react_agent import AVAILABLE_TOOL_MAP, MultiTurnReactAgent
7
- from agent_base.tools.tooling import normalize_workspace_root
8
-
9
-
10
- class ResearchClawBenchAgent(MultiTurnReactAgent):
11
- """
12
- Lightweight benchmark adapter for ResearchClawBench.
13
-
14
- The benchmark task is not complete until the run workspace contains the
15
- canonical final report at report/report.md. Pure planning text without that
16
- artifact should not terminate the agent loop.
17
- """
18
-
19
- required_report_relpath = Path("report") / "report.md"
20
- forbidden_tool_names = {"AskUser"}
21
-
22
- def __init__(self, function_list: Optional[Sequence[str]] = None, *args: Any, **kwargs: Any):
23
- if function_list is None:
24
- function_list = [
25
- tool_name
26
- for tool_name in AVAILABLE_TOOL_MAP
27
- if tool_name not in self.forbidden_tool_names
28
- ]
29
- else:
30
- function_list = [str(tool_name).strip() for tool_name in function_list if str(tool_name).strip()]
31
- forbidden = sorted(set(function_list) & self.forbidden_tool_names)
32
- if forbidden:
33
- raise ValueError(f"Tools are not allowed in ResearchClawBench runs: {forbidden}")
34
- super().__init__(function_list=list(function_list), *args, **kwargs)
35
-
36
- def _required_report_path(self, workspace_root: Optional[str]) -> Path:
37
- workspace = Path(normalize_workspace_root(workspace_root))
38
- return workspace / self.required_report_relpath
39
-
40
- def should_accept_plaintext_result(
41
- self,
42
- *,
43
- result_text: str,
44
- workspace_root: Optional[str],
45
- messages: Sequence[dict[str, Any]],
46
- ) -> bool:
47
- if not self._required_report_path(workspace_root).exists():
48
- return False
49
- return super().should_accept_plaintext_result(
50
- result_text=result_text,
51
- workspace_root=workspace_root,
52
- messages=messages,
53
- )
54
-
55
- def rejected_plaintext_result_message(
56
- self,
57
- *,
58
- result_text: str,
59
- workspace_root: Optional[str],
60
- messages: Sequence[dict[str, Any]],
61
- ) -> str:
62
- if not self._required_report_path(workspace_root).exists():
63
- return (
64
- "The previous assistant turn was not accepted as the final result because "
65
- "ResearchClawBench requires report/report.md and that file is still missing. "
66
- "Continue working and use tool calls to produce or verify report/report.md before finishing."
67
- )
68
- return super().rejected_plaintext_result_message(
69
- result_text=result_text,
70
- workspace_root=workspace_root,
71
- messages=messages,
72
- )
73
-
74
- def should_accept_terminal_error(
75
- self,
76
- *,
77
- error_text: str,
78
- workspace_root: Optional[str],
79
- messages: Sequence[dict[str, Any]],
80
- ) -> bool:
81
- return self._required_report_path(workspace_root).exists()
82
-
83
- def accepted_terminal_error_result_text(
84
- self,
85
- *,
86
- error_text: str,
87
- workspace_root: Optional[str],
88
- messages: Sequence[dict[str, Any]],
89
- ) -> str:
90
- return (
91
- "ResearchClawBench completion recovered after a terminal LLM/runtime error because "
92
- "report/report.md already exists and the required final artifact has been produced."
93
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/ResearchClawBench/role_prompt.md DELETED
@@ -1,195 +0,0 @@
1
- # Benchmark Role Overlay
2
-
3
- ## Purpose
4
-
5
- You are running inside a benchmark-style scientific evaluation.
6
-
7
- Your job is not just to produce a plausible report. Your job is to produce a
8
- report whose claims are traceable to concrete artifacts in the workspace and
9
- whose methods match the task's named scientific commitments as closely as the
10
- environment allows.
11
-
12
- This benchmark is non-interactive. Do not use `AskUser` or attempt to ask the
13
- human for clarification. Resolve ambiguity from `INSTRUCTIONS.md`, workspace
14
- files, related work, and available local or web tools.
15
-
16
- ## Method Contract
17
-
18
- - Parse the task into explicit methodological commitments early.
19
- - Before broad exploration, infer the likely target artifact families required by
20
- the task, including:
21
- - primary quantitative answers
22
- - required comparison tables
23
- - expected figure families
24
- - interpretability artifacts
25
- - subgroup or condition-specific outputs
26
- - If the task names a framework, protocol, comparison structure,
27
- interpretability method, simulator, ablation, posterior treatment,
28
- reconciliation step, or validation design, treat that as part of the
29
- contract.
30
- - Do not silently replace an explicitly named method with a looser descriptive
31
- analysis.
32
- - Save a concise contract summary to `outputs/method_contract.json`.
33
- - Save the inferred target artifact inventory to
34
- `outputs/target_artifact_inventory.json`.
35
- - After reading the most relevant related-work papers, refresh both files if the
36
- papers reveal additional named baselines, architectures, figure families,
37
- comparison strata, or interpretability artifacts central to the task.
38
- - Save a concise related-work extraction to `outputs/related_work_contract.json`
39
- whenever related work materially changes the contract or artifact inventory.
40
-
41
- ## Capability Check
42
-
43
- - Before approximating or skipping a named method, check whether the needed
44
- dependency, library, or runtime capability is available.
45
- - Save the result to `outputs/dependency_check.json`.
46
- - If a named method cannot be implemented exactly, state the exact limitation
47
- and the fallback.
48
- - If the task centers on a named model family, simulator, architecture, or
49
- analysis stack, do not quietly swap to a different family just because it is
50
- easier. Either implement a minimally faithful version of the named approach
51
- or make the deviation explicit before proceeding.
52
-
53
- ## Evidence Discipline
54
-
55
- - Every major scientific claim should have at least one explicit supporting
56
- artifact in `outputs/` or `report/images/`.
57
- - Export the exact tables, matrices, or JSON objects used to create each main
58
- figure.
59
- - Add a dedicated validation subsection to the report that separates:
60
- - what was verified directly from workspace data
61
- - what came from related work
62
- - what remains an assumption or limitation
63
- - Answer claim-recovery questions claim-by-claim rather than only with a broad
64
- narrative.
65
- - Save a concise claim recovery table before finalizing the report.
66
- - When the task asks for quantitative constraints, limits, posterior summaries,
67
- calibration values, or uncertainty summaries, save those values explicitly in
68
- the requested variables and units rather than only through a proxy
69
- transformation.
70
- - If the task ultimately asks for a direct constraint on a named target
71
- quantity, prefer deriving and reporting that named quantity itself instead of
72
- stopping at an intermediate proxy axis, surrogate scale, or nearby latent
73
- variable whenever a defensible derivation is possible from workspace data and
74
- related work.
75
- - If posterior samples are a primary input, report canonical distribution
76
- summaries for each primary source, including mean and standard deviation,
77
- unless those statistics are mathematically invalid for the variable.
78
- - If the task names a primary source, cohort, benchmark, or experimental arm,
79
- produce at least one source-specific artifact for it before emphasizing only
80
- combined or aggregated results.
81
- - If the task names a direct target quantity, threshold, or decision criterion,
82
- export a compact result table that answers it directly before presenting
83
- broader supporting analyses.
84
-
85
- ## Related Work Use
86
-
87
- - Read `related_work/` early, but bounded.
88
- - Start with concise or bounded reads when papers are long.
89
- - Extract only task-relevant facts into notes or structured outputs.
90
- - If related work contains validation metrics, methodological caveats,
91
- baselines, or target comparison axes that matter for the task, incorporate
92
- them explicitly.
93
- - Prefer extracting from related work:
94
- - named methods or architectures to reproduce or compare against
95
- - target comparison axes and subgroup splits
96
- - likely main figure families or panel structures
97
- - explicit quantitative targets, thresholds, or calibration outputs
98
-
99
- ## Figure And Comparison Fidelity
100
-
101
- - Prefer claim-driven figures over generic exploratory plots.
102
- - Infer likely figure families and comparison structures from the task and
103
- related work.
104
- - If the task is about projections, calibration, method agreement, subgroup
105
- trends, rankings, level-wise comparisons, or ablations, produce figures that
106
- directly encode those structures.
107
- - Keep the main figure set compact: each main figure should support a specific
108
- target claim.
109
- - If the task's core claim is source-specific, dataset-specific, or benchmark-
110
- specific, include at least one main figure at that same granularity rather
111
- than only a pooled or combined summary figure.
112
- - If the task implies a named figure family such as ablation curves, PR/ROC
113
- curves, parity plots, subgroup heatmaps, saliency maps, architecture
114
- diagrams, or level-wise comparisons, prioritize that family over a generic
115
- substitute.
116
-
117
- ## Group And Condition Preservation
118
-
119
- - If the task names groups, conditions, labs, sexes, environments, shells,
120
- depth levels, or other comparison strata, preserve them in at least one
121
- exported table or figure.
122
- - Do not silently collapse mixed categories if the scientific question depends
123
- on them.
124
- - When subgroup structure matters over time, prefer a subgroup-by-time matrix
125
- and save it.
126
- - If the task is a benchmark or model-comparison study across datasets,
127
- baselines, cohorts, or conditions, export a compact comparison table with the
128
- main metric reported as mean ± standard deviation whenever repeated runs,
129
- folds, or stochastic training are part of the setup.
130
- - For multi-condition or multi-cohort tasks, save at least one artifact at the
131
- per-condition granularity before merging across conditions.
132
-
133
- ## Named Method Fidelity
134
-
135
- - If the task or related work defines a named mechanism, algorithm, or
136
- protocol central to the scientific claim, save a fidelity checklist to
137
- `outputs/method_fidelity_checklist.json`.
138
- - That checklist should capture:
139
- - the exact definition
140
- - assumptions
141
- - invariants
142
- - non-negotiable structural steps
143
- - Use it to verify whether the implemented method actually matches the named
144
- mechanism.
145
- - If you deviate, explain exactly how and why in the report.
146
- - If the task revolves around a named architecture or protocol, capture the key
147
- structural ingredients that distinguish it from nearby alternatives and check
148
- them explicitly.
149
-
150
- ## Small Sweeps And Ablations
151
-
152
- - If the named mechanism exposes a small discrete design variable, such as
153
- levels, layers, stages, shells, bins, or ablation settings, run at least a
154
- small sweep unless it is genuinely impossible from the available workspace.
155
- - If the task names a specific interpretability method such as SHAP,
156
- permutation importance, saliency, or similar, produce at least one artifact
157
- using that named method.
158
- - If the task claims improved interpretability, do not stop at aggregate metric
159
- gains alone; produce at least one explicit interpretability artifact and tie
160
- it back to domain-relevant entities, groups, or substructures named in the
161
- task or related work.
162
- - If the task names multiple groups, labs, cohorts, or environments, prefer an
163
- interpretability artifact that compares them directly instead of a single
164
- pooled explanation.
165
- - If interpretability is central and the chosen model family supports a common
166
- post hoc explanation method, do not stop at native coefficient or impurity
167
- magnitudes alone. Add at least one post hoc explanation artifact such as
168
- SHAP, permutation importance, saliency, attention attribution, or a similarly
169
- standard method for that model family.
170
-
171
- ## Finalization
172
-
173
- - Start `report/report.md` as soon as at least two core result families already
174
- have concrete supporting artifacts in `outputs/` or `report/images/`.
175
- - Prefer an evidence-backed report draft over one more optional script, one
176
- more polish pass, or one more non-essential figure.
177
- - Once the primary quantitative outputs, the main comparison figures, and the
178
- core validation artifacts exist, write `report/report.md` immediately.
179
- - Do not postpone the report in order to chase optional supplementary figures,
180
- extra exploratory analyses, or additional polish that is not required to
181
- support the task's main claims.
182
- - Treat optional supplementary work as lower priority than a complete,
183
- evidence-backed report. If the report can already answer the task directly,
184
- finish the report first and only then consider extras if there is clear
185
- remaining need.
186
- - The final report should be tightly traceable.
187
- - Important numbers should be reproducible from saved artifacts in the
188
- workspace.
189
- - Do not claim exact reproduction if only a rough approximation was achieved.
190
- - Before finalizing, check that the report contains direct answers to the main
191
- requested outputs in the named variables, units, and confidence language of
192
- the task, not only nearby surrogate quantities.
193
- - Before finalizing, verify that every primary entry in
194
- `outputs/target_artifact_inventory.json` is either satisfied by a concrete
195
- saved artifact or explicitly marked as unsatisfied with a reason.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/tutorial_en.md DELETED
@@ -1,531 +0,0 @@
1
- # ResearchHarness Tutorial
2
-
3
- This tutorial explains how to use ResearchHarness from the command line and as
4
- an OpenAI-compatible API service.
5
-
6
- ResearchHarness is a lightweight, general-purpose harness for tool-using LLM
7
- agents. It can be used as:
8
-
9
- - a command-line local agent,
10
- - a fair execution substrate for agent benchmarks,
11
- - an OpenAI-compatible synchronous API backend,
12
- - a personal assistant runtime for files, code, reports, PDFs, images, and web tasks.
13
-
14
- ## 1. Install
15
-
16
- Clone the repository and install dependencies:
17
-
18
- ```bash
19
- python3 -m pip install -r requirements.txt
20
- ```
21
-
22
- Python 3.10+ is recommended.
23
-
24
- ## 2. Configure Environment Variables
25
-
26
- Copy `.env.example` to `.env` and fill in the required values.
27
-
28
- Required variables:
29
-
30
- | Variable | Meaning |
31
- | --- | --- |
32
- | `API_KEY` | API key for your OpenAI-compatible LLM provider. |
33
- | `API_BASE` | Base URL for the OpenAI-compatible chat-completions endpoint. |
34
- | `MODEL_NAME` | Main model used by ResearchHarness. |
35
- | `SERPER_KEY` | Serper key for `WebSearch` and `ScholarSearch`: https://serper.dev/ |
36
- | `JINA_KEY` | Jina key for `WebFetch`: https://jina.ai/ |
37
- | `MINERU_TOKEN` | MinerU token for `ReadPDF`: https://mineru.net/ |
38
-
39
- Optional variables:
40
-
41
- | Variable | Default | Meaning |
42
- | --- | --- | --- |
43
- | `WORKSPACE_ROOT` | `./workspace` | Default workspace root when no explicit workspace is passed. |
44
- | `MAX_LLM_CALL_PER_RUN` | `100` | Maximum LLM calls in one agent run. |
45
- | `MAX_AGENT_ROUNDS` | `100` | Maximum ReAct loop rounds. |
46
- | `MAX_AGENT_RUNTIME_SECONDS` | `9000` | Maximum wall-clock runtime for one agent run. |
47
- | `LLM_TIMEOUT_SECONDS` | `600` | Timeout for each LLM API request. |
48
- | `LLM_MAX_OUTPUT_TOKENS` | `10000` | Requested maximum output tokens. |
49
- | `MAX_INPUT_TOKENS` | `320000` | Input-token budget used by runtime accounting. |
50
- | `LLM_MAX_RETRIES` | `10` | Maximum retries for transient LLM API errors. |
51
- | `TEMPERATURE` | `0.6` | Main model temperature. |
52
- | `TOP_P` | `0.95` | Main model top-p. |
53
- | `PRESENCE_PENALTY` | `1.1` | Main model presence penalty when supported. |
54
- | `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | Context length threshold for automatic compaction. |
55
- | `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | Token estimate for each image content part. |
56
- | `LLM_IMAGE_MAX_EDGE` | `1568` | Maximum image edge sent to multimodal models. |
57
- | `LLM_IMAGE_MAX_BYTES` | `524288` | Maximum compressed image payload size. |
58
- | `LLM_IMAGE_JPEG_QUALITY` | `85` | Initial JPEG quality for image compression. |
59
- | `DEBUG_AGENT` | `false` | Verbose agent-loop logs. |
60
- | `DEBUG_SEARCH` | `false` | Verbose WebSearch logs. |
61
- | `DEBUG_SCHOLAR` | `false` | Verbose ScholarSearch logs. |
62
- | `DEBUG_VISIT` | `false` | Verbose WebFetch logs. |
63
-
64
- Before real use, run:
65
-
66
- ```bash
67
- python3 tests/test_tool_availability.py
68
- ```
69
-
70
- All tools should pass. Missing service keys, missing dependencies, exhausted
71
- credits, or unavailable external tools should be treated as failures.
72
-
73
- If `WebSearch`, `ScholarSearch`, `WebFetch`, or `ReadPDF` fails with network,
74
- TLS, upload, download, or parsing errors, try disabling VPN/proxy and rerun the
75
- test.
76
-
77
- ## 3. Command-Line Usage
78
-
79
- Run a simple prompt:
80
-
81
- ```bash
82
- python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
83
- ```
84
-
85
- Use an explicit workspace:
86
-
87
- ```bash
88
- python3 run_agent.py "Summarize this project." \
89
- --workspace-root ./workspace
90
- ```
91
-
92
- You can replace `./workspace` with any other workspace directory.
93
-
94
- Save traces to a directory:
95
-
96
- ```bash
97
- python3 run_agent.py "Summarize this project." \
98
- --workspace-root ./workspace \
99
- --trace-dir ./traces
100
- ```
101
-
102
- You can replace `./traces` with any other trace directory.
103
-
104
- Without `--trace-dir`, CLI runs do not write a trace file.
105
-
106
- Append a role prompt:
107
-
108
- ```bash
109
- python3 run_agent.py "Answer this QA task." \
110
- --workspace-root ./workspace \
111
- --role-prompt-file benchmarks/QA/role_prompt.md
112
- ```
113
-
114
- Attach a local image:
115
-
116
- ```bash
117
- python3 run_agent.py "Read the image and return JSON." \
118
- --workspace-root ./workspace \
119
- --images /path/to/image.png /path/to/second-image.png
120
- ```
121
-
122
- Each image path must exist. RH copies images into `./workspace/inputs/images/`,
123
- sends them as initial `image_url` content parts, and adds each saved relative
124
- path to the user text so later rounds can call `ReadImage` on the same files.
125
-
126
- In an interactive terminal, CLI runs continue after a final answer and prompt
127
- for a follow-up. The follow-up run keeps the prior messages, tool results, and
128
- saved image path hints. During a running step, `Ctrl+C` interrupts the current
129
- run at the next safe point and returns to follow-up mode with context preserved.
130
- Press `Ctrl+C` at the follow-up prompt or send EOF to exit. Use `--no-chat` for
131
- strict one-shot behavior, or `--chat` to force follow-up mode.
132
-
133
- For browser-based local use, run `python3 run_frontend.py`. The frontend uses an
134
- existing workspace selected in the page, streams tool steps live, accepts one or
135
- more image attachments, and continues the current conversation after each final
136
- answer until you click **New chat**. While running, the send button becomes
137
- **Stop**; it interrupts at the next safe point and keeps the conversation
138
- context for the next message.
139
-
140
- ### CLI Parameters
141
-
142
- | Parameter | Required | Meaning |
143
- | --- | --- | --- |
144
- | positional `prompt` | yes, unless `--prompt-file` is used | Prompt text. |
145
- | `--prompt-file PATH` | no | Read prompt text from a UTF-8 file. |
146
- | `--workspace-root PATH` | no | Workspace root for local file tools, Bash, and terminal sessions. Created if missing. |
147
- | `--trace-dir PATH` | no | Directory where `trace_*.jsonl` is written. |
148
- | `--role-prompt-file PATH` | no, repeatable | Append role-specific prompt text to the base system prompt. |
149
- | `--images PATH [PATH ...]` | no | Copy one or more local images into `inputs/images/` and attach them to the initial user message. |
150
- | `--chat` / `--no-chat` | no | Enable or disable CLI follow-up mode. Default: enabled only when stdin and stdout are interactive terminals. |
151
-
152
- ## 4. OpenAI-Compatible API Server
153
-
154
- ResearchHarness can serve a synchronous OpenAI-compatible endpoint:
155
-
156
- ```http
157
- POST /v1/chat/completions
158
- ```
159
-
160
- This allows existing OpenAI SDK clients to call ResearchHarness by changing only
161
- `base_url`.
162
-
163
- ### Start the Server
164
-
165
- Default deployment:
166
-
167
- ```bash
168
- python3 run_server.py \
169
- --api-runs-dir ./api_runs \
170
- --host 127.0.0.1 \
171
- --port 8686
172
- ```
173
-
174
- QA/VQA benchmark deployment with a benchmark role overlay:
175
-
176
- ```bash
177
- python3 run_server.py \
178
- --api-runs-dir ./api_runs \
179
- --host 127.0.0.1 \
180
- --port 8686 \
181
- --role-prompt-file benchmarks/QA/role_prompt.md
182
- ```
183
-
184
- ### API Server Parameters
185
-
186
- | Parameter | Required | Default | Meaning |
187
- | --- | --- | --- | --- |
188
- | `--api-runs-dir PATH` | yes | none | Parent directory for API runs. Each request gets one subdirectory. |
189
- | `--host HOST` | no | `127.0.0.1` | Host to bind. |
190
- | `--port PORT` | no | `8686` | Port to bind. |
191
- | `--role-prompt-file PATH` | no, repeatable | none | Append role prompt text to the base ResearchHarness prompt. |
192
- | `--input-wrapper` / `--no-input-wrapper` | no | enabled | Enable or disable the input LLM wrapper. |
193
- | `--output-wrapper` / `--no-output-wrapper` | no | enabled | Enable or disable the output LLM wrapper. |
194
-
195
- ### Wrapper Modes
196
-
197
- Both wrappers are enabled by default.
198
-
199
- Strict-format benchmark mode:
200
-
201
- ```bash
202
- python3 run_server.py \
203
- --api-runs-dir ./api_runs \
204
- --role-prompt-file benchmarks/QA/role_prompt.md \
205
- --input-wrapper \
206
- --output-wrapper
207
- ```
208
-
209
- Direct agent mode:
210
-
211
- ```bash
212
- python3 run_server.py \
213
- --api-runs-dir ./api_runs \
214
- --no-input-wrapper \
215
- --no-output-wrapper
216
- ```
217
-
218
- Simple input plus strict final formatting:
219
-
220
- ```bash
221
- python3 run_server.py \
222
- --api-runs-dir ./api_runs \
223
- --no-input-wrapper \
224
- --output-wrapper
225
- ```
226
-
227
- The input wrapper rewrites the original user request into a stable task for the
228
- agent. The output wrapper formats the agent result to match the user's requested
229
- answer contract. Wrappers must not invent new facts; they only normalize input
230
- and format output.
231
-
232
- The API server is intentionally one request -> one answer. It does not keep a
233
- server-side conversation between HTTP requests. If an application needs API
234
- multi-turn behavior, keep that state in the client and send the needed prior
235
- context in later requests.
236
-
237
- ```mermaid
238
- flowchart LR
239
- U[User Input] --> IW[Input Wrapper LLM]
240
- IW --> A[ResearchHarness Agent]
241
- A --> OW[Output Wrapper LLM]
242
- OW --> O[Output]
243
- ```
244
-
245
- ## 5. API Workspace Layout
246
-
247
- Each API request creates one run directory:
248
-
249
- ```text
250
- ./api_runs/
251
- `-- run_YYYYMMDD_HHMMSS_<random>/
252
- |-- agent_workspace/
253
- | `-- inputs/
254
- | `-- images/
255
- `-- agent_trace/
256
- |-- api_trace.jsonl
257
- |-- trace_*.jsonl
258
- `-- _session_state.json
259
- ```
260
-
261
- Meaning:
262
-
263
- | Path | Meaning |
264
- | --- | --- |
265
- | `run_YYYYMMDD_HHMMSS_<random>/` | Per-request run root. |
266
- | `agent_workspace/` | The only workspace visible to the agent. File tools, Bash, `ls`, and `cat` start here. |
267
- | `agent_workspace/inputs/images/` | User-provided images saved from API requests. |
268
- | `agent_trace/` | API trace, agent trace, and runtime records. |
269
-
270
- For multimodal requests, image inputs are handled in two ways at the same time:
271
- the image content is passed to the backend model as initial multimodal input
272
- when the selected model supports it, and each image is saved under
273
- `agent_workspace/inputs/images/`. Each saved relative path is also included in
274
- the agent-visible text, so later rounds can call `ReadImage` on a stable local
275
- path without repeatedly resending image bytes.
276
-
277
- This separation keeps user-visible tool work separate from server-side trace files.
278
- In API deployment mode, traces are saved by default: every request writes
279
- `api_trace.jsonl`, `trace_*.jsonl`, and `_session_state.json` under that run's `agent_trace/`
280
- directory.
281
-
282
- ## 6. Text Request with OpenAI SDK
283
-
284
- ```python
285
- from openai import OpenAI
286
-
287
- client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
288
-
289
- response = client.chat.completions.create(
290
- model="researchharness",
291
- messages=[
292
- {"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
293
- ],
294
- )
295
-
296
- print(response.choices[0].message.content)
297
- ```
298
-
299
- ## 7. Multimodal Request with OpenAI SDK
300
-
301
- The first API version supports one or more `data:image/...;base64,...` image
302
- URLs in the same request. Remote image URLs and local file paths are
303
- intentionally not supported by the API server.
304
-
305
- The example below generates an image in memory and asks for JSON output.
306
-
307
- ```python
308
- import base64
309
- from io import BytesIO
310
-
311
- from PIL import Image, ImageDraw
312
- from openai import OpenAI
313
-
314
- image = Image.new("RGB", (320, 120), "white")
315
- draw = ImageDraw.Draw(image)
316
- draw.text((40, 45), "7 + 5 = ?", fill="black")
317
- buffer = BytesIO()
318
- image.save(buffer, format="PNG")
319
- data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
320
-
321
- client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
322
-
323
- response = client.chat.completions.create(
324
- model="researchharness",
325
- messages=[
326
- {
327
- "role": "user",
328
- "content": [
329
- {
330
- "type": "text",
331
- "text": (
332
- "The image contains a simple arithmetic expression. "
333
- "Return JSON with exactly two keys: expression and answer."
334
- ),
335
- },
336
- {"type": "image_url", "image_url": {"url": data_url}},
337
- ],
338
- }
339
- ],
340
- )
341
-
342
- print(response.choices[0].message.content)
343
- ```
344
-
345
- Expected answer shape:
346
-
347
- ```json
348
- {"expression":"7 + 5","answer":12}
349
- ```
350
-
351
- ## 8. API Request and Response Contract
352
-
353
- ### `POST /v1/chat/completions`
354
-
355
- Supported request fields:
356
-
357
- | Field | Required | Meaning |
358
- | --- | --- | --- |
359
- | `model` | yes | Client-visible model label. It does not override `MODEL_NAME`; the backend model comes from `.env`. |
360
- | `messages` | yes | OpenAI-style chat messages. |
361
- | `stream` | no | Must be absent or `false`; streaming is not supported. |
362
- | `n` | no | Must be absent or `1`. |
363
- | `max_tokens` | no | Maximum output tokens for the output wrapper. |
364
- | `max_completion_tokens` | no | Alias accepted for output-wrapper max tokens. |
365
- | `response_format` | no | Passed to the wrappers as an output-format hint. |
366
-
367
- Supported message roles:
368
-
369
- | Role | Supported |
370
- | --- | --- |
371
- | `system` | yes |
372
- | `user` | yes |
373
- | `assistant` | yes |
374
- | `tool` | no |
375
-
376
- Supported content forms:
377
-
378
- ```json
379
- {"role": "user", "content": "plain text"}
380
- ```
381
-
382
- ```json
383
- {
384
- "role": "user",
385
- "content": [
386
- {"type": "text", "text": "question"},
387
- {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
388
- ]
389
- }
390
- ```
391
-
392
- Response shape:
393
-
394
- ```json
395
- {
396
- "id": "chatcmpl_...",
397
- "object": "chat.completion",
398
- "created": 1770000000,
399
- "model": "researchharness",
400
- "choices": [
401
- {
402
- "index": 0,
403
- "message": {
404
- "role": "assistant",
405
- "content": "final answer"
406
- },
407
- "finish_reason": "stop"
408
- }
409
- ]
410
- }
411
- ```
412
-
413
- Callers usually only need:
414
-
415
- ```python
416
- response.choices[0].message.content
417
- ```
418
-
419
- ### `GET /v1/health`
420
-
421
- Returns:
422
-
423
- ```json
424
- {
425
- "status": "ok",
426
- "api_runs_dir": "./api_runs",
427
- "input_wrapper": true,
428
- "output_wrapper": true
429
- }
430
- ```
431
-
432
- ## 9. Tool Surface
433
-
434
- ResearchHarness currently includes:
435
-
436
- | Tool | Purpose |
437
- | --- | --- |
438
- | `Glob` | Discover files by pattern. |
439
- | `Grep` | Search text in files. |
440
- | `Read` | Read text files with bounds. |
441
- | `ReadPDF` | Parse PDFs with MinerU/structai. |
442
- | `ReadImage` | Inspect local image files and forward image content to vision-capable models. |
443
- | `Write` | Write files inside the workspace. |
444
- | `Edit` | Patch files inside the workspace. |
445
- | `Bash` | Run shell commands inside the workspace. |
446
- | `WebSearch` | Web search through Serper. |
447
- | `ScholarSearch` | Scholar-style search through Serper. |
448
- | `WebFetch` | Fetch and summarize webpages through Jina and the configured model. |
449
- | `AskUser` | Ask a human for clarification in interactive runs. Disabled by some benchmark adapters. |
450
- | `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | Persistent terminal sessions. |
451
-
452
- ## 10. Traces and Records
453
-
454
- CLI runs write traces only when `--trace-dir` is provided. Without
455
- `--trace-dir`, CLI runs do not write a trace file.
456
-
457
- API runs write traces under:
458
-
459
- ```text
460
- ./api_runs/run_.../agent_trace/
461
- ```
462
-
463
- Important files:
464
-
465
- | File | Meaning |
466
- | --- | --- |
467
- | `api_trace.jsonl` | Input wrapper, agent result, and output wrapper records. |
468
- | `trace_*.jsonl` | Flat agent runtime trace. |
469
- | `_session_state.json` | Current session state, written next to `trace_*.jsonl` when tracing is enabled. |
470
-
471
- The trace stores tool calls, tool results, LLM call capture payloads, compaction
472
- events, errors, and final termination state.
473
-
474
- ## 11. Benchmark Adapters
475
-
476
- Tracked benchmark contracts live under `benchmarks/`.
477
-
478
- Current tracked adapters:
479
-
480
- | Benchmark | Directory | Notes |
481
- | --- | --- | --- |
482
- | ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI integration with role prompt and adapter. |
483
- | QA / VQA | `benchmarks/QA/` | OpenAI-compatible API integration for text and multimodal QA. |
484
-
485
- Benchmark-specific behavior should stay outside `agent_base/`.
486
-
487
- ## 12. Testing
488
-
489
- Recommended checks:
490
-
491
- ```bash
492
- python3 tests/test_tool_availability.py
493
- python3 tests/test_openai_api_checks.py
494
- python3 tests/test_agent_extension_checks.py
495
- python3 tests/test_edge_case_checks.py
496
- python3 tests/test_toolchain_validation.py
497
- ```
498
-
499
- If using conda:
500
-
501
- ```bash
502
- /home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
503
- ```
504
-
505
- ## 13. Troubleshooting
506
-
507
- Common issues:
508
-
509
- | Symptom | Likely cause | Action |
510
- | --- | --- | --- |
511
- | Missing required env error | `.env` is incomplete | Fill required variables. |
512
- | Web/PDF tools fail | VPN/proxy/TLS/service issue | Disable VPN/proxy and rerun tool availability tests. |
513
- | Image request returns 400 | Image URL is not a `data:image/...;base64,...` URL | Convert the image to a base64 data URL. |
514
- | Backend model rejects images | Model endpoint is not vision-capable | Use a vision-capable model or send text-only tasks. |
515
- | API request fails with streaming error | `stream=true` was sent | Use synchronous requests only. |
516
- | Unexpected output format | Output wrapper disabled or prompt under-specified | Enable `--output-wrapper` and state the desired format clearly. |
517
-
518
- ## 14. Current Boundaries
519
-
520
- The first API version intentionally does not include:
521
-
522
- - streaming,
523
- - async run status,
524
- - cancellation,
525
- - artifact download endpoints,
526
- - remote image URL downloading,
527
- - user authentication,
528
- - multi-tenant access control.
529
-
530
- These can be added later as separate layers without changing the core harness
531
- loop.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/tutorial_zh.md DELETED
@@ -1,511 +0,0 @@
1
- # ResearchHarness 教程
2
-
3
- 本文介绍如何通过命令行和 OpenAI-compatible API 使用 ResearchHarness。
4
-
5
- ResearchHarness 是一个轻量、通用的 tool-using LLM agent harness。它可以作为:
6
-
7
- - 命令行本地 agent,
8
- - agent benchmark 的公平执行底座,
9
- - OpenAI-compatible 同步 API 后端,
10
- - 面向代码、文件、报告、PDF、图片、网页任务的个人助手运行时。
11
-
12
- ## 1. 安装
13
-
14
- 安装依赖:
15
-
16
- ```bash
17
- python3 -m pip install -r requirements.txt
18
- ```
19
-
20
- 推荐使用 Python 3.10+。
21
-
22
- ## 2. 配置环境变量
23
-
24
- 复制 `.env.example` 为 `.env`,并填写必需变量。
25
-
26
- 必需变量:
27
-
28
- | 变量 | 含义 |
29
- | --- | --- |
30
- | `API_KEY` | OpenAI-compatible LLM 服务的 API key。 |
31
- | `API_BASE` | OpenAI-compatible chat-completions endpoint 的 base URL。 |
32
- | `MODEL_NAME` | ResearchHarness 使用的主模型。 |
33
- | `SERPER_KEY` | `WebSearch` 和 `ScholarSearch` 使用的 Serper key:https://serper.dev/ |
34
- | `JINA_KEY` | `WebFetch` 使用的 Jina key:https://jina.ai/ |
35
- | `MINERU_TOKEN` | `ReadPDF` 使用的 MinerU token:https://mineru.net/ |
36
-
37
- 可选变量:
38
-
39
- | 变量 | 默认值 | 含义 |
40
- | --- | --- | --- |
41
- | `WORKSPACE_ROOT` | `./workspace` | 未显式传入 workspace 时使用的默认 workspace root。 |
42
- | `MAX_LLM_CALL_PER_RUN` | `100` | 单次 agent run 最多允许的 LLM 调用次数。 |
43
- | `MAX_AGENT_ROUNDS` | `100` | ReAct loop 最大轮次。 |
44
- | `MAX_AGENT_RUNTIME_SECONDS` | `9000` | 单次 agent run 的最大运行秒数。 |
45
- | `LLM_TIMEOUT_SECONDS` | `600` | 单次 LLM API 请求超时时间。 |
46
- | `LLM_MAX_OUTPUT_TOKENS` | `10000` | 请求模型输出的最大 token 数。 |
47
- | `MAX_INPUT_TOKENS` | `320000` | runtime token accounting 使用的输入 token 预算。 |
48
- | `LLM_MAX_RETRIES` | `10` | 瞬时 LLM API 错误最大重试次数。 |
49
- | `TEMPERATURE` | `0.6` | 主模型 temperature。 |
50
- | `TOP_P` | `0.95` | 主模型 top-p。 |
51
- | `PRESENCE_PENALTY` | `1.1` | provider 支持时使用的 presence penalty。 |
52
- | `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | 自动上下文压缩触发阈值。 |
53
- | `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | 每个 image content part 的 token 估计。 |
54
- | `LLM_IMAGE_MAX_EDGE` | `1568` | 发送给多模态模型的图片最大边长。 |
55
- | `LLM_IMAGE_MAX_BYTES` | `524288` | 发送给多模态模型的压缩图片最大字节数。 |
56
- | `LLM_IMAGE_JPEG_QUALITY` | `85` | 图片压缩时的初始 JPEG 质量。 |
57
- | `DEBUG_AGENT` | `false` | 打印 agent loop 详细调试日志。 |
58
- | `DEBUG_SEARCH` | `false` | 打印 WebSearch 调试日志。 |
59
- | `DEBUG_SCHOLAR` | `false` | 打印 ScholarSearch 调试日志。 |
60
- | `DEBUG_VISIT` | `false` | 打印 WebFetch 调试日志。 |
61
-
62
- 正式使用前,先运行:
63
-
64
- ```bash
65
- python3 tests/test_tool_availability.py
66
- ```
67
-
68
- 预期结果是全部工具通过。缺 key、缺依赖、服务额度耗尽、外部工具不可用都应该视为失败,不应 skip。
69
-
70
- 如果 `WebSearch`、`ScholarSearch`、`WebFetch` 或 `ReadPDF` 出现 network、TLS、upload、download、PDF parsing 相关错误,优先尝试关闭 VPN / proxy 后重跑测试。
71
-
72
- ## 3. 命令行使用
73
-
74
- 直接运行一个 prompt:
75
-
76
- ```bash
77
- python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
78
- ```
79
-
80
- 指定 workspace:
81
-
82
- ```bash
83
- python3 run_agent.py "Summarize this project." \
84
- --workspace-root ./workspace
85
- ```
86
-
87
- `./workspace` 可以替换为任何其他 workspace 目录。
88
-
89
- 保存 trace:
90
-
91
- ```bash
92
- python3 run_agent.py "Summarize this project." \
93
- --workspace-root ./workspace \
94
- --trace-dir ./traces
95
- ```
96
-
97
- `./traces` 可以替换为任何其他 trace 目录。
98
-
99
- 如果不传 `--trace-dir`,CLI 运行不会写 trace 文件。
100
-
101
- 追加 role prompt:
102
-
103
- ```bash
104
- python3 run_agent.py "Answer this QA task." \
105
- --workspace-root ./workspace \
106
- --role-prompt-file benchmarks/QA/role_prompt.md
107
- ```
108
-
109
- 附加本地图片:
110
-
111
- ```bash
112
- python3 run_agent.py "Read the image and return JSON." \
113
- --workspace-root ./workspace \
114
- --images /path/to/image.png /path/to/second-image.png
115
- ```
116
-
117
- 每个图片路径都必须存在。RH 会把图片复制到 `./workspace/inputs/images/`,
118
- 作为初始 `image_url` content part 传给模型,同时把每个保存后的相对路径写进
119
- 用户文本,让后续轮次可以用 `ReadImage` 重新读取这些图片。
120
-
121
- 在交互式终端中,CLI 会在最终回答后继续等待 follow-up。下一轮会保留之前的
122
- messages、工具结果和图片保存路径提示。运行过程中按 `Ctrl+C` 会在下一个安全点
123
- 中断当前 run,并带着上下文回到 follow-up 模式。在 follow-up 输入处按 `Ctrl+C`
124
- 或发送 EOF 可退出。脚本或 benchmark 如果需要严格的一问一答行为,使用
125
- `--no-chat`;需要强制开启时使用 `--chat`。
126
-
127
- 如果需要浏览器本地界面,运行 `python3 run_frontend.py`。前端使用页面中选择的
128
- 已有 workspace,实时显示工具步骤,支持一张或多张图片附件,���在每次最终回答后
129
- 继续当前对话,直到点击 **New chat**。运行中发送按钮会变成 **Stop**;它会在下一个
130
- 安全点中断,并保留上下文用于下一条消息。
131
-
132
- ### CLI 参数
133
-
134
- | 参数 | 是否必需 | 含义 |
135
- | --- | --- | --- |
136
- | 位置参数 `prompt` | 是,除非使用 `--prompt-file` | prompt 文本。 |
137
- | `--prompt-file PATH` | 否 | 从 UTF-8 文件读取 prompt。 |
138
- | `--workspace-root PATH` | 否 | 本地文件工具、Bash、Terminal 使用的 workspace root;不存在会自动创建。 |
139
- | `--trace-dir PATH` | 否 | 写入 `trace_*.jsonl` 的目录。 |
140
- | `--role-prompt-file PATH` | 否,可重复 | 追加 role-specific prompt 到 base system prompt。 |
141
- | `--images PATH [PATH ...]` | 否 | 把一张或多张本地图片复制到 `inputs/images/` 并附加到初始用户消息。 |
142
- | `--chat` / `--no-chat` | 否 | 开启或关闭 CLI follow-up 模式。默认只在 stdin 和 stdout 都是交互式终端时开启。 |
143
-
144
- ## 4. OpenAI-Compatible API Server
145
-
146
- ResearchHarness 可以部署为同步 OpenAI-compatible endpoint:
147
-
148
- ```http
149
- POST /v1/chat/completions
150
- ```
151
-
152
- 这样,现有 OpenAI SDK 客户端只需要修改 `base_url` 就可以调用 ResearchHarness。
153
-
154
- ### 启动服务
155
-
156
- 默认部署:
157
-
158
- ```bash
159
- python3 run_server.py \
160
- --api-runs-dir ./api_runs \
161
- --host 127.0.0.1 \
162
- --port 8686
163
- ```
164
-
165
- QA/VQA benchmark 部署,可以额外加 benchmark role overlay:
166
-
167
- ```bash
168
- python3 run_server.py \
169
- --api-runs-dir ./api_runs \
170
- --host 127.0.0.1 \
171
- --port 8686 \
172
- --role-prompt-file benchmarks/QA/role_prompt.md
173
- ```
174
-
175
- ### API Server 参数
176
-
177
- | 参数 | 是否必需 | 默认值 | 含义 |
178
- | --- | --- | --- | --- |
179
- | `--api-runs-dir PATH` | 是 | 无 | API runs 的父目录;每个请求会创建一个子目录。 |
180
- | `--host HOST` | 否 | `127.0.0.1` | 服务监听 host。 |
181
- | `--port PORT` | 否 | `8686` | 服务监听端口。 |
182
- | `--role-prompt-file PATH` | 否,可重复 | 无 | 追加 role prompt 到 base ResearchHarness prompt。 |
183
- | `--input-wrapper` / `--no-input-wrapper` | 否 | 开启 | 开启或关闭输入 LLM wrapper。 |
184
- | `--output-wrapper` / `--no-output-wrapper` | 否 | 开启 | 开启或关闭输出 LLM wrapper。 |
185
-
186
- ### Wrapper 模式
187
-
188
- 默认两个 wrapper 都开启。
189
-
190
- 严格格式 benchmark 模式:
191
-
192
- ```bash
193
- python3 run_server.py \
194
- --api-runs-dir ./api_runs \
195
- --role-prompt-file benchmarks/QA/role_prompt.md \
196
- --input-wrapper \
197
- --output-wrapper
198
- ```
199
-
200
- 直接 agent 模式:
201
-
202
- ```bash
203
- python3 run_server.py \
204
- --api-runs-dir ./api_runs \
205
- --no-input-wrapper \
206
- --no-output-wrapper
207
- ```
208
-
209
- 输入简单但最终答案需要严格格式:
210
-
211
- ```bash
212
- python3 run_server.py \
213
- --api-runs-dir ./api_runs \
214
- --no-input-wrapper \
215
- --output-wrapper
216
- ```
217
-
218
- input wrapper 的作用是把原始用户请求整理为适合 agent 稳定执行的任务。output wrapper 的作用是把 agent 的最终结果整理为用户要求的答案格式。wrapper 不应该引入新事实,只做输入规范化和输出格式化。
219
-
220
- API server 有意保持一问一答:每个 HTTP 请求创建一次隔离 run,并返回一个最终
221
- assistant message。服务端不会跨请求保存 conversation state。如果应用需要 API
222
- 多轮对话,应由客户端保存状态,并在后续请求中传入需要的上下文。
223
-
224
- ```mermaid
225
- flowchart LR
226
- U[User Input] --> IW[Input Wrapper LLM]
227
- IW --> A[ResearchHarness Agent]
228
- A --> OW[Output Wrapper LLM]
229
- OW --> O[Output]
230
- ```
231
-
232
- ## 5. API Workspace 结构
233
-
234
- 每个 API 请求会创建一个 run 目录:
235
-
236
- ```text
237
- ./api_runs/
238
- `-- run_YYYYMMDD_HHMMSS_<random>/
239
- |-- agent_workspace/
240
- | `-- inputs/
241
- | `-- images/
242
- `-- agent_trace/
243
- |-- api_trace.jsonl
244
- |-- trace_*.jsonl
245
- `-- _session_state.json
246
- ```
247
-
248
- 含义:
249
-
250
- | 路径 | 含义 |
251
- | --- | --- |
252
- | `run_YYYYMMDD_HHMMSS_<random>/` | 单个请求对应的 run 根目录。 |
253
- | `agent_workspace/` | agent 唯一可见的 workspace;文件工具、Bash、`ls`、`cat` 都从这里开始。 |
254
- | `agent_workspace/inputs/images/` | API 请求中用户提交的图片。 |
255
- | `agent_trace/` | API trace、agent trace 和 runtime 记录。 |
256
-
257
- 对于多模态请求,每张图片会同时走两条路径:当底层模型支持多模态输入时,
258
- 图片内容会作为初始多模态输入直接传给模型;每张图片也会保存到
259
- `agent_workspace/inputs/images/`。每个保存后的相对路径也会写进 agent 可见文本,
260
- 让后续轮次可以用 `ReadImage` 读取稳定的本地路径,而不是反复依赖内联图片字节。
261
-
262
- 这个结构把 agent 可见工作目录和服务端记录目录隔离开。
263
- 在 API 部署模式下,trace 默认保存:每个请求都会在自己的 `agent_trace/`
264
- 目录下写入 `api_trace.jsonl`、`trace_*.jsonl` 和 `_session_state.json`。
265
-
266
- ## 6. 纯文本 OpenAI SDK 请求
267
-
268
- ```python
269
- from openai import OpenAI
270
-
271
- client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
272
-
273
- response = client.chat.completions.create(
274
- model="researchharness",
275
- messages=[
276
- {"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
277
- ],
278
- )
279
-
280
- print(response.choices[0].message.content)
281
- ```
282
-
283
- ## 7. 多模态 OpenAI SDK 请求
284
-
285
- 第一版 API 支持同一个请求中包含一张或多张 `data:image/...;base64,...` 形式的图片 URL。API server 不支持远程图片 URL,也不支持让外部请求直接传本地文件路径。
286
-
287
- 下面的示例在代码中生成一张图片,并要求返回 JSON。
288
-
289
- ```python
290
- import base64
291
- from io import BytesIO
292
-
293
- from PIL import Image, ImageDraw
294
- from openai import OpenAI
295
-
296
- image = Image.new("RGB", (320, 120), "white")
297
- draw = ImageDraw.Draw(image)
298
- draw.text((40, 45), "7 + 5 = ?", fill="black")
299
- buffer = BytesIO()
300
- image.save(buffer, format="PNG")
301
- data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
302
-
303
- client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
304
-
305
- response = client.chat.completions.create(
306
- model="researchharness",
307
- messages=[
308
- {
309
- "role": "user",
310
- "content": [
311
- {
312
- "type": "text",
313
- "text": (
314
- "The image contains a simple arithmetic expression. "
315
- "Return JSON with exactly two keys: expression and answer."
316
- ),
317
- },
318
- {"type": "image_url", "image_url": {"url": data_url}},
319
- ],
320
- }
321
- ],
322
- )
323
-
324
- print(response.choices[0].message.content)
325
- ```
326
-
327
- 预期答案形状:
328
-
329
- ```json
330
- {"expression":"7 + 5","answer":12}
331
- ```
332
-
333
- ## 8. API 请求与返回协议
334
-
335
- ### `POST /v1/chat/completions`
336
-
337
- 支持的请求字段:
338
-
339
- | 字段 | 是否必需 | 含义 |
340
- | --- | --- | --- |
341
- | `model` | 是 | 客户端看到的 model label;不会覆盖 `.env` 中的 `MODEL_NAME`。 |
342
- | `messages` | 是 | OpenAI-style chat messages。 |
343
- | `stream` | 否 | 必须不存在或为 `false`;当前不支持 streaming。 |
344
- | `n` | 否 | 必须不存在或为 `1`。 |
345
- | `max_tokens` | 否 | output wrapper 最大输出 token。 |
346
- | `max_completion_tokens` | 否 | output wrapper 最大输出 token 的兼容别名。 |
347
- | `response_format` | 否 | 作为输出格式提示传给 wrapper。 |
348
-
349
- 支持的 message role:
350
-
351
- | Role | 是否支持 |
352
- | --- | --- |
353
- | `system` | 支持 |
354
- | `user` | 支持 |
355
- | `assistant` | 支持 |
356
- | `tool` | 不支持 |
357
-
358
- 支持的 content 形式:
359
-
360
- ```json
361
- {"role": "user", "content": "plain text"}
362
- ```
363
-
364
- ```json
365
- {
366
- "role": "user",
367
- "content": [
368
- {"type": "text", "text": "question"},
369
- {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
370
- ]
371
- }
372
- ```
373
-
374
- 返回结构:
375
-
376
- ```json
377
- {
378
- "id": "chatcmpl_...",
379
- "object": "chat.completion",
380
- "created": 1770000000,
381
- "model": "researchharness",
382
- "choices": [
383
- {
384
- "index": 0,
385
- "message": {
386
- "role": "assistant",
387
- "content": "final answer"
388
- },
389
- "finish_reason": "stop"
390
- }
391
- ]
392
- }
393
- ```
394
-
395
- 调用方通常只需要读取:
396
-
397
- ```python
398
- response.choices[0].message.content
399
- ```
400
-
401
- ### `GET /v1/health`
402
-
403
- 返回:
404
-
405
- ```json
406
- {
407
- "status": "ok",
408
- "api_runs_dir": "./api_runs",
409
- "input_wrapper": true,
410
- "output_wrapper": true
411
- }
412
- ```
413
-
414
- ## 9. 工具能力
415
-
416
- ResearchHarness 当前包含:
417
-
418
- | 工具 | 用途 |
419
- | --- | --- |
420
- | `Glob` | 按模式发现文件。 |
421
- | `Grep` | 在文件中搜索文本。 |
422
- | `Read` | 有边界地读取文本文件。 |
423
- | `ReadPDF` | 通过 MinerU/structai 解析 PDF。 |
424
- | `ReadImage` | 读取本地图片,并把图片内容传给支持 vision 的模型。 |
425
- | `Write` | 在 workspace 内写文件。 |
426
- | `Edit` | 在 workspace 内 patch 文件。 |
427
- | `Bash` | 在 workspace 内执行 shell 命令。 |
428
- | `WebSearch` | 通过 Serper 进行网页搜索。 |
429
- | `ScholarSearch` | 通过 Serper 进行学术搜索。 |
430
- | `WebFetch` | 通过 Jina 和配置模型抓取、总结网页。 |
431
- | `AskUser` | 交互式运行中向用户提问;某些 benchmark adapter 会禁用。 |
432
- | `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | 持久终端会话。 |
433
-
434
- ## 10. Trace 与记录
435
-
436
- CLI 运行只有在传入 `--trace-dir` 时才会写 trace。如果不传
437
- `--trace-dir`,CLI 运行不会写 trace 文件。
438
-
439
- API 运行时,记录在:
440
-
441
- ```text
442
- ./api_runs/run_.../agent_trace/
443
- ```
444
-
445
- 重要文件:
446
-
447
- | 文件 | 含义 |
448
- | --- | --- |
449
- | `api_trace.jsonl` | input wrapper、agent result、output wrapper 记录。 |
450
- | `trace_*.jsonl` | agent runtime 的 flat trace。 |
451
- | `_session_state.json` | 当前 session state;启用 trace 时和 `trace_*.jsonl` 写在同一目录。 |
452
-
453
- trace 会记录工具调用、工具结果、LLM call capture payload、context compaction、错误和终止状态。
454
-
455
- ## 11. Benchmark Adapter
456
-
457
- tracked benchmark contract 放在 `benchmarks/` 下。
458
-
459
- 当前 tracked adapter:
460
-
461
- | Benchmark | 目录 | 说明 |
462
- | --- | --- | --- |
463
- | ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI 方式接入,包含 role prompt 和 adapter。 |
464
- | QA / VQA | `benchmarks/QA/` | OpenAI-compatible API 方式接入,支持纯文本和多模态 QA。 |
465
-
466
- benchmark-specific 行为应放在 `benchmarks/`,不要塞进 `agent_base/`。
467
-
468
- ## 12. 测试
469
-
470
- 推荐检查:
471
-
472
- ```bash
473
- python3 tests/test_tool_availability.py
474
- python3 tests/test_openai_api_checks.py
475
- python3 tests/test_agent_extension_checks.py
476
- python3 tests/test_edge_case_checks.py
477
- python3 tests/test_toolchain_validation.py
478
- ```
479
-
480
- 如果使用 conda:
481
-
482
- ```bash
483
- /home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
484
- ```
485
-
486
- ## 13. 排障
487
-
488
- 常见问题:
489
-
490
- | 现象 | 可能原因 | 处理 |
491
- | --- | --- | --- |
492
- | 缺少 required env | `.env` 不完整 | 填写所有必需变量。 |
493
- | Web/PDF 工具失败 | VPN/proxy/TLS/服务问题 | 关闭 VPN/proxy 后重跑工具可用性测试。 |
494
- | 图片请求返回 400 | 图片不是 `data:image/...;base64,...` | 把图片转成 base64 data URL。 |
495
- | 后端模型拒绝图片 | 当前模型 endpoint 不支持 vision | 换用支持 vision 的模型,或改为纯文本任务。 |
496
- | API 报 streaming 错误 | 请求里传了 `stream=true` | 当前只支持同步请求。 |
497
- | 输出格式不符合预期 | output wrapper 关闭,或用户格式要求不明确 | 开启 `--output-wrapper`,并清楚说明输出格式。 |
498
-
499
- ## 14. 当前边界
500
-
501
- 第一版 API 暂不包括:
502
-
503
- - streaming,
504
- - async run status,
505
- - cancellation,
506
- - artifact download endpoint,
507
- - 远程图片 URL 下载,
508
- - 用户认证,
509
- - 多租户访问控制。
510
-
511
- 这些能力以后可以作为外层服务继续扩展,不需要破坏核心 harness loop。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
frontend/local_server.py CHANGED
@@ -4,7 +4,6 @@ import asyncio
4
  import base64
5
  import datetime as _dt
6
  import json
7
- import os
8
  import re
9
  import shutil
10
  import threading
@@ -16,7 +15,7 @@ from typing import Any
16
  from uuid import uuid4
17
 
18
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
19
- from fastapi.responses import FileResponse, JSONResponse
20
  from fastapi.staticfiles import StaticFiles
21
 
22
  from agent_base.react_agent import MultiTurnReactAgent, default_llm_config
@@ -35,9 +34,6 @@ from agent_base.utils import (
35
  STATIC_DIR = Path(__file__).resolve().parent / "static"
36
  MAX_UPLOAD_IMAGES = 12
37
  MAX_IMAGE_BYTES = 12 * 1024 * 1024
38
- MAX_DIRECTORY_ENTRIES = 800
39
- FRONTEND_ROLE_PROMPT = ""
40
- FRONTEND_TRACE_DIR: str | None = None
41
  FRONTEND_MANAGED_RUNS_DIR: str | None = None
42
  FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
43
  FRONTEND_CLEANUP_MAX_RUNS = 40
@@ -52,14 +48,12 @@ _ACTIVE_MANAGED_RUNS_LOCK = threading.Lock()
52
  _COLLECTION_LOCK = threading.Lock()
53
  _COLLECTION_CONFIG_WARNED: set[str] = set()
54
 
55
- app = FastAPI(title="ResearchHarness Local UI")
56
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
57
 
58
 
59
  def configure_frontend(
60
  *,
61
- role_prompt: str = "",
62
- trace_dir: str | None = None,
63
  managed_runs_dir: str | None = None,
64
  cleanup_retention_seconds: int | None = None,
65
  cleanup_max_runs: int | None = None,
@@ -69,11 +63,10 @@ def configure_frontend(
69
  collection_batch_size: int | None = None,
70
  collection_max_bundle_bytes: int | None = None,
71
  ) -> None:
72
- global FRONTEND_ROLE_PROMPT, FRONTEND_TRACE_DIR, FRONTEND_MANAGED_RUNS_DIR
73
  global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
74
  global FRONTEND_COLLECTION_ENABLED, FRONTEND_COLLECTION_DATASET_REPO
75
  global FRONTEND_COLLECTION_BATCH_SIZE, FRONTEND_COLLECTION_MAX_BUNDLE_BYTES
76
- FRONTEND_ROLE_PROMPT = str(role_prompt or "").strip()
77
  if collection_enabled is not None:
78
  FRONTEND_COLLECTION_ENABLED = bool(collection_enabled)
79
  if collection_dataset_repo is not None:
@@ -82,32 +75,22 @@ def configure_frontend(
82
  FRONTEND_COLLECTION_BATCH_SIZE = max(1, int(collection_batch_size))
83
  if collection_max_bundle_bytes is not None:
84
  FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = max(1, int(collection_max_bundle_bytes))
85
- if trace_dir:
86
- path = Path(trace_dir).expanduser()
87
- if path.exists() and not path.is_dir():
88
- raise ValueError(f"trace-dir is not a directory: {path}")
89
- path.mkdir(parents=True, exist_ok=True)
90
- FRONTEND_TRACE_DIR = str(path)
91
- else:
92
- FRONTEND_TRACE_DIR = None
93
-
94
- if managed_runs_dir:
95
- path = Path(managed_runs_dir).expanduser()
96
- if path.exists() and not path.is_dir():
97
- raise ValueError(f"managed-runs-dir is not a directory: {path}")
98
- path.mkdir(parents=True, exist_ok=True)
99
- FRONTEND_MANAGED_RUNS_DIR = str(path)
100
- if cleanup_retention_seconds is not None:
101
- FRONTEND_CLEANUP_RETENTION_SECONDS = max(60, int(cleanup_retention_seconds))
102
- if cleanup_max_runs is not None:
103
- FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
104
- if cleanup_interval_seconds is not None:
105
- FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
106
- _collection_root()
107
- cleanup_managed_runs_once()
108
- _start_managed_cleanup_thread()
109
- else:
110
- FRONTEND_MANAGED_RUNS_DIR = None
111
 
112
 
113
  class FrontendRunBridge:
@@ -543,26 +526,25 @@ def _run_agent_thread(
543
  prompt: str,
544
  workspace_root: Path,
545
  initial_content_parts: list[dict[str, Any]],
546
- trace_dir: str | None = None,
547
  prior_messages: list[dict[str, Any]] | None = None,
548
  managed_run_root: str = "",
 
549
  ) -> None:
550
  try:
551
  load_dotenv(PROJECT_ROOT / ".env")
552
  require_required_env("ResearchHarness frontend")
553
- effective_trace_dir = trace_dir if trace_dir is not None else FRONTEND_TRACE_DIR
554
  agent = FrontendInteractiveAgent(
555
  bridge=bridge,
556
- llm=default_llm_config(),
557
- trace_dir=effective_trace_dir,
558
- role_prompt=FRONTEND_ROLE_PROMPT or None,
559
  )
560
  bridge.send(
561
  {
562
  "type": "run_started",
563
  "model": agent.model,
564
  "workspace_root": str(workspace_root),
565
- "trace_dir": effective_trace_dir or "",
566
  }
567
  )
568
  result = agent._run_session(
@@ -590,98 +572,6 @@ def _run_agent_thread(
590
  bridge.send({"type": "run_error", "error": str(exc), "traceback": traceback.format_exc()})
591
 
592
 
593
- def _resolve_existing_workspace(raw_path: str) -> Path:
594
- if not str(raw_path or "").strip():
595
- raise ValueError("workspace path is required")
596
- path = Path(raw_path).expanduser()
597
- if not path.is_absolute():
598
- path = (Path.cwd() / path).resolve()
599
- else:
600
- path = path.resolve()
601
- if not path.exists() or not path.is_dir():
602
- raise ValueError(f"workspace must be an existing directory: {path}")
603
- return path
604
-
605
-
606
- def _resolve_directory_browser_path(raw_path: str = "") -> Path:
607
- text = str(raw_path or "").strip()
608
- if text:
609
- path = Path(text).expanduser()
610
- else:
611
- path = Path.home() if Path.home().exists() else PROJECT_ROOT
612
- if not path.is_absolute():
613
- path = (Path.cwd() / path).resolve()
614
- else:
615
- path = path.resolve()
616
- if not path.exists() or not path.is_dir():
617
- raise ValueError(f"directory does not exist: {path}")
618
- return path
619
-
620
-
621
- def _directory_root_choices() -> list[dict[str, str]]:
622
- candidates = [Path.home(), PROJECT_ROOT, PROJECT_ROOT / "workspace", Path.cwd(), Path("/mnt"), Path("/")]
623
- if os.name == "nt":
624
- for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
625
- candidates.append(Path(f"{letter}:\\"))
626
-
627
- seen: set[str] = set()
628
- roots: list[dict[str, str]] = []
629
- for candidate in candidates:
630
- try:
631
- resolved = candidate.expanduser().resolve()
632
- except (OSError, RuntimeError):
633
- continue
634
- if not resolved.exists() or not resolved.is_dir():
635
- continue
636
- key = str(resolved)
637
- if key in seen:
638
- continue
639
- seen.add(key)
640
- label = "Home" if resolved == Path.home().resolve() else (resolved.name or key)
641
- roots.append({"label": label, "path": key})
642
- return roots
643
-
644
-
645
- def _workspace_directory_payload(raw_path: str = "") -> dict[str, Any]:
646
- directory = _resolve_directory_browser_path(raw_path)
647
- entries: list[dict[str, str]] = []
648
- truncated = False
649
- try:
650
- children = sorted(directory.iterdir(), key=lambda item: item.name.casefold())
651
- except PermissionError as exc:
652
- raise ValueError(f"permission denied: {directory}") from exc
653
- except OSError as exc:
654
- raise ValueError(f"cannot read directory {directory}: {exc}") from exc
655
-
656
- for child in children:
657
- if len(entries) >= MAX_DIRECTORY_ENTRIES:
658
- truncated = True
659
- break
660
- try:
661
- if not child.is_dir():
662
- continue
663
- except OSError:
664
- continue
665
- entries.append({"name": child.name or str(child), "path": str(child)})
666
-
667
- parent = directory.parent if directory.parent != directory else None
668
- return {
669
- "path": str(directory),
670
- "parent": str(parent) if parent else "",
671
- "entries": entries,
672
- "truncated": truncated,
673
- "roots": _directory_root_choices(),
674
- }
675
-
676
-
677
- @app.get("/api/workspace-directories")
678
- def workspace_directories(path: str = "") -> JSONResponse:
679
- try:
680
- return JSONResponse(_workspace_directory_payload(path))
681
- except ValueError as exc:
682
- return JSONResponse({"error": str(exc)}, status_code=400)
683
-
684
-
685
  @app.get("/")
686
  def index() -> FileResponse:
687
  return FileResponse(STATIC_DIR / "index.html")
@@ -705,7 +595,7 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
705
 
706
  sender_task = asyncio.create_task(sender())
707
  try:
708
- await websocket.send_json({"type": "ready", "managed_workspace": bool(FRONTEND_MANAGED_RUNS_DIR)})
709
  while True:
710
  message = await websocket.receive_json()
711
  message_type = str(message.get("type", "")).strip()
@@ -719,30 +609,18 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
719
  continue
720
  try:
721
  continue_conversation = bool(message.get("continue_conversation"))
 
722
  prior_messages = None
723
- effective_trace_dir = FRONTEND_TRACE_DIR
724
- if FRONTEND_MANAGED_RUNS_DIR:
725
- if continue_conversation:
726
- if not bridge.conversation_messages or not bridge.managed_workspace_root:
727
- bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
728
- continue
729
- workspace_root = Path(bridge.managed_workspace_root)
730
- effective_trace_dir = bridge.managed_trace_dir or FRONTEND_TRACE_DIR
731
- prior_messages = bridge.conversation_messages
732
- else:
733
- _release_managed_run(bridge)
734
- workspace_root, effective_trace_dir = _create_managed_run(bridge)
735
  else:
736
- workspace_root = _resolve_existing_workspace(str(message.get("workspace_root", "")))
737
- if continue_conversation:
738
- if not bridge.conversation_messages:
739
- bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
740
- continue
741
- elif bridge.conversation_workspace_root and bridge.conversation_workspace_root != str(workspace_root):
742
- bridge.send({"type": "run_error", "error": "Workspace changed. Start a new chat before using a different workspace."})
743
- continue
744
- else:
745
- prior_messages = bridge.conversation_messages
746
  image_parts, saved_paths = save_uploaded_images(
747
  workspace_root,
748
  message.get("images", []) if isinstance(message.get("images", []), list) else [],
@@ -768,6 +646,7 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
768
  "trace_dir": effective_trace_dir,
769
  "prior_messages": prior_messages,
770
  "managed_run_root": bridge.managed_run_root,
 
771
  },
772
  daemon=True,
773
  )
 
4
  import base64
5
  import datetime as _dt
6
  import json
 
7
  import re
8
  import shutil
9
  import threading
 
15
  from uuid import uuid4
16
 
17
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
18
+ from fastapi.responses import FileResponse
19
  from fastapi.staticfiles import StaticFiles
20
 
21
  from agent_base.react_agent import MultiTurnReactAgent, default_llm_config
 
34
  STATIC_DIR = Path(__file__).resolve().parent / "static"
35
  MAX_UPLOAD_IMAGES = 12
36
  MAX_IMAGE_BYTES = 12 * 1024 * 1024
 
 
 
37
  FRONTEND_MANAGED_RUNS_DIR: str | None = None
38
  FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
39
  FRONTEND_CLEANUP_MAX_RUNS = 40
 
48
  _COLLECTION_LOCK = threading.Lock()
49
  _COLLECTION_CONFIG_WARNED: set[str] = set()
50
 
51
+ app = FastAPI(title="ResearchHarness Space UI")
52
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
53
 
54
 
55
  def configure_frontend(
56
  *,
 
 
57
  managed_runs_dir: str | None = None,
58
  cleanup_retention_seconds: int | None = None,
59
  cleanup_max_runs: int | None = None,
 
63
  collection_batch_size: int | None = None,
64
  collection_max_bundle_bytes: int | None = None,
65
  ) -> None:
66
+ global FRONTEND_MANAGED_RUNS_DIR
67
  global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
68
  global FRONTEND_COLLECTION_ENABLED, FRONTEND_COLLECTION_DATASET_REPO
69
  global FRONTEND_COLLECTION_BATCH_SIZE, FRONTEND_COLLECTION_MAX_BUNDLE_BYTES
 
70
  if collection_enabled is not None:
71
  FRONTEND_COLLECTION_ENABLED = bool(collection_enabled)
72
  if collection_dataset_repo is not None:
 
75
  FRONTEND_COLLECTION_BATCH_SIZE = max(1, int(collection_batch_size))
76
  if collection_max_bundle_bytes is not None:
77
  FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = max(1, int(collection_max_bundle_bytes))
78
+ if not managed_runs_dir:
79
+ raise ValueError("managed_runs_dir is required for the Space frontend")
80
+ path = Path(managed_runs_dir).expanduser()
81
+ if path.exists() and not path.is_dir():
82
+ raise ValueError(f"managed-runs-dir is not a directory: {path}")
83
+ path.mkdir(parents=True, exist_ok=True)
84
+ FRONTEND_MANAGED_RUNS_DIR = str(path)
85
+ if cleanup_retention_seconds is not None:
86
+ FRONTEND_CLEANUP_RETENTION_SECONDS = max(60, int(cleanup_retention_seconds))
87
+ if cleanup_max_runs is not None:
88
+ FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
89
+ if cleanup_interval_seconds is not None:
90
+ FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
91
+ _collection_root()
92
+ cleanup_managed_runs_once()
93
+ _start_managed_cleanup_thread()
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  class FrontendRunBridge:
 
526
  prompt: str,
527
  workspace_root: Path,
528
  initial_content_parts: list[dict[str, Any]],
529
+ trace_dir: str,
530
  prior_messages: list[dict[str, Any]] | None = None,
531
  managed_run_root: str = "",
532
+ model_name: str = "",
533
  ) -> None:
534
  try:
535
  load_dotenv(PROJECT_ROOT / ".env")
536
  require_required_env("ResearchHarness frontend")
 
537
  agent = FrontendInteractiveAgent(
538
  bridge=bridge,
539
+ llm=default_llm_config(model_name=model_name or None),
540
+ trace_dir=trace_dir,
 
541
  )
542
  bridge.send(
543
  {
544
  "type": "run_started",
545
  "model": agent.model,
546
  "workspace_root": str(workspace_root),
547
+ "trace_dir": trace_dir,
548
  }
549
  )
550
  result = agent._run_session(
 
572
  bridge.send({"type": "run_error", "error": str(exc), "traceback": traceback.format_exc()})
573
 
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  @app.get("/")
576
  def index() -> FileResponse:
577
  return FileResponse(STATIC_DIR / "index.html")
 
595
 
596
  sender_task = asyncio.create_task(sender())
597
  try:
598
+ await websocket.send_json({"type": "ready", "managed_workspace": True})
599
  while True:
600
  message = await websocket.receive_json()
601
  message_type = str(message.get("type", "")).strip()
 
609
  continue
610
  try:
611
  continue_conversation = bool(message.get("continue_conversation"))
612
+ model_name = str(message.get("model_name", "") or "").strip()
613
  prior_messages = None
614
+ if continue_conversation:
615
+ if not bridge.conversation_messages or not bridge.managed_workspace_root:
616
+ bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
617
+ continue
618
+ workspace_root = Path(bridge.managed_workspace_root)
619
+ effective_trace_dir = bridge.managed_trace_dir
620
+ prior_messages = bridge.conversation_messages
 
 
 
 
 
621
  else:
622
+ _release_managed_run(bridge)
623
+ workspace_root, effective_trace_dir = _create_managed_run(bridge)
 
 
 
 
 
 
 
 
624
  image_parts, saved_paths = save_uploaded_images(
625
  workspace_root,
626
  message.get("images", []) if isinstance(message.get("images", []), list) else [],
 
646
  "trace_dir": effective_trace_dir,
647
  "prior_messages": prior_messages,
648
  "managed_run_root": bridge.managed_run_root,
649
+ "model_name": model_name,
650
  },
651
  daemon=True,
652
  )
frontend/static/app.css CHANGED
@@ -201,7 +201,8 @@ button {
201
 
202
  .plain,
203
  .send-button,
204
- .icon-button {
 
205
  border: 1px solid var(--border);
206
  border-radius: 999px;
207
  background: var(--panel-strong);
@@ -214,12 +215,24 @@ button {
214
  padding: 8px 12px;
215
  }
216
 
 
 
 
 
 
217
  .plain:hover,
218
- .icon-button:hover {
 
 
219
  border-color: rgba(var(--glow-rgb), 0.38);
220
  transform: translateY(-1px);
221
  }
222
 
 
 
 
 
 
223
  .workspace-strip {
224
  position: sticky;
225
  top: 66px;
@@ -266,13 +279,11 @@ button {
266
  -webkit-overflow-scrolling: touch;
267
  }
268
 
269
- .messages::-webkit-scrollbar,
270
- .workspace-list::-webkit-scrollbar {
271
  width: 10px;
272
  }
273
 
274
- .messages::-webkit-scrollbar-thumb,
275
- .workspace-list::-webkit-scrollbar-thumb {
276
  border: 3px solid transparent;
277
  border-radius: 999px;
278
  background: rgba(var(--glow-rgb), 0.24);
@@ -696,172 +707,6 @@ button:disabled {
696
  text-align: center;
697
  }
698
 
699
- .modal {
700
- position: fixed;
701
- inset: 0;
702
- z-index: 30;
703
- display: grid;
704
- place-items: center;
705
- padding: 18px;
706
- background: rgba(0, 0, 0, 0.24);
707
- backdrop-filter: blur(14px);
708
- }
709
-
710
- .modal.hidden {
711
- display: none;
712
- }
713
-
714
- .modal-card {
715
- display: grid;
716
- grid-template-rows: auto auto auto minmax(0, 1fr) auto;
717
- gap: 12px;
718
- width: min(780px, 100%);
719
- max-height: min(760px, 82vh);
720
- border: 1px solid var(--border);
721
- border-radius: 28px;
722
- background: var(--panel-strong);
723
- box-shadow: 0 24px 88px rgba(0, 0, 0, 0.22);
724
- padding: 18px;
725
- }
726
-
727
- .modal-head,
728
- .modal-path-row,
729
- .modal-actions {
730
- display: flex;
731
- align-items: center;
732
- gap: 12px;
733
- }
734
-
735
- .modal-head {
736
- justify-content: space-between;
737
- }
738
-
739
- .modal-head h2,
740
- .modal-head p {
741
- margin: 0;
742
- }
743
-
744
- .modal-head h2 {
745
- font-size: 1.18rem;
746
- letter-spacing: -0.025em;
747
- }
748
-
749
- .modal-head p,
750
- .modal-actions span {
751
- color: var(--muted);
752
- font-size: 0.86rem;
753
- }
754
-
755
- .modal-path-row {
756
- border: 1px solid var(--border);
757
- border-radius: 18px;
758
- background: var(--hover);
759
- padding: 8px;
760
- }
761
-
762
- .modal-path-row input {
763
- min-width: 0;
764
- flex: 1;
765
- border: 0;
766
- outline: 0;
767
- background: transparent;
768
- color: var(--text);
769
- }
770
-
771
- .workspace-roots {
772
- display: flex;
773
- flex-wrap: wrap;
774
- gap: 8px;
775
- }
776
-
777
- .root-chip {
778
- max-width: 190px;
779
- overflow: hidden;
780
- border: 1px solid var(--border);
781
- border-radius: 999px;
782
- background: var(--panel);
783
- color: var(--text);
784
- font-weight: 800;
785
- padding: 7px 11px;
786
- text-overflow: ellipsis;
787
- white-space: nowrap;
788
- }
789
-
790
- .workspace-list {
791
- display: grid;
792
- align-content: start;
793
- gap: 7px;
794
- min-height: 0;
795
- overflow: auto;
796
- padding-right: 4px;
797
- }
798
-
799
- .dir-row {
800
- display: grid;
801
- grid-template-columns: auto minmax(0, 1fr) auto;
802
- align-items: center;
803
- gap: 10px;
804
- width: 100%;
805
- border: 1px solid var(--border);
806
- border-radius: 18px;
807
- background: var(--panel);
808
- color: var(--text);
809
- padding: 10px 12px;
810
- text-align: left;
811
- }
812
-
813
- .dir-row:hover,
814
- .root-chip:hover {
815
- border-color: rgba(var(--glow-rgb), 0.38);
816
- background: var(--hover);
817
- }
818
-
819
- .dir-icon {
820
- display: grid;
821
- place-items: center;
822
- width: 24px;
823
- height: 24px;
824
- border-radius: 50%;
825
- background: rgba(var(--glow-rgb), 0.1);
826
- font-weight: 900;
827
- }
828
-
829
- .dir-main {
830
- min-width: 0;
831
- }
832
-
833
- .dir-main strong,
834
- .dir-main small {
835
- display: block;
836
- overflow: hidden;
837
- text-overflow: ellipsis;
838
- white-space: nowrap;
839
- }
840
-
841
- .dir-main small {
842
- margin-top: 2px;
843
- color: var(--muted);
844
- font-size: 0.78rem;
845
- }
846
-
847
- .dir-action {
848
- color: var(--muted);
849
- font-size: 0.76rem;
850
- font-weight: 850;
851
- }
852
-
853
- .dir-empty {
854
- border: 1px dashed var(--border);
855
- border-radius: 18px;
856
- padding: 18px;
857
- color: var(--muted);
858
- text-align: center;
859
- }
860
-
861
- .modal-actions {
862
- justify-content: space-between;
863
- }
864
-
865
  #theme-switcher {
866
  position: fixed;
867
  right: 22px;
@@ -984,22 +829,6 @@ button:disabled {
984
  max-width: none;
985
  }
986
 
987
- .modal-card {
988
- max-height: 88vh;
989
- padding: 14px;
990
- }
991
-
992
- .modal-head,
993
- .modal-actions {
994
- align-items: stretch;
995
- flex-direction: column;
996
- }
997
-
998
- .modal-path-row {
999
- align-items: stretch;
1000
- flex-direction: column;
1001
- }
1002
-
1003
  .message,
1004
  .event {
1005
  max-width: 96%;
 
201
 
202
  .plain,
203
  .send-button,
204
+ .icon-button,
205
+ .model-select {
206
  border: 1px solid var(--border);
207
  border-radius: 999px;
208
  background: var(--panel-strong);
 
215
  padding: 8px 12px;
216
  }
217
 
218
+ .model-select {
219
+ min-width: 150px;
220
+ padding: 8px 34px 8px 12px;
221
+ }
222
+
223
  .plain:hover,
224
+ .icon-button:hover,
225
+ .model-select:hover:not(:disabled),
226
+ .model-select:focus-visible {
227
  border-color: rgba(var(--glow-rgb), 0.38);
228
  transform: translateY(-1px);
229
  }
230
 
231
+ .model-select:disabled {
232
+ cursor: not-allowed;
233
+ opacity: 0.58;
234
+ }
235
+
236
  .workspace-strip {
237
  position: sticky;
238
  top: 66px;
 
279
  -webkit-overflow-scrolling: touch;
280
  }
281
 
282
+ .messages::-webkit-scrollbar {
 
283
  width: 10px;
284
  }
285
 
286
+ .messages::-webkit-scrollbar-thumb {
 
287
  border: 3px solid transparent;
288
  border-radius: 999px;
289
  background: rgba(var(--glow-rgb), 0.24);
 
707
  text-align: center;
708
  }
709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  #theme-switcher {
711
  position: fixed;
712
  right: 22px;
 
829
  max-width: none;
830
  }
831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
832
  .message,
833
  .event {
834
  max-width: 96%;
frontend/static/app.js CHANGED
@@ -131,28 +131,16 @@
131
  var images = [];
132
  var COLLAPSED_STEP_HEIGHT = 220;
133
 
134
- var workspaceInput = document.getElementById("workspaceInput");
135
- var workspaceStrip = document.getElementById("workspaceStrip");
136
  var promptInput = document.getElementById("promptInput");
137
  var runBtn = document.getElementById("runBtn");
138
  var newBtn = document.getElementById("newBtn");
139
- var pickWorkspaceBtn = document.getElementById("pickWorkspaceBtn");
140
  var attachBtn = document.getElementById("attachBtn");
141
  var imageInput = document.getElementById("imageInput");
142
  var imagePreview = document.getElementById("imagePreview");
143
  var dropZone = document.getElementById("dropZone");
144
  var timeline = document.getElementById("timeline");
145
  var statusPill = document.getElementById("statusPill");
146
- var workspaceMeta = document.getElementById("workspaceMeta");
147
- var workspaceModal = document.getElementById("workspaceModal");
148
- var workspaceCloseBtn = document.getElementById("workspaceCloseBtn");
149
- var workspacePathInput = document.getElementById("workspacePathInput");
150
- var workspaceGoBtn = document.getElementById("workspaceGoBtn");
151
- var workspaceRoots = document.getElementById("workspaceRoots");
152
- var workspaceList = document.getElementById("workspaceList");
153
- var workspaceUseBtn = document.getElementById("workspaceUseBtn");
154
- var workspacePickerHint = document.getElementById("workspacePickerHint");
155
- var currentWorkspacePath = "";
156
  var defaultPromptPlaceholder = promptInput.getAttribute("placeholder") || "Message ResearchHarness";
157
 
158
  function escapeHtml(value) {
@@ -223,23 +211,20 @@
223
  statusPill.className = "status " + (kind || "idle");
224
  }
225
 
226
- function setWorkspaceSelected(path) {
227
- workspaceInput.value = path;
228
- workspaceMeta.textContent = "Workspace selected: " + path;
229
- }
230
-
231
  function updateComposerMode() {
232
  if (pendingAskId) {
233
  runBtn.disabled = false;
234
  runBtn.classList.remove("is-running");
235
  runBtn.textContent = "Reply";
236
  promptInput.placeholder = defaultPromptPlaceholder;
 
237
  return;
238
  }
239
  runBtn.disabled = running && interrupting;
240
  runBtn.classList.toggle("is-running", running);
241
  runBtn.textContent = running ? (interrupting ? "Stopping" : "Stop") : "Run";
242
  promptInput.placeholder = defaultPromptPlaceholder;
 
243
  }
244
 
245
  function setRunning(active, statusText) {
@@ -254,7 +239,7 @@
254
  timeline.innerHTML = ''
255
  + '<div class="welcome">'
256
  + '<h1>What should the agent do?</h1>'
257
- + '<p>Ask a question, attach images, choose a local workspace, and watch tool calls stream here.</p>'
258
  + '</div>';
259
  }
260
 
@@ -551,7 +536,7 @@
551
  ws.send(JSON.stringify({
552
  type: "start",
553
  prompt: prompt,
554
- workspace_root: workspaceInput.value,
555
  images: sentImages,
556
  continue_conversation: continueConversation
557
  }));
@@ -611,89 +596,6 @@
611
  });
612
  }
613
 
614
- function openWorkspaceModal() {
615
- workspaceModal.classList.remove("hidden");
616
- loadWorkspaceDirectory(workspaceInput.value.trim());
617
- }
618
-
619
- function closeWorkspaceModal() {
620
- workspaceModal.classList.add("hidden");
621
- }
622
-
623
- function setWorkspacePickerBusy(text) {
624
- workspaceList.innerHTML = '<div class="dir-empty">' + escapeHtml(text || "Loading...") + "</div>";
625
- workspacePickerHint.textContent = text || "Loading...";
626
- }
627
-
628
- function renderWorkspaceError(message) {
629
- workspaceList.innerHTML = '<div class="dir-empty error-text">' + escapeHtml(message) + "</div>";
630
- workspacePickerHint.textContent = "Paste a valid existing folder path, then press Go.";
631
- }
632
-
633
- function directoryRow(label, path, actionLabel, onClick) {
634
- var row = document.createElement("button");
635
- row.type = "button";
636
- row.className = "dir-row";
637
- row.innerHTML = ''
638
- + '<span class="dir-icon">&rsaquo;</span>'
639
- + '<span class="dir-main"><strong>' + escapeHtml(label) + '</strong><small>' + escapeHtml(path) + '</small></span>'
640
- + '<span class="dir-action">' + escapeHtml(actionLabel || "Open") + '</span>';
641
- row.addEventListener("click", onClick);
642
- return row;
643
- }
644
-
645
- function renderWorkspacePicker(payload) {
646
- currentWorkspacePath = payload.path || "";
647
- workspacePathInput.value = currentWorkspacePath;
648
- workspaceRoots.innerHTML = "";
649
- (payload.roots || []).forEach(function (root) {
650
- var chip = document.createElement("button");
651
- chip.type = "button";
652
- chip.className = "root-chip";
653
- chip.textContent = root.label || root.path;
654
- chip.title = root.path || "";
655
- chip.addEventListener("click", function () {
656
- loadWorkspaceDirectory(root.path || "");
657
- });
658
- workspaceRoots.appendChild(chip);
659
- });
660
-
661
- workspaceList.innerHTML = "";
662
- if (payload.parent) {
663
- workspaceList.appendChild(directoryRow("..", payload.parent, "Parent", function () {
664
- loadWorkspaceDirectory(payload.parent);
665
- }));
666
- }
667
- (payload.entries || []).forEach(function (entry) {
668
- workspaceList.appendChild(directoryRow(entry.name, entry.path, "Open", function () {
669
- loadWorkspaceDirectory(entry.path);
670
- }));
671
- });
672
- if (!payload.parent && !(payload.entries || []).length) {
673
- workspaceList.innerHTML = '<div class="dir-empty">No readable child folders.</div>';
674
- }
675
- workspacePickerHint.textContent = payload.truncated
676
- ? "Directory list was truncated. Paste a deeper path if needed."
677
- : "Current folder will be used when you click Use this folder.";
678
- }
679
-
680
- async function loadWorkspaceDirectory(path) {
681
- setWorkspacePickerBusy("Loading folders...");
682
- try {
683
- var url = "/api/workspace-directories";
684
- if (path) url += "?path=" + encodeURIComponent(path);
685
- var response = await fetch(url);
686
- var payload = await response.json();
687
- if (!response.ok || payload.error) {
688
- renderWorkspaceError(payload.error || "Cannot open this folder.");
689
- return;
690
- }
691
- renderWorkspacePicker(payload);
692
- } catch (error) {
693
- renderWorkspaceError(String(error));
694
- }
695
- }
696
-
697
  runBtn.addEventListener("click", sendStart);
698
  timeline.addEventListener("scroll", syncTimelineFollowMode);
699
  timeline.addEventListener("wheel", function (event) {
@@ -730,29 +632,6 @@
730
  });
731
  imageInput.addEventListener("change", function (event) { addImageFiles(event.target.files); });
732
 
733
- pickWorkspaceBtn.addEventListener("click", function () {
734
- openWorkspaceModal();
735
- });
736
-
737
- workspaceCloseBtn.addEventListener("click", closeWorkspaceModal);
738
- workspaceModal.addEventListener("click", function (event) {
739
- if (event.target === workspaceModal) closeWorkspaceModal();
740
- });
741
- workspaceGoBtn.addEventListener("click", function () {
742
- loadWorkspaceDirectory(workspacePathInput.value.trim());
743
- });
744
- workspacePathInput.addEventListener("keydown", function (event) {
745
- if (event.key === "Enter") {
746
- event.preventDefault();
747
- loadWorkspaceDirectory(workspacePathInput.value.trim());
748
- }
749
- });
750
- workspaceUseBtn.addEventListener("click", function () {
751
- if (!currentWorkspacePath) return;
752
- setWorkspaceSelected(currentWorkspacePath);
753
- closeWorkspaceModal();
754
- });
755
-
756
  ["dragenter", "dragover"].forEach(function (name) {
757
  dropZone.addEventListener(name, function (event) {
758
  event.preventDefault();
 
131
  var images = [];
132
  var COLLAPSED_STEP_HEIGHT = 220;
133
 
 
 
134
  var promptInput = document.getElementById("promptInput");
135
  var runBtn = document.getElementById("runBtn");
136
  var newBtn = document.getElementById("newBtn");
137
+ var modelSelect = document.getElementById("modelSelect");
138
  var attachBtn = document.getElementById("attachBtn");
139
  var imageInput = document.getElementById("imageInput");
140
  var imagePreview = document.getElementById("imagePreview");
141
  var dropZone = document.getElementById("dropZone");
142
  var timeline = document.getElementById("timeline");
143
  var statusPill = document.getElementById("statusPill");
 
 
 
 
 
 
 
 
 
 
144
  var defaultPromptPlaceholder = promptInput.getAttribute("placeholder") || "Message ResearchHarness";
145
 
146
  function escapeHtml(value) {
 
211
  statusPill.className = "status " + (kind || "idle");
212
  }
213
 
 
 
 
 
 
214
  function updateComposerMode() {
215
  if (pendingAskId) {
216
  runBtn.disabled = false;
217
  runBtn.classList.remove("is-running");
218
  runBtn.textContent = "Reply";
219
  promptInput.placeholder = defaultPromptPlaceholder;
220
+ if (modelSelect) modelSelect.disabled = true;
221
  return;
222
  }
223
  runBtn.disabled = running && interrupting;
224
  runBtn.classList.toggle("is-running", running);
225
  runBtn.textContent = running ? (interrupting ? "Stopping" : "Stop") : "Run";
226
  promptInput.placeholder = defaultPromptPlaceholder;
227
+ if (modelSelect) modelSelect.disabled = running;
228
  }
229
 
230
  function setRunning(active, statusText) {
 
239
  timeline.innerHTML = ''
240
  + '<div class="welcome">'
241
  + '<h1>What should the agent do?</h1>'
242
+ + '<p>Ask a question, attach images, and watch tool calls stream from an isolated temporary workspace.</p>'
243
  + '</div>';
244
  }
245
 
 
536
  ws.send(JSON.stringify({
537
  type: "start",
538
  prompt: prompt,
539
+ model_name: modelSelect ? modelSelect.value : "",
540
  images: sentImages,
541
  continue_conversation: continueConversation
542
  }));
 
596
  });
597
  }
598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  runBtn.addEventListener("click", sendStart);
600
  timeline.addEventListener("scroll", syncTimelineFollowMode);
601
  timeline.addEventListener("wheel", function (event) {
 
632
  });
633
  imageInput.addEventListener("change", function (event) { addImageFiles(event.target.files); });
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  ["dragenter", "dragover"].forEach(function (name) {
636
  dropZone.addEventListener(name, function (event) {
637
  event.preventDefault();
frontend/static/index.html CHANGED
@@ -19,13 +19,15 @@
19
  </div>
20
  </div>
21
  <div class="top-actions">
22
- <button id="pickWorkspaceBtn" class="plain" type="button" hidden>Open workspace</button>
 
 
 
23
  <button id="newBtn" class="plain" type="button">New chat</button>
24
  </div>
25
  </header>
26
 
27
  <section id="workspaceStrip" class="workspace-strip">
28
- <input id="workspaceInput" type="hidden" value="" />
29
  <span id="workspaceMeta">Managed temporary workspace. Each chat uses an isolated runtime directory.</span>
30
  </section>
31
 
@@ -48,27 +50,6 @@
48
  </footer>
49
  </main>
50
 
51
- <section id="workspaceModal" class="modal hidden" role="dialog" aria-modal="true" aria-labelledby="workspaceModalTitle">
52
- <div class="modal-card">
53
- <header class="modal-head">
54
- <div>
55
- <h2 id="workspaceModalTitle">Open workspace</h2>
56
- <p>Choose an existing local folder. Unicode paths are supported.</p>
57
- </div>
58
- <button id="workspaceCloseBtn" class="plain" type="button" aria-label="Close workspace picker">Close</button>
59
- </header>
60
- <div class="modal-path-row">
61
- <input id="workspacePathInput" type="text" autocomplete="off" placeholder="Paste a folder path..." />
62
- <button id="workspaceGoBtn" class="plain" type="button">Go</button>
63
- </div>
64
- <div id="workspaceRoots" class="workspace-roots"></div>
65
- <div id="workspaceList" class="workspace-list"></div>
66
- <footer class="modal-actions">
67
- <span id="workspacePickerHint">Select a folder to use as the agent workspace.</span>
68
- <button id="workspaceUseBtn" class="send-button" type="button">Use this folder</button>
69
- </footer>
70
- </div>
71
- </section>
72
  <nav class="space-links" aria-label="Project links">
73
  <a href="https://github.com/black-yt/ResearchHarness" target="_blank" rel="noopener noreferrer" title="GitHub">
74
  <svg viewBox="0 0 16 16" aria-hidden="true"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
 
19
  </div>
20
  </div>
21
  <div class="top-actions">
22
+ <select id="modelSelect" class="model-select" aria-label="Model">
23
+ <option value="gpt-5.5">gpt-5.5</option>
24
+ <option value="claude-opus-4-7">claude-opus-4-7</option>
25
+ </select>
26
  <button id="newBtn" class="plain" type="button">New chat</button>
27
  </div>
28
  </header>
29
 
30
  <section id="workspaceStrip" class="workspace-strip">
 
31
  <span id="workspaceMeta">Managed temporary workspace. Each chat uses an isolated runtime directory.</span>
32
  </section>
33
 
 
50
  </footer>
51
  </main>
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  <nav class="space-links" aria-label="Project links">
54
  <a href="https://github.com/black-yt/ResearchHarness" target="_blank" rel="noopener noreferrer" title="GitHub">
55
  <svg viewBox="0 0 16 16" aria-hidden="true"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
run_agent.py DELETED
@@ -1,7 +0,0 @@
1
- """Thin top-level CLI entrypoint for the ResearchHarness agent."""
2
-
3
- from agent_base.react_agent import main
4
-
5
-
6
- if __name__ == "__main__":
7
- raise SystemExit(main())
 
 
 
 
 
 
 
 
run_frontend.py DELETED
@@ -1,48 +0,0 @@
1
- """Launch the local ResearchHarness browser UI."""
2
-
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import sys
7
- import threading
8
- import webbrowser
9
-
10
- import uvicorn
11
-
12
- from agent_base.utils import read_role_prompt_files
13
- from frontend.local_server import app, configure_frontend
14
-
15
-
16
- def main(argv: list[str] | None = None) -> int:
17
- parser = argparse.ArgumentParser(description="Run the local ResearchHarness frontend.")
18
- parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Default: 127.0.0.1")
19
- parser.add_argument("--port", type=int, default=8765, help="Port to bind. Default: 8765")
20
- parser.add_argument("--no-browser", action="store_true", help="Do not open the browser automatically.")
21
- parser.add_argument("--trace-dir", help="Optional directory where frontend agent traces are written.")
22
- parser.add_argument(
23
- "--role-prompt-file",
24
- action="append",
25
- default=[],
26
- dest="role_prompt_files",
27
- metavar="PATH",
28
- help="Append one role-specific prompt file to the frontend agent. May be passed multiple times.",
29
- )
30
- args = parser.parse_args(argv)
31
-
32
- try:
33
- role_prompt = read_role_prompt_files(args.role_prompt_files)
34
- configure_frontend(role_prompt=role_prompt, trace_dir=args.trace_dir)
35
- except (OSError, ValueError) as exc:
36
- print(str(exc), file=sys.stderr)
37
- return 1
38
-
39
- url = f"http://{args.host}:{args.port}"
40
- if not args.no_browser:
41
- threading.Timer(0.8, lambda: webbrowser.open(url)).start()
42
- print(f"ResearchHarness frontend: {url}")
43
- uvicorn.run(app, host=args.host, port=args.port, reload=False)
44
- return 0
45
-
46
-
47
- if __name__ == "__main__":
48
- raise SystemExit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run_server.py DELETED
@@ -1,61 +0,0 @@
1
- """Run ResearchHarness as a minimal OpenAI-compatible API server."""
2
-
3
- from __future__ import annotations
4
-
5
- import argparse
6
- import sys
7
-
8
- from agent_base.utils import PROJECT_ROOT, MissingRequiredEnvError, load_dotenv, require_required_env
9
- from api.openai_server import serve
10
-
11
-
12
- def main(argv: list[str] | None = None) -> int:
13
- parser = argparse.ArgumentParser(description="Serve ResearchHarness through /v1/chat/completions.")
14
- parser.add_argument(
15
- "--api-runs-dir",
16
- required=True,
17
- dest="api_runs_dir",
18
- help="Directory where the server creates one isolated subdirectory per request.",
19
- )
20
- parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Defaults to 127.0.0.1.")
21
- parser.add_argument("--port", type=int, default=8686, help="Port to bind. Defaults to 8686.")
22
- parser.add_argument(
23
- "--role-prompt-file",
24
- action="append",
25
- default=[],
26
- dest="role_prompt_files",
27
- help="Optional role prompt file appended to the base ResearchHarness prompt.",
28
- )
29
- parser.add_argument(
30
- "--input-wrapper",
31
- action=argparse.BooleanOptionalAction,
32
- default=True,
33
- help="Enable or disable the input LLM wrapper. Enabled by default.",
34
- )
35
- parser.add_argument(
36
- "--output-wrapper",
37
- action=argparse.BooleanOptionalAction,
38
- default=True,
39
- help="Enable or disable the output LLM wrapper. Enabled by default.",
40
- )
41
- args = parser.parse_args(argv)
42
-
43
- load_dotenv(PROJECT_ROOT / ".env")
44
- try:
45
- require_required_env("ResearchHarness API server")
46
- serve(
47
- api_runs_dir=args.api_runs_dir,
48
- host=args.host,
49
- port=args.port,
50
- role_prompt_files=list(args.role_prompt_files),
51
- input_wrapper=args.input_wrapper,
52
- output_wrapper=args.output_wrapper,
53
- )
54
- except (MissingRequiredEnvError, ValueError) as exc:
55
- print(str(exc), file=sys.stderr)
56
- return 1
57
- return 0
58
-
59
-
60
- if __name__ == "__main__":
61
- raise SystemExit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
traces/.gitkeep DELETED
@@ -1 +0,0 @@
1
-
 
 
workspace/.gitkeep DELETED
@@ -1 +0,0 @@
1
-