Spaces:
Running
Running
Slim Space deployment mirror
Browse files- .dockerignore +10 -9
- .env.example +0 -39
- .gitignore +16 -215
- README.md +107 -10
- agent_base/console_utils.py +0 -223
- agent_base/react_agent.py +9 -154
- agent_base/tools/README.md +0 -457
- agent_base/tools/tool_web.py +34 -8
- agent_base/utils.py +0 -15
- api/__init__.py +0 -1
- api/openai_server.py +0 -518
- api_runs/.gitkeep +0 -1
- app.py +0 -10
- benchmarks/QA/README.md +0 -102
- benchmarks/QA/role_prompt.md +0 -31
- benchmarks/README.md +0 -18
- benchmarks/ResearchClawBench/README.md +0 -44
- benchmarks/ResearchClawBench/adapter.py +0 -93
- benchmarks/ResearchClawBench/role_prompt.md +0 -195
- docs/tutorial_en.md +0 -531
- docs/tutorial_zh.md +0 -511
- frontend/local_server.py +36 -157
- frontend/static/app.css +17 -188
- frontend/static/app.js +5 -126
- frontend/static/index.html +4 -23
- run_agent.py +0 -7
- run_frontend.py +0 -48
- run_server.py +0 -61
- traces/.gitkeep +0 -1
- workspace/.gitkeep +0 -1
.dockerignore
CHANGED
|
@@ -1,23 +1,24 @@
|
|
| 1 |
-
.git
|
| 2 |
.gitignore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
__pycache__/
|
| 4 |
*.py[cod]
|
| 5 |
.pytest_cache/
|
| 6 |
.mypy_cache/
|
| 7 |
.ruff_cache/
|
|
|
|
| 8 |
.env
|
| 9 |
.envrc
|
| 10 |
.venv/
|
| 11 |
venv/
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
traces/*
|
| 15 |
-
!traces/.gitkeep
|
| 16 |
-
api_runs/*
|
| 17 |
-
!api_runs/.gitkeep
|
| 18 |
-
runtime/
|
| 19 |
-
tests/
|
| 20 |
.codex/
|
|
|
|
| 21 |
.idea/
|
| 22 |
.vscode/
|
| 23 |
.DS_Store
|
|
|
|
| 1 |
+
.git/
|
| 2 |
.gitignore
|
| 3 |
+
AGENTS.md
|
| 4 |
+
runtime/
|
| 5 |
+
data/
|
| 6 |
+
inputs/
|
| 7 |
+
|
| 8 |
__pycache__/
|
| 9 |
*.py[cod]
|
| 10 |
.pytest_cache/
|
| 11 |
.mypy_cache/
|
| 12 |
.ruff_cache/
|
| 13 |
+
|
| 14 |
.env
|
| 15 |
.envrc
|
| 16 |
.venv/
|
| 17 |
venv/
|
| 18 |
+
env/
|
| 19 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
.codex/
|
| 21 |
+
.agents/
|
| 22 |
.idea/
|
| 23 |
.vscode/
|
| 24 |
.DS_Store
|
.env.example
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
# Required
|
| 2 |
-
API_KEY="your_openai_compatible_key" # API key for your OpenAI-compatible LLM provider.
|
| 3 |
-
API_BASE="https://your-openai-compatible-endpoint/v1" # Base URL for the OpenAI-compatible chat-completions endpoint.
|
| 4 |
-
MODEL_NAME="gpt-5.5" # Main model used by the agent and WebFetch summarization.
|
| 5 |
-
SERPER_KEY="your_serper_key" # https://serper.dev/
|
| 6 |
-
JINA_KEY="your_jina_key" # https://jina.ai/
|
| 7 |
-
MINERU_TOKEN="your_mineru_token" # https://mineru.net/
|
| 8 |
-
HF_TOKEN="your_huggingface_token" # Hugging Face token with dataset write access when collection is enabled.
|
| 9 |
-
|
| 10 |
-
# Optional
|
| 11 |
-
WORKSPACE_ROOT="./workspace" # Default local workspace root when --workspace-root is not provided.
|
| 12 |
-
MAX_LLM_CALL_PER_RUN=100 # Maximum chat-completions calls allowed in one agent run.
|
| 13 |
-
MAX_AGENT_ROUNDS=100 # Maximum ReAct loop rounds before forced termination.
|
| 14 |
-
MAX_AGENT_RUNTIME_SECONDS=9000 # Maximum wall-clock runtime per agent run.
|
| 15 |
-
LLM_TIMEOUT_SECONDS=600 # Timeout for each chat-completions request.
|
| 16 |
-
LLM_MAX_OUTPUT_TOKENS=10000 # Maximum output tokens requested from the main model.
|
| 17 |
-
MAX_INPUT_TOKENS=320000 # Maximum input-token budget used for runtime token accounting.
|
| 18 |
-
LLM_MAX_RETRIES=10 # Maximum retries for transient LLM API failures.
|
| 19 |
-
TEMPERATURE=0.6 # Main model sampling temperature.
|
| 20 |
-
TOP_P=0.95 # Main model nucleus-sampling top_p.
|
| 21 |
-
PRESENCE_PENALTY=1.1 # Main model presence penalty when supported by the provider.
|
| 22 |
-
AUTO_COMPACT_TRIGGER_TOKENS="128k" # Context size threshold that triggers automatic memory compaction.
|
| 23 |
-
IMAGE_PART_TOKEN_ESTIMATE=1536 # Token estimate used for each runtime image_url content part.
|
| 24 |
-
LLM_IMAGE_MAX_EDGE=1568 # Maximum image edge length sent to multimodal LLMs.
|
| 25 |
-
LLM_IMAGE_MAX_BYTES=524288 # Maximum compressed image payload size sent to multimodal LLMs.
|
| 26 |
-
LLM_IMAGE_JPEG_QUALITY=85 # Initial JPEG quality for runtime image compression.
|
| 27 |
-
DEBUG_AGENT=false # Print verbose agent-loop debug logs.
|
| 28 |
-
DEBUG_SEARCH=false # Print verbose WebSearch debug logs.
|
| 29 |
-
DEBUG_SCHOLAR=false # Print verbose ScholarSearch debug logs.
|
| 30 |
-
DEBUG_VISIT=false # Print verbose WebFetch debug logs.
|
| 31 |
-
RH_SPACE_RUNS_DIR="/tmp/researchharness_space/runs" # Parent directory for temporary per-chat runs in hosted mode.
|
| 32 |
-
RH_SPACE_RETENTION_SECONDS=21600 # Delete inactive hosted runs older than this many seconds.
|
| 33 |
-
RH_SPACE_MAX_RUNS=40 # Keep at most this many inactive hosted runs.
|
| 34 |
-
RH_SPACE_CLEANUP_INTERVAL_SECONDS=900 # Background cleanup interval for hosted runs.
|
| 35 |
-
RH_COLLECTION_ENABLED=true # Automatically collect hosted run traces after each completed run.
|
| 36 |
-
RH_COLLECTION_DATASET_REPO="CoCoOne/ResearchHarness-Data" # Hugging Face dataset repo receiving trace PRs.
|
| 37 |
-
RH_COLLECTION_BATCH_SIZE=5 # Create one dataset PR after this many collected runs.
|
| 38 |
-
RH_COLLECTION_MAX_BUNDLE_BYTES=20971520 # Drop any single trace bundle larger than this many bytes.
|
| 39 |
-
RH_ROLE_PROMPT_FILES="" # Optional role prompt files separated by os.pathsep.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1,230 +1,31 @@
|
|
| 1 |
-
runtime/
|
| 2 |
-
# Local agent artifacts
|
| 3 |
AGENTS.md
|
| 4 |
-
|
| 5 |
-
!workspace/.gitkeep
|
| 6 |
-
api_runs/*
|
| 7 |
-
!api_runs/.gitkeep
|
| 8 |
-
traces/*
|
| 9 |
-
!traces/.gitkeep
|
| 10 |
-
/inputs/
|
| 11 |
data/
|
| 12 |
-
|
| 13 |
-
.idea/
|
| 14 |
-
.vscode/
|
| 15 |
-
.DS_Store
|
| 16 |
-
tests/example_files/pdfs/dummy_document
|
| 17 |
-
.codex
|
| 18 |
-
|
| 19 |
|
| 20 |
-
# Byte-compiled / optimized / DLL files
|
| 21 |
__pycache__/
|
| 22 |
-
*.py[
|
| 23 |
*$py.class
|
| 24 |
-
|
| 25 |
-
# C extensions
|
| 26 |
*.so
|
| 27 |
|
| 28 |
-
# Distribution / packaging
|
| 29 |
-
.Python
|
| 30 |
-
build/
|
| 31 |
-
develop-eggs/
|
| 32 |
-
dist/
|
| 33 |
-
downloads/
|
| 34 |
-
eggs/
|
| 35 |
-
.eggs/
|
| 36 |
-
lib/
|
| 37 |
-
lib64/
|
| 38 |
-
parts/
|
| 39 |
-
sdist/
|
| 40 |
-
var/
|
| 41 |
-
wheels/
|
| 42 |
-
share/python-wheels/
|
| 43 |
-
*.egg-info/
|
| 44 |
-
.installed.cfg
|
| 45 |
-
*.egg
|
| 46 |
-
MANIFEST
|
| 47 |
-
|
| 48 |
-
# PyInstaller
|
| 49 |
-
# Usually these files are written by a python script from a template
|
| 50 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 51 |
-
*.manifest
|
| 52 |
-
*.spec
|
| 53 |
-
|
| 54 |
-
# Installer logs
|
| 55 |
-
pip-log.txt
|
| 56 |
-
pip-delete-this-directory.txt
|
| 57 |
-
|
| 58 |
-
# Unit test / coverage reports
|
| 59 |
-
htmlcov/
|
| 60 |
-
.tox/
|
| 61 |
-
.nox/
|
| 62 |
-
.coverage
|
| 63 |
-
.coverage.*
|
| 64 |
-
.cache
|
| 65 |
-
nosetests.xml
|
| 66 |
-
coverage.xml
|
| 67 |
-
*.cover
|
| 68 |
-
*.py.cover
|
| 69 |
-
.hypothesis/
|
| 70 |
-
.pytest_cache/
|
| 71 |
-
cover/
|
| 72 |
-
|
| 73 |
-
# Translations
|
| 74 |
-
*.mo
|
| 75 |
-
*.pot
|
| 76 |
-
|
| 77 |
-
# Django stuff:
|
| 78 |
-
*.log
|
| 79 |
-
local_settings.py
|
| 80 |
-
db.sqlite3
|
| 81 |
-
db.sqlite3-journal
|
| 82 |
-
|
| 83 |
-
# Flask stuff:
|
| 84 |
-
instance/
|
| 85 |
-
.webassets-cache
|
| 86 |
-
|
| 87 |
-
# Scrapy stuff:
|
| 88 |
-
.scrapy
|
| 89 |
-
|
| 90 |
-
# Sphinx documentation
|
| 91 |
-
docs/_build/
|
| 92 |
-
|
| 93 |
-
# PyBuilder
|
| 94 |
-
.pybuilder/
|
| 95 |
-
target/
|
| 96 |
-
|
| 97 |
-
# Jupyter Notebook
|
| 98 |
-
.ipynb_checkpoints
|
| 99 |
-
|
| 100 |
-
# IPython
|
| 101 |
-
profile_default/
|
| 102 |
-
ipython_config.py
|
| 103 |
-
|
| 104 |
-
# pyenv
|
| 105 |
-
# For a library or package, you might want to ignore these files since the code is
|
| 106 |
-
# intended to run in multiple environments; otherwise, check them in:
|
| 107 |
-
# .python-version
|
| 108 |
-
|
| 109 |
-
# pipenv
|
| 110 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 111 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 112 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 113 |
-
# install all needed dependencies.
|
| 114 |
-
#Pipfile.lock
|
| 115 |
-
|
| 116 |
-
# UV
|
| 117 |
-
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 118 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 119 |
-
# commonly ignored for libraries.
|
| 120 |
-
#uv.lock
|
| 121 |
-
|
| 122 |
-
# poetry
|
| 123 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 124 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 125 |
-
# commonly ignored for libraries.
|
| 126 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 127 |
-
#poetry.lock
|
| 128 |
-
#poetry.toml
|
| 129 |
-
|
| 130 |
-
# pdm
|
| 131 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 132 |
-
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 133 |
-
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 134 |
-
#pdm.lock
|
| 135 |
-
#pdm.toml
|
| 136 |
-
.pdm-python
|
| 137 |
-
.pdm-build/
|
| 138 |
-
|
| 139 |
-
# pixi
|
| 140 |
-
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 141 |
-
#pixi.lock
|
| 142 |
-
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 143 |
-
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 144 |
-
.pixi
|
| 145 |
-
|
| 146 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 147 |
-
__pypackages__/
|
| 148 |
-
|
| 149 |
-
# Celery stuff
|
| 150 |
-
celerybeat-schedule
|
| 151 |
-
celerybeat.pid
|
| 152 |
-
|
| 153 |
-
# SageMath parsed files
|
| 154 |
-
*.sage.py
|
| 155 |
-
|
| 156 |
-
# Environments
|
| 157 |
.env
|
| 158 |
.envrc
|
| 159 |
-
.venv
|
| 160 |
-
env/
|
| 161 |
venv/
|
| 162 |
-
|
| 163 |
-
env.bak/
|
| 164 |
-
venv.bak/
|
| 165 |
-
|
| 166 |
-
# Spyder project settings
|
| 167 |
-
.spyderproject
|
| 168 |
-
.spyproject
|
| 169 |
-
|
| 170 |
-
# Rope project settings
|
| 171 |
-
.ropeproject
|
| 172 |
-
|
| 173 |
-
# mkdocs documentation
|
| 174 |
-
/site
|
| 175 |
|
| 176 |
-
|
| 177 |
.mypy_cache/
|
| 178 |
-
.dmypy.json
|
| 179 |
-
dmypy.json
|
| 180 |
-
|
| 181 |
-
# Pyre type checker
|
| 182 |
-
.pyre/
|
| 183 |
-
|
| 184 |
-
# pytype static type analyzer
|
| 185 |
-
.pytype/
|
| 186 |
-
|
| 187 |
-
# Cython debug symbols
|
| 188 |
-
cython_debug/
|
| 189 |
-
|
| 190 |
-
# PyCharm
|
| 191 |
-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 192 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 193 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 194 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 195 |
-
#.idea/
|
| 196 |
-
|
| 197 |
-
# Abstra
|
| 198 |
-
# Abstra is an AI-powered process automation framework.
|
| 199 |
-
# Ignore directories containing user credentials, local state, and settings.
|
| 200 |
-
# Learn more at https://abstra.io/docs
|
| 201 |
-
.abstra/
|
| 202 |
-
|
| 203 |
-
# Visual Studio Code
|
| 204 |
-
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 205 |
-
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 206 |
-
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 207 |
-
# you could uncomment the following to ignore the entire vscode folder
|
| 208 |
-
# .vscode/
|
| 209 |
-
|
| 210 |
-
# Ruff stuff:
|
| 211 |
.ruff_cache/
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
# Cursor
|
| 217 |
-
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 218 |
-
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 219 |
-
# refer to https://docs.cursor.com/context/ignore-files
|
| 220 |
-
.cursorignore
|
| 221 |
-
.cursorindexingignore
|
| 222 |
-
|
| 223 |
-
# Marimo
|
| 224 |
-
marimo/_static/
|
| 225 |
-
marimo/_lsp/
|
| 226 |
-
__marimo__/
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
AGENTS.md
|
| 2 |
+
runtime/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
data/
|
| 4 |
+
inputs/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
|
|
|
| 6 |
__pycache__/
|
| 7 |
+
*.py[cod]
|
| 8 |
*$py.class
|
|
|
|
|
|
|
| 9 |
*.so
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
.env
|
| 12 |
.envrc
|
| 13 |
+
.venv/
|
|
|
|
| 14 |
venv/
|
| 15 |
+
env/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
.pytest_cache/
|
| 18 |
.mypy_cache/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
.ruff_cache/
|
| 20 |
+
.coverage
|
| 21 |
+
htmlcov/
|
| 22 |
|
| 23 |
+
build/
|
| 24 |
+
dist/
|
| 25 |
+
*.egg-info/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
.codex/
|
| 28 |
+
.agents/
|
| 29 |
+
.idea/
|
| 30 |
+
.vscode/
|
| 31 |
+
.DS_Store
|
README.md
CHANGED
|
@@ -10,17 +10,87 @@ license: mit
|
|
| 10 |
short_description: Lightweight harness for tool-using LLM agents.
|
| 11 |
---
|
| 12 |
|
| 13 |
-
# ResearchHarness Space
|
| 14 |
|
| 15 |
-
This
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
## Required Secrets
|
| 26 |
|
|
@@ -48,7 +118,6 @@ Configure these as Hugging Face Space secrets before starting the app:
|
|
| 48 |
| `RH_COLLECTION_DATASET_REPO` | `CoCoOne/ResearchHarness-Data` | Dataset repo that receives trajectory PRs. |
|
| 49 |
| `RH_COLLECTION_BATCH_SIZE` | `5` | Create one dataset PR after this many collected runs. |
|
| 50 |
| `RH_COLLECTION_MAX_BUNDLE_BYTES` | `20971520` | Drop a single run bundle if it exceeds this byte limit. |
|
| 51 |
-
| `RH_ROLE_PROMPT_FILES` | empty | Optional `os.pathsep`-separated role prompt files inside the Space image. |
|
| 52 |
| `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
|
| 53 |
|
| 54 |
## Runtime Layout
|
|
@@ -82,3 +151,31 @@ python app.py
|
|
| 82 |
```
|
| 83 |
|
| 84 |
Then open `http://127.0.0.1:7860`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
short_description: Lightweight harness for tool-using LLM agents.
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# ResearchHarness Space Maintenance Notes
|
| 14 |
|
| 15 |
+
This repository is the Hugging Face Docker Space deployment for
|
| 16 |
+
[`ResearchHarness`](https://github.com/black-yt/ResearchHarness). It is an online
|
| 17 |
+
app mirror, not the public open-source documentation and not a full source mirror.
|
| 18 |
|
| 19 |
+
The public project README, tutorials, benchmark notes, API server documentation,
|
| 20 |
+
and local CLI documentation belong in the main GitHub repository. This Space
|
| 21 |
+
README should stay focused on long-term deployment maintenance: what is copied
|
| 22 |
+
from the main repo, what is intentionally changed for hosted use, and what is
|
| 23 |
+
new in the Space.
|
| 24 |
+
|
| 25 |
+
## Repository Relationship
|
| 26 |
+
|
| 27 |
+
| Repository | Role |
|
| 28 |
+
| --- | --- |
|
| 29 |
+
| `black-yt/ResearchHarness` | Main open-source runtime, CLI, API server, frontend, docs, tests, and benchmark adapters. |
|
| 30 |
+
| `CoCoOne/ResearchHarness` | Hugging Face Space app that hosts the browser frontend with managed temporary workspaces. |
|
| 31 |
+
| `CoCoOne/ResearchHarness-Data` | Hugging Face dataset receiving collected hosted-run trajectory PRs. |
|
| 32 |
+
|
| 33 |
+
Maintenance rule:
|
| 34 |
+
|
| 35 |
+
- Copy only the runtime/frontend pieces needed by the hosted app.
|
| 36 |
+
- Do not blindly sync the whole main repository into this Space.
|
| 37 |
+
- Space-only deployment logic must not be copied back into the main repo unless
|
| 38 |
+
it is genuinely general-purpose.
|
| 39 |
+
- Public documentation should be updated in the main repo, not duplicated here.
|
| 40 |
+
|
| 41 |
+
## Copied From The Main Repository
|
| 42 |
+
|
| 43 |
+
These files/directories are copied from the main repo and should be refreshed
|
| 44 |
+
when their corresponding upstream implementation changes:
|
| 45 |
+
|
| 46 |
+
- `agent_base/`: core ReAct runtime, prompts, tool registry, provider
|
| 47 |
+
compatibility, trace/session state, image handling, and compaction logic.
|
| 48 |
+
- `agent_base/tools/`: hosted-safe tool implementations used by the frontend.
|
| 49 |
+
- `frontend/static/`: shared browser UI assets, styles, and client logic.
|
| 50 |
+
- `frontend/local_server.py`: WebSocket streaming frontend server base, with
|
| 51 |
+
Space-specific managed-workspace behavior preserved.
|
| 52 |
+
- `requirements.txt`: Python runtime dependencies needed by the hosted app.
|
| 53 |
+
|
| 54 |
+
When updating these files from the main repo, inspect the diff and preserve the
|
| 55 |
+
Space-specific changes listed below.
|
| 56 |
+
|
| 57 |
+
## Space-Specific Changes
|
| 58 |
+
|
| 59 |
+
These behaviors are intentional Space-only deltas:
|
| 60 |
+
|
| 61 |
+
- `app.py` is the Hugging Face entrypoint and owns Space startup, cleanup, and
|
| 62 |
+
trajectory collection configuration.
|
| 63 |
+
- Users cannot select arbitrary server folders. Each new chat gets an isolated
|
| 64 |
+
managed run directory under `RH_SPACE_RUNS_DIR`.
|
| 65 |
+
- The runtime layout is always:
|
| 66 |
+
`run_.../agent_workspace/` for agent-visible files and
|
| 67 |
+
`run_.../agent_trace/` for traces and `_session_state.json`.
|
| 68 |
+
- Uploaded images are saved under `agent_workspace/inputs/images/` and are also
|
| 69 |
+
passed to the model as image inputs when supported.
|
| 70 |
+
- The frontend exposes a per-run model dropdown. Current options are `gpt-5.5`
|
| 71 |
+
and `claude-opus-4-7`; the selection must stay local to that run and must not
|
| 72 |
+
mutate global process environment variables.
|
| 73 |
+
- Completed runs are packaged for trajectory collection and submitted as pull
|
| 74 |
+
requests to the configured Hugging Face dataset after the batch threshold is
|
| 75 |
+
reached.
|
| 76 |
+
- Old inactive runs are cleaned periodically so the Space does not grow without
|
| 77 |
+
bound.
|
| 78 |
+
|
| 79 |
+
## Intentionally Removed From The Space
|
| 80 |
+
|
| 81 |
+
The Space intentionally does not keep the full main-repo surface area:
|
| 82 |
+
|
| 83 |
+
- `run_agent.py`, `run_server.py`, `run_frontend.py`
|
| 84 |
+
- OpenAI-compatible API server code under `api/`
|
| 85 |
+
- benchmark adapters and benchmark documentation under `benchmarks/`
|
| 86 |
+
- long-form tutorials under `docs/`
|
| 87 |
+
- local placeholder directories such as `workspace/`, `api_runs/`, and `traces/`
|
| 88 |
+
- CLI-only console formatting helpers
|
| 89 |
+
- test fixtures and local test suites
|
| 90 |
+
- `.env.example`
|
| 91 |
+
|
| 92 |
+
Removing these files keeps the deployed app small and avoids stale code or
|
| 93 |
+
misleading documentation drifting away from the main repository.
|
| 94 |
|
| 95 |
## Required Secrets
|
| 96 |
|
|
|
|
| 118 |
| `RH_COLLECTION_DATASET_REPO` | `CoCoOne/ResearchHarness-Data` | Dataset repo that receives trajectory PRs. |
|
| 119 |
| `RH_COLLECTION_BATCH_SIZE` | `5` | Create one dataset PR after this many collected runs. |
|
| 120 |
| `RH_COLLECTION_MAX_BUNDLE_BYTES` | `20971520` | Drop a single run bundle if it exceeds this byte limit. |
|
|
|
|
| 121 |
| `PORT` | `7860` | Port used by Hugging Face Docker Spaces. |
|
| 122 |
|
| 123 |
## Runtime Layout
|
|
|
|
| 151 |
```
|
| 152 |
|
| 153 |
Then open `http://127.0.0.1:7860`.
|
| 154 |
+
|
| 155 |
+
Before pushing Space changes, run at least:
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
python3 -B - <<'PY'
|
| 159 |
+
from pathlib import Path
|
| 160 |
+
import py_compile
|
| 161 |
+
|
| 162 |
+
for path in Path(".").rglob("*.py"):
|
| 163 |
+
if ".git" not in path.parts:
|
| 164 |
+
py_compile.compile(str(path), doraise=True)
|
| 165 |
+
print("syntax ok")
|
| 166 |
+
PY
|
| 167 |
+
|
| 168 |
+
RH_COLLECTION_ENABLED=false python3 -B - <<'PY'
|
| 169 |
+
from fastapi.testclient import TestClient
|
| 170 |
+
import app
|
| 171 |
+
|
| 172 |
+
client = TestClient(app.app)
|
| 173 |
+
response = client.get("/")
|
| 174 |
+
assert response.status_code == 200
|
| 175 |
+
assert "ResearchHarness" in response.text
|
| 176 |
+
print("app ok")
|
| 177 |
+
PY
|
| 178 |
+
|
| 179 |
+
node --check frontend/static/app.js
|
| 180 |
+
git diff --check
|
| 181 |
+
```
|
agent_base/console_utils.py
DELETED
|
@@ -1,223 +0,0 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
import json
|
| 3 |
-
import os
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import shutil
|
| 6 |
-
import sys
|
| 7 |
-
import unicodedata
|
| 8 |
-
from typing import Any, Optional
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
ANSI_RESET = "\033[0m"
|
| 12 |
-
ANSI_COLORS = {
|
| 13 |
-
"header": "\033[36m",
|
| 14 |
-
"assistant": "\033[32m",
|
| 15 |
-
"tool": "\033[33m",
|
| 16 |
-
"runtime": "\033[34m",
|
| 17 |
-
"user": "\033[35m",
|
| 18 |
-
"error": "\033[31m",
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def _char_display_width(char: str) -> int:
|
| 23 |
-
if unicodedata.combining(char):
|
| 24 |
-
return 0
|
| 25 |
-
if unicodedata.category(char) in {"Cc", "Cf"}:
|
| 26 |
-
return 0
|
| 27 |
-
return 2 if unicodedata.east_asian_width(char) in {"F", "W"} else 1
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def _display_width(text: str) -> int:
|
| 31 |
-
return sum(_char_display_width(char) for char in str(text))
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def _truncate_display(text: str, width: int) -> str:
|
| 35 |
-
if _display_width(text) <= width:
|
| 36 |
-
return text
|
| 37 |
-
suffix = "..."
|
| 38 |
-
target = max(0, width - _display_width(suffix))
|
| 39 |
-
out = []
|
| 40 |
-
used = 0
|
| 41 |
-
for char in text:
|
| 42 |
-
char_width = _char_display_width(char)
|
| 43 |
-
if used + char_width > target:
|
| 44 |
-
break
|
| 45 |
-
out.append(char)
|
| 46 |
-
used += char_width
|
| 47 |
-
return "".join(out) + suffix
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def _pad_display(text: str, width: int) -> str:
|
| 51 |
-
return text + " " * max(0, width - _display_width(text))
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def _last_soft_break(chars: list[str]) -> int:
|
| 55 |
-
for index in range(len(chars) - 1, 0, -1):
|
| 56 |
-
if chars[index].isspace() and "".join(chars[:index]).strip():
|
| 57 |
-
return index
|
| 58 |
-
return -1
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
class ConsoleEventPrinter:
|
| 62 |
-
def __init__(self, *, model_name: str, workspace_root: Path, prompt: str):
|
| 63 |
-
self.model_name = model_name
|
| 64 |
-
self.workspace_root = workspace_root
|
| 65 |
-
self.prompt = prompt.strip()
|
| 66 |
-
self._printed_any = False
|
| 67 |
-
self._use_color = (
|
| 68 |
-
"NO_COLOR" not in os.environ
|
| 69 |
-
and os.environ.get("TERM") != "dumb"
|
| 70 |
-
and (sys.stdout.isatty() or bool(os.environ.get("FORCE_COLOR") or os.environ.get("CLICOLOR_FORCE")))
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
def print_header(self) -> None:
|
| 74 |
-
self._print_box(
|
| 75 |
-
"ResearchHarness CLI",
|
| 76 |
-
f"Model: {self.model_name}\nWorkspace Root: {self.workspace_root}\n\nPrompt:\n{self.prompt}",
|
| 77 |
-
"header",
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
def reset_rounds(self) -> None:
|
| 81 |
-
self._printed_any = False
|
| 82 |
-
|
| 83 |
-
def _paint(self, text: str, color_key: str) -> str:
|
| 84 |
-
if not self._use_color:
|
| 85 |
-
return text
|
| 86 |
-
return f"{ANSI_COLORS.get(color_key, '')}{text}{ANSI_RESET}"
|
| 87 |
-
|
| 88 |
-
def _terminal_width(self) -> int:
|
| 89 |
-
return max(60, min(110, shutil.get_terminal_size((100, 20)).columns))
|
| 90 |
-
|
| 91 |
-
def _wrap_line(self, line: str, width: int) -> list[str]:
|
| 92 |
-
expanded = line.expandtabs(2)
|
| 93 |
-
if expanded == "":
|
| 94 |
-
return [""]
|
| 95 |
-
chunks: list[str] = []
|
| 96 |
-
current: list[str] = []
|
| 97 |
-
current_width = 0
|
| 98 |
-
for char in expanded:
|
| 99 |
-
char_width = _char_display_width(char)
|
| 100 |
-
if current and current_width + char_width > width:
|
| 101 |
-
break_at = _last_soft_break(current)
|
| 102 |
-
if break_at > 0:
|
| 103 |
-
chunks.append("".join(current[:break_at]).rstrip())
|
| 104 |
-
current = list("".join(current[break_at + 1 :]).lstrip())
|
| 105 |
-
current_width = _display_width("".join(current))
|
| 106 |
-
else:
|
| 107 |
-
chunks.append("".join(current))
|
| 108 |
-
current = []
|
| 109 |
-
current_width = 0
|
| 110 |
-
current.append(char)
|
| 111 |
-
current_width += char_width
|
| 112 |
-
if current:
|
| 113 |
-
chunks.append("".join(current))
|
| 114 |
-
return chunks or [""]
|
| 115 |
-
|
| 116 |
-
def _print_box(self, title: str, body: str, color_key: str = "runtime") -> None:
|
| 117 |
-
width = self._terminal_width()
|
| 118 |
-
inner_width = width - 4
|
| 119 |
-
title_text = f" {_truncate_display(title.strip(), width - 6)} "
|
| 120 |
-
top = "+" + title_text + "-" * max(0, width - 2 - _display_width(title_text)) + "+"
|
| 121 |
-
bottom = "+" + "-" * (width - 2) + "+"
|
| 122 |
-
if self._printed_any:
|
| 123 |
-
print()
|
| 124 |
-
print(self._paint(top, color_key))
|
| 125 |
-
for raw_line in str(body or "").splitlines() or [""]:
|
| 126 |
-
for line in self._wrap_line(raw_line, inner_width):
|
| 127 |
-
padded = _pad_display(line, inner_width)
|
| 128 |
-
print(f"{self._paint('|', color_key)} {padded} {self._paint('|', color_key)}")
|
| 129 |
-
print(self._paint(bottom, color_key))
|
| 130 |
-
self._printed_any = True
|
| 131 |
-
|
| 132 |
-
def _title(self, label: str, turn_index: int) -> str:
|
| 133 |
-
return f"{label} | round {turn_index}" if turn_index > 0 else label
|
| 134 |
-
|
| 135 |
-
def _format_tool_call(self, tool_name: str, tool_args: Any) -> str:
|
| 136 |
-
try:
|
| 137 |
-
tool_args_text = json.dumps(tool_args, ensure_ascii=False, indent=2)
|
| 138 |
-
except TypeError:
|
| 139 |
-
tool_args_text = str(tool_args)
|
| 140 |
-
return f"- {tool_name}\n{tool_args_text}"
|
| 141 |
-
|
| 142 |
-
def handle_event(self, row: dict[str, Any]) -> None:
|
| 143 |
-
role = str(row.get("role", ""))
|
| 144 |
-
turn_index = int(row.get("turn_index", 0) or 0)
|
| 145 |
-
text = str(row.get("text", ""))
|
| 146 |
-
capture_type = str(row.get("capture_type", ""))
|
| 147 |
-
tool_names = row.get("tool_names") if isinstance(row.get("tool_names"), list) else []
|
| 148 |
-
tool_arguments = row.get("tool_arguments") if isinstance(row.get("tool_arguments"), list) else []
|
| 149 |
-
finish_reason = str(row.get("finish_reason", ""))
|
| 150 |
-
error = str(row.get("error", ""))
|
| 151 |
-
|
| 152 |
-
if capture_type and not text.strip():
|
| 153 |
-
return
|
| 154 |
-
|
| 155 |
-
if role == "system":
|
| 156 |
-
return
|
| 157 |
-
|
| 158 |
-
if role == "user":
|
| 159 |
-
if turn_index == 0:
|
| 160 |
-
return
|
| 161 |
-
self._print_box(self._title("Runtime Message", turn_index), text, "user")
|
| 162 |
-
return
|
| 163 |
-
|
| 164 |
-
if role == "assistant":
|
| 165 |
-
lines: list[str] = []
|
| 166 |
-
if tool_names:
|
| 167 |
-
if text.strip():
|
| 168 |
-
lines.append(text)
|
| 169 |
-
else:
|
| 170 |
-
suffix = f" finish_reason={finish_reason}" if finish_reason else ""
|
| 171 |
-
lines.append(f"(no text; native tool-calls only.{suffix})")
|
| 172 |
-
lines.append("")
|
| 173 |
-
lines.append("Assistant Tool Calls:")
|
| 174 |
-
for idx, tool_name in enumerate(tool_names):
|
| 175 |
-
tool_args = tool_arguments[idx] if idx < len(tool_arguments) else {}
|
| 176 |
-
lines.append(self._format_tool_call(str(tool_name), tool_args))
|
| 177 |
-
elif text.strip():
|
| 178 |
-
lines.append(text)
|
| 179 |
-
else:
|
| 180 |
-
suffix = f" finish_reason={finish_reason}" if finish_reason else ""
|
| 181 |
-
lines.append(f"(empty assistant output.{suffix})")
|
| 182 |
-
if error:
|
| 183 |
-
lines.append("")
|
| 184 |
-
lines.append(f"Assistant Error: {error}")
|
| 185 |
-
self._print_box(self._title("Assistant", turn_index), "\n".join(lines), "error" if error else "assistant")
|
| 186 |
-
return
|
| 187 |
-
|
| 188 |
-
if role == "tool":
|
| 189 |
-
tool_name = str(tool_names[0]) if tool_names else "Tool"
|
| 190 |
-
lines = [text]
|
| 191 |
-
if error:
|
| 192 |
-
lines.extend(["", f"{tool_name} Error: {error}"])
|
| 193 |
-
self._print_box(self._title(f"{tool_name} Result", turn_index), "\n".join(lines), "error" if error else "tool")
|
| 194 |
-
return
|
| 195 |
-
|
| 196 |
-
if role == "runtime":
|
| 197 |
-
lines = [text]
|
| 198 |
-
if error:
|
| 199 |
-
lines.extend(["", f"Runtime Error: {error}"])
|
| 200 |
-
self._print_box(self._title("Runtime", turn_index), "\n".join(lines), "error" if error else "runtime")
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
def main(argv: Optional[list[str]] = None) -> int:
|
| 204 |
-
parser = argparse.ArgumentParser(description="Show a minimal example of the CLI console event formatter.")
|
| 205 |
-
parser.parse_args(argv)
|
| 206 |
-
printer = ConsoleEventPrinter(model_name="demo-model", workspace_root=Path("."), prompt="demo question")
|
| 207 |
-
printer.print_header()
|
| 208 |
-
printer.handle_event(
|
| 209 |
-
{
|
| 210 |
-
"role": "assistant",
|
| 211 |
-
"turn_index": 1,
|
| 212 |
-
"text": "",
|
| 213 |
-
"tool_names": ["Read"],
|
| 214 |
-
"tool_arguments": [{"path": "demo.txt"}],
|
| 215 |
-
"termination": "",
|
| 216 |
-
"error": "",
|
| 217 |
-
}
|
| 218 |
-
)
|
| 219 |
-
return 0
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
if __name__ == "__main__":
|
| 223 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent_base/react_agent.py
CHANGED
|
@@ -1,18 +1,15 @@
|
|
| 1 |
-
import argparse
|
| 2 |
from contextlib import contextmanager
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
import signal
|
| 7 |
-
import sys
|
| 8 |
import threading
|
| 9 |
from pathlib import Path
|
| 10 |
-
from typing import Any, Callable, Dict, List, Optional, Sequence
|
| 11 |
|
| 12 |
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
| 13 |
import tiktoken
|
| 14 |
from agent_base.base import BaseAgent
|
| 15 |
-
from agent_base.console_utils import ConsoleEventPrinter
|
| 16 |
from agent_base.context_compact import compact_messages, should_compact_messages
|
| 17 |
from agent_base.model_profiles import resolve_model_profile
|
| 18 |
from agent_base.provider_compat import apply_sampling_params
|
|
@@ -25,16 +22,8 @@ from agent_base.tools.tool_runtime import Bash, TerminalInterrupt, TerminalKill,
|
|
| 25 |
from agent_base.tools.tool_user import AskUser
|
| 26 |
from agent_base.tools.tool_web import ScholarSearch, WebFetch, WebSearch
|
| 27 |
from agent_base.utils import (
|
| 28 |
-
PROJECT_ROOT,
|
| 29 |
-
MissingRequiredEnvError,
|
| 30 |
-
append_saved_image_paths_to_prompt,
|
| 31 |
env_flag,
|
| 32 |
-
image_input_content_parts,
|
| 33 |
-
load_dotenv,
|
| 34 |
-
read_role_prompt_files,
|
| 35 |
-
require_required_env,
|
| 36 |
safe_jsonable,
|
| 37 |
-
stage_image_file_for_input,
|
| 38 |
)
|
| 39 |
|
| 40 |
import datetime
|
|
@@ -75,6 +64,10 @@ DEFAULT_PRESENCE_PENALTY = 1.1
|
|
| 75 |
DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
class LLMHardTimeoutError(TimeoutError):
|
| 79 |
pass
|
| 80 |
|
|
@@ -551,10 +544,10 @@ def image_context_trace_text(result: Any) -> str:
|
|
| 551 |
return text
|
| 552 |
|
| 553 |
|
| 554 |
-
def default_llm_config() -> dict:
|
| 555 |
-
|
| 556 |
return {
|
| 557 |
-
"model":
|
| 558 |
"api_key": os.environ.get("API_KEY", "EMPTY"),
|
| 559 |
"api_base": os.environ.get("API_BASE"),
|
| 560 |
"timeout_seconds": float(os.environ.get("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))),
|
|
@@ -1195,6 +1188,7 @@ class MultiTurnReactAgent(BaseAgent):
|
|
| 1195 |
tool_arguments,
|
| 1196 |
workspace_root=resolved_workspace_root,
|
| 1197 |
runtime_deadline=runtime_deadline,
|
|
|
|
| 1198 |
)
|
| 1199 |
except KeyboardInterrupt:
|
| 1200 |
messages = messages[:tool_turn_message_start]
|
|
@@ -1312,142 +1306,3 @@ class MultiTurnReactAgent(BaseAgent):
|
|
| 1312 |
|
| 1313 |
def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs):
|
| 1314 |
return execute_tool_by_name(self.tool_map, tool_name, tool_args, **kwargs)
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
def _path_has_suffix(path: Path, suffix_parts: Sequence[str]) -> bool:
|
| 1318 |
-
normalized_parts = tuple(part.casefold() for part in path.parts)
|
| 1319 |
-
normalized_suffix = tuple(part.casefold() for part in suffix_parts)
|
| 1320 |
-
if len(normalized_parts) < len(normalized_suffix):
|
| 1321 |
-
return False
|
| 1322 |
-
return normalized_parts[-len(normalized_suffix) :] == normalized_suffix
|
| 1323 |
-
|
| 1324 |
-
|
| 1325 |
-
def resolve_agent_class_for_role_prompt_files(role_prompt_files: Sequence[str]) -> Type[MultiTurnReactAgent]:
|
| 1326 |
-
for raw_path in role_prompt_files:
|
| 1327 |
-
path_text = str(raw_path).strip()
|
| 1328 |
-
if not path_text:
|
| 1329 |
-
continue
|
| 1330 |
-
path = Path(path_text).expanduser().resolve(strict=False)
|
| 1331 |
-
if _path_has_suffix(path, ("benchmarks", "ResearchClawBench", "role_prompt.md")):
|
| 1332 |
-
from benchmarks.ResearchClawBench.adapter import ResearchClawBenchAgent
|
| 1333 |
-
|
| 1334 |
-
return ResearchClawBenchAgent
|
| 1335 |
-
return MultiTurnReactAgent
|
| 1336 |
-
|
| 1337 |
-
|
| 1338 |
-
def _parse_cli_args(argv: list[str]) -> tuple[str, Optional[str], Optional[str], str, list[str], list[str], Optional[bool]]:
|
| 1339 |
-
parser = argparse.ArgumentParser(description="Run the local agent directly from agent_base.react_agent.")
|
| 1340 |
-
parser.add_argument("prompt", nargs="*", help="Prompt text.")
|
| 1341 |
-
parser.add_argument("--prompt-file", help="Optional UTF-8 text file containing the prompt.")
|
| 1342 |
-
parser.add_argument("--trace-dir", help="Optional directory where the run trace JSONL should be created.")
|
| 1343 |
-
parser.add_argument(
|
| 1344 |
-
"--workspace-root",
|
| 1345 |
-
help="Optional workspace root for local file tools, Bash, and TerminalStart.",
|
| 1346 |
-
)
|
| 1347 |
-
parser.add_argument(
|
| 1348 |
-
"--role-prompt-file",
|
| 1349 |
-
action="append",
|
| 1350 |
-
default=[],
|
| 1351 |
-
dest="role_prompt_files",
|
| 1352 |
-
metavar="PATH",
|
| 1353 |
-
help="Append one role-specific prompt file to the base system prompt. May be passed multiple times.",
|
| 1354 |
-
)
|
| 1355 |
-
parser.add_argument(
|
| 1356 |
-
"--images",
|
| 1357 |
-
action="append",
|
| 1358 |
-
nargs="+",
|
| 1359 |
-
default=[],
|
| 1360 |
-
dest="image_paths",
|
| 1361 |
-
metavar="PATH",
|
| 1362 |
-
help="Attach one or more local image paths to the initial user message.",
|
| 1363 |
-
)
|
| 1364 |
-
parser.add_argument(
|
| 1365 |
-
"--chat",
|
| 1366 |
-
action=argparse.BooleanOptionalAction,
|
| 1367 |
-
default=None,
|
| 1368 |
-
help="Continue asking for follow-up user messages after each final answer. Defaults to on only in an interactive terminal.",
|
| 1369 |
-
)
|
| 1370 |
-
args = parser.parse_args(argv)
|
| 1371 |
-
|
| 1372 |
-
prompt_text = ""
|
| 1373 |
-
if args.prompt_file:
|
| 1374 |
-
prompt_text = Path(args.prompt_file).read_text(encoding="utf-8").strip()
|
| 1375 |
-
elif args.prompt:
|
| 1376 |
-
prompt_text = " ".join(args.prompt).strip()
|
| 1377 |
-
|
| 1378 |
-
if not prompt_text:
|
| 1379 |
-
raise ValueError("A non-empty prompt is required via positional args or --prompt-file.")
|
| 1380 |
-
role_prompt = read_role_prompt_files(args.role_prompt_files)
|
| 1381 |
-
return (
|
| 1382 |
-
prompt_text,
|
| 1383 |
-
args.trace_dir,
|
| 1384 |
-
args.workspace_root,
|
| 1385 |
-
role_prompt,
|
| 1386 |
-
list(args.role_prompt_files),
|
| 1387 |
-
[path for group in args.image_paths for path in group],
|
| 1388 |
-
args.chat,
|
| 1389 |
-
)
|
| 1390 |
-
|
| 1391 |
-
|
| 1392 |
-
def main(argv: Optional[list[str]] = None) -> int:
|
| 1393 |
-
load_dotenv(PROJECT_ROOT / ".env")
|
| 1394 |
-
try:
|
| 1395 |
-
require_required_env("ResearchHarness agent")
|
| 1396 |
-
prompt_text, trace_dir, workspace_root, role_prompt, role_prompt_files, image_paths, chat_arg = _parse_cli_args(argv or sys.argv[1:])
|
| 1397 |
-
agent_cls = resolve_agent_class_for_role_prompt_files(role_prompt_files)
|
| 1398 |
-
agent = agent_cls(
|
| 1399 |
-
llm=default_llm_config(),
|
| 1400 |
-
trace_dir=trace_dir,
|
| 1401 |
-
role_prompt=role_prompt or None,
|
| 1402 |
-
)
|
| 1403 |
-
resolved_workspace_root = normalize_workspace_root(workspace_root)
|
| 1404 |
-
initial_content_parts: list[dict[str, Any]] = []
|
| 1405 |
-
saved_image_paths: list[str] = []
|
| 1406 |
-
for image_index, image_path in enumerate(image_paths):
|
| 1407 |
-
saved_path, data_url = stage_image_file_for_input(
|
| 1408 |
-
image_path,
|
| 1409 |
-
workspace_root=resolved_workspace_root,
|
| 1410 |
-
image_index=image_index,
|
| 1411 |
-
)
|
| 1412 |
-
saved_image_paths.append(saved_path)
|
| 1413 |
-
initial_content_parts.extend(image_input_content_parts(data_url, saved_path))
|
| 1414 |
-
run_prompt = append_saved_image_paths_to_prompt(prompt_text, saved_image_paths)
|
| 1415 |
-
printer = ConsoleEventPrinter(
|
| 1416 |
-
model_name=agent.model,
|
| 1417 |
-
workspace_root=resolved_workspace_root,
|
| 1418 |
-
prompt=run_prompt,
|
| 1419 |
-
)
|
| 1420 |
-
printer.print_header()
|
| 1421 |
-
session = agent._run_session(
|
| 1422 |
-
run_prompt,
|
| 1423 |
-
workspace_root=str(resolved_workspace_root),
|
| 1424 |
-
event_callback=printer.handle_event,
|
| 1425 |
-
initial_content_parts=initial_content_parts or None,
|
| 1426 |
-
)
|
| 1427 |
-
chat_enabled = chat_arg if chat_arg is not None else (sys.stdin.isatty() and sys.stdout.isatty())
|
| 1428 |
-
messages = session.get("messages", [])
|
| 1429 |
-
while chat_enabled:
|
| 1430 |
-
try:
|
| 1431 |
-
followup = input("\n[ResearchHarness] Follow-up (Ctrl+C to exit): ").strip()
|
| 1432 |
-
except (KeyboardInterrupt, EOFError):
|
| 1433 |
-
print("\n[ResearchHarness] Chat ended.")
|
| 1434 |
-
break
|
| 1435 |
-
if not followup:
|
| 1436 |
-
continue
|
| 1437 |
-
print(f"\n[ResearchHarness] Continuing conversation: {followup}")
|
| 1438 |
-
printer.reset_rounds()
|
| 1439 |
-
session = agent._run_session(
|
| 1440 |
-
followup,
|
| 1441 |
-
workspace_root=str(resolved_workspace_root),
|
| 1442 |
-
event_callback=printer.handle_event,
|
| 1443 |
-
prior_messages=messages,
|
| 1444 |
-
)
|
| 1445 |
-
messages = session.get("messages", messages)
|
| 1446 |
-
return 0
|
| 1447 |
-
except (MissingRequiredEnvError, ValueError) as exc:
|
| 1448 |
-
print(str(exc), file=sys.stderr)
|
| 1449 |
-
return 1
|
| 1450 |
-
|
| 1451 |
-
|
| 1452 |
-
if __name__ == "__main__":
|
| 1453 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
| 1 |
from contextlib import contextmanager
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
import signal
|
|
|
|
| 6 |
import threading
|
| 7 |
from pathlib import Path
|
| 8 |
+
from typing import Any, Callable, Dict, List, Optional, Sequence
|
| 9 |
|
| 10 |
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
| 11 |
import tiktoken
|
| 12 |
from agent_base.base import BaseAgent
|
|
|
|
| 13 |
from agent_base.context_compact import compact_messages, should_compact_messages
|
| 14 |
from agent_base.model_profiles import resolve_model_profile
|
| 15 |
from agent_base.provider_compat import apply_sampling_params
|
|
|
|
| 22 |
from agent_base.tools.tool_user import AskUser
|
| 23 |
from agent_base.tools.tool_web import ScholarSearch, WebFetch, WebSearch
|
| 24 |
from agent_base.utils import (
|
|
|
|
|
|
|
|
|
|
| 25 |
env_flag,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
safe_jsonable,
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
import datetime
|
|
|
|
| 64 |
DEFAULT_LLM_TIMEOUT_SECONDS = 600.0
|
| 65 |
|
| 66 |
|
| 67 |
+
def default_model_name() -> str:
|
| 68 |
+
return os.environ.get("MODEL_NAME", DEFAULT_MODEL_NAME).strip() or DEFAULT_MODEL_NAME
|
| 69 |
+
|
| 70 |
+
|
| 71 |
class LLMHardTimeoutError(TimeoutError):
|
| 72 |
pass
|
| 73 |
|
|
|
|
| 544 |
return text
|
| 545 |
|
| 546 |
|
| 547 |
+
def default_llm_config(model_name: Optional[str] = None) -> dict:
|
| 548 |
+
selected_model = str(model_name or "").strip() or default_model_name()
|
| 549 |
return {
|
| 550 |
+
"model": selected_model,
|
| 551 |
"api_key": os.environ.get("API_KEY", "EMPTY"),
|
| 552 |
"api_base": os.environ.get("API_BASE"),
|
| 553 |
"timeout_seconds": float(os.environ.get("LLM_TIMEOUT_SECONDS", str(DEFAULT_LLM_TIMEOUT_SECONDS))),
|
|
|
|
| 1188 |
tool_arguments,
|
| 1189 |
workspace_root=resolved_workspace_root,
|
| 1190 |
runtime_deadline=runtime_deadline,
|
| 1191 |
+
model_name=self.model,
|
| 1192 |
)
|
| 1193 |
except KeyboardInterrupt:
|
| 1194 |
messages = messages[:tool_turn_message_start]
|
|
|
|
| 1306 |
|
| 1307 |
def custom_call_tool(self, tool_name: str, tool_args: Any, **kwargs):
|
| 1308 |
return execute_tool_by_name(self.tool_map, tool_name, tool_args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent_base/tools/README.md
DELETED
|
@@ -1,457 +0,0 @@
|
|
| 1 |
-
# Tools
|
| 2 |
-
|
| 3 |
-
This document describes the tool surface exposed to the model. Tool names use PascalCase consistently.
|
| 4 |
-
|
| 5 |
-
The current implementation is grouped by category:
|
| 6 |
-
|
| 7 |
-
- `agent_base/tools/tool_file.py`
|
| 8 |
-
- `agent_base/tools/tool_runtime.py`
|
| 9 |
-
- `agent_base/tools/tool_user.py`
|
| 10 |
-
- `agent_base/tools/tool_web.py`
|
| 11 |
-
|
| 12 |
-
## Overview
|
| 13 |
-
|
| 14 |
-
The current tool set is:
|
| 15 |
-
|
| 16 |
-
- `Glob`
|
| 17 |
-
- `Grep`
|
| 18 |
-
- `Read`
|
| 19 |
-
- `ReadPDF`
|
| 20 |
-
- `ReadImage`
|
| 21 |
-
- `Write`
|
| 22 |
-
- `Edit`
|
| 23 |
-
- `Bash`
|
| 24 |
-
- `WebSearch`
|
| 25 |
-
- `ScholarSearch`
|
| 26 |
-
- `WebFetch`
|
| 27 |
-
- `AskUser`
|
| 28 |
-
- `TerminalStart`
|
| 29 |
-
- `TerminalWrite`
|
| 30 |
-
- `TerminalRead`
|
| 31 |
-
- `TerminalInterrupt`
|
| 32 |
-
- `TerminalKill`
|
| 33 |
-
|
| 34 |
-
## Tool Matrix
|
| 35 |
-
|
| 36 |
-
| Tool | Category | Arguments | Description | Return Shape / Notes |
|
| 37 |
-
| --- | --- | --- | --- | --- |
|
| 38 |
-
| `Glob` | Local files | `pattern`, `path?`, `include_dirs?`, `max_results?` | Discover files or directories by pathname pattern inside the workspace. | Returns `root`, `match_count`, `truncated`, and `results`. Best for pathname discovery rather than reading content. |
|
| 39 |
-
| `Grep` | Local files | `pattern`, `path?`, `glob?`, `case_sensitive?`, `max_results?`, `max_chars?` | Search local text files by content and return matching lines. | Returns search metadata plus matched file paths, line numbers, and line text. Skips obvious binary files, images, and PDFs. |
|
| 40 |
-
| `Read` | Local files | `path`, `start_line?`, `end_line?`, `max_chars?` | Read a local text file, optionally by line range. | Returns normalized path, line metadata, truncation status, and `content`. Redirects PDF/image tasks toward `ReadPDF` or `ReadImage`. |
|
| 41 |
-
| `ReadPDF` | Local files | `path`, `max_chars?`, `max_image_paths?` | Read a local PDF, extract text, and expose extracted image paths when available. | Returns text content plus `image_paths` and image-count metadata. Depends on [`structai`](https://github.com/black-yt/structai) and `MINERU_TOKEN`. |
|
| 42 |
-
| `ReadImage` | Local files | `path` | Read a local image and expose image metadata for runtime multimodal use. | Returns image metadata only. During agent runs, the runtime sends a compressed attachment to the LLM API as an `image_url` content part. |
|
| 43 |
-
| `Write` | Local files | `path`, `content`, `overwrite?` | Create a text file or overwrite one when explicitly allowed. | Creates parent directories automatically. Returns an error if the file exists and `overwrite=false`. |
|
| 44 |
-
| `Edit` | Local files | `path`, `patch` | Apply a targeted patch to a local text file. | Expects unified-diff / hunk-style input. Context-based matching, not a full `patch(1)` implementation. |
|
| 45 |
-
| `Bash` | Runtime | `command`, `timeout?`, `workdir?` | Run one-shot shell commands for deterministic local execution, parsing, and validation. | Returns `stdout` and `stderr`. Primary local execution tool for short Python, `rg`, `find`, `git`, and structured local processing. |
|
| 46 |
-
| `WebSearch` | Web | `query` | Perform general web search over one or more complementary queries. | Returns a text summary headed by `## Web Results` with title, link, snippet, and date/source when available. Uses Serper. |
|
| 47 |
-
| `ScholarSearch` | Web | `query` | Search academic results such as papers, year, abstract, and citations. | Returns a text summary headed by `## Scholar Results` with title, PDF link, publication info, year, citation count, and abstract. Uses Serper Scholar. |
|
| 48 |
-
| `WebFetch` | Web | `url`, `goal` | Fetch a page, extract evidence relevant to a concrete goal, and summarize it. | Uses Jina Reader plus the configured summary model. Returns evidence-focused text rather than raw HTML. |
|
| 49 |
-
| `AskUser` | Human interaction | `question`, `context?` | Ask the human user one concise clarification question when essential information cannot be determined from tools or existing instructions. | Writes the question to the interactive terminal and returns the user's answer. If no interactive terminal is available, returns an explicit unavailable message. |
|
| 50 |
-
| `TerminalStart` | Runtime | `cwd?`, `shell?`, `rows?`, `cols?` | Start a persistent terminal session. | Returns session metadata such as `session_id`, `pid`, `cwd`, `shell`, `alive`, and `returncode`. |
|
| 51 |
-
| `TerminalWrite` | Runtime | `session_id`, `input`, `append_newline?`, `yield_time_ms?`, `max_output_chars?` | Send input to a persistent terminal session and read incremental output. | Best for stateful shells, REPLs, and long-running foreground processes. |
|
| 52 |
-
| `TerminalRead` | Runtime | `session_id`, `yield_time_ms?`, `max_output_chars?` | Read unread output from an existing persistent terminal session. | Useful when a process is still running and output arrives over time. |
|
| 53 |
-
| `TerminalInterrupt` | Runtime | `session_id`, `max_output_chars?` | Send `Ctrl-C` to the foreground process in a terminal session without destroying the session. | Use when a long-running process must be interrupted but the shell should remain alive. |
|
| 54 |
-
| `TerminalKill` | Runtime | `session_id`, `force?` | Terminate a persistent terminal session and release resources. | Final cleanup step for terminal sessions that are no longer needed. |
|
| 55 |
-
|
| 56 |
-
## Glob
|
| 57 |
-
|
| 58 |
-
Purpose:
|
| 59 |
-
|
| 60 |
-
- Discover local files or directories by glob pattern.
|
| 61 |
-
- Good for pathname discovery, not for reading file contents.
|
| 62 |
-
|
| 63 |
-
Arguments:
|
| 64 |
-
|
| 65 |
-
- `pattern`: string, a `pathlib`-style glob such as `**/*.py`
|
| 66 |
-
- `path`: optional string, search root, defaults to the current workspace
|
| 67 |
-
- `include_dirs`: optional boolean, defaults to `false`
|
| 68 |
-
- `max_results`: optional integer, defaults to `200`
|
| 69 |
-
|
| 70 |
-
Returns:
|
| 71 |
-
|
| 72 |
-
- `root`
|
| 73 |
-
- `pattern`
|
| 74 |
-
- `include_dirs`
|
| 75 |
-
- `match_count`
|
| 76 |
-
- `truncated`
|
| 77 |
-
- `results`
|
| 78 |
-
|
| 79 |
-
## Grep
|
| 80 |
-
|
| 81 |
-
Purpose:
|
| 82 |
-
|
| 83 |
-
- Search local text files by content.
|
| 84 |
-
- Return matched file paths, line numbers, and line text.
|
| 85 |
-
|
| 86 |
-
Arguments:
|
| 87 |
-
|
| 88 |
-
- `pattern`: string, regular expression
|
| 89 |
-
- `path`: optional string, file or directory path, defaults to the current workspace
|
| 90 |
-
- `glob`: optional string, file filter when scanning a directory, defaults to `**/*`
|
| 91 |
-
- `case_sensitive`: optional boolean, defaults to `false`
|
| 92 |
-
- `max_results`: optional integer, defaults to `100`
|
| 93 |
-
- `max_chars`: optional integer, defaults to `20000`
|
| 94 |
-
|
| 95 |
-
Behavior:
|
| 96 |
-
|
| 97 |
-
- If `path` is a file, only that file is searched.
|
| 98 |
-
- If `path` is a directory, matching text files are searched recursively.
|
| 99 |
-
- Images, PDFs, and obviously binary files are skipped.
|
| 100 |
-
|
| 101 |
-
Returns:
|
| 102 |
-
|
| 103 |
-
- `root`
|
| 104 |
-
- `pattern`
|
| 105 |
-
- `glob`
|
| 106 |
-
- `case_sensitive`
|
| 107 |
-
- `files_scanned`
|
| 108 |
-
- `match_count`
|
| 109 |
-
- `truncated`
|
| 110 |
-
- `results`
|
| 111 |
-
|
| 112 |
-
## Read
|
| 113 |
-
|
| 114 |
-
Purpose:
|
| 115 |
-
|
| 116 |
-
- Read a local text file.
|
| 117 |
-
- Support partial line ranges.
|
| 118 |
-
- Support long-text truncation.
|
| 119 |
-
|
| 120 |
-
Arguments:
|
| 121 |
-
|
| 122 |
-
- `path`: string, file path
|
| 123 |
-
- `start_line`: optional integer, 1-based start line
|
| 124 |
-
- `end_line`: optional integer, 1-based end line
|
| 125 |
-
- `max_chars`: optional integer, maximum returned characters, defaults to `20000`
|
| 126 |
-
|
| 127 |
-
Behavior:
|
| 128 |
-
|
| 129 |
-
- Only text files are handled directly.
|
| 130 |
-
- If the input is a PDF, the tool tells the model to use `ReadPDF`.
|
| 131 |
-
- If the input is an image, the tool tells the model to use `ReadImage`.
|
| 132 |
-
|
| 133 |
-
Returns:
|
| 134 |
-
|
| 135 |
-
- `path`
|
| 136 |
-
- `source_type: text`
|
| 137 |
-
- `start_line`
|
| 138 |
-
- `end_line`
|
| 139 |
-
- `total_lines`
|
| 140 |
-
- `truncated`
|
| 141 |
-
- `content`
|
| 142 |
-
|
| 143 |
-
## ReadPDF
|
| 144 |
-
|
| 145 |
-
Purpose:
|
| 146 |
-
|
| 147 |
-
- Read a local PDF.
|
| 148 |
-
- Return extracted text.
|
| 149 |
-
- Return extracted local image paths when the PDF parser produces image assets.
|
| 150 |
-
|
| 151 |
-
Arguments:
|
| 152 |
-
|
| 153 |
-
- `path`: string, PDF path
|
| 154 |
-
- `max_chars`: optional integer, maximum returned characters, defaults to `20000`
|
| 155 |
-
- `max_image_paths`: optional integer, maximum listed extracted image paths, defaults to `20`
|
| 156 |
-
|
| 157 |
-
Behavior:
|
| 158 |
-
|
| 159 |
-
- Calls `structai.read_pdf(...)` from [`structai`](https://github.com/black-yt/structai) underneath.
|
| 160 |
-
- Uses the returned `text` and `img_paths`.
|
| 161 |
-
- Depends on `MINERU_TOKEN`.
|
| 162 |
-
- If [`structai`](https://github.com/black-yt/structai) is missing, returns a clear dependency error instead of breaking unrelated file tools.
|
| 163 |
-
- For PDF figure tasks, prefer `ReadPDF` first to discover extracted text and extracted image paths, then use `ReadImage` on the actual extracted image file.
|
| 164 |
-
|
| 165 |
-
Returns:
|
| 166 |
-
|
| 167 |
-
- `path`
|
| 168 |
-
- `source_type: pdf`
|
| 169 |
-
- `total_lines`
|
| 170 |
-
- `truncated`
|
| 171 |
-
- `image_count`
|
| 172 |
-
- `image_paths_listed`
|
| 173 |
-
- `image_paths_truncated`
|
| 174 |
-
- `image_paths`
|
| 175 |
-
- `content`
|
| 176 |
-
|
| 177 |
-
## ReadImage
|
| 178 |
-
|
| 179 |
-
Purpose:
|
| 180 |
-
|
| 181 |
-
- Read a local image.
|
| 182 |
-
- Return image metadata.
|
| 183 |
-
- During a main agent run, pass a compressed image to the LLM API as an `image_url` content part instead of stuffing raw base64 text into ordinary message text.
|
| 184 |
-
|
| 185 |
-
Arguments:
|
| 186 |
-
|
| 187 |
-
- `path`: string, image path
|
| 188 |
-
|
| 189 |
-
Behavior:
|
| 190 |
-
|
| 191 |
-
- Uses `PIL.Image.open(...)` underneath.
|
| 192 |
-
- The runtime creates a compressed JPEG attachment for the LLM request and sends it as an inline `data:` URL in an `image_url` content part.
|
| 193 |
-
- Trace records and direct tool output keep image metadata only, not the full binary payload.
|
| 194 |
-
|
| 195 |
-
Returns:
|
| 196 |
-
|
| 197 |
-
- `path`
|
| 198 |
-
- `source_type`
|
| 199 |
-
- `format`
|
| 200 |
-
- `mime_type`
|
| 201 |
-
- `mode`
|
| 202 |
-
- `width`
|
| 203 |
-
- `height`
|
| 204 |
-
- `byte_count`
|
| 205 |
-
- `llm_attachment_format`
|
| 206 |
-
- `llm_attachment_width`
|
| 207 |
-
- `llm_attachment_height`
|
| 208 |
-
- `llm_attachment_byte_count`
|
| 209 |
-
|
| 210 |
-
## Write
|
| 211 |
-
|
| 212 |
-
Purpose:
|
| 213 |
-
|
| 214 |
-
- Create a text file.
|
| 215 |
-
- Overwrite an existing file when explicitly requested.
|
| 216 |
-
|
| 217 |
-
Arguments:
|
| 218 |
-
|
| 219 |
-
- `path`: string, destination file path
|
| 220 |
-
- `content`: string, complete file content
|
| 221 |
-
- `overwrite`: optional boolean, defaults to `false`
|
| 222 |
-
|
| 223 |
-
Behavior:
|
| 224 |
-
|
| 225 |
-
- Parent directories are created automatically.
|
| 226 |
-
- If `overwrite=false` and the file already exists, the tool returns an error.
|
| 227 |
-
|
| 228 |
-
## Edit
|
| 229 |
-
|
| 230 |
-
Purpose:
|
| 231 |
-
|
| 232 |
-
- Edit a local text file partially.
|
| 233 |
-
- Best for targeted patches, not full-file rewrites.
|
| 234 |
-
|
| 235 |
-
Arguments:
|
| 236 |
-
|
| 237 |
-
- `path`: string, destination file path
|
| 238 |
-
- `patch`: string, unified-diff / hunk-style patch
|
| 239 |
-
|
| 240 |
-
Behavior:
|
| 241 |
-
|
| 242 |
-
- Requires explicit hunks such as `@@ -1,2 +1,2 @@`.
|
| 243 |
-
- The current implementation matches by surrounding context blocks rather than implementing full `patch(1)` line-number semantics.
|
| 244 |
-
|
| 245 |
-
Returns:
|
| 246 |
-
|
| 247 |
-
- updated file path on success
|
| 248 |
-
- applied hunk count
|
| 249 |
-
|
| 250 |
-
## Bash
|
| 251 |
-
|
| 252 |
-
Purpose:
|
| 253 |
-
|
| 254 |
-
- Execute one-shot shell commands.
|
| 255 |
-
- Handle paths, search, git, conda, and local script orchestration.
|
| 256 |
-
- Serve as the primary local execution tool for temporary Python, deterministic computation, validation, formatting, and parsing.
|
| 257 |
-
|
| 258 |
-
Arguments:
|
| 259 |
-
|
| 260 |
-
- `command`: string, shell command to execute
|
| 261 |
-
- `timeout`: optional integer, seconds, defaults to `30`
|
| 262 |
-
- `workdir`: optional string, working directory
|
| 263 |
-
|
| 264 |
-
Behavior:
|
| 265 |
-
|
| 266 |
-
- Uses local `bash`.
|
| 267 |
-
- Returns both `stdout` and `stderr`.
|
| 268 |
-
- Timeout produces an explicit error.
|
| 269 |
-
- Short scripts are well suited to a heredoc such as `python3 - <<'PY'`.
|
| 270 |
-
|
| 271 |
-
Recommended use cases:
|
| 272 |
-
|
| 273 |
-
- pathname and file discovery
|
| 274 |
-
- `rg`, `find`, `git`
|
| 275 |
-
- local Python or other CLI programs
|
| 276 |
-
- deterministic CSV / JSON / text processing
|
| 277 |
-
- local computation and validation against absolute paths returned by file tools
|
| 278 |
-
|
| 279 |
-
## WebSearch
|
| 280 |
-
|
| 281 |
-
Purpose:
|
| 282 |
-
|
| 283 |
-
- General web search.
|
| 284 |
-
- Supports passing multiple complementary queries in one call.
|
| 285 |
-
|
| 286 |
-
Arguments:
|
| 287 |
-
|
| 288 |
-
- `query`: array of strings, at least one query
|
| 289 |
-
|
| 290 |
-
Behavior:
|
| 291 |
-
|
| 292 |
-
- Calls Serper's Google Search endpoint.
|
| 293 |
-
- Reads `SERPER_KEY` at runtime.
|
| 294 |
-
|
| 295 |
-
Returns:
|
| 296 |
-
|
| 297 |
-
- query summary text
|
| 298 |
-
- `## Web Results`
|
| 299 |
-
- title, link, snippet, and date/source when available
|
| 300 |
-
|
| 301 |
-
## ScholarSearch
|
| 302 |
-
|
| 303 |
-
Purpose:
|
| 304 |
-
|
| 305 |
-
- Academic search.
|
| 306 |
-
- Return paper title, year, abstract, citation count, and related metadata.
|
| 307 |
-
|
| 308 |
-
Arguments:
|
| 309 |
-
|
| 310 |
-
- `query`: array of strings, at least one query
|
| 311 |
-
|
| 312 |
-
Behavior:
|
| 313 |
-
|
| 314 |
-
- Calls Serper's Google Scholar endpoint.
|
| 315 |
-
- Reads `SERPER_KEY` at runtime.
|
| 316 |
-
|
| 317 |
-
Returns:
|
| 318 |
-
|
| 319 |
-
- query summary text
|
| 320 |
-
- `## Scholar Results`
|
| 321 |
-
- title, PDF link, `publicationInfo`, year, citation count, and abstract
|
| 322 |
-
|
| 323 |
-
## WebFetch
|
| 324 |
-
|
| 325 |
-
Purpose:
|
| 326 |
-
|
| 327 |
-
- Visit a webpage.
|
| 328 |
-
- Extract evidence relevant to a concrete goal.
|
| 329 |
-
- Produce a goal-oriented summary.
|
| 330 |
-
|
| 331 |
-
Arguments:
|
| 332 |
-
|
| 333 |
-
- `url`: string or array of strings, page URL or URLs
|
| 334 |
-
- `goal`: string, the specific goal to extract from the page
|
| 335 |
-
|
| 336 |
-
Behavior:
|
| 337 |
-
|
| 338 |
-
- Fetches page text through Jina Reader first.
|
| 339 |
-
- Then calls the configured summary-model endpoint for evidence extraction and summarization.
|
| 340 |
-
- Returns a fetch-and-extract result, not raw HTML.
|
| 341 |
-
|
| 342 |
-
Dependencies:
|
| 343 |
-
|
| 344 |
-
- `JINA_KEY`
|
| 345 |
-
- `API_KEY`
|
| 346 |
-
- `API_BASE`
|
| 347 |
-
- `MODEL_NAME`
|
| 348 |
-
|
| 349 |
-
Returns:
|
| 350 |
-
|
| 351 |
-
- `The useful information in ...`
|
| 352 |
-
- `Evidence in page:`
|
| 353 |
-
- `Summary:`
|
| 354 |
-
|
| 355 |
-
## TerminalStart
|
| 356 |
-
|
| 357 |
-
Purpose:
|
| 358 |
-
|
| 359 |
-
- Start a persistent terminal session.
|
| 360 |
-
|
| 361 |
-
Arguments:
|
| 362 |
-
|
| 363 |
-
- `cwd`: optional string, working directory
|
| 364 |
-
- `shell`: optional string, shell path
|
| 365 |
-
- `rows`: optional integer, terminal rows, defaults to `30`
|
| 366 |
-
- `cols`: optional integer, terminal columns, defaults to `120`
|
| 367 |
-
|
| 368 |
-
Returns:
|
| 369 |
-
|
| 370 |
-
- `session_id`
|
| 371 |
-
- `pid`
|
| 372 |
-
- `cwd`
|
| 373 |
-
- `shell`
|
| 374 |
-
- `alive`
|
| 375 |
-
- `returncode`
|
| 376 |
-
|
| 377 |
-
## TerminalWrite
|
| 378 |
-
|
| 379 |
-
Purpose:
|
| 380 |
-
|
| 381 |
-
- Send input to an existing terminal session and read output.
|
| 382 |
-
|
| 383 |
-
Arguments:
|
| 384 |
-
|
| 385 |
-
- `session_id`: string, session id
|
| 386 |
-
- `input`: string, text to send
|
| 387 |
-
- `append_newline`: optional boolean, defaults to `true`
|
| 388 |
-
- `yield_time_ms`: optional integer, defaults to `200`
|
| 389 |
-
- `max_output_chars`: optional integer, defaults to `20000`
|
| 390 |
-
|
| 391 |
-
## TerminalRead
|
| 392 |
-
|
| 393 |
-
Purpose:
|
| 394 |
-
|
| 395 |
-
- Read unread output from an existing terminal session.
|
| 396 |
-
|
| 397 |
-
Arguments:
|
| 398 |
-
|
| 399 |
-
- `session_id`: string, session id
|
| 400 |
-
- `yield_time_ms`: optional integer, defaults to `200`
|
| 401 |
-
- `max_output_chars`: optional integer, defaults to `20000`
|
| 402 |
-
|
| 403 |
-
## TerminalInterrupt
|
| 404 |
-
|
| 405 |
-
Purpose:
|
| 406 |
-
|
| 407 |
-
- Send `Ctrl-C` to the foreground process in a terminal session.
|
| 408 |
-
- Keep the session alive.
|
| 409 |
-
|
| 410 |
-
Arguments:
|
| 411 |
-
|
| 412 |
-
- `session_id`: string, session id
|
| 413 |
-
- `max_output_chars`: optional integer, defaults to `20000`
|
| 414 |
-
|
| 415 |
-
## TerminalKill
|
| 416 |
-
|
| 417 |
-
Purpose:
|
| 418 |
-
|
| 419 |
-
- Terminate a terminal session.
|
| 420 |
-
- Release related resources.
|
| 421 |
-
|
| 422 |
-
Arguments:
|
| 423 |
-
|
| 424 |
-
- `session_id`: string, session id
|
| 425 |
-
- `force`: optional boolean, defaults to `false`
|
| 426 |
-
|
| 427 |
-
## AskUser
|
| 428 |
-
|
| 429 |
-
Purpose:
|
| 430 |
-
|
| 431 |
-
- Ask the human user for essential missing information, preference, or approval.
|
| 432 |
-
- Use only when the answer cannot be determined from workspace files, available tools, or existing instructions.
|
| 433 |
-
|
| 434 |
-
Arguments:
|
| 435 |
-
|
| 436 |
-
- `question`: string, concise question to ask.
|
| 437 |
-
- `context`: optional string, brief explanation of why the question is necessary.
|
| 438 |
-
|
| 439 |
-
Behavior:
|
| 440 |
-
|
| 441 |
-
- Writes the question to the interactive terminal and waits for one user answer.
|
| 442 |
-
- Returns an explicit unavailable message instead of blocking when no interactive terminal exists.
|
| 443 |
-
- Not available in ResearchClawBench runs.
|
| 444 |
-
|
| 445 |
-
## Suggested Usage
|
| 446 |
-
|
| 447 |
-
- Use `Glob` first for pathname discovery.
|
| 448 |
-
- Use `Grep` first for local text search.
|
| 449 |
-
- Use `Read` for local text files.
|
| 450 |
-
- Use `ReadPDF` for local PDFs.
|
| 451 |
-
- Use `ReadImage` for local images.
|
| 452 |
-
- Use `Edit` for targeted file changes.
|
| 453 |
-
- Use `Write` for full-file writes.
|
| 454 |
-
- Use `Bash` for one-shot system commands.
|
| 455 |
-
- Use `AskUser` only when a human answer is genuinely necessary.
|
| 456 |
-
- Use `Terminal*` only when persistent interactive shell state is actually needed.
|
| 457 |
-
- Route pure Python analysis through `Bash` rather than introducing a separate Python tool.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent_base/tools/tool_web.py
CHANGED
|
@@ -373,11 +373,12 @@ class WebFetch(ToolBase):
|
|
| 373 |
except ValueError as exc:
|
| 374 |
return f"[WebFetch] {exc}"
|
| 375 |
runtime_deadline = kwargs.get("runtime_deadline")
|
|
|
|
| 376 |
|
| 377 |
start_time = time.time()
|
| 378 |
|
| 379 |
if isinstance(url, str):
|
| 380 |
-
response = self.readpage_jina(url, goal, runtime_deadline=runtime_deadline)
|
| 381 |
elif isinstance(url, list):
|
| 382 |
response = []
|
| 383 |
start_time = time.time()
|
|
@@ -396,7 +397,12 @@ class WebFetch(ToolBase):
|
|
| 396 |
cur_response += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
|
| 397 |
cur_response += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
|
| 398 |
else:
|
| 399 |
-
cur_response = self.readpage_jina(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
response.append(cur_response)
|
| 401 |
response = "\n=======\n".join(response)
|
| 402 |
else:
|
|
@@ -406,11 +412,18 @@ class WebFetch(ToolBase):
|
|
| 406 |
print(f"Summary Length {len(response)}")
|
| 407 |
return response.strip()
|
| 408 |
|
| 409 |
-
def call_server(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
client = self._ensure_summary_client()
|
| 411 |
if client is None or not self._summary_api_base:
|
| 412 |
return "[WebFetch] Summary model error: API_BASE is not set."
|
| 413 |
-
|
|
|
|
| 414 |
return "[WebFetch] Summary model error: MODEL_NAME is not set."
|
| 415 |
last_error = "unknown summary-model error"
|
| 416 |
for attempt in range(max_retries):
|
|
@@ -424,12 +437,12 @@ class WebFetch(ToolBase):
|
|
| 424 |
else client
|
| 425 |
)
|
| 426 |
request_kwargs = {
|
| 427 |
-
"model":
|
| 428 |
"messages": msgs,
|
| 429 |
}
|
| 430 |
apply_sampling_params(
|
| 431 |
request_kwargs,
|
| 432 |
-
model_name=
|
| 433 |
temperature=self._summary_temperature,
|
| 434 |
top_p=self._summary_top_p,
|
| 435 |
presence_penalty=self._summary_presence_penalty,
|
|
@@ -494,8 +507,21 @@ class WebFetch(ToolBase):
|
|
| 494 |
return content
|
| 495 |
return "[WebFetch] Failed to read page: exhausted retries"
|
| 496 |
|
| 497 |
-
def readpage_jina(
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
max_retries = int(os.getenv("LLM_MAX_RETRIES", str(DEFAULT_LLM_MAX_RETRIES)))
|
| 500 |
|
| 501 |
content = self.html_readpage_jina(url, runtime_deadline=runtime_deadline)
|
|
|
|
| 373 |
except ValueError as exc:
|
| 374 |
return f"[WebFetch] {exc}"
|
| 375 |
runtime_deadline = kwargs.get("runtime_deadline")
|
| 376 |
+
summary_model_name = str(kwargs.get("model_name") or "").strip()
|
| 377 |
|
| 378 |
start_time = time.time()
|
| 379 |
|
| 380 |
if isinstance(url, str):
|
| 381 |
+
response = self.readpage_jina(url, goal, runtime_deadline=runtime_deadline, summary_model_name=summary_model_name)
|
| 382 |
elif isinstance(url, list):
|
| 383 |
response = []
|
| 384 |
start_time = time.time()
|
|
|
|
| 397 |
cur_response += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
|
| 398 |
cur_response += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
|
| 399 |
else:
|
| 400 |
+
cur_response = self.readpage_jina(
|
| 401 |
+
one_url,
|
| 402 |
+
goal,
|
| 403 |
+
runtime_deadline=runtime_deadline,
|
| 404 |
+
summary_model_name=summary_model_name,
|
| 405 |
+
)
|
| 406 |
response.append(cur_response)
|
| 407 |
response = "\n=======\n".join(response)
|
| 408 |
else:
|
|
|
|
| 412 |
print(f"Summary Length {len(response)}")
|
| 413 |
return response.strip()
|
| 414 |
|
| 415 |
+
def call_server(
|
| 416 |
+
self,
|
| 417 |
+
msgs,
|
| 418 |
+
max_retries=2,
|
| 419 |
+
runtime_deadline: Optional[float] = None,
|
| 420 |
+
model_name: str = "",
|
| 421 |
+
):
|
| 422 |
client = self._ensure_summary_client()
|
| 423 |
if client is None or not self._summary_api_base:
|
| 424 |
return "[WebFetch] Summary model error: API_BASE is not set."
|
| 425 |
+
summary_model_name = str(model_name or self._summary_model_name or os.environ.get("MODEL_NAME", "")).strip()
|
| 426 |
+
if not summary_model_name:
|
| 427 |
return "[WebFetch] Summary model error: MODEL_NAME is not set."
|
| 428 |
last_error = "unknown summary-model error"
|
| 429 |
for attempt in range(max_retries):
|
|
|
|
| 437 |
else client
|
| 438 |
)
|
| 439 |
request_kwargs = {
|
| 440 |
+
"model": summary_model_name,
|
| 441 |
"messages": msgs,
|
| 442 |
}
|
| 443 |
apply_sampling_params(
|
| 444 |
request_kwargs,
|
| 445 |
+
model_name=summary_model_name,
|
| 446 |
temperature=self._summary_temperature,
|
| 447 |
top_p=self._summary_top_p,
|
| 448 |
presence_penalty=self._summary_presence_penalty,
|
|
|
|
| 507 |
return content
|
| 508 |
return "[WebFetch] Failed to read page: exhausted retries"
|
| 509 |
|
| 510 |
+
def readpage_jina(
|
| 511 |
+
self,
|
| 512 |
+
url: str,
|
| 513 |
+
goal: str,
|
| 514 |
+
runtime_deadline: Optional[float] = None,
|
| 515 |
+
summary_model_name: str = "",
|
| 516 |
+
) -> str:
|
| 517 |
+
def summary_page_func(messages, max_retries=2, runtime_deadline: Optional[float] = None):
|
| 518 |
+
return self.call_server(
|
| 519 |
+
messages,
|
| 520 |
+
max_retries=max_retries,
|
| 521 |
+
runtime_deadline=runtime_deadline,
|
| 522 |
+
model_name=summary_model_name,
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
max_retries = int(os.getenv("LLM_MAX_RETRIES", str(DEFAULT_LLM_MAX_RETRIES)))
|
| 526 |
|
| 527 |
content = self.html_readpage_jina(url, runtime_deadline=runtime_deadline)
|
agent_base/utils.py
CHANGED
|
@@ -87,21 +87,6 @@ def require_required_env(context: str = "ResearchHarness") -> None:
|
|
| 87 |
)
|
| 88 |
|
| 89 |
|
| 90 |
-
def read_role_prompt_files(paths: Iterable[str]) -> str:
|
| 91 |
-
blocks: list[str] = []
|
| 92 |
-
for raw_path in paths:
|
| 93 |
-
path_text = str(raw_path).strip()
|
| 94 |
-
if not path_text:
|
| 95 |
-
continue
|
| 96 |
-
path = Path(path_text).expanduser()
|
| 97 |
-
if not path.exists():
|
| 98 |
-
raise ValueError(f"Role prompt file does not exist: {path}")
|
| 99 |
-
if not path.is_file():
|
| 100 |
-
raise ValueError(f"Role prompt path is not a file: {path}")
|
| 101 |
-
blocks.append(path.read_text(encoding="utf-8").strip())
|
| 102 |
-
return "\n\n".join(block for block in blocks if block.strip())
|
| 103 |
-
|
| 104 |
-
|
| 105 |
def _safe_image_stem(name: str, fallback: str) -> str:
|
| 106 |
stem = re.sub(r"[^A-Za-z0-9_.-]+", "_", Path(name).stem).strip("._")
|
| 107 |
return stem or fallback
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
def _safe_image_stem(name: str, fallback: str) -> str:
|
| 91 |
stem = re.sub(r"[^A-Za-z0-9_.-]+", "_", Path(name).stem).strip("._")
|
| 92 |
return stem or fallback
|
api/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
"""OpenAI-compatible API helpers for ResearchHarness."""
|
|
|
|
|
|
api/openai_server.py
DELETED
|
@@ -1,518 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import base64
|
| 4 |
-
import binascii
|
| 5 |
-
import datetime
|
| 6 |
-
import json
|
| 7 |
-
import re
|
| 8 |
-
import time
|
| 9 |
-
from dataclasses import dataclass
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Any, Optional
|
| 12 |
-
from uuid import uuid4
|
| 13 |
-
|
| 14 |
-
import uvicorn
|
| 15 |
-
from fastapi import Body, FastAPI, Request
|
| 16 |
-
from fastapi.responses import JSONResponse
|
| 17 |
-
|
| 18 |
-
from agent_base.react_agent import (
|
| 19 |
-
AVAILABLE_TOOL_MAP,
|
| 20 |
-
MultiTurnReactAgent,
|
| 21 |
-
assistant_text_content,
|
| 22 |
-
default_llm_config,
|
| 23 |
-
model_supports_runtime_image_parts,
|
| 24 |
-
)
|
| 25 |
-
from agent_base.tools.tooling import normalize_workspace_root
|
| 26 |
-
from agent_base.utils import append_jsonl, image_input_content_parts, read_role_prompt_files, safe_jsonable
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
DATA_IMAGE_RE = re.compile(r"^data:(image/[A-Za-z0-9.+-]+);base64,(.*)$", re.DOTALL)
|
| 30 |
-
IMAGE_EXTENSIONS = {
|
| 31 |
-
"image/png": ".png",
|
| 32 |
-
"image/jpeg": ".jpg",
|
| 33 |
-
"image/jpg": ".jpg",
|
| 34 |
-
"image/webp": ".webp",
|
| 35 |
-
"image/gif": ".gif",
|
| 36 |
-
}
|
| 37 |
-
DEFAULT_MAX_IMAGE_BYTES = 25 * 1024 * 1024
|
| 38 |
-
|
| 39 |
-
INPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness input wrapper.
|
| 40 |
-
|
| 41 |
-
Convert the user's OpenAI-compatible chat request into a stable task for a
|
| 42 |
-
tool-using ResearchHarness agent.
|
| 43 |
-
|
| 44 |
-
Return only a JSON object with these string fields:
|
| 45 |
-
- agent_instruction: the task the agent should solve, including all substantive question details.
|
| 46 |
-
- output_contract: the final output format or schema requested by the user. If no strict format is requested, say "plain text".
|
| 47 |
-
- wrapper_notes: brief notes about images, constraints, or benchmark-specific requirements.
|
| 48 |
-
|
| 49 |
-
Rules:
|
| 50 |
-
- Do not answer the task.
|
| 51 |
-
- Do not remove substantive constraints.
|
| 52 |
-
- Keep strict final formatting requirements out of agent_instruction when possible.
|
| 53 |
-
- If images are listed, mention their saved paths in agent_instruction.
|
| 54 |
-
"""
|
| 55 |
-
|
| 56 |
-
OUTPUT_WRAPPER_SYSTEM_PROMPT = """You are the ResearchHarness output wrapper.
|
| 57 |
-
|
| 58 |
-
Format the ResearchHarness agent result so it satisfies the user's requested
|
| 59 |
-
final output contract.
|
| 60 |
-
|
| 61 |
-
Rules:
|
| 62 |
-
- Return only the final answer requested by the user.
|
| 63 |
-
- Do not add markdown fences unless the user explicitly required them.
|
| 64 |
-
- Do not solve the task again.
|
| 65 |
-
- Do not introduce facts not present in the agent result.
|
| 66 |
-
- Make the answer complete and self-contained for a remote user or evaluator.
|
| 67 |
-
- The answer may mention workspace files when useful, but it must not depend on
|
| 68 |
-
local files as the only carrier of the answer.
|
| 69 |
-
- Include the actual answer and any necessary evidence or solution steps in the
|
| 70 |
-
returned text.
|
| 71 |
-
- If reasoning or evidence is required, summarize it directly in the final
|
| 72 |
-
answer according to the requested format.
|
| 73 |
-
- If the requested format is JSON, return valid JSON only.
|
| 74 |
-
- If the agent result does not contain enough information, produce the best
|
| 75 |
-
contract-compliant failure answer instead of inventing evidence.
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
class OpenAICompatError(Exception):
|
| 80 |
-
def __init__(self, status_code: int, message: str, error_type: str = "invalid_request_error"):
|
| 81 |
-
super().__init__(message)
|
| 82 |
-
self.status_code = status_code
|
| 83 |
-
self.message = message
|
| 84 |
-
self.error_type = error_type
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
@dataclass
|
| 88 |
-
class ServerConfig:
|
| 89 |
-
api_runs_dir: Path
|
| 90 |
-
role_prompt: str = ""
|
| 91 |
-
host: str = "127.0.0.1"
|
| 92 |
-
port: int = 8686
|
| 93 |
-
input_wrapper: bool = True
|
| 94 |
-
output_wrapper: bool = True
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
@dataclass
|
| 98 |
-
class PreparedInput:
|
| 99 |
-
wrapper_messages: list[dict[str, str]]
|
| 100 |
-
initial_content_parts: list[dict[str, Any]]
|
| 101 |
-
image_paths: list[str]
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def openai_error_response(exc: OpenAICompatError) -> JSONResponse:
|
| 105 |
-
return JSONResponse(
|
| 106 |
-
status_code=exc.status_code,
|
| 107 |
-
content={"error": {"message": exc.message, "type": exc.error_type}},
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def make_chat_completion_response(*, request_id: str, model: str, content: str) -> dict[str, Any]:
|
| 112 |
-
return {
|
| 113 |
-
"id": request_id,
|
| 114 |
-
"object": "chat.completion",
|
| 115 |
-
"created": int(time.time()),
|
| 116 |
-
"model": model,
|
| 117 |
-
"choices": [
|
| 118 |
-
{
|
| 119 |
-
"index": 0,
|
| 120 |
-
"message": {"role": "assistant", "content": content},
|
| 121 |
-
"finish_reason": "stop",
|
| 122 |
-
}
|
| 123 |
-
],
|
| 124 |
-
}
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def validate_chat_payload(payload: Any) -> dict[str, Any]:
|
| 128 |
-
if not isinstance(payload, dict):
|
| 129 |
-
raise OpenAICompatError(400, "Request body must be a JSON object.")
|
| 130 |
-
if payload.get("stream") is True:
|
| 131 |
-
raise OpenAICompatError(400, "Streaming is not supported by this synchronous endpoint.")
|
| 132 |
-
try:
|
| 133 |
-
n_value = int(payload.get("n", 1) or 1)
|
| 134 |
-
except (TypeError, ValueError) as exc:
|
| 135 |
-
raise OpenAICompatError(400, "n must be an integer.") from exc
|
| 136 |
-
if n_value != 1:
|
| 137 |
-
raise OpenAICompatError(400, "Only n=1 is supported.")
|
| 138 |
-
model = str(payload.get("model", "")).strip()
|
| 139 |
-
if not model:
|
| 140 |
-
raise OpenAICompatError(400, "model is required.")
|
| 141 |
-
messages = payload.get("messages")
|
| 142 |
-
if not isinstance(messages, list) or not messages:
|
| 143 |
-
raise OpenAICompatError(400, "messages must be a non-empty list.")
|
| 144 |
-
return payload
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
def prepare_openai_input(messages: list[Any], workspace_root: Path) -> PreparedInput:
|
| 148 |
-
wrapper_messages: list[dict[str, str]] = []
|
| 149 |
-
initial_content_parts: list[dict[str, Any]] = []
|
| 150 |
-
image_paths: list[str] = []
|
| 151 |
-
image_dir = workspace_root / "inputs" / "images"
|
| 152 |
-
image_index = 0
|
| 153 |
-
|
| 154 |
-
for message in messages:
|
| 155 |
-
if not isinstance(message, dict):
|
| 156 |
-
raise OpenAICompatError(400, "Each message must be an object.")
|
| 157 |
-
role = str(message.get("role", "")).strip()
|
| 158 |
-
if role not in {"system", "user", "assistant"}:
|
| 159 |
-
raise OpenAICompatError(400, f"Unsupported message role: {role!r}.")
|
| 160 |
-
content = message.get("content", "")
|
| 161 |
-
text_parts: list[str] = []
|
| 162 |
-
if isinstance(content, str):
|
| 163 |
-
text_parts.append(content)
|
| 164 |
-
elif isinstance(content, list):
|
| 165 |
-
for part in content:
|
| 166 |
-
if not isinstance(part, dict):
|
| 167 |
-
raise OpenAICompatError(400, "Multimodal content parts must be objects.")
|
| 168 |
-
part_type = str(part.get("type", "")).strip()
|
| 169 |
-
if part_type == "text":
|
| 170 |
-
text_parts.append(str(part.get("text", "")))
|
| 171 |
-
elif part_type == "image_url":
|
| 172 |
-
image_url = part.get("image_url")
|
| 173 |
-
if not isinstance(image_url, dict):
|
| 174 |
-
raise OpenAICompatError(400, "image_url content must contain an image_url object.")
|
| 175 |
-
url = str(image_url.get("url", "")).strip()
|
| 176 |
-
detail = str(image_url.get("detail", "auto") or "auto")
|
| 177 |
-
rel_path = save_data_image(
|
| 178 |
-
url,
|
| 179 |
-
workspace_root=workspace_root,
|
| 180 |
-
image_dir=image_dir,
|
| 181 |
-
image_index=image_index,
|
| 182 |
-
)
|
| 183 |
-
image_index += 1
|
| 184 |
-
image_paths.append(rel_path)
|
| 185 |
-
text_parts.append(f"[image saved at {rel_path}]")
|
| 186 |
-
initial_content_parts.extend(image_input_content_parts(url, rel_path, detail=detail))
|
| 187 |
-
else:
|
| 188 |
-
raise OpenAICompatError(400, f"Unsupported content part type: {part_type!r}.")
|
| 189 |
-
else:
|
| 190 |
-
raise OpenAICompatError(400, "message content must be a string or a list of content parts.")
|
| 191 |
-
wrapper_messages.append({"role": role, "content": "\n".join(part for part in text_parts if part)})
|
| 192 |
-
|
| 193 |
-
return PreparedInput(
|
| 194 |
-
wrapper_messages=wrapper_messages,
|
| 195 |
-
initial_content_parts=initial_content_parts,
|
| 196 |
-
image_paths=image_paths,
|
| 197 |
-
)
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
def save_data_image(url: str, *, workspace_root: Path, image_dir: Path, image_index: int) -> str:
|
| 201 |
-
match = DATA_IMAGE_RE.match(url)
|
| 202 |
-
if not match:
|
| 203 |
-
raise OpenAICompatError(
|
| 204 |
-
400,
|
| 205 |
-
"Only data:image/...;base64,... image_url inputs are supported in the first API version.",
|
| 206 |
-
)
|
| 207 |
-
mime_type = match.group(1).lower()
|
| 208 |
-
extension = IMAGE_EXTENSIONS.get(mime_type)
|
| 209 |
-
if extension is None:
|
| 210 |
-
raise OpenAICompatError(400, f"Unsupported image MIME type: {mime_type}.")
|
| 211 |
-
try:
|
| 212 |
-
image_bytes = base64.b64decode(match.group(2), validate=True)
|
| 213 |
-
except (binascii.Error, ValueError) as exc:
|
| 214 |
-
raise OpenAICompatError(400, "Invalid base64 image data.") from exc
|
| 215 |
-
if len(image_bytes) > DEFAULT_MAX_IMAGE_BYTES:
|
| 216 |
-
raise OpenAICompatError(400, f"Image exceeds the {DEFAULT_MAX_IMAGE_BYTES} byte limit.")
|
| 217 |
-
image_dir.mkdir(parents=True, exist_ok=True)
|
| 218 |
-
filename = f"image_{image_index:03d}{extension}"
|
| 219 |
-
path = image_dir / filename
|
| 220 |
-
path.write_bytes(image_bytes)
|
| 221 |
-
return path.relative_to(workspace_root).as_posix()
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
def wrapper_request_payload(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, Any]:
|
| 225 |
-
return {
|
| 226 |
-
"messages": prepared.wrapper_messages,
|
| 227 |
-
"saved_image_paths": prepared.image_paths,
|
| 228 |
-
"response_format": safe_jsonable(payload.get("response_format")),
|
| 229 |
-
"requested_model_label": str(payload.get("model", "")),
|
| 230 |
-
}
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
def build_input_wrapper_messages(*, prepared: PreparedInput, payload: dict[str, Any]) -> list[dict[str, str]]:
|
| 234 |
-
return [
|
| 235 |
-
{"role": "system", "content": INPUT_WRAPPER_SYSTEM_PROMPT},
|
| 236 |
-
{
|
| 237 |
-
"role": "user",
|
| 238 |
-
"content": json.dumps(wrapper_request_payload(prepared=prepared, payload=payload), ensure_ascii=False, indent=2),
|
| 239 |
-
},
|
| 240 |
-
]
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
def build_passthrough_input_plan(*, prepared: PreparedInput, payload: dict[str, Any]) -> dict[str, str]:
|
| 244 |
-
conversation = "\n\n".join(
|
| 245 |
-
f"{message['role'].upper()}:\n{message['content']}" for message in prepared.wrapper_messages
|
| 246 |
-
).strip()
|
| 247 |
-
response_format = payload.get("response_format")
|
| 248 |
-
output_contract = "Follow the final answer requirements in the original request."
|
| 249 |
-
if response_format is not None:
|
| 250 |
-
output_contract += "\nOpenAI response_format request:\n" + json.dumps(
|
| 251 |
-
safe_jsonable(response_format),
|
| 252 |
-
ensure_ascii=False,
|
| 253 |
-
indent=2,
|
| 254 |
-
)
|
| 255 |
-
return {
|
| 256 |
-
"agent_instruction": conversation or "Answer the user's request.",
|
| 257 |
-
"output_contract": output_contract,
|
| 258 |
-
"wrapper_notes": "Input wrapper disabled; the original normalized conversation was passed through directly.",
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
def build_agent_prompt(input_plan: dict[str, Any], prepared: PreparedInput) -> str:
|
| 263 |
-
image_block = "\n".join(f"- {path}" for path in prepared.image_paths) if prepared.image_paths else "- none"
|
| 264 |
-
return (
|
| 265 |
-
"You are solving a user request through ResearchHarness.\n\n"
|
| 266 |
-
"Task for the agent:\n"
|
| 267 |
-
f"{str(input_plan.get('agent_instruction', '')).strip()}\n\n"
|
| 268 |
-
"User-provided images saved in this workspace:\n"
|
| 269 |
-
f"{image_block}\n\n"
|
| 270 |
-
"The original image content is attached to the initial user message when the backend model supports image parts. "
|
| 271 |
-
"The same images are also saved at the paths above so you may call ReadImage when visual inspection is needed.\n\n"
|
| 272 |
-
"Do not optimize your tool-use loop for the final output schema. Solve the task completely, then finish with a complete, "
|
| 273 |
-
"self-contained internal final text that includes the actual answer, the evidence used, and any concise reasoning needed to understand it. "
|
| 274 |
-
"You may mention files you created or inspected, but the internal final text must not depend on local files as the only carrier of the answer.\n\n"
|
| 275 |
-
"Final output contract that will be enforced by a formatter after your run:\n"
|
| 276 |
-
f"{str(input_plan.get('output_contract', 'plain text')).strip()}\n\n"
|
| 277 |
-
"Wrapper notes:\n"
|
| 278 |
-
f"{str(input_plan.get('wrapper_notes', '')).strip()}"
|
| 279 |
-
)
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
def build_output_wrapper_messages(
|
| 283 |
-
*,
|
| 284 |
-
prepared: PreparedInput,
|
| 285 |
-
payload: dict[str, Any],
|
| 286 |
-
input_plan: dict[str, Any],
|
| 287 |
-
agent_result_text: str,
|
| 288 |
-
) -> list[dict[str, str]]:
|
| 289 |
-
output_payload = {
|
| 290 |
-
"original_messages": prepared.wrapper_messages,
|
| 291 |
-
"saved_image_paths": prepared.image_paths,
|
| 292 |
-
"output_contract": str(input_plan.get("output_contract", "plain text")),
|
| 293 |
-
"response_format": safe_jsonable(payload.get("response_format")),
|
| 294 |
-
"agent_result_text": agent_result_text,
|
| 295 |
-
}
|
| 296 |
-
return [
|
| 297 |
-
{"role": "system", "content": OUTPUT_WRAPPER_SYSTEM_PROMPT},
|
| 298 |
-
{"role": "user", "content": json.dumps(output_payload, ensure_ascii=False, indent=2)},
|
| 299 |
-
]
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
def extract_json_object(text: str) -> dict[str, Any]:
|
| 303 |
-
stripped = text.strip()
|
| 304 |
-
if stripped.startswith("```"):
|
| 305 |
-
stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE)
|
| 306 |
-
stripped = re.sub(r"\s*```$", "", stripped)
|
| 307 |
-
try:
|
| 308 |
-
parsed = json.loads(stripped)
|
| 309 |
-
except json.JSONDecodeError:
|
| 310 |
-
start = stripped.find("{")
|
| 311 |
-
end = stripped.rfind("}")
|
| 312 |
-
if start < 0 or end <= start:
|
| 313 |
-
raise OpenAICompatError(500, "Input wrapper did not return a JSON object.", "server_error") from None
|
| 314 |
-
try:
|
| 315 |
-
parsed = json.loads(stripped[start : end + 1])
|
| 316 |
-
except json.JSONDecodeError as exc:
|
| 317 |
-
raise OpenAICompatError(500, f"Input wrapper returned invalid JSON: {exc}", "server_error") from exc
|
| 318 |
-
if not isinstance(parsed, dict):
|
| 319 |
-
raise OpenAICompatError(500, "Input wrapper JSON must be an object.", "server_error")
|
| 320 |
-
if not str(parsed.get("agent_instruction", "")).strip():
|
| 321 |
-
raise OpenAICompatError(500, "Input wrapper JSON missing agent_instruction.", "server_error")
|
| 322 |
-
if not str(parsed.get("output_contract", "")).strip():
|
| 323 |
-
parsed["output_contract"] = "plain text"
|
| 324 |
-
parsed.setdefault("wrapper_notes", "")
|
| 325 |
-
return parsed
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
def call_wrapper_text(
|
| 329 |
-
agent: MultiTurnReactAgent,
|
| 330 |
-
messages: list[dict[str, str]],
|
| 331 |
-
*,
|
| 332 |
-
max_output_tokens: Optional[int] = None,
|
| 333 |
-
) -> str:
|
| 334 |
-
response = agent.call_compaction_api(messages, max_output_tokens=max_output_tokens)
|
| 335 |
-
if not isinstance(response, dict) or response.get("status") == "error":
|
| 336 |
-
error_text = response.get("error", "unknown wrapper error") if isinstance(response, dict) else str(response)
|
| 337 |
-
raise OpenAICompatError(500, error_text, "server_error")
|
| 338 |
-
text = assistant_text_content(response.get("content")).strip()
|
| 339 |
-
if not text:
|
| 340 |
-
raise OpenAICompatError(500, "Wrapper returned empty content.", "server_error")
|
| 341 |
-
return text
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
def final_max_tokens(payload: dict[str, Any]) -> Optional[int]:
|
| 345 |
-
raw_value = payload.get("max_tokens", payload.get("max_completion_tokens"))
|
| 346 |
-
if raw_value is None:
|
| 347 |
-
return None
|
| 348 |
-
try:
|
| 349 |
-
value = int(raw_value)
|
| 350 |
-
except (TypeError, ValueError) as exc:
|
| 351 |
-
raise OpenAICompatError(400, "max_tokens must be an integer.") from exc
|
| 352 |
-
if value <= 0:
|
| 353 |
-
raise OpenAICompatError(400, "max_tokens must be positive.")
|
| 354 |
-
return value
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
def append_api_event(trace_dir: Path, event: str, payload: dict[str, Any]) -> None:
|
| 358 |
-
append_jsonl(
|
| 359 |
-
trace_dir / "api_trace.jsonl",
|
| 360 |
-
{
|
| 361 |
-
"timestamp": int(time.time()),
|
| 362 |
-
"event": event,
|
| 363 |
-
"payload": safe_jsonable(payload),
|
| 364 |
-
},
|
| 365 |
-
)
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
def run_chat_completion(payload: dict[str, Any], config: ServerConfig) -> dict[str, Any]:
|
| 369 |
-
payload = validate_chat_payload(payload)
|
| 370 |
-
request_id = "chatcmpl_" + uuid4().hex
|
| 371 |
-
run_id = "run_" + datetime.datetime.now().astimezone().strftime("%Y%m%d_%H%M%S") + "_" + uuid4().hex[:8]
|
| 372 |
-
run_root = config.api_runs_dir / run_id
|
| 373 |
-
agent_workspace = run_root / "agent_workspace"
|
| 374 |
-
trace_dir = run_root / "agent_trace"
|
| 375 |
-
agent_workspace.mkdir(parents=True, exist_ok=False)
|
| 376 |
-
trace_dir.mkdir(parents=True, exist_ok=False)
|
| 377 |
-
prepared = prepare_openai_input(payload["messages"], agent_workspace)
|
| 378 |
-
llm_config = default_llm_config()
|
| 379 |
-
backend_model = str(llm_config.get("model", ""))
|
| 380 |
-
if prepared.initial_content_parts and not model_supports_runtime_image_parts(backend_model):
|
| 381 |
-
raise OpenAICompatError(
|
| 382 |
-
400,
|
| 383 |
-
f"Backend model {backend_model!r} does not support image content parts.",
|
| 384 |
-
)
|
| 385 |
-
|
| 386 |
-
tool_names = [name for name in AVAILABLE_TOOL_MAP if name != "AskUser"]
|
| 387 |
-
agent = MultiTurnReactAgent(
|
| 388 |
-
function_list=tool_names,
|
| 389 |
-
llm=llm_config,
|
| 390 |
-
trace_dir=str(trace_dir),
|
| 391 |
-
role_prompt=config.role_prompt or None,
|
| 392 |
-
)
|
| 393 |
-
|
| 394 |
-
if config.input_wrapper:
|
| 395 |
-
input_wrapper_messages = build_input_wrapper_messages(prepared=prepared, payload=payload)
|
| 396 |
-
input_wrapper_text = call_wrapper_text(agent, input_wrapper_messages, max_output_tokens=1200)
|
| 397 |
-
input_plan = extract_json_object(input_wrapper_text)
|
| 398 |
-
append_api_event(
|
| 399 |
-
trace_dir,
|
| 400 |
-
"input_wrapper",
|
| 401 |
-
{
|
| 402 |
-
"enabled": True,
|
| 403 |
-
"request": input_wrapper_messages,
|
| 404 |
-
"response_text": input_wrapper_text,
|
| 405 |
-
"input_plan": input_plan,
|
| 406 |
-
},
|
| 407 |
-
)
|
| 408 |
-
else:
|
| 409 |
-
input_plan = build_passthrough_input_plan(prepared=prepared, payload=payload)
|
| 410 |
-
append_api_event(
|
| 411 |
-
trace_dir,
|
| 412 |
-
"input_wrapper",
|
| 413 |
-
{
|
| 414 |
-
"enabled": False,
|
| 415 |
-
"input_plan": input_plan,
|
| 416 |
-
},
|
| 417 |
-
)
|
| 418 |
-
|
| 419 |
-
agent_prompt = build_agent_prompt(input_plan, prepared)
|
| 420 |
-
session = agent._run_session(
|
| 421 |
-
agent_prompt,
|
| 422 |
-
workspace_root=str(agent_workspace),
|
| 423 |
-
initial_content_parts=prepared.initial_content_parts or None,
|
| 424 |
-
)
|
| 425 |
-
agent_result_text = str(session.get("result_text", "")).strip()
|
| 426 |
-
append_api_event(
|
| 427 |
-
trace_dir,
|
| 428 |
-
"agent_result",
|
| 429 |
-
{
|
| 430 |
-
"termination": session.get("termination", ""),
|
| 431 |
-
"result_text": agent_result_text,
|
| 432 |
-
"trace_path": session.get("trace_path", ""),
|
| 433 |
-
},
|
| 434 |
-
)
|
| 435 |
-
|
| 436 |
-
if config.output_wrapper:
|
| 437 |
-
output_wrapper_messages = build_output_wrapper_messages(
|
| 438 |
-
prepared=prepared,
|
| 439 |
-
payload=payload,
|
| 440 |
-
input_plan=input_plan,
|
| 441 |
-
agent_result_text=agent_result_text,
|
| 442 |
-
)
|
| 443 |
-
final_text = call_wrapper_text(agent, output_wrapper_messages, max_output_tokens=final_max_tokens(payload))
|
| 444 |
-
append_api_event(
|
| 445 |
-
trace_dir,
|
| 446 |
-
"output_wrapper",
|
| 447 |
-
{
|
| 448 |
-
"enabled": True,
|
| 449 |
-
"request": output_wrapper_messages,
|
| 450 |
-
"response_text": final_text,
|
| 451 |
-
},
|
| 452 |
-
)
|
| 453 |
-
else:
|
| 454 |
-
final_text = agent_result_text
|
| 455 |
-
append_api_event(
|
| 456 |
-
trace_dir,
|
| 457 |
-
"output_wrapper",
|
| 458 |
-
{
|
| 459 |
-
"enabled": False,
|
| 460 |
-
"response_text": final_text,
|
| 461 |
-
},
|
| 462 |
-
)
|
| 463 |
-
return make_chat_completion_response(
|
| 464 |
-
request_id=request_id,
|
| 465 |
-
model=str(payload.get("model", "researchharness")),
|
| 466 |
-
content=final_text,
|
| 467 |
-
)
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
def create_app(config: ServerConfig) -> FastAPI:
|
| 471 |
-
app = FastAPI(title="ResearchHarness OpenAI-Compatible API", version="1.0")
|
| 472 |
-
|
| 473 |
-
@app.exception_handler(OpenAICompatError)
|
| 474 |
-
async def _handle_openai_compat_error(request: Request, exc: OpenAICompatError) -> JSONResponse:
|
| 475 |
-
return openai_error_response(exc)
|
| 476 |
-
|
| 477 |
-
@app.get("/v1/health")
|
| 478 |
-
async def health() -> dict[str, Any]:
|
| 479 |
-
return {
|
| 480 |
-
"status": "ok",
|
| 481 |
-
"api_runs_dir": str(config.api_runs_dir),
|
| 482 |
-
"input_wrapper": config.input_wrapper,
|
| 483 |
-
"output_wrapper": config.output_wrapper,
|
| 484 |
-
}
|
| 485 |
-
|
| 486 |
-
@app.post("/v1/chat/completions")
|
| 487 |
-
async def chat_completions(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
|
| 488 |
-
try:
|
| 489 |
-
return run_chat_completion(payload, config)
|
| 490 |
-
except OpenAICompatError:
|
| 491 |
-
raise
|
| 492 |
-
except Exception as exc:
|
| 493 |
-
raise OpenAICompatError(500, f"ResearchHarness API error: {exc}", "server_error") from exc
|
| 494 |
-
|
| 495 |
-
return app
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
def serve(
|
| 499 |
-
*,
|
| 500 |
-
api_runs_dir: str,
|
| 501 |
-
host: str = "127.0.0.1",
|
| 502 |
-
port: int = 8686,
|
| 503 |
-
role_prompt_files: Optional[list[str]] = None,
|
| 504 |
-
input_wrapper: bool = True,
|
| 505 |
-
output_wrapper: bool = True,
|
| 506 |
-
) -> None:
|
| 507 |
-
root = normalize_workspace_root(api_runs_dir)
|
| 508 |
-
role_prompt = read_role_prompt_files(role_prompt_files or [])
|
| 509 |
-
config = ServerConfig(
|
| 510 |
-
api_runs_dir=root,
|
| 511 |
-
role_prompt=role_prompt,
|
| 512 |
-
host=host,
|
| 513 |
-
port=port,
|
| 514 |
-
input_wrapper=input_wrapper,
|
| 515 |
-
output_wrapper=output_wrapper,
|
| 516 |
-
)
|
| 517 |
-
app = create_app(config)
|
| 518 |
-
uvicorn.run(app, host=host, port=port)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api_runs/.gitkeep
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|
app.py
CHANGED
|
@@ -7,7 +7,6 @@ from pathlib import Path
|
|
| 7 |
|
| 8 |
import uvicorn
|
| 9 |
|
| 10 |
-
from agent_base.utils import read_role_prompt_files
|
| 11 |
from frontend.local_server import app, configure_frontend
|
| 12 |
|
| 13 |
|
|
@@ -32,18 +31,9 @@ def _bool_env(name: str, default: bool) -> bool:
|
|
| 32 |
raise ValueError(f"{name} must be a boolean, got {raw!r}")
|
| 33 |
|
| 34 |
|
| 35 |
-
def _role_prompt_files() -> list[str]:
|
| 36 |
-
raw = os.getenv("RH_ROLE_PROMPT_FILES", "").strip()
|
| 37 |
-
if not raw:
|
| 38 |
-
return []
|
| 39 |
-
return [item for item in raw.split(os.pathsep) if item]
|
| 40 |
-
|
| 41 |
-
|
| 42 |
def configure_space() -> None:
|
| 43 |
runs_dir = Path(os.getenv("RH_SPACE_RUNS_DIR", "/tmp/researchharness_space/runs")).expanduser()
|
| 44 |
-
role_prompt = read_role_prompt_files(_role_prompt_files())
|
| 45 |
configure_frontend(
|
| 46 |
-
role_prompt=role_prompt,
|
| 47 |
managed_runs_dir=str(runs_dir),
|
| 48 |
cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
|
| 49 |
cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
|
|
|
|
| 7 |
|
| 8 |
import uvicorn
|
| 9 |
|
|
|
|
| 10 |
from frontend.local_server import app, configure_frontend
|
| 11 |
|
| 12 |
|
|
|
|
| 31 |
raise ValueError(f"{name} must be a boolean, got {raw!r}")
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def configure_space() -> None:
|
| 35 |
runs_dir = Path(os.getenv("RH_SPACE_RUNS_DIR", "/tmp/researchharness_space/runs")).expanduser()
|
|
|
|
| 36 |
configure_frontend(
|
|
|
|
| 37 |
managed_runs_dir=str(runs_dir),
|
| 38 |
cleanup_retention_seconds=_int_env("RH_SPACE_RETENTION_SECONDS", 6 * 60 * 60),
|
| 39 |
cleanup_max_runs=_int_env("RH_SPACE_MAX_RUNS", 40),
|
benchmarks/QA/README.md
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
# QA / VQA Benchmarks
|
| 2 |
-
|
| 3 |
-
This directory documents the lightweight ResearchHarness contract for
|
| 4 |
-
question-answering benchmarks, including plain-text QA and multimodal VQA-style
|
| 5 |
-
tasks.
|
| 6 |
-
|
| 7 |
-
The recommended integration is the OpenAI-compatible synchronous API server:
|
| 8 |
-
|
| 9 |
-
```bash
|
| 10 |
-
python3 /abs/path/to/ResearchHarness/run_server.py \
|
| 11 |
-
--api-runs-dir ./api_runs
|
| 12 |
-
```
|
| 13 |
-
|
| 14 |
-
For QA/VQA benchmark runs, optionally add this benchmark role overlay:
|
| 15 |
-
|
| 16 |
-
```bash
|
| 17 |
-
python3 /abs/path/to/ResearchHarness/run_server.py \
|
| 18 |
-
--api-runs-dir ./api_runs \
|
| 19 |
-
--role-prompt-file /abs/path/to/ResearchHarness/benchmarks/QA/role_prompt.md
|
| 20 |
-
```
|
| 21 |
-
|
| 22 |
-
Each request creates a fresh run directory:
|
| 23 |
-
|
| 24 |
-
```text
|
| 25 |
-
./api_runs/
|
| 26 |
-
`-- run_YYYYMMDD_HHMMSS_<random>/
|
| 27 |
-
|-- agent_workspace/ # visible to the agent
|
| 28 |
-
| `-- inputs/
|
| 29 |
-
| `-- images/ # user-provided images, when present
|
| 30 |
-
`-- agent_trace/ # server-side trace and session state
|
| 31 |
-
|-- api_trace.jsonl
|
| 32 |
-
|-- trace_*.jsonl
|
| 33 |
-
`-- _session_state.json
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
The input and output LLM wrappers are enabled by default:
|
| 37 |
-
|
| 38 |
-
- `--input-wrapper` / `--no-input-wrapper` controls the input normalization pass.
|
| 39 |
-
- `--output-wrapper` / `--no-output-wrapper` controls the final answer formatting pass.
|
| 40 |
-
|
| 41 |
-
Strict-format benchmarks should usually keep both wrappers enabled. To return
|
| 42 |
-
the agent's direct final text instead, run:
|
| 43 |
-
|
| 44 |
-
```bash
|
| 45 |
-
python3 /abs/path/to/ResearchHarness/run_server.py \
|
| 46 |
-
--api-runs-dir ./api_runs \
|
| 47 |
-
--no-input-wrapper \
|
| 48 |
-
--no-output-wrapper
|
| 49 |
-
```
|
| 50 |
-
|
| 51 |
-
External benchmark runners can then use the regular OpenAI SDK with:
|
| 52 |
-
|
| 53 |
-
```python
|
| 54 |
-
from openai import OpenAI
|
| 55 |
-
|
| 56 |
-
client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
|
| 57 |
-
|
| 58 |
-
response = client.chat.completions.create(
|
| 59 |
-
model="researchharness",
|
| 60 |
-
messages=[{"role": "user", "content": "Answer the question."}],
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
answer = response.choices[0].message.content
|
| 64 |
-
```
|
| 65 |
-
|
| 66 |
-
## Multimodal Input
|
| 67 |
-
|
| 68 |
-
For image benchmarks, send OpenAI-style content parts. The first API version
|
| 69 |
-
supports one or more `data:image/...;base64,...` URLs in the same request.
|
| 70 |
-
|
| 71 |
-
```python
|
| 72 |
-
response = client.chat.completions.create(
|
| 73 |
-
model="researchharness",
|
| 74 |
-
messages=[
|
| 75 |
-
{
|
| 76 |
-
"role": "user",
|
| 77 |
-
"content": [
|
| 78 |
-
{"type": "text", "text": "What is shown? Return JSON with key answer."},
|
| 79 |
-
{"type": "image_url", "image_url": {"url": data_url}},
|
| 80 |
-
],
|
| 81 |
-
}
|
| 82 |
-
],
|
| 83 |
-
)
|
| 84 |
-
```
|
| 85 |
-
|
| 86 |
-
The API saves each submitted image under `agent_workspace/inputs/images/`,
|
| 87 |
-
passes the image content to the first ResearchHarness model call when the
|
| 88 |
-
backend model supports image parts, and includes each saved path in the
|
| 89 |
-
agent-visible text.
|
| 90 |
-
|
| 91 |
-
The returned answer should be self-contained for a remote evaluator. Workspace
|
| 92 |
-
files may support the run, but the response should not only say to consult
|
| 93 |
-
`answer.md`, `report.md`, an image file, or another local artifact.
|
| 94 |
-
|
| 95 |
-
## Scope
|
| 96 |
-
|
| 97 |
-
- The endpoint is synchronous and returns one final text answer.
|
| 98 |
-
- Each request gets a separate workspace subdirectory.
|
| 99 |
-
- The API uses an input wrapper, the ResearchHarness agent, and an output
|
| 100 |
-
wrapper so strict benchmark output formats do not destabilize the agent loop.
|
| 101 |
-
- Streaming, async run status, artifact download, and remote image fetching are
|
| 102 |
-
intentionally out of scope for this minimal QA contract.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/QA/role_prompt.md
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
# Benchmark Role Overlay
|
| 2 |
-
|
| 3 |
-
You are running inside ResearchHarness for a QA or VQA benchmark.
|
| 4 |
-
|
| 5 |
-
Behavior:
|
| 6 |
-
- Solve the user's task directly and carefully.
|
| 7 |
-
- Use tools only when they materially improve answer quality.
|
| 8 |
-
- If the request includes saved image paths, inspect the image evidence when it
|
| 9 |
-
is needed for the answer.
|
| 10 |
-
- Do not ask the user follow-up questions.
|
| 11 |
-
- Do not stop with a plan. Produce the answer once enough evidence has been
|
| 12 |
-
gathered.
|
| 13 |
-
- It is acceptable to explain what evidence was used in the agent's internal
|
| 14 |
-
final text; a downstream formatter will enforce the benchmark's exact output
|
| 15 |
-
contract.
|
| 16 |
-
- Assume the remote evaluator only sees the returned text, not your workspace.
|
| 17 |
-
- Your final text must be a complete, independent plain-text answer.
|
| 18 |
-
- Include the actual answer to the original question.
|
| 19 |
-
- Include supporting evidence, calculations, or reasoning steps when they are
|
| 20 |
-
needed to make the answer understandable.
|
| 21 |
-
- In this benchmark role, do not rely on local workspace files as the answer.
|
| 22 |
-
Files such as `answer.md`, `report.md`, images, or other artifacts may support
|
| 23 |
-
your work, but the returned text itself must contain the answer a remote
|
| 24 |
-
evaluator needs.
|
| 25 |
-
|
| 26 |
-
For visual tasks:
|
| 27 |
-
- Prefer the attached image content when it is available in the model input.
|
| 28 |
-
- Use `ReadImage` on saved image paths when additional visual inspection is
|
| 29 |
-
needed or when the prompt explicitly asks you to inspect local image files.
|
| 30 |
-
- Do not invent visual details that are not supported by the image or tool
|
| 31 |
-
output.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/README.md
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
# Benchmarks
|
| 2 |
-
|
| 3 |
-
This folder records benchmark-specific integration contracts that live
|
| 4 |
-
**outside** `agent_base` so the core harness stays generic, lightweight, and
|
| 5 |
-
fair across different evaluations.
|
| 6 |
-
|
| 7 |
-
| Benchmark | Directory | Tracked contract |
|
| 8 |
-
| --- | --- | --- |
|
| 9 |
-
| ResearchClawBench | `benchmarks/ResearchClawBench/` | `README.md` + `role_prompt.md` + `adapter.py` |
|
| 10 |
-
| QA / VQA-style benchmarks | `benchmarks/QA/` | `README.md` + `role_prompt.md` |
|
| 11 |
-
|
| 12 |
-
## Notes
|
| 13 |
-
|
| 14 |
-
- `agent_base/` stays focused on the reusable harness runtime.
|
| 15 |
-
- Benchmark-specific prompts, adapters, and integration notes should live under
|
| 16 |
-
their own benchmark subdirectory.
|
| 17 |
-
- Local benchmark helpers may exist for private experimentation, but they do
|
| 18 |
-
not define the formal external integration contract.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/ResearchClawBench/README.md
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
# ResearchClawBench
|
| 2 |
-
|
| 3 |
-
This directory contains the tracked files needed to document how `ResearchHarness`
|
| 4 |
-
should be integrated into `ResearchClawBench`.
|
| 5 |
-
|
| 6 |
-
ResearchHarness is intended to serve here as a **general and fair execution
|
| 7 |
-
substrate** for tool-using LLM evaluation, while `ResearchClawBench` remains in
|
| 8 |
-
charge of task construction, hidden-answer isolation, and scoring.
|
| 9 |
-
|
| 10 |
-
## Recommended `agents.json` Entry
|
| 11 |
-
|
| 12 |
-
Use a single direct command that launches the thin top-level ResearchHarness
|
| 13 |
-
entrypoint.
|
| 14 |
-
|
| 15 |
-
```json
|
| 16 |
-
{
|
| 17 |
-
"researchharness": {
|
| 18 |
-
"label": "ResearchHarness",
|
| 19 |
-
"icon": "H",
|
| 20 |
-
"logo": "/static/logos/rh.svg",
|
| 21 |
-
"cmd": "python3 /abs/path/to/ResearchHarness/run_agent.py <PROMPT> --workspace-root <WORKSPACE> --role-prompt-file /abs/path/to/ResearchHarness/benchmarks/ResearchClawBench/role_prompt.md --trace-dir <WORKSPACE>"
|
| 22 |
-
}
|
| 23 |
-
}
|
| 24 |
-
```
|
| 25 |
-
|
| 26 |
-
## Why This Shape
|
| 27 |
-
|
| 28 |
-
- `ResearchClawBench` already prepares the workspace, writes `INSTRUCTIONS.md`,
|
| 29 |
-
and isolates hidden checklist data.
|
| 30 |
-
- `ResearchHarness` should only execute the agent through a stable harness
|
| 31 |
-
interface.
|
| 32 |
-
- The command stays unchanged. The entrypoint automatically selects the
|
| 33 |
-
lightweight adapter in `benchmarks/ResearchClawBench/adapter.py` when this
|
| 34 |
-
benchmark role prompt is used.
|
| 35 |
-
|
| 36 |
-
## Notes
|
| 37 |
-
|
| 38 |
-
- Replace `/abs/path/to/ResearchHarness/` with the real local checkout path.
|
| 39 |
-
- The command should stay one-line and non-interactive.
|
| 40 |
-
- The adapter prevents premature termination on long tasks by refusing to accept
|
| 41 |
-
plain-text completion before `report/report.md` exists in the workspace.
|
| 42 |
-
- The adapter excludes `AskUser`; RCB runs must remain fully non-interactive.
|
| 43 |
-
- Any local batch helpers or ad hoc benchmark scripts should remain untracked
|
| 44 |
-
and live outside the formal integration contract.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/ResearchClawBench/adapter.py
DELETED
|
@@ -1,93 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from typing import Any, Optional, Sequence
|
| 5 |
-
|
| 6 |
-
from agent_base.react_agent import AVAILABLE_TOOL_MAP, MultiTurnReactAgent
|
| 7 |
-
from agent_base.tools.tooling import normalize_workspace_root
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class ResearchClawBenchAgent(MultiTurnReactAgent):
|
| 11 |
-
"""
|
| 12 |
-
Lightweight benchmark adapter for ResearchClawBench.
|
| 13 |
-
|
| 14 |
-
The benchmark task is not complete until the run workspace contains the
|
| 15 |
-
canonical final report at report/report.md. Pure planning text without that
|
| 16 |
-
artifact should not terminate the agent loop.
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
required_report_relpath = Path("report") / "report.md"
|
| 20 |
-
forbidden_tool_names = {"AskUser"}
|
| 21 |
-
|
| 22 |
-
def __init__(self, function_list: Optional[Sequence[str]] = None, *args: Any, **kwargs: Any):
|
| 23 |
-
if function_list is None:
|
| 24 |
-
function_list = [
|
| 25 |
-
tool_name
|
| 26 |
-
for tool_name in AVAILABLE_TOOL_MAP
|
| 27 |
-
if tool_name not in self.forbidden_tool_names
|
| 28 |
-
]
|
| 29 |
-
else:
|
| 30 |
-
function_list = [str(tool_name).strip() for tool_name in function_list if str(tool_name).strip()]
|
| 31 |
-
forbidden = sorted(set(function_list) & self.forbidden_tool_names)
|
| 32 |
-
if forbidden:
|
| 33 |
-
raise ValueError(f"Tools are not allowed in ResearchClawBench runs: {forbidden}")
|
| 34 |
-
super().__init__(function_list=list(function_list), *args, **kwargs)
|
| 35 |
-
|
| 36 |
-
def _required_report_path(self, workspace_root: Optional[str]) -> Path:
|
| 37 |
-
workspace = Path(normalize_workspace_root(workspace_root))
|
| 38 |
-
return workspace / self.required_report_relpath
|
| 39 |
-
|
| 40 |
-
def should_accept_plaintext_result(
|
| 41 |
-
self,
|
| 42 |
-
*,
|
| 43 |
-
result_text: str,
|
| 44 |
-
workspace_root: Optional[str],
|
| 45 |
-
messages: Sequence[dict[str, Any]],
|
| 46 |
-
) -> bool:
|
| 47 |
-
if not self._required_report_path(workspace_root).exists():
|
| 48 |
-
return False
|
| 49 |
-
return super().should_accept_plaintext_result(
|
| 50 |
-
result_text=result_text,
|
| 51 |
-
workspace_root=workspace_root,
|
| 52 |
-
messages=messages,
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
def rejected_plaintext_result_message(
|
| 56 |
-
self,
|
| 57 |
-
*,
|
| 58 |
-
result_text: str,
|
| 59 |
-
workspace_root: Optional[str],
|
| 60 |
-
messages: Sequence[dict[str, Any]],
|
| 61 |
-
) -> str:
|
| 62 |
-
if not self._required_report_path(workspace_root).exists():
|
| 63 |
-
return (
|
| 64 |
-
"The previous assistant turn was not accepted as the final result because "
|
| 65 |
-
"ResearchClawBench requires report/report.md and that file is still missing. "
|
| 66 |
-
"Continue working and use tool calls to produce or verify report/report.md before finishing."
|
| 67 |
-
)
|
| 68 |
-
return super().rejected_plaintext_result_message(
|
| 69 |
-
result_text=result_text,
|
| 70 |
-
workspace_root=workspace_root,
|
| 71 |
-
messages=messages,
|
| 72 |
-
)
|
| 73 |
-
|
| 74 |
-
def should_accept_terminal_error(
|
| 75 |
-
self,
|
| 76 |
-
*,
|
| 77 |
-
error_text: str,
|
| 78 |
-
workspace_root: Optional[str],
|
| 79 |
-
messages: Sequence[dict[str, Any]],
|
| 80 |
-
) -> bool:
|
| 81 |
-
return self._required_report_path(workspace_root).exists()
|
| 82 |
-
|
| 83 |
-
def accepted_terminal_error_result_text(
|
| 84 |
-
self,
|
| 85 |
-
*,
|
| 86 |
-
error_text: str,
|
| 87 |
-
workspace_root: Optional[str],
|
| 88 |
-
messages: Sequence[dict[str, Any]],
|
| 89 |
-
) -> str:
|
| 90 |
-
return (
|
| 91 |
-
"ResearchClawBench completion recovered after a terminal LLM/runtime error because "
|
| 92 |
-
"report/report.md already exists and the required final artifact has been produced."
|
| 93 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/ResearchClawBench/role_prompt.md
DELETED
|
@@ -1,195 +0,0 @@
|
|
| 1 |
-
# Benchmark Role Overlay
|
| 2 |
-
|
| 3 |
-
## Purpose
|
| 4 |
-
|
| 5 |
-
You are running inside a benchmark-style scientific evaluation.
|
| 6 |
-
|
| 7 |
-
Your job is not just to produce a plausible report. Your job is to produce a
|
| 8 |
-
report whose claims are traceable to concrete artifacts in the workspace and
|
| 9 |
-
whose methods match the task's named scientific commitments as closely as the
|
| 10 |
-
environment allows.
|
| 11 |
-
|
| 12 |
-
This benchmark is non-interactive. Do not use `AskUser` or attempt to ask the
|
| 13 |
-
human for clarification. Resolve ambiguity from `INSTRUCTIONS.md`, workspace
|
| 14 |
-
files, related work, and available local or web tools.
|
| 15 |
-
|
| 16 |
-
## Method Contract
|
| 17 |
-
|
| 18 |
-
- Parse the task into explicit methodological commitments early.
|
| 19 |
-
- Before broad exploration, infer the likely target artifact families required by
|
| 20 |
-
the task, including:
|
| 21 |
-
- primary quantitative answers
|
| 22 |
-
- required comparison tables
|
| 23 |
-
- expected figure families
|
| 24 |
-
- interpretability artifacts
|
| 25 |
-
- subgroup or condition-specific outputs
|
| 26 |
-
- If the task names a framework, protocol, comparison structure,
|
| 27 |
-
interpretability method, simulator, ablation, posterior treatment,
|
| 28 |
-
reconciliation step, or validation design, treat that as part of the
|
| 29 |
-
contract.
|
| 30 |
-
- Do not silently replace an explicitly named method with a looser descriptive
|
| 31 |
-
analysis.
|
| 32 |
-
- Save a concise contract summary to `outputs/method_contract.json`.
|
| 33 |
-
- Save the inferred target artifact inventory to
|
| 34 |
-
`outputs/target_artifact_inventory.json`.
|
| 35 |
-
- After reading the most relevant related-work papers, refresh both files if the
|
| 36 |
-
papers reveal additional named baselines, architectures, figure families,
|
| 37 |
-
comparison strata, or interpretability artifacts central to the task.
|
| 38 |
-
- Save a concise related-work extraction to `outputs/related_work_contract.json`
|
| 39 |
-
whenever related work materially changes the contract or artifact inventory.
|
| 40 |
-
|
| 41 |
-
## Capability Check
|
| 42 |
-
|
| 43 |
-
- Before approximating or skipping a named method, check whether the needed
|
| 44 |
-
dependency, library, or runtime capability is available.
|
| 45 |
-
- Save the result to `outputs/dependency_check.json`.
|
| 46 |
-
- If a named method cannot be implemented exactly, state the exact limitation
|
| 47 |
-
and the fallback.
|
| 48 |
-
- If the task centers on a named model family, simulator, architecture, or
|
| 49 |
-
analysis stack, do not quietly swap to a different family just because it is
|
| 50 |
-
easier. Either implement a minimally faithful version of the named approach
|
| 51 |
-
or make the deviation explicit before proceeding.
|
| 52 |
-
|
| 53 |
-
## Evidence Discipline
|
| 54 |
-
|
| 55 |
-
- Every major scientific claim should have at least one explicit supporting
|
| 56 |
-
artifact in `outputs/` or `report/images/`.
|
| 57 |
-
- Export the exact tables, matrices, or JSON objects used to create each main
|
| 58 |
-
figure.
|
| 59 |
-
- Add a dedicated validation subsection to the report that separates:
|
| 60 |
-
- what was verified directly from workspace data
|
| 61 |
-
- what came from related work
|
| 62 |
-
- what remains an assumption or limitation
|
| 63 |
-
- Answer claim-recovery questions claim-by-claim rather than only with a broad
|
| 64 |
-
narrative.
|
| 65 |
-
- Save a concise claim recovery table before finalizing the report.
|
| 66 |
-
- When the task asks for quantitative constraints, limits, posterior summaries,
|
| 67 |
-
calibration values, or uncertainty summaries, save those values explicitly in
|
| 68 |
-
the requested variables and units rather than only through a proxy
|
| 69 |
-
transformation.
|
| 70 |
-
- If the task ultimately asks for a direct constraint on a named target
|
| 71 |
-
quantity, prefer deriving and reporting that named quantity itself instead of
|
| 72 |
-
stopping at an intermediate proxy axis, surrogate scale, or nearby latent
|
| 73 |
-
variable whenever a defensible derivation is possible from workspace data and
|
| 74 |
-
related work.
|
| 75 |
-
- If posterior samples are a primary input, report canonical distribution
|
| 76 |
-
summaries for each primary source, including mean and standard deviation,
|
| 77 |
-
unless those statistics are mathematically invalid for the variable.
|
| 78 |
-
- If the task names a primary source, cohort, benchmark, or experimental arm,
|
| 79 |
-
produce at least one source-specific artifact for it before emphasizing only
|
| 80 |
-
combined or aggregated results.
|
| 81 |
-
- If the task names a direct target quantity, threshold, or decision criterion,
|
| 82 |
-
export a compact result table that answers it directly before presenting
|
| 83 |
-
broader supporting analyses.
|
| 84 |
-
|
| 85 |
-
## Related Work Use
|
| 86 |
-
|
| 87 |
-
- Read `related_work/` early, but bounded.
|
| 88 |
-
- Start with concise or bounded reads when papers are long.
|
| 89 |
-
- Extract only task-relevant facts into notes or structured outputs.
|
| 90 |
-
- If related work contains validation metrics, methodological caveats,
|
| 91 |
-
baselines, or target comparison axes that matter for the task, incorporate
|
| 92 |
-
them explicitly.
|
| 93 |
-
- Prefer extracting from related work:
|
| 94 |
-
- named methods or architectures to reproduce or compare against
|
| 95 |
-
- target comparison axes and subgroup splits
|
| 96 |
-
- likely main figure families or panel structures
|
| 97 |
-
- explicit quantitative targets, thresholds, or calibration outputs
|
| 98 |
-
|
| 99 |
-
## Figure And Comparison Fidelity
|
| 100 |
-
|
| 101 |
-
- Prefer claim-driven figures over generic exploratory plots.
|
| 102 |
-
- Infer likely figure families and comparison structures from the task and
|
| 103 |
-
related work.
|
| 104 |
-
- If the task is about projections, calibration, method agreement, subgroup
|
| 105 |
-
trends, rankings, level-wise comparisons, or ablations, produce figures that
|
| 106 |
-
directly encode those structures.
|
| 107 |
-
- Keep the main figure set compact: each main figure should support a specific
|
| 108 |
-
target claim.
|
| 109 |
-
- If the task's core claim is source-specific, dataset-specific, or benchmark-
|
| 110 |
-
specific, include at least one main figure at that same granularity rather
|
| 111 |
-
than only a pooled or combined summary figure.
|
| 112 |
-
- If the task implies a named figure family such as ablation curves, PR/ROC
|
| 113 |
-
curves, parity plots, subgroup heatmaps, saliency maps, architecture
|
| 114 |
-
diagrams, or level-wise comparisons, prioritize that family over a generic
|
| 115 |
-
substitute.
|
| 116 |
-
|
| 117 |
-
## Group And Condition Preservation
|
| 118 |
-
|
| 119 |
-
- If the task names groups, conditions, labs, sexes, environments, shells,
|
| 120 |
-
depth levels, or other comparison strata, preserve them in at least one
|
| 121 |
-
exported table or figure.
|
| 122 |
-
- Do not silently collapse mixed categories if the scientific question depends
|
| 123 |
-
on them.
|
| 124 |
-
- When subgroup structure matters over time, prefer a subgroup-by-time matrix
|
| 125 |
-
and save it.
|
| 126 |
-
- If the task is a benchmark or model-comparison study across datasets,
|
| 127 |
-
baselines, cohorts, or conditions, export a compact comparison table with the
|
| 128 |
-
main metric reported as mean ± standard deviation whenever repeated runs,
|
| 129 |
-
folds, or stochastic training are part of the setup.
|
| 130 |
-
- For multi-condition or multi-cohort tasks, save at least one artifact at the
|
| 131 |
-
per-condition granularity before merging across conditions.
|
| 132 |
-
|
| 133 |
-
## Named Method Fidelity
|
| 134 |
-
|
| 135 |
-
- If the task or related work defines a named mechanism, algorithm, or
|
| 136 |
-
protocol central to the scientific claim, save a fidelity checklist to
|
| 137 |
-
`outputs/method_fidelity_checklist.json`.
|
| 138 |
-
- That checklist should capture:
|
| 139 |
-
- the exact definition
|
| 140 |
-
- assumptions
|
| 141 |
-
- invariants
|
| 142 |
-
- non-negotiable structural steps
|
| 143 |
-
- Use it to verify whether the implemented method actually matches the named
|
| 144 |
-
mechanism.
|
| 145 |
-
- If you deviate, explain exactly how and why in the report.
|
| 146 |
-
- If the task revolves around a named architecture or protocol, capture the key
|
| 147 |
-
structural ingredients that distinguish it from nearby alternatives and check
|
| 148 |
-
them explicitly.
|
| 149 |
-
|
| 150 |
-
## Small Sweeps And Ablations
|
| 151 |
-
|
| 152 |
-
- If the named mechanism exposes a small discrete design variable, such as
|
| 153 |
-
levels, layers, stages, shells, bins, or ablation settings, run at least a
|
| 154 |
-
small sweep unless it is genuinely impossible from the available workspace.
|
| 155 |
-
- If the task names a specific interpretability method such as SHAP,
|
| 156 |
-
permutation importance, saliency, or similar, produce at least one artifact
|
| 157 |
-
using that named method.
|
| 158 |
-
- If the task claims improved interpretability, do not stop at aggregate metric
|
| 159 |
-
gains alone; produce at least one explicit interpretability artifact and tie
|
| 160 |
-
it back to domain-relevant entities, groups, or substructures named in the
|
| 161 |
-
task or related work.
|
| 162 |
-
- If the task names multiple groups, labs, cohorts, or environments, prefer an
|
| 163 |
-
interpretability artifact that compares them directly instead of a single
|
| 164 |
-
pooled explanation.
|
| 165 |
-
- If interpretability is central and the chosen model family supports a common
|
| 166 |
-
post hoc explanation method, do not stop at native coefficient or impurity
|
| 167 |
-
magnitudes alone. Add at least one post hoc explanation artifact such as
|
| 168 |
-
SHAP, permutation importance, saliency, attention attribution, or a similarly
|
| 169 |
-
standard method for that model family.
|
| 170 |
-
|
| 171 |
-
## Finalization
|
| 172 |
-
|
| 173 |
-
- Start `report/report.md` as soon as at least two core result families already
|
| 174 |
-
have concrete supporting artifacts in `outputs/` or `report/images/`.
|
| 175 |
-
- Prefer an evidence-backed report draft over one more optional script, one
|
| 176 |
-
more polish pass, or one more non-essential figure.
|
| 177 |
-
- Once the primary quantitative outputs, the main comparison figures, and the
|
| 178 |
-
core validation artifacts exist, write `report/report.md` immediately.
|
| 179 |
-
- Do not postpone the report in order to chase optional supplementary figures,
|
| 180 |
-
extra exploratory analyses, or additional polish that is not required to
|
| 181 |
-
support the task's main claims.
|
| 182 |
-
- Treat optional supplementary work as lower priority than a complete,
|
| 183 |
-
evidence-backed report. If the report can already answer the task directly,
|
| 184 |
-
finish the report first and only then consider extras if there is clear
|
| 185 |
-
remaining need.
|
| 186 |
-
- The final report should be tightly traceable.
|
| 187 |
-
- Important numbers should be reproducible from saved artifacts in the
|
| 188 |
-
workspace.
|
| 189 |
-
- Do not claim exact reproduction if only a rough approximation was achieved.
|
| 190 |
-
- Before finalizing, check that the report contains direct answers to the main
|
| 191 |
-
requested outputs in the named variables, units, and confidence language of
|
| 192 |
-
the task, not only nearby surrogate quantities.
|
| 193 |
-
- Before finalizing, verify that every primary entry in
|
| 194 |
-
`outputs/target_artifact_inventory.json` is either satisfied by a concrete
|
| 195 |
-
saved artifact or explicitly marked as unsatisfied with a reason.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/tutorial_en.md
DELETED
|
@@ -1,531 +0,0 @@
|
|
| 1 |
-
# ResearchHarness Tutorial
|
| 2 |
-
|
| 3 |
-
This tutorial explains how to use ResearchHarness from the command line and as
|
| 4 |
-
an OpenAI-compatible API service.
|
| 5 |
-
|
| 6 |
-
ResearchHarness is a lightweight, general-purpose harness for tool-using LLM
|
| 7 |
-
agents. It can be used as:
|
| 8 |
-
|
| 9 |
-
- a command-line local agent,
|
| 10 |
-
- a fair execution substrate for agent benchmarks,
|
| 11 |
-
- an OpenAI-compatible synchronous API backend,
|
| 12 |
-
- a personal assistant runtime for files, code, reports, PDFs, images, and web tasks.
|
| 13 |
-
|
| 14 |
-
## 1. Install
|
| 15 |
-
|
| 16 |
-
Clone the repository and install dependencies:
|
| 17 |
-
|
| 18 |
-
```bash
|
| 19 |
-
python3 -m pip install -r requirements.txt
|
| 20 |
-
```
|
| 21 |
-
|
| 22 |
-
Python 3.10+ is recommended.
|
| 23 |
-
|
| 24 |
-
## 2. Configure Environment Variables
|
| 25 |
-
|
| 26 |
-
Copy `.env.example` to `.env` and fill in the required values.
|
| 27 |
-
|
| 28 |
-
Required variables:
|
| 29 |
-
|
| 30 |
-
| Variable | Meaning |
|
| 31 |
-
| --- | --- |
|
| 32 |
-
| `API_KEY` | API key for your OpenAI-compatible LLM provider. |
|
| 33 |
-
| `API_BASE` | Base URL for the OpenAI-compatible chat-completions endpoint. |
|
| 34 |
-
| `MODEL_NAME` | Main model used by ResearchHarness. |
|
| 35 |
-
| `SERPER_KEY` | Serper key for `WebSearch` and `ScholarSearch`: https://serper.dev/ |
|
| 36 |
-
| `JINA_KEY` | Jina key for `WebFetch`: https://jina.ai/ |
|
| 37 |
-
| `MINERU_TOKEN` | MinerU token for `ReadPDF`: https://mineru.net/ |
|
| 38 |
-
|
| 39 |
-
Optional variables:
|
| 40 |
-
|
| 41 |
-
| Variable | Default | Meaning |
|
| 42 |
-
| --- | --- | --- |
|
| 43 |
-
| `WORKSPACE_ROOT` | `./workspace` | Default workspace root when no explicit workspace is passed. |
|
| 44 |
-
| `MAX_LLM_CALL_PER_RUN` | `100` | Maximum LLM calls in one agent run. |
|
| 45 |
-
| `MAX_AGENT_ROUNDS` | `100` | Maximum ReAct loop rounds. |
|
| 46 |
-
| `MAX_AGENT_RUNTIME_SECONDS` | `9000` | Maximum wall-clock runtime for one agent run. |
|
| 47 |
-
| `LLM_TIMEOUT_SECONDS` | `600` | Timeout for each LLM API request. |
|
| 48 |
-
| `LLM_MAX_OUTPUT_TOKENS` | `10000` | Requested maximum output tokens. |
|
| 49 |
-
| `MAX_INPUT_TOKENS` | `320000` | Input-token budget used by runtime accounting. |
|
| 50 |
-
| `LLM_MAX_RETRIES` | `10` | Maximum retries for transient LLM API errors. |
|
| 51 |
-
| `TEMPERATURE` | `0.6` | Main model temperature. |
|
| 52 |
-
| `TOP_P` | `0.95` | Main model top-p. |
|
| 53 |
-
| `PRESENCE_PENALTY` | `1.1` | Main model presence penalty when supported. |
|
| 54 |
-
| `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | Context length threshold for automatic compaction. |
|
| 55 |
-
| `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | Token estimate for each image content part. |
|
| 56 |
-
| `LLM_IMAGE_MAX_EDGE` | `1568` | Maximum image edge sent to multimodal models. |
|
| 57 |
-
| `LLM_IMAGE_MAX_BYTES` | `524288` | Maximum compressed image payload size. |
|
| 58 |
-
| `LLM_IMAGE_JPEG_QUALITY` | `85` | Initial JPEG quality for image compression. |
|
| 59 |
-
| `DEBUG_AGENT` | `false` | Verbose agent-loop logs. |
|
| 60 |
-
| `DEBUG_SEARCH` | `false` | Verbose WebSearch logs. |
|
| 61 |
-
| `DEBUG_SCHOLAR` | `false` | Verbose ScholarSearch logs. |
|
| 62 |
-
| `DEBUG_VISIT` | `false` | Verbose WebFetch logs. |
|
| 63 |
-
|
| 64 |
-
Before real use, run:
|
| 65 |
-
|
| 66 |
-
```bash
|
| 67 |
-
python3 tests/test_tool_availability.py
|
| 68 |
-
```
|
| 69 |
-
|
| 70 |
-
All tools should pass. Missing service keys, missing dependencies, exhausted
|
| 71 |
-
credits, or unavailable external tools should be treated as failures.
|
| 72 |
-
|
| 73 |
-
If `WebSearch`, `ScholarSearch`, `WebFetch`, or `ReadPDF` fails with network,
|
| 74 |
-
TLS, upload, download, or parsing errors, try disabling VPN/proxy and rerun the
|
| 75 |
-
test.
|
| 76 |
-
|
| 77 |
-
## 3. Command-Line Usage
|
| 78 |
-
|
| 79 |
-
Run a simple prompt:
|
| 80 |
-
|
| 81 |
-
```bash
|
| 82 |
-
python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
|
| 83 |
-
```
|
| 84 |
-
|
| 85 |
-
Use an explicit workspace:
|
| 86 |
-
|
| 87 |
-
```bash
|
| 88 |
-
python3 run_agent.py "Summarize this project." \
|
| 89 |
-
--workspace-root ./workspace
|
| 90 |
-
```
|
| 91 |
-
|
| 92 |
-
You can replace `./workspace` with any other workspace directory.
|
| 93 |
-
|
| 94 |
-
Save traces to a directory:
|
| 95 |
-
|
| 96 |
-
```bash
|
| 97 |
-
python3 run_agent.py "Summarize this project." \
|
| 98 |
-
--workspace-root ./workspace \
|
| 99 |
-
--trace-dir ./traces
|
| 100 |
-
```
|
| 101 |
-
|
| 102 |
-
You can replace `./traces` with any other trace directory.
|
| 103 |
-
|
| 104 |
-
Without `--trace-dir`, CLI runs do not write a trace file.
|
| 105 |
-
|
| 106 |
-
Append a role prompt:
|
| 107 |
-
|
| 108 |
-
```bash
|
| 109 |
-
python3 run_agent.py "Answer this QA task." \
|
| 110 |
-
--workspace-root ./workspace \
|
| 111 |
-
--role-prompt-file benchmarks/QA/role_prompt.md
|
| 112 |
-
```
|
| 113 |
-
|
| 114 |
-
Attach a local image:
|
| 115 |
-
|
| 116 |
-
```bash
|
| 117 |
-
python3 run_agent.py "Read the image and return JSON." \
|
| 118 |
-
--workspace-root ./workspace \
|
| 119 |
-
--images /path/to/image.png /path/to/second-image.png
|
| 120 |
-
```
|
| 121 |
-
|
| 122 |
-
Each image path must exist. RH copies images into `./workspace/inputs/images/`,
|
| 123 |
-
sends them as initial `image_url` content parts, and adds each saved relative
|
| 124 |
-
path to the user text so later rounds can call `ReadImage` on the same files.
|
| 125 |
-
|
| 126 |
-
In an interactive terminal, CLI runs continue after a final answer and prompt
|
| 127 |
-
for a follow-up. The follow-up run keeps the prior messages, tool results, and
|
| 128 |
-
saved image path hints. During a running step, `Ctrl+C` interrupts the current
|
| 129 |
-
run at the next safe point and returns to follow-up mode with context preserved.
|
| 130 |
-
Press `Ctrl+C` at the follow-up prompt or send EOF to exit. Use `--no-chat` for
|
| 131 |
-
strict one-shot behavior, or `--chat` to force follow-up mode.
|
| 132 |
-
|
| 133 |
-
For browser-based local use, run `python3 run_frontend.py`. The frontend uses an
|
| 134 |
-
existing workspace selected in the page, streams tool steps live, accepts one or
|
| 135 |
-
more image attachments, and continues the current conversation after each final
|
| 136 |
-
answer until you click **New chat**. While running, the send button becomes
|
| 137 |
-
**Stop**; it interrupts at the next safe point and keeps the conversation
|
| 138 |
-
context for the next message.
|
| 139 |
-
|
| 140 |
-
### CLI Parameters
|
| 141 |
-
|
| 142 |
-
| Parameter | Required | Meaning |
|
| 143 |
-
| --- | --- | --- |
|
| 144 |
-
| positional `prompt` | yes, unless `--prompt-file` is used | Prompt text. |
|
| 145 |
-
| `--prompt-file PATH` | no | Read prompt text from a UTF-8 file. |
|
| 146 |
-
| `--workspace-root PATH` | no | Workspace root for local file tools, Bash, and terminal sessions. Created if missing. |
|
| 147 |
-
| `--trace-dir PATH` | no | Directory where `trace_*.jsonl` is written. |
|
| 148 |
-
| `--role-prompt-file PATH` | no, repeatable | Append role-specific prompt text to the base system prompt. |
|
| 149 |
-
| `--images PATH [PATH ...]` | no | Copy one or more local images into `inputs/images/` and attach them to the initial user message. |
|
| 150 |
-
| `--chat` / `--no-chat` | no | Enable or disable CLI follow-up mode. Default: enabled only when stdin and stdout are interactive terminals. |
|
| 151 |
-
|
| 152 |
-
## 4. OpenAI-Compatible API Server
|
| 153 |
-
|
| 154 |
-
ResearchHarness can serve a synchronous OpenAI-compatible endpoint:
|
| 155 |
-
|
| 156 |
-
```http
|
| 157 |
-
POST /v1/chat/completions
|
| 158 |
-
```
|
| 159 |
-
|
| 160 |
-
This allows existing OpenAI SDK clients to call ResearchHarness by changing only
|
| 161 |
-
`base_url`.
|
| 162 |
-
|
| 163 |
-
### Start the Server
|
| 164 |
-
|
| 165 |
-
Default deployment:
|
| 166 |
-
|
| 167 |
-
```bash
|
| 168 |
-
python3 run_server.py \
|
| 169 |
-
--api-runs-dir ./api_runs \
|
| 170 |
-
--host 127.0.0.1 \
|
| 171 |
-
--port 8686
|
| 172 |
-
```
|
| 173 |
-
|
| 174 |
-
QA/VQA benchmark deployment with a benchmark role overlay:
|
| 175 |
-
|
| 176 |
-
```bash
|
| 177 |
-
python3 run_server.py \
|
| 178 |
-
--api-runs-dir ./api_runs \
|
| 179 |
-
--host 127.0.0.1 \
|
| 180 |
-
--port 8686 \
|
| 181 |
-
--role-prompt-file benchmarks/QA/role_prompt.md
|
| 182 |
-
```
|
| 183 |
-
|
| 184 |
-
### API Server Parameters
|
| 185 |
-
|
| 186 |
-
| Parameter | Required | Default | Meaning |
|
| 187 |
-
| --- | --- | --- | --- |
|
| 188 |
-
| `--api-runs-dir PATH` | yes | none | Parent directory for API runs. Each request gets one subdirectory. |
|
| 189 |
-
| `--host HOST` | no | `127.0.0.1` | Host to bind. |
|
| 190 |
-
| `--port PORT` | no | `8686` | Port to bind. |
|
| 191 |
-
| `--role-prompt-file PATH` | no, repeatable | none | Append role prompt text to the base ResearchHarness prompt. |
|
| 192 |
-
| `--input-wrapper` / `--no-input-wrapper` | no | enabled | Enable or disable the input LLM wrapper. |
|
| 193 |
-
| `--output-wrapper` / `--no-output-wrapper` | no | enabled | Enable or disable the output LLM wrapper. |
|
| 194 |
-
|
| 195 |
-
### Wrapper Modes
|
| 196 |
-
|
| 197 |
-
Both wrappers are enabled by default.
|
| 198 |
-
|
| 199 |
-
Strict-format benchmark mode:
|
| 200 |
-
|
| 201 |
-
```bash
|
| 202 |
-
python3 run_server.py \
|
| 203 |
-
--api-runs-dir ./api_runs \
|
| 204 |
-
--role-prompt-file benchmarks/QA/role_prompt.md \
|
| 205 |
-
--input-wrapper \
|
| 206 |
-
--output-wrapper
|
| 207 |
-
```
|
| 208 |
-
|
| 209 |
-
Direct agent mode:
|
| 210 |
-
|
| 211 |
-
```bash
|
| 212 |
-
python3 run_server.py \
|
| 213 |
-
--api-runs-dir ./api_runs \
|
| 214 |
-
--no-input-wrapper \
|
| 215 |
-
--no-output-wrapper
|
| 216 |
-
```
|
| 217 |
-
|
| 218 |
-
Simple input plus strict final formatting:
|
| 219 |
-
|
| 220 |
-
```bash
|
| 221 |
-
python3 run_server.py \
|
| 222 |
-
--api-runs-dir ./api_runs \
|
| 223 |
-
--no-input-wrapper \
|
| 224 |
-
--output-wrapper
|
| 225 |
-
```
|
| 226 |
-
|
| 227 |
-
The input wrapper rewrites the original user request into a stable task for the
|
| 228 |
-
agent. The output wrapper formats the agent result to match the user's requested
|
| 229 |
-
answer contract. Wrappers must not invent new facts; they only normalize input
|
| 230 |
-
and format output.
|
| 231 |
-
|
| 232 |
-
The API server is intentionally one request -> one answer. It does not keep a
|
| 233 |
-
server-side conversation between HTTP requests. If an application needs API
|
| 234 |
-
multi-turn behavior, keep that state in the client and send the needed prior
|
| 235 |
-
context in later requests.
|
| 236 |
-
|
| 237 |
-
```mermaid
|
| 238 |
-
flowchart LR
|
| 239 |
-
U[User Input] --> IW[Input Wrapper LLM]
|
| 240 |
-
IW --> A[ResearchHarness Agent]
|
| 241 |
-
A --> OW[Output Wrapper LLM]
|
| 242 |
-
OW --> O[Output]
|
| 243 |
-
```
|
| 244 |
-
|
| 245 |
-
## 5. API Workspace Layout
|
| 246 |
-
|
| 247 |
-
Each API request creates one run directory:
|
| 248 |
-
|
| 249 |
-
```text
|
| 250 |
-
./api_runs/
|
| 251 |
-
`-- run_YYYYMMDD_HHMMSS_<random>/
|
| 252 |
-
|-- agent_workspace/
|
| 253 |
-
| `-- inputs/
|
| 254 |
-
| `-- images/
|
| 255 |
-
`-- agent_trace/
|
| 256 |
-
|-- api_trace.jsonl
|
| 257 |
-
|-- trace_*.jsonl
|
| 258 |
-
`-- _session_state.json
|
| 259 |
-
```
|
| 260 |
-
|
| 261 |
-
Meaning:
|
| 262 |
-
|
| 263 |
-
| Path | Meaning |
|
| 264 |
-
| --- | --- |
|
| 265 |
-
| `run_YYYYMMDD_HHMMSS_<random>/` | Per-request run root. |
|
| 266 |
-
| `agent_workspace/` | The only workspace visible to the agent. File tools, Bash, `ls`, and `cat` start here. |
|
| 267 |
-
| `agent_workspace/inputs/images/` | User-provided images saved from API requests. |
|
| 268 |
-
| `agent_trace/` | API trace, agent trace, and runtime records. |
|
| 269 |
-
|
| 270 |
-
For multimodal requests, image inputs are handled in two ways at the same time:
|
| 271 |
-
the image content is passed to the backend model as initial multimodal input
|
| 272 |
-
when the selected model supports it, and each image is saved under
|
| 273 |
-
`agent_workspace/inputs/images/`. Each saved relative path is also included in
|
| 274 |
-
the agent-visible text, so later rounds can call `ReadImage` on a stable local
|
| 275 |
-
path without repeatedly resending image bytes.
|
| 276 |
-
|
| 277 |
-
This separation keeps user-visible tool work separate from server-side trace files.
|
| 278 |
-
In API deployment mode, traces are saved by default: every request writes
|
| 279 |
-
`api_trace.jsonl`, `trace_*.jsonl`, and `_session_state.json` under that run's `agent_trace/`
|
| 280 |
-
directory.
|
| 281 |
-
|
| 282 |
-
## 6. Text Request with OpenAI SDK
|
| 283 |
-
|
| 284 |
-
```python
|
| 285 |
-
from openai import OpenAI
|
| 286 |
-
|
| 287 |
-
client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
|
| 288 |
-
|
| 289 |
-
response = client.chat.completions.create(
|
| 290 |
-
model="researchharness",
|
| 291 |
-
messages=[
|
| 292 |
-
{"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
|
| 293 |
-
],
|
| 294 |
-
)
|
| 295 |
-
|
| 296 |
-
print(response.choices[0].message.content)
|
| 297 |
-
```
|
| 298 |
-
|
| 299 |
-
## 7. Multimodal Request with OpenAI SDK
|
| 300 |
-
|
| 301 |
-
The first API version supports one or more `data:image/...;base64,...` image
|
| 302 |
-
URLs in the same request. Remote image URLs and local file paths are
|
| 303 |
-
intentionally not supported by the API server.
|
| 304 |
-
|
| 305 |
-
The example below generates an image in memory and asks for JSON output.
|
| 306 |
-
|
| 307 |
-
```python
|
| 308 |
-
import base64
|
| 309 |
-
from io import BytesIO
|
| 310 |
-
|
| 311 |
-
from PIL import Image, ImageDraw
|
| 312 |
-
from openai import OpenAI
|
| 313 |
-
|
| 314 |
-
image = Image.new("RGB", (320, 120), "white")
|
| 315 |
-
draw = ImageDraw.Draw(image)
|
| 316 |
-
draw.text((40, 45), "7 + 5 = ?", fill="black")
|
| 317 |
-
buffer = BytesIO()
|
| 318 |
-
image.save(buffer, format="PNG")
|
| 319 |
-
data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
|
| 320 |
-
|
| 321 |
-
client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
|
| 322 |
-
|
| 323 |
-
response = client.chat.completions.create(
|
| 324 |
-
model="researchharness",
|
| 325 |
-
messages=[
|
| 326 |
-
{
|
| 327 |
-
"role": "user",
|
| 328 |
-
"content": [
|
| 329 |
-
{
|
| 330 |
-
"type": "text",
|
| 331 |
-
"text": (
|
| 332 |
-
"The image contains a simple arithmetic expression. "
|
| 333 |
-
"Return JSON with exactly two keys: expression and answer."
|
| 334 |
-
),
|
| 335 |
-
},
|
| 336 |
-
{"type": "image_url", "image_url": {"url": data_url}},
|
| 337 |
-
],
|
| 338 |
-
}
|
| 339 |
-
],
|
| 340 |
-
)
|
| 341 |
-
|
| 342 |
-
print(response.choices[0].message.content)
|
| 343 |
-
```
|
| 344 |
-
|
| 345 |
-
Expected answer shape:
|
| 346 |
-
|
| 347 |
-
```json
|
| 348 |
-
{"expression":"7 + 5","answer":12}
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
## 8. API Request and Response Contract
|
| 352 |
-
|
| 353 |
-
### `POST /v1/chat/completions`
|
| 354 |
-
|
| 355 |
-
Supported request fields:
|
| 356 |
-
|
| 357 |
-
| Field | Required | Meaning |
|
| 358 |
-
| --- | --- | --- |
|
| 359 |
-
| `model` | yes | Client-visible model label. It does not override `MODEL_NAME`; the backend model comes from `.env`. |
|
| 360 |
-
| `messages` | yes | OpenAI-style chat messages. |
|
| 361 |
-
| `stream` | no | Must be absent or `false`; streaming is not supported. |
|
| 362 |
-
| `n` | no | Must be absent or `1`. |
|
| 363 |
-
| `max_tokens` | no | Maximum output tokens for the output wrapper. |
|
| 364 |
-
| `max_completion_tokens` | no | Alias accepted for output-wrapper max tokens. |
|
| 365 |
-
| `response_format` | no | Passed to the wrappers as an output-format hint. |
|
| 366 |
-
|
| 367 |
-
Supported message roles:
|
| 368 |
-
|
| 369 |
-
| Role | Supported |
|
| 370 |
-
| --- | --- |
|
| 371 |
-
| `system` | yes |
|
| 372 |
-
| `user` | yes |
|
| 373 |
-
| `assistant` | yes |
|
| 374 |
-
| `tool` | no |
|
| 375 |
-
|
| 376 |
-
Supported content forms:
|
| 377 |
-
|
| 378 |
-
```json
|
| 379 |
-
{"role": "user", "content": "plain text"}
|
| 380 |
-
```
|
| 381 |
-
|
| 382 |
-
```json
|
| 383 |
-
{
|
| 384 |
-
"role": "user",
|
| 385 |
-
"content": [
|
| 386 |
-
{"type": "text", "text": "question"},
|
| 387 |
-
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
|
| 388 |
-
]
|
| 389 |
-
}
|
| 390 |
-
```
|
| 391 |
-
|
| 392 |
-
Response shape:
|
| 393 |
-
|
| 394 |
-
```json
|
| 395 |
-
{
|
| 396 |
-
"id": "chatcmpl_...",
|
| 397 |
-
"object": "chat.completion",
|
| 398 |
-
"created": 1770000000,
|
| 399 |
-
"model": "researchharness",
|
| 400 |
-
"choices": [
|
| 401 |
-
{
|
| 402 |
-
"index": 0,
|
| 403 |
-
"message": {
|
| 404 |
-
"role": "assistant",
|
| 405 |
-
"content": "final answer"
|
| 406 |
-
},
|
| 407 |
-
"finish_reason": "stop"
|
| 408 |
-
}
|
| 409 |
-
]
|
| 410 |
-
}
|
| 411 |
-
```
|
| 412 |
-
|
| 413 |
-
Callers usually only need:
|
| 414 |
-
|
| 415 |
-
```python
|
| 416 |
-
response.choices[0].message.content
|
| 417 |
-
```
|
| 418 |
-
|
| 419 |
-
### `GET /v1/health`
|
| 420 |
-
|
| 421 |
-
Returns:
|
| 422 |
-
|
| 423 |
-
```json
|
| 424 |
-
{
|
| 425 |
-
"status": "ok",
|
| 426 |
-
"api_runs_dir": "./api_runs",
|
| 427 |
-
"input_wrapper": true,
|
| 428 |
-
"output_wrapper": true
|
| 429 |
-
}
|
| 430 |
-
```
|
| 431 |
-
|
| 432 |
-
## 9. Tool Surface
|
| 433 |
-
|
| 434 |
-
ResearchHarness currently includes:
|
| 435 |
-
|
| 436 |
-
| Tool | Purpose |
|
| 437 |
-
| --- | --- |
|
| 438 |
-
| `Glob` | Discover files by pattern. |
|
| 439 |
-
| `Grep` | Search text in files. |
|
| 440 |
-
| `Read` | Read text files with bounds. |
|
| 441 |
-
| `ReadPDF` | Parse PDFs with MinerU/structai. |
|
| 442 |
-
| `ReadImage` | Inspect local image files and forward image content to vision-capable models. |
|
| 443 |
-
| `Write` | Write files inside the workspace. |
|
| 444 |
-
| `Edit` | Patch files inside the workspace. |
|
| 445 |
-
| `Bash` | Run shell commands inside the workspace. |
|
| 446 |
-
| `WebSearch` | Web search through Serper. |
|
| 447 |
-
| `ScholarSearch` | Scholar-style search through Serper. |
|
| 448 |
-
| `WebFetch` | Fetch and summarize webpages through Jina and the configured model. |
|
| 449 |
-
| `AskUser` | Ask a human for clarification in interactive runs. Disabled by some benchmark adapters. |
|
| 450 |
-
| `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | Persistent terminal sessions. |
|
| 451 |
-
|
| 452 |
-
## 10. Traces and Records
|
| 453 |
-
|
| 454 |
-
CLI runs write traces only when `--trace-dir` is provided. Without
|
| 455 |
-
`--trace-dir`, CLI runs do not write a trace file.
|
| 456 |
-
|
| 457 |
-
API runs write traces under:
|
| 458 |
-
|
| 459 |
-
```text
|
| 460 |
-
./api_runs/run_.../agent_trace/
|
| 461 |
-
```
|
| 462 |
-
|
| 463 |
-
Important files:
|
| 464 |
-
|
| 465 |
-
| File | Meaning |
|
| 466 |
-
| --- | --- |
|
| 467 |
-
| `api_trace.jsonl` | Input wrapper, agent result, and output wrapper records. |
|
| 468 |
-
| `trace_*.jsonl` | Flat agent runtime trace. |
|
| 469 |
-
| `_session_state.json` | Current session state, written next to `trace_*.jsonl` when tracing is enabled. |
|
| 470 |
-
|
| 471 |
-
The trace stores tool calls, tool results, LLM call capture payloads, compaction
|
| 472 |
-
events, errors, and final termination state.
|
| 473 |
-
|
| 474 |
-
## 11. Benchmark Adapters
|
| 475 |
-
|
| 476 |
-
Tracked benchmark contracts live under `benchmarks/`.
|
| 477 |
-
|
| 478 |
-
Current tracked adapters:
|
| 479 |
-
|
| 480 |
-
| Benchmark | Directory | Notes |
|
| 481 |
-
| --- | --- | --- |
|
| 482 |
-
| ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI integration with role prompt and adapter. |
|
| 483 |
-
| QA / VQA | `benchmarks/QA/` | OpenAI-compatible API integration for text and multimodal QA. |
|
| 484 |
-
|
| 485 |
-
Benchmark-specific behavior should stay outside `agent_base/`.
|
| 486 |
-
|
| 487 |
-
## 12. Testing
|
| 488 |
-
|
| 489 |
-
Recommended checks:
|
| 490 |
-
|
| 491 |
-
```bash
|
| 492 |
-
python3 tests/test_tool_availability.py
|
| 493 |
-
python3 tests/test_openai_api_checks.py
|
| 494 |
-
python3 tests/test_agent_extension_checks.py
|
| 495 |
-
python3 tests/test_edge_case_checks.py
|
| 496 |
-
python3 tests/test_toolchain_validation.py
|
| 497 |
-
```
|
| 498 |
-
|
| 499 |
-
If using conda:
|
| 500 |
-
|
| 501 |
-
```bash
|
| 502 |
-
/home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
|
| 503 |
-
```
|
| 504 |
-
|
| 505 |
-
## 13. Troubleshooting
|
| 506 |
-
|
| 507 |
-
Common issues:
|
| 508 |
-
|
| 509 |
-
| Symptom | Likely cause | Action |
|
| 510 |
-
| --- | --- | --- |
|
| 511 |
-
| Missing required env error | `.env` is incomplete | Fill required variables. |
|
| 512 |
-
| Web/PDF tools fail | VPN/proxy/TLS/service issue | Disable VPN/proxy and rerun tool availability tests. |
|
| 513 |
-
| Image request returns 400 | Image URL is not a `data:image/...;base64,...` URL | Convert the image to a base64 data URL. |
|
| 514 |
-
| Backend model rejects images | Model endpoint is not vision-capable | Use a vision-capable model or send text-only tasks. |
|
| 515 |
-
| API request fails with streaming error | `stream=true` was sent | Use synchronous requests only. |
|
| 516 |
-
| Unexpected output format | Output wrapper disabled or prompt under-specified | Enable `--output-wrapper` and state the desired format clearly. |
|
| 517 |
-
|
| 518 |
-
## 14. Current Boundaries
|
| 519 |
-
|
| 520 |
-
The first API version intentionally does not include:
|
| 521 |
-
|
| 522 |
-
- streaming,
|
| 523 |
-
- async run status,
|
| 524 |
-
- cancellation,
|
| 525 |
-
- artifact download endpoints,
|
| 526 |
-
- remote image URL downloading,
|
| 527 |
-
- user authentication,
|
| 528 |
-
- multi-tenant access control.
|
| 529 |
-
|
| 530 |
-
These can be added later as separate layers without changing the core harness
|
| 531 |
-
loop.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/tutorial_zh.md
DELETED
|
@@ -1,511 +0,0 @@
|
|
| 1 |
-
# ResearchHarness 教程
|
| 2 |
-
|
| 3 |
-
本文介绍如何通过命令行和 OpenAI-compatible API 使用 ResearchHarness。
|
| 4 |
-
|
| 5 |
-
ResearchHarness 是一个轻量、通用的 tool-using LLM agent harness。它可以作为:
|
| 6 |
-
|
| 7 |
-
- 命令行本地 agent,
|
| 8 |
-
- agent benchmark 的公平执行底座,
|
| 9 |
-
- OpenAI-compatible 同步 API 后端,
|
| 10 |
-
- 面向代码、文件、报告、PDF、图片、网页任务的个人助手运行时。
|
| 11 |
-
|
| 12 |
-
## 1. 安装
|
| 13 |
-
|
| 14 |
-
安装依赖:
|
| 15 |
-
|
| 16 |
-
```bash
|
| 17 |
-
python3 -m pip install -r requirements.txt
|
| 18 |
-
```
|
| 19 |
-
|
| 20 |
-
推荐使用 Python 3.10+。
|
| 21 |
-
|
| 22 |
-
## 2. 配置环境变量
|
| 23 |
-
|
| 24 |
-
复制 `.env.example` 为 `.env`,并填写必需变量。
|
| 25 |
-
|
| 26 |
-
必需变量:
|
| 27 |
-
|
| 28 |
-
| 变量 | 含义 |
|
| 29 |
-
| --- | --- |
|
| 30 |
-
| `API_KEY` | OpenAI-compatible LLM 服务的 API key。 |
|
| 31 |
-
| `API_BASE` | OpenAI-compatible chat-completions endpoint 的 base URL。 |
|
| 32 |
-
| `MODEL_NAME` | ResearchHarness 使用的主模型。 |
|
| 33 |
-
| `SERPER_KEY` | `WebSearch` 和 `ScholarSearch` 使用的 Serper key:https://serper.dev/ |
|
| 34 |
-
| `JINA_KEY` | `WebFetch` 使用的 Jina key:https://jina.ai/ |
|
| 35 |
-
| `MINERU_TOKEN` | `ReadPDF` 使用的 MinerU token:https://mineru.net/ |
|
| 36 |
-
|
| 37 |
-
可选变量:
|
| 38 |
-
|
| 39 |
-
| 变量 | 默认值 | 含义 |
|
| 40 |
-
| --- | --- | --- |
|
| 41 |
-
| `WORKSPACE_ROOT` | `./workspace` | 未显式传入 workspace 时使用的默认 workspace root。 |
|
| 42 |
-
| `MAX_LLM_CALL_PER_RUN` | `100` | 单次 agent run 最多允许的 LLM 调用次数。 |
|
| 43 |
-
| `MAX_AGENT_ROUNDS` | `100` | ReAct loop 最大轮次。 |
|
| 44 |
-
| `MAX_AGENT_RUNTIME_SECONDS` | `9000` | 单次 agent run 的最大运行秒数。 |
|
| 45 |
-
| `LLM_TIMEOUT_SECONDS` | `600` | 单次 LLM API 请求超时时间。 |
|
| 46 |
-
| `LLM_MAX_OUTPUT_TOKENS` | `10000` | 请求模型输出的最大 token 数。 |
|
| 47 |
-
| `MAX_INPUT_TOKENS` | `320000` | runtime token accounting 使用的输入 token 预算。 |
|
| 48 |
-
| `LLM_MAX_RETRIES` | `10` | 瞬时 LLM API 错误最大重试次数。 |
|
| 49 |
-
| `TEMPERATURE` | `0.6` | 主模型 temperature。 |
|
| 50 |
-
| `TOP_P` | `0.95` | 主模型 top-p。 |
|
| 51 |
-
| `PRESENCE_PENALTY` | `1.1` | provider 支持时使用的 presence penalty。 |
|
| 52 |
-
| `AUTO_COMPACT_TRIGGER_TOKENS` | `128k` | 自动上下文压缩触发阈值。 |
|
| 53 |
-
| `IMAGE_PART_TOKEN_ESTIMATE` | `1536` | 每个 image content part 的 token 估计。 |
|
| 54 |
-
| `LLM_IMAGE_MAX_EDGE` | `1568` | 发送给多模态模型的图片最大边长。 |
|
| 55 |
-
| `LLM_IMAGE_MAX_BYTES` | `524288` | 发送给多模态模型的压缩图片最大字节数。 |
|
| 56 |
-
| `LLM_IMAGE_JPEG_QUALITY` | `85` | 图片压缩时的初始 JPEG 质量。 |
|
| 57 |
-
| `DEBUG_AGENT` | `false` | 打印 agent loop 详细调试日志。 |
|
| 58 |
-
| `DEBUG_SEARCH` | `false` | 打印 WebSearch 调试日志。 |
|
| 59 |
-
| `DEBUG_SCHOLAR` | `false` | 打印 ScholarSearch 调试日志。 |
|
| 60 |
-
| `DEBUG_VISIT` | `false` | 打印 WebFetch 调试日志。 |
|
| 61 |
-
|
| 62 |
-
正式使用前,先运行:
|
| 63 |
-
|
| 64 |
-
```bash
|
| 65 |
-
python3 tests/test_tool_availability.py
|
| 66 |
-
```
|
| 67 |
-
|
| 68 |
-
预期结果是全部工具通过。缺 key、缺依赖、服务额度耗尽、外部工具不可用都应该视为失败,不应 skip。
|
| 69 |
-
|
| 70 |
-
如果 `WebSearch`、`ScholarSearch`、`WebFetch` 或 `ReadPDF` 出现 network、TLS、upload、download、PDF parsing 相关错误,优先尝试关闭 VPN / proxy 后重跑测试。
|
| 71 |
-
|
| 72 |
-
## 3. 命令行使用
|
| 73 |
-
|
| 74 |
-
直接运行一个 prompt:
|
| 75 |
-
|
| 76 |
-
```bash
|
| 77 |
-
python3 run_agent.py "Who proposed the transformer architecture, and in what year was the paper published?"
|
| 78 |
-
```
|
| 79 |
-
|
| 80 |
-
指定 workspace:
|
| 81 |
-
|
| 82 |
-
```bash
|
| 83 |
-
python3 run_agent.py "Summarize this project." \
|
| 84 |
-
--workspace-root ./workspace
|
| 85 |
-
```
|
| 86 |
-
|
| 87 |
-
`./workspace` 可以替换为任何其他 workspace 目录。
|
| 88 |
-
|
| 89 |
-
保存 trace:
|
| 90 |
-
|
| 91 |
-
```bash
|
| 92 |
-
python3 run_agent.py "Summarize this project." \
|
| 93 |
-
--workspace-root ./workspace \
|
| 94 |
-
--trace-dir ./traces
|
| 95 |
-
```
|
| 96 |
-
|
| 97 |
-
`./traces` 可以替换为任何其他 trace 目录。
|
| 98 |
-
|
| 99 |
-
如果不传 `--trace-dir`,CLI 运行不会写 trace 文件。
|
| 100 |
-
|
| 101 |
-
追加 role prompt:
|
| 102 |
-
|
| 103 |
-
```bash
|
| 104 |
-
python3 run_agent.py "Answer this QA task." \
|
| 105 |
-
--workspace-root ./workspace \
|
| 106 |
-
--role-prompt-file benchmarks/QA/role_prompt.md
|
| 107 |
-
```
|
| 108 |
-
|
| 109 |
-
附加本地图片:
|
| 110 |
-
|
| 111 |
-
```bash
|
| 112 |
-
python3 run_agent.py "Read the image and return JSON." \
|
| 113 |
-
--workspace-root ./workspace \
|
| 114 |
-
--images /path/to/image.png /path/to/second-image.png
|
| 115 |
-
```
|
| 116 |
-
|
| 117 |
-
每个图片路径都必须存在。RH 会把图片复制到 `./workspace/inputs/images/`,
|
| 118 |
-
作为初始 `image_url` content part 传给模型,同时把每个保存后的相对路径写进
|
| 119 |
-
用户文本,让后续轮次可以用 `ReadImage` 重新读取这些图片。
|
| 120 |
-
|
| 121 |
-
在交互式终端中,CLI 会在最终回答后继续等待 follow-up。下一轮会保留之前的
|
| 122 |
-
messages、工具结果和图片保存路径提示。运行过程中按 `Ctrl+C` 会在下一个安全点
|
| 123 |
-
中断当前 run,并带着上下文回到 follow-up 模式。在 follow-up 输入处按 `Ctrl+C`
|
| 124 |
-
或发送 EOF 可退出。脚本或 benchmark 如果需要严格的一问一答行为,使用
|
| 125 |
-
`--no-chat`;需要强制开启时使用 `--chat`。
|
| 126 |
-
|
| 127 |
-
如果需要浏览器本地界面,运行 `python3 run_frontend.py`。前端使用页面中选择的
|
| 128 |
-
已有 workspace,实时显示工具步骤,支持一张或多张图片附件,���在每次最终回答后
|
| 129 |
-
继续当前对话,直到点击 **New chat**。运行中发送按钮会变成 **Stop**;它会在下一个
|
| 130 |
-
安全点中断,并保留上下文用于下一条消息。
|
| 131 |
-
|
| 132 |
-
### CLI 参数
|
| 133 |
-
|
| 134 |
-
| 参数 | 是否必需 | 含义 |
|
| 135 |
-
| --- | --- | --- |
|
| 136 |
-
| 位置参数 `prompt` | 是,除非使用 `--prompt-file` | prompt 文本。 |
|
| 137 |
-
| `--prompt-file PATH` | 否 | 从 UTF-8 文件读取 prompt。 |
|
| 138 |
-
| `--workspace-root PATH` | 否 | 本地文件工具、Bash、Terminal 使用的 workspace root;不存在会自动创建。 |
|
| 139 |
-
| `--trace-dir PATH` | 否 | 写入 `trace_*.jsonl` 的目录。 |
|
| 140 |
-
| `--role-prompt-file PATH` | 否,可重复 | 追加 role-specific prompt 到 base system prompt。 |
|
| 141 |
-
| `--images PATH [PATH ...]` | 否 | 把一张或多张本地图片复制到 `inputs/images/` 并附加到初始用户消息。 |
|
| 142 |
-
| `--chat` / `--no-chat` | 否 | 开启或关闭 CLI follow-up 模式。默认只在 stdin 和 stdout 都是交互式终端时开启。 |
|
| 143 |
-
|
| 144 |
-
## 4. OpenAI-Compatible API Server
|
| 145 |
-
|
| 146 |
-
ResearchHarness 可以部署为同步 OpenAI-compatible endpoint:
|
| 147 |
-
|
| 148 |
-
```http
|
| 149 |
-
POST /v1/chat/completions
|
| 150 |
-
```
|
| 151 |
-
|
| 152 |
-
这样,现有 OpenAI SDK 客户端只需要修改 `base_url` 就可以调用 ResearchHarness。
|
| 153 |
-
|
| 154 |
-
### 启动服务
|
| 155 |
-
|
| 156 |
-
默认部署:
|
| 157 |
-
|
| 158 |
-
```bash
|
| 159 |
-
python3 run_server.py \
|
| 160 |
-
--api-runs-dir ./api_runs \
|
| 161 |
-
--host 127.0.0.1 \
|
| 162 |
-
--port 8686
|
| 163 |
-
```
|
| 164 |
-
|
| 165 |
-
QA/VQA benchmark 部署,可以额外加 benchmark role overlay:
|
| 166 |
-
|
| 167 |
-
```bash
|
| 168 |
-
python3 run_server.py \
|
| 169 |
-
--api-runs-dir ./api_runs \
|
| 170 |
-
--host 127.0.0.1 \
|
| 171 |
-
--port 8686 \
|
| 172 |
-
--role-prompt-file benchmarks/QA/role_prompt.md
|
| 173 |
-
```
|
| 174 |
-
|
| 175 |
-
### API Server 参数
|
| 176 |
-
|
| 177 |
-
| 参数 | 是否必需 | 默认值 | 含义 |
|
| 178 |
-
| --- | --- | --- | --- |
|
| 179 |
-
| `--api-runs-dir PATH` | 是 | 无 | API runs 的父目录;每个请求会创建一个子目录。 |
|
| 180 |
-
| `--host HOST` | 否 | `127.0.0.1` | 服务监听 host。 |
|
| 181 |
-
| `--port PORT` | 否 | `8686` | 服务监听端口。 |
|
| 182 |
-
| `--role-prompt-file PATH` | 否,可重复 | 无 | 追加 role prompt 到 base ResearchHarness prompt。 |
|
| 183 |
-
| `--input-wrapper` / `--no-input-wrapper` | 否 | 开启 | 开启或关闭输入 LLM wrapper。 |
|
| 184 |
-
| `--output-wrapper` / `--no-output-wrapper` | 否 | 开启 | 开启或关闭输出 LLM wrapper。 |
|
| 185 |
-
|
| 186 |
-
### Wrapper 模式
|
| 187 |
-
|
| 188 |
-
默认两个 wrapper 都开启。
|
| 189 |
-
|
| 190 |
-
严格格式 benchmark 模式:
|
| 191 |
-
|
| 192 |
-
```bash
|
| 193 |
-
python3 run_server.py \
|
| 194 |
-
--api-runs-dir ./api_runs \
|
| 195 |
-
--role-prompt-file benchmarks/QA/role_prompt.md \
|
| 196 |
-
--input-wrapper \
|
| 197 |
-
--output-wrapper
|
| 198 |
-
```
|
| 199 |
-
|
| 200 |
-
直接 agent 模式:
|
| 201 |
-
|
| 202 |
-
```bash
|
| 203 |
-
python3 run_server.py \
|
| 204 |
-
--api-runs-dir ./api_runs \
|
| 205 |
-
--no-input-wrapper \
|
| 206 |
-
--no-output-wrapper
|
| 207 |
-
```
|
| 208 |
-
|
| 209 |
-
输入简单但最终答案需要严格格式:
|
| 210 |
-
|
| 211 |
-
```bash
|
| 212 |
-
python3 run_server.py \
|
| 213 |
-
--api-runs-dir ./api_runs \
|
| 214 |
-
--no-input-wrapper \
|
| 215 |
-
--output-wrapper
|
| 216 |
-
```
|
| 217 |
-
|
| 218 |
-
input wrapper 的作用是把原始用户请求整理为适合 agent 稳定执行的任务。output wrapper 的作用是把 agent 的最终结果整理为用户要求的答案格式。wrapper 不应该引入新事实,只做输入规范化和输出格式化。
|
| 219 |
-
|
| 220 |
-
API server 有意保持一问一答:每个 HTTP 请求创建一次隔离 run,并返回一个最终
|
| 221 |
-
assistant message。服务端不会跨请求保存 conversation state。如果应用需要 API
|
| 222 |
-
多轮对话,应由客户端保存状态,并在后续请求中传入需要的上下文。
|
| 223 |
-
|
| 224 |
-
```mermaid
|
| 225 |
-
flowchart LR
|
| 226 |
-
U[User Input] --> IW[Input Wrapper LLM]
|
| 227 |
-
IW --> A[ResearchHarness Agent]
|
| 228 |
-
A --> OW[Output Wrapper LLM]
|
| 229 |
-
OW --> O[Output]
|
| 230 |
-
```
|
| 231 |
-
|
| 232 |
-
## 5. API Workspace 结构
|
| 233 |
-
|
| 234 |
-
每个 API 请求会创建一个 run 目录:
|
| 235 |
-
|
| 236 |
-
```text
|
| 237 |
-
./api_runs/
|
| 238 |
-
`-- run_YYYYMMDD_HHMMSS_<random>/
|
| 239 |
-
|-- agent_workspace/
|
| 240 |
-
| `-- inputs/
|
| 241 |
-
| `-- images/
|
| 242 |
-
`-- agent_trace/
|
| 243 |
-
|-- api_trace.jsonl
|
| 244 |
-
|-- trace_*.jsonl
|
| 245 |
-
`-- _session_state.json
|
| 246 |
-
```
|
| 247 |
-
|
| 248 |
-
含义:
|
| 249 |
-
|
| 250 |
-
| 路径 | 含义 |
|
| 251 |
-
| --- | --- |
|
| 252 |
-
| `run_YYYYMMDD_HHMMSS_<random>/` | 单个请求对应的 run 根目录。 |
|
| 253 |
-
| `agent_workspace/` | agent 唯一可见的 workspace;文件工具、Bash、`ls`、`cat` 都从这里开始。 |
|
| 254 |
-
| `agent_workspace/inputs/images/` | API 请求中用户提交的图片。 |
|
| 255 |
-
| `agent_trace/` | API trace、agent trace 和 runtime 记录。 |
|
| 256 |
-
|
| 257 |
-
对于多模态请求,每张图片会同时走两条路径:当底层模型支持多模态输入时,
|
| 258 |
-
图片内容会作为初始多模态输入直接传给模型;每张图片也会保存到
|
| 259 |
-
`agent_workspace/inputs/images/`。每个保存后的相对路径也会写进 agent 可见文本,
|
| 260 |
-
让后续轮次可以用 `ReadImage` 读取稳定的本地路径,而不是反复依赖内联图片字节。
|
| 261 |
-
|
| 262 |
-
这个结构把 agent 可见工作目录和服务端记录目录隔离开。
|
| 263 |
-
在 API 部署模式下,trace 默认保存:每个请求都会在自己的 `agent_trace/`
|
| 264 |
-
目录下写入 `api_trace.jsonl`、`trace_*.jsonl` 和 `_session_state.json`。
|
| 265 |
-
|
| 266 |
-
## 6. 纯文本 OpenAI SDK 请求
|
| 267 |
-
|
| 268 |
-
```python
|
| 269 |
-
from openai import OpenAI
|
| 270 |
-
|
| 271 |
-
client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
|
| 272 |
-
|
| 273 |
-
response = client.chat.completions.create(
|
| 274 |
-
model="researchharness",
|
| 275 |
-
messages=[
|
| 276 |
-
{"role": "user", "content": "Answer in one sentence: what is 2 + 2?"}
|
| 277 |
-
],
|
| 278 |
-
)
|
| 279 |
-
|
| 280 |
-
print(response.choices[0].message.content)
|
| 281 |
-
```
|
| 282 |
-
|
| 283 |
-
## 7. 多模态 OpenAI SDK 请求
|
| 284 |
-
|
| 285 |
-
第一版 API 支持同一个请求中包含一张或多张 `data:image/...;base64,...` 形式的图片 URL。API server 不支持远程图片 URL,也不支持让外部请求直接传本地文件路径。
|
| 286 |
-
|
| 287 |
-
下面的示例在代码中生成一张图片,并要求返回 JSON。
|
| 288 |
-
|
| 289 |
-
```python
|
| 290 |
-
import base64
|
| 291 |
-
from io import BytesIO
|
| 292 |
-
|
| 293 |
-
from PIL import Image, ImageDraw
|
| 294 |
-
from openai import OpenAI
|
| 295 |
-
|
| 296 |
-
image = Image.new("RGB", (320, 120), "white")
|
| 297 |
-
draw = ImageDraw.Draw(image)
|
| 298 |
-
draw.text((40, 45), "7 + 5 = ?", fill="black")
|
| 299 |
-
buffer = BytesIO()
|
| 300 |
-
image.save(buffer, format="PNG")
|
| 301 |
-
data_url = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode("ascii")
|
| 302 |
-
|
| 303 |
-
client = OpenAI(api_key="unused", base_url="http://127.0.0.1:8686/v1")
|
| 304 |
-
|
| 305 |
-
response = client.chat.completions.create(
|
| 306 |
-
model="researchharness",
|
| 307 |
-
messages=[
|
| 308 |
-
{
|
| 309 |
-
"role": "user",
|
| 310 |
-
"content": [
|
| 311 |
-
{
|
| 312 |
-
"type": "text",
|
| 313 |
-
"text": (
|
| 314 |
-
"The image contains a simple arithmetic expression. "
|
| 315 |
-
"Return JSON with exactly two keys: expression and answer."
|
| 316 |
-
),
|
| 317 |
-
},
|
| 318 |
-
{"type": "image_url", "image_url": {"url": data_url}},
|
| 319 |
-
],
|
| 320 |
-
}
|
| 321 |
-
],
|
| 322 |
-
)
|
| 323 |
-
|
| 324 |
-
print(response.choices[0].message.content)
|
| 325 |
-
```
|
| 326 |
-
|
| 327 |
-
预期答案形状:
|
| 328 |
-
|
| 329 |
-
```json
|
| 330 |
-
{"expression":"7 + 5","answer":12}
|
| 331 |
-
```
|
| 332 |
-
|
| 333 |
-
## 8. API 请求与返回协议
|
| 334 |
-
|
| 335 |
-
### `POST /v1/chat/completions`
|
| 336 |
-
|
| 337 |
-
支持的请求字段:
|
| 338 |
-
|
| 339 |
-
| 字段 | 是否必需 | 含义 |
|
| 340 |
-
| --- | --- | --- |
|
| 341 |
-
| `model` | 是 | 客户端看到的 model label;不会覆盖 `.env` 中的 `MODEL_NAME`。 |
|
| 342 |
-
| `messages` | 是 | OpenAI-style chat messages。 |
|
| 343 |
-
| `stream` | 否 | 必须不存在或为 `false`;当前不支持 streaming。 |
|
| 344 |
-
| `n` | 否 | 必须不存在或为 `1`。 |
|
| 345 |
-
| `max_tokens` | 否 | output wrapper 最大输出 token。 |
|
| 346 |
-
| `max_completion_tokens` | 否 | output wrapper 最大输出 token 的兼容别名。 |
|
| 347 |
-
| `response_format` | 否 | 作为输出格式提示传给 wrapper。 |
|
| 348 |
-
|
| 349 |
-
支持的 message role:
|
| 350 |
-
|
| 351 |
-
| Role | 是否支持 |
|
| 352 |
-
| --- | --- |
|
| 353 |
-
| `system` | 支持 |
|
| 354 |
-
| `user` | 支持 |
|
| 355 |
-
| `assistant` | 支持 |
|
| 356 |
-
| `tool` | 不支持 |
|
| 357 |
-
|
| 358 |
-
支持的 content 形式:
|
| 359 |
-
|
| 360 |
-
```json
|
| 361 |
-
{"role": "user", "content": "plain text"}
|
| 362 |
-
```
|
| 363 |
-
|
| 364 |
-
```json
|
| 365 |
-
{
|
| 366 |
-
"role": "user",
|
| 367 |
-
"content": [
|
| 368 |
-
{"type": "text", "text": "question"},
|
| 369 |
-
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
|
| 370 |
-
]
|
| 371 |
-
}
|
| 372 |
-
```
|
| 373 |
-
|
| 374 |
-
返回结构:
|
| 375 |
-
|
| 376 |
-
```json
|
| 377 |
-
{
|
| 378 |
-
"id": "chatcmpl_...",
|
| 379 |
-
"object": "chat.completion",
|
| 380 |
-
"created": 1770000000,
|
| 381 |
-
"model": "researchharness",
|
| 382 |
-
"choices": [
|
| 383 |
-
{
|
| 384 |
-
"index": 0,
|
| 385 |
-
"message": {
|
| 386 |
-
"role": "assistant",
|
| 387 |
-
"content": "final answer"
|
| 388 |
-
},
|
| 389 |
-
"finish_reason": "stop"
|
| 390 |
-
}
|
| 391 |
-
]
|
| 392 |
-
}
|
| 393 |
-
```
|
| 394 |
-
|
| 395 |
-
调用方通常只需要读取:
|
| 396 |
-
|
| 397 |
-
```python
|
| 398 |
-
response.choices[0].message.content
|
| 399 |
-
```
|
| 400 |
-
|
| 401 |
-
### `GET /v1/health`
|
| 402 |
-
|
| 403 |
-
返回:
|
| 404 |
-
|
| 405 |
-
```json
|
| 406 |
-
{
|
| 407 |
-
"status": "ok",
|
| 408 |
-
"api_runs_dir": "./api_runs",
|
| 409 |
-
"input_wrapper": true,
|
| 410 |
-
"output_wrapper": true
|
| 411 |
-
}
|
| 412 |
-
```
|
| 413 |
-
|
| 414 |
-
## 9. 工具能力
|
| 415 |
-
|
| 416 |
-
ResearchHarness 当前包含:
|
| 417 |
-
|
| 418 |
-
| 工具 | 用途 |
|
| 419 |
-
| --- | --- |
|
| 420 |
-
| `Glob` | 按模式发现文件。 |
|
| 421 |
-
| `Grep` | 在文件中搜索文本。 |
|
| 422 |
-
| `Read` | 有边界地读取文本文件。 |
|
| 423 |
-
| `ReadPDF` | 通过 MinerU/structai 解析 PDF。 |
|
| 424 |
-
| `ReadImage` | 读取本地图片,并把图片内容传给支持 vision 的模型。 |
|
| 425 |
-
| `Write` | 在 workspace 内写文件。 |
|
| 426 |
-
| `Edit` | 在 workspace 内 patch 文件。 |
|
| 427 |
-
| `Bash` | 在 workspace 内执行 shell 命令。 |
|
| 428 |
-
| `WebSearch` | 通过 Serper 进行网页搜索。 |
|
| 429 |
-
| `ScholarSearch` | 通过 Serper 进行学术搜索。 |
|
| 430 |
-
| `WebFetch` | 通过 Jina 和配置模型抓取、总结网页。 |
|
| 431 |
-
| `AskUser` | 交互式运行中向用户提问;某些 benchmark adapter 会禁用。 |
|
| 432 |
-
| `TerminalStart` / `TerminalWrite` / `TerminalRead` / `TerminalInterrupt` / `TerminalKill` | 持久终端会话。 |
|
| 433 |
-
|
| 434 |
-
## 10. Trace 与记录
|
| 435 |
-
|
| 436 |
-
CLI 运行只有在传入 `--trace-dir` 时才会写 trace。如果不传
|
| 437 |
-
`--trace-dir`,CLI 运行不会写 trace 文件。
|
| 438 |
-
|
| 439 |
-
API 运行时,记录在:
|
| 440 |
-
|
| 441 |
-
```text
|
| 442 |
-
./api_runs/run_.../agent_trace/
|
| 443 |
-
```
|
| 444 |
-
|
| 445 |
-
重要文件:
|
| 446 |
-
|
| 447 |
-
| 文件 | 含义 |
|
| 448 |
-
| --- | --- |
|
| 449 |
-
| `api_trace.jsonl` | input wrapper、agent result、output wrapper 记录。 |
|
| 450 |
-
| `trace_*.jsonl` | agent runtime 的 flat trace。 |
|
| 451 |
-
| `_session_state.json` | 当前 session state;启用 trace 时和 `trace_*.jsonl` 写在同一目录。 |
|
| 452 |
-
|
| 453 |
-
trace 会记录工具调用、工具结果、LLM call capture payload、context compaction、错误和终止状态。
|
| 454 |
-
|
| 455 |
-
## 11. Benchmark Adapter
|
| 456 |
-
|
| 457 |
-
tracked benchmark contract 放在 `benchmarks/` 下。
|
| 458 |
-
|
| 459 |
-
当前 tracked adapter:
|
| 460 |
-
|
| 461 |
-
| Benchmark | 目录 | 说明 |
|
| 462 |
-
| --- | --- | --- |
|
| 463 |
-
| ResearchClawBench | `benchmarks/ResearchClawBench/` | CLI 方式接入,包含 role prompt 和 adapter。 |
|
| 464 |
-
| QA / VQA | `benchmarks/QA/` | OpenAI-compatible API 方式接入,支持纯文本和多模态 QA。 |
|
| 465 |
-
|
| 466 |
-
benchmark-specific 行为应放在 `benchmarks/`,不要塞进 `agent_base/`。
|
| 467 |
-
|
| 468 |
-
## 12. 测试
|
| 469 |
-
|
| 470 |
-
推荐检查:
|
| 471 |
-
|
| 472 |
-
```bash
|
| 473 |
-
python3 tests/test_tool_availability.py
|
| 474 |
-
python3 tests/test_openai_api_checks.py
|
| 475 |
-
python3 tests/test_agent_extension_checks.py
|
| 476 |
-
python3 tests/test_edge_case_checks.py
|
| 477 |
-
python3 tests/test_toolchain_validation.py
|
| 478 |
-
```
|
| 479 |
-
|
| 480 |
-
如果使用 conda:
|
| 481 |
-
|
| 482 |
-
```bash
|
| 483 |
-
/home/xwh/miniconda3/bin/conda run -n agent python3 tests/test_openai_api_checks.py
|
| 484 |
-
```
|
| 485 |
-
|
| 486 |
-
## 13. 排障
|
| 487 |
-
|
| 488 |
-
常见问题:
|
| 489 |
-
|
| 490 |
-
| 现象 | 可能原因 | 处理 |
|
| 491 |
-
| --- | --- | --- |
|
| 492 |
-
| 缺少 required env | `.env` 不完整 | 填写所有必需变量。 |
|
| 493 |
-
| Web/PDF 工具失败 | VPN/proxy/TLS/服务问题 | 关闭 VPN/proxy 后重跑工具可用性测试。 |
|
| 494 |
-
| 图片请求返回 400 | 图片不是 `data:image/...;base64,...` | 把图片转成 base64 data URL。 |
|
| 495 |
-
| 后端模型拒绝图片 | 当前模型 endpoint 不支持 vision | 换用支持 vision 的模型,或改为纯文本任务。 |
|
| 496 |
-
| API 报 streaming 错误 | 请求里传了 `stream=true` | 当前只支持同步请求。 |
|
| 497 |
-
| 输出格式不符合预期 | output wrapper 关闭,或用户格式要求不明确 | 开启 `--output-wrapper`,并清楚说明输出格式。 |
|
| 498 |
-
|
| 499 |
-
## 14. 当前边界
|
| 500 |
-
|
| 501 |
-
第一版 API 暂不包括:
|
| 502 |
-
|
| 503 |
-
- streaming,
|
| 504 |
-
- async run status,
|
| 505 |
-
- cancellation,
|
| 506 |
-
- artifact download endpoint,
|
| 507 |
-
- 远程图片 URL 下载,
|
| 508 |
-
- 用户认证,
|
| 509 |
-
- 多租户访问控制。
|
| 510 |
-
|
| 511 |
-
这些能力以后可以作为外层服务继续扩展,不需要破坏核心 harness loop。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/local_server.py
CHANGED
|
@@ -4,7 +4,6 @@ import asyncio
|
|
| 4 |
import base64
|
| 5 |
import datetime as _dt
|
| 6 |
import json
|
| 7 |
-
import os
|
| 8 |
import re
|
| 9 |
import shutil
|
| 10 |
import threading
|
|
@@ -16,7 +15,7 @@ from typing import Any
|
|
| 16 |
from uuid import uuid4
|
| 17 |
|
| 18 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 19 |
-
from fastapi.responses import FileResponse
|
| 20 |
from fastapi.staticfiles import StaticFiles
|
| 21 |
|
| 22 |
from agent_base.react_agent import MultiTurnReactAgent, default_llm_config
|
|
@@ -35,9 +34,6 @@ from agent_base.utils import (
|
|
| 35 |
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
| 36 |
MAX_UPLOAD_IMAGES = 12
|
| 37 |
MAX_IMAGE_BYTES = 12 * 1024 * 1024
|
| 38 |
-
MAX_DIRECTORY_ENTRIES = 800
|
| 39 |
-
FRONTEND_ROLE_PROMPT = ""
|
| 40 |
-
FRONTEND_TRACE_DIR: str | None = None
|
| 41 |
FRONTEND_MANAGED_RUNS_DIR: str | None = None
|
| 42 |
FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
|
| 43 |
FRONTEND_CLEANUP_MAX_RUNS = 40
|
|
@@ -52,14 +48,12 @@ _ACTIVE_MANAGED_RUNS_LOCK = threading.Lock()
|
|
| 52 |
_COLLECTION_LOCK = threading.Lock()
|
| 53 |
_COLLECTION_CONFIG_WARNED: set[str] = set()
|
| 54 |
|
| 55 |
-
app = FastAPI(title="ResearchHarness
|
| 56 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
|
| 57 |
|
| 58 |
|
| 59 |
def configure_frontend(
|
| 60 |
*,
|
| 61 |
-
role_prompt: str = "",
|
| 62 |
-
trace_dir: str | None = None,
|
| 63 |
managed_runs_dir: str | None = None,
|
| 64 |
cleanup_retention_seconds: int | None = None,
|
| 65 |
cleanup_max_runs: int | None = None,
|
|
@@ -69,11 +63,10 @@ def configure_frontend(
|
|
| 69 |
collection_batch_size: int | None = None,
|
| 70 |
collection_max_bundle_bytes: int | None = None,
|
| 71 |
) -> None:
|
| 72 |
-
global
|
| 73 |
global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
|
| 74 |
global FRONTEND_COLLECTION_ENABLED, FRONTEND_COLLECTION_DATASET_REPO
|
| 75 |
global FRONTEND_COLLECTION_BATCH_SIZE, FRONTEND_COLLECTION_MAX_BUNDLE_BYTES
|
| 76 |
-
FRONTEND_ROLE_PROMPT = str(role_prompt or "").strip()
|
| 77 |
if collection_enabled is not None:
|
| 78 |
FRONTEND_COLLECTION_ENABLED = bool(collection_enabled)
|
| 79 |
if collection_dataset_repo is not None:
|
|
@@ -82,32 +75,22 @@ def configure_frontend(
|
|
| 82 |
FRONTEND_COLLECTION_BATCH_SIZE = max(1, int(collection_batch_size))
|
| 83 |
if collection_max_bundle_bytes is not None:
|
| 84 |
FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = max(1, int(collection_max_bundle_bytes))
|
| 85 |
-
if
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
if
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
FRONTEND_CLEANUP_RETENTION_SECONDS = max(60, int(cleanup_retention_seconds))
|
| 102 |
-
if cleanup_max_runs is not None:
|
| 103 |
-
FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
|
| 104 |
-
if cleanup_interval_seconds is not None:
|
| 105 |
-
FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
|
| 106 |
-
_collection_root()
|
| 107 |
-
cleanup_managed_runs_once()
|
| 108 |
-
_start_managed_cleanup_thread()
|
| 109 |
-
else:
|
| 110 |
-
FRONTEND_MANAGED_RUNS_DIR = None
|
| 111 |
|
| 112 |
|
| 113 |
class FrontendRunBridge:
|
|
@@ -543,26 +526,25 @@ def _run_agent_thread(
|
|
| 543 |
prompt: str,
|
| 544 |
workspace_root: Path,
|
| 545 |
initial_content_parts: list[dict[str, Any]],
|
| 546 |
-
trace_dir: str
|
| 547 |
prior_messages: list[dict[str, Any]] | None = None,
|
| 548 |
managed_run_root: str = "",
|
|
|
|
| 549 |
) -> None:
|
| 550 |
try:
|
| 551 |
load_dotenv(PROJECT_ROOT / ".env")
|
| 552 |
require_required_env("ResearchHarness frontend")
|
| 553 |
-
effective_trace_dir = trace_dir if trace_dir is not None else FRONTEND_TRACE_DIR
|
| 554 |
agent = FrontendInteractiveAgent(
|
| 555 |
bridge=bridge,
|
| 556 |
-
llm=default_llm_config(),
|
| 557 |
-
trace_dir=
|
| 558 |
-
role_prompt=FRONTEND_ROLE_PROMPT or None,
|
| 559 |
)
|
| 560 |
bridge.send(
|
| 561 |
{
|
| 562 |
"type": "run_started",
|
| 563 |
"model": agent.model,
|
| 564 |
"workspace_root": str(workspace_root),
|
| 565 |
-
"trace_dir":
|
| 566 |
}
|
| 567 |
)
|
| 568 |
result = agent._run_session(
|
|
@@ -590,98 +572,6 @@ def _run_agent_thread(
|
|
| 590 |
bridge.send({"type": "run_error", "error": str(exc), "traceback": traceback.format_exc()})
|
| 591 |
|
| 592 |
|
| 593 |
-
def _resolve_existing_workspace(raw_path: str) -> Path:
|
| 594 |
-
if not str(raw_path or "").strip():
|
| 595 |
-
raise ValueError("workspace path is required")
|
| 596 |
-
path = Path(raw_path).expanduser()
|
| 597 |
-
if not path.is_absolute():
|
| 598 |
-
path = (Path.cwd() / path).resolve()
|
| 599 |
-
else:
|
| 600 |
-
path = path.resolve()
|
| 601 |
-
if not path.exists() or not path.is_dir():
|
| 602 |
-
raise ValueError(f"workspace must be an existing directory: {path}")
|
| 603 |
-
return path
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
def _resolve_directory_browser_path(raw_path: str = "") -> Path:
|
| 607 |
-
text = str(raw_path or "").strip()
|
| 608 |
-
if text:
|
| 609 |
-
path = Path(text).expanduser()
|
| 610 |
-
else:
|
| 611 |
-
path = Path.home() if Path.home().exists() else PROJECT_ROOT
|
| 612 |
-
if not path.is_absolute():
|
| 613 |
-
path = (Path.cwd() / path).resolve()
|
| 614 |
-
else:
|
| 615 |
-
path = path.resolve()
|
| 616 |
-
if not path.exists() or not path.is_dir():
|
| 617 |
-
raise ValueError(f"directory does not exist: {path}")
|
| 618 |
-
return path
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
def _directory_root_choices() -> list[dict[str, str]]:
|
| 622 |
-
candidates = [Path.home(), PROJECT_ROOT, PROJECT_ROOT / "workspace", Path.cwd(), Path("/mnt"), Path("/")]
|
| 623 |
-
if os.name == "nt":
|
| 624 |
-
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
| 625 |
-
candidates.append(Path(f"{letter}:\\"))
|
| 626 |
-
|
| 627 |
-
seen: set[str] = set()
|
| 628 |
-
roots: list[dict[str, str]] = []
|
| 629 |
-
for candidate in candidates:
|
| 630 |
-
try:
|
| 631 |
-
resolved = candidate.expanduser().resolve()
|
| 632 |
-
except (OSError, RuntimeError):
|
| 633 |
-
continue
|
| 634 |
-
if not resolved.exists() or not resolved.is_dir():
|
| 635 |
-
continue
|
| 636 |
-
key = str(resolved)
|
| 637 |
-
if key in seen:
|
| 638 |
-
continue
|
| 639 |
-
seen.add(key)
|
| 640 |
-
label = "Home" if resolved == Path.home().resolve() else (resolved.name or key)
|
| 641 |
-
roots.append({"label": label, "path": key})
|
| 642 |
-
return roots
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
def _workspace_directory_payload(raw_path: str = "") -> dict[str, Any]:
|
| 646 |
-
directory = _resolve_directory_browser_path(raw_path)
|
| 647 |
-
entries: list[dict[str, str]] = []
|
| 648 |
-
truncated = False
|
| 649 |
-
try:
|
| 650 |
-
children = sorted(directory.iterdir(), key=lambda item: item.name.casefold())
|
| 651 |
-
except PermissionError as exc:
|
| 652 |
-
raise ValueError(f"permission denied: {directory}") from exc
|
| 653 |
-
except OSError as exc:
|
| 654 |
-
raise ValueError(f"cannot read directory {directory}: {exc}") from exc
|
| 655 |
-
|
| 656 |
-
for child in children:
|
| 657 |
-
if len(entries) >= MAX_DIRECTORY_ENTRIES:
|
| 658 |
-
truncated = True
|
| 659 |
-
break
|
| 660 |
-
try:
|
| 661 |
-
if not child.is_dir():
|
| 662 |
-
continue
|
| 663 |
-
except OSError:
|
| 664 |
-
continue
|
| 665 |
-
entries.append({"name": child.name or str(child), "path": str(child)})
|
| 666 |
-
|
| 667 |
-
parent = directory.parent if directory.parent != directory else None
|
| 668 |
-
return {
|
| 669 |
-
"path": str(directory),
|
| 670 |
-
"parent": str(parent) if parent else "",
|
| 671 |
-
"entries": entries,
|
| 672 |
-
"truncated": truncated,
|
| 673 |
-
"roots": _directory_root_choices(),
|
| 674 |
-
}
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
@app.get("/api/workspace-directories")
|
| 678 |
-
def workspace_directories(path: str = "") -> JSONResponse:
|
| 679 |
-
try:
|
| 680 |
-
return JSONResponse(_workspace_directory_payload(path))
|
| 681 |
-
except ValueError as exc:
|
| 682 |
-
return JSONResponse({"error": str(exc)}, status_code=400)
|
| 683 |
-
|
| 684 |
-
|
| 685 |
@app.get("/")
|
| 686 |
def index() -> FileResponse:
|
| 687 |
return FileResponse(STATIC_DIR / "index.html")
|
|
@@ -705,7 +595,7 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
|
|
| 705 |
|
| 706 |
sender_task = asyncio.create_task(sender())
|
| 707 |
try:
|
| 708 |
-
await websocket.send_json({"type": "ready", "managed_workspace":
|
| 709 |
while True:
|
| 710 |
message = await websocket.receive_json()
|
| 711 |
message_type = str(message.get("type", "")).strip()
|
|
@@ -719,30 +609,18 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
|
|
| 719 |
continue
|
| 720 |
try:
|
| 721 |
continue_conversation = bool(message.get("continue_conversation"))
|
|
|
|
| 722 |
prior_messages = None
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
effective_trace_dir = bridge.managed_trace_dir or FRONTEND_TRACE_DIR
|
| 731 |
-
prior_messages = bridge.conversation_messages
|
| 732 |
-
else:
|
| 733 |
-
_release_managed_run(bridge)
|
| 734 |
-
workspace_root, effective_trace_dir = _create_managed_run(bridge)
|
| 735 |
else:
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
if not bridge.conversation_messages:
|
| 739 |
-
bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
|
| 740 |
-
continue
|
| 741 |
-
elif bridge.conversation_workspace_root and bridge.conversation_workspace_root != str(workspace_root):
|
| 742 |
-
bridge.send({"type": "run_error", "error": "Workspace changed. Start a new chat before using a different workspace."})
|
| 743 |
-
continue
|
| 744 |
-
else:
|
| 745 |
-
prior_messages = bridge.conversation_messages
|
| 746 |
image_parts, saved_paths = save_uploaded_images(
|
| 747 |
workspace_root,
|
| 748 |
message.get("images", []) if isinstance(message.get("images", []), list) else [],
|
|
@@ -768,6 +646,7 @@ async def websocket_endpoint(websocket: WebSocket) -> None:
|
|
| 768 |
"trace_dir": effective_trace_dir,
|
| 769 |
"prior_messages": prior_messages,
|
| 770 |
"managed_run_root": bridge.managed_run_root,
|
|
|
|
| 771 |
},
|
| 772 |
daemon=True,
|
| 773 |
)
|
|
|
|
| 4 |
import base64
|
| 5 |
import datetime as _dt
|
| 6 |
import json
|
|
|
|
| 7 |
import re
|
| 8 |
import shutil
|
| 9 |
import threading
|
|
|
|
| 15 |
from uuid import uuid4
|
| 16 |
|
| 17 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
| 18 |
+
from fastapi.responses import FileResponse
|
| 19 |
from fastapi.staticfiles import StaticFiles
|
| 20 |
|
| 21 |
from agent_base.react_agent import MultiTurnReactAgent, default_llm_config
|
|
|
|
| 34 |
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
| 35 |
MAX_UPLOAD_IMAGES = 12
|
| 36 |
MAX_IMAGE_BYTES = 12 * 1024 * 1024
|
|
|
|
|
|
|
|
|
|
| 37 |
FRONTEND_MANAGED_RUNS_DIR: str | None = None
|
| 38 |
FRONTEND_CLEANUP_RETENTION_SECONDS = 6 * 60 * 60
|
| 39 |
FRONTEND_CLEANUP_MAX_RUNS = 40
|
|
|
|
| 48 |
_COLLECTION_LOCK = threading.Lock()
|
| 49 |
_COLLECTION_CONFIG_WARNED: set[str] = set()
|
| 50 |
|
| 51 |
+
app = FastAPI(title="ResearchHarness Space UI")
|
| 52 |
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="frontend-static")
|
| 53 |
|
| 54 |
|
| 55 |
def configure_frontend(
|
| 56 |
*,
|
|
|
|
|
|
|
| 57 |
managed_runs_dir: str | None = None,
|
| 58 |
cleanup_retention_seconds: int | None = None,
|
| 59 |
cleanup_max_runs: int | None = None,
|
|
|
|
| 63 |
collection_batch_size: int | None = None,
|
| 64 |
collection_max_bundle_bytes: int | None = None,
|
| 65 |
) -> None:
|
| 66 |
+
global FRONTEND_MANAGED_RUNS_DIR
|
| 67 |
global FRONTEND_CLEANUP_RETENTION_SECONDS, FRONTEND_CLEANUP_MAX_RUNS, FRONTEND_CLEANUP_INTERVAL_SECONDS
|
| 68 |
global FRONTEND_COLLECTION_ENABLED, FRONTEND_COLLECTION_DATASET_REPO
|
| 69 |
global FRONTEND_COLLECTION_BATCH_SIZE, FRONTEND_COLLECTION_MAX_BUNDLE_BYTES
|
|
|
|
| 70 |
if collection_enabled is not None:
|
| 71 |
FRONTEND_COLLECTION_ENABLED = bool(collection_enabled)
|
| 72 |
if collection_dataset_repo is not None:
|
|
|
|
| 75 |
FRONTEND_COLLECTION_BATCH_SIZE = max(1, int(collection_batch_size))
|
| 76 |
if collection_max_bundle_bytes is not None:
|
| 77 |
FRONTEND_COLLECTION_MAX_BUNDLE_BYTES = max(1, int(collection_max_bundle_bytes))
|
| 78 |
+
if not managed_runs_dir:
|
| 79 |
+
raise ValueError("managed_runs_dir is required for the Space frontend")
|
| 80 |
+
path = Path(managed_runs_dir).expanduser()
|
| 81 |
+
if path.exists() and not path.is_dir():
|
| 82 |
+
raise ValueError(f"managed-runs-dir is not a directory: {path}")
|
| 83 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 84 |
+
FRONTEND_MANAGED_RUNS_DIR = str(path)
|
| 85 |
+
if cleanup_retention_seconds is not None:
|
| 86 |
+
FRONTEND_CLEANUP_RETENTION_SECONDS = max(60, int(cleanup_retention_seconds))
|
| 87 |
+
if cleanup_max_runs is not None:
|
| 88 |
+
FRONTEND_CLEANUP_MAX_RUNS = max(1, int(cleanup_max_runs))
|
| 89 |
+
if cleanup_interval_seconds is not None:
|
| 90 |
+
FRONTEND_CLEANUP_INTERVAL_SECONDS = max(60, int(cleanup_interval_seconds))
|
| 91 |
+
_collection_root()
|
| 92 |
+
cleanup_managed_runs_once()
|
| 93 |
+
_start_managed_cleanup_thread()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
class FrontendRunBridge:
|
|
|
|
| 526 |
prompt: str,
|
| 527 |
workspace_root: Path,
|
| 528 |
initial_content_parts: list[dict[str, Any]],
|
| 529 |
+
trace_dir: str,
|
| 530 |
prior_messages: list[dict[str, Any]] | None = None,
|
| 531 |
managed_run_root: str = "",
|
| 532 |
+
model_name: str = "",
|
| 533 |
) -> None:
|
| 534 |
try:
|
| 535 |
load_dotenv(PROJECT_ROOT / ".env")
|
| 536 |
require_required_env("ResearchHarness frontend")
|
|
|
|
| 537 |
agent = FrontendInteractiveAgent(
|
| 538 |
bridge=bridge,
|
| 539 |
+
llm=default_llm_config(model_name=model_name or None),
|
| 540 |
+
trace_dir=trace_dir,
|
|
|
|
| 541 |
)
|
| 542 |
bridge.send(
|
| 543 |
{
|
| 544 |
"type": "run_started",
|
| 545 |
"model": agent.model,
|
| 546 |
"workspace_root": str(workspace_root),
|
| 547 |
+
"trace_dir": trace_dir,
|
| 548 |
}
|
| 549 |
)
|
| 550 |
result = agent._run_session(
|
|
|
|
| 572 |
bridge.send({"type": "run_error", "error": str(exc), "traceback": traceback.format_exc()})
|
| 573 |
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
@app.get("/")
|
| 576 |
def index() -> FileResponse:
|
| 577 |
return FileResponse(STATIC_DIR / "index.html")
|
|
|
|
| 595 |
|
| 596 |
sender_task = asyncio.create_task(sender())
|
| 597 |
try:
|
| 598 |
+
await websocket.send_json({"type": "ready", "managed_workspace": True})
|
| 599 |
while True:
|
| 600 |
message = await websocket.receive_json()
|
| 601 |
message_type = str(message.get("type", "")).strip()
|
|
|
|
| 609 |
continue
|
| 610 |
try:
|
| 611 |
continue_conversation = bool(message.get("continue_conversation"))
|
| 612 |
+
model_name = str(message.get("model_name", "") or "").strip()
|
| 613 |
prior_messages = None
|
| 614 |
+
if continue_conversation:
|
| 615 |
+
if not bridge.conversation_messages or not bridge.managed_workspace_root:
|
| 616 |
+
bridge.send({"type": "run_error", "error": "No active conversation is available on the server. Click New chat and start again."})
|
| 617 |
+
continue
|
| 618 |
+
workspace_root = Path(bridge.managed_workspace_root)
|
| 619 |
+
effective_trace_dir = bridge.managed_trace_dir
|
| 620 |
+
prior_messages = bridge.conversation_messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
else:
|
| 622 |
+
_release_managed_run(bridge)
|
| 623 |
+
workspace_root, effective_trace_dir = _create_managed_run(bridge)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
image_parts, saved_paths = save_uploaded_images(
|
| 625 |
workspace_root,
|
| 626 |
message.get("images", []) if isinstance(message.get("images", []), list) else [],
|
|
|
|
| 646 |
"trace_dir": effective_trace_dir,
|
| 647 |
"prior_messages": prior_messages,
|
| 648 |
"managed_run_root": bridge.managed_run_root,
|
| 649 |
+
"model_name": model_name,
|
| 650 |
},
|
| 651 |
daemon=True,
|
| 652 |
)
|
frontend/static/app.css
CHANGED
|
@@ -201,7 +201,8 @@ button {
|
|
| 201 |
|
| 202 |
.plain,
|
| 203 |
.send-button,
|
| 204 |
-
.icon-button
|
|
|
|
| 205 |
border: 1px solid var(--border);
|
| 206 |
border-radius: 999px;
|
| 207 |
background: var(--panel-strong);
|
|
@@ -214,12 +215,24 @@ button {
|
|
| 214 |
padding: 8px 12px;
|
| 215 |
}
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
.plain:hover,
|
| 218 |
-
.icon-button:hover
|
|
|
|
|
|
|
| 219 |
border-color: rgba(var(--glow-rgb), 0.38);
|
| 220 |
transform: translateY(-1px);
|
| 221 |
}
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
.workspace-strip {
|
| 224 |
position: sticky;
|
| 225 |
top: 66px;
|
|
@@ -266,13 +279,11 @@ button {
|
|
| 266 |
-webkit-overflow-scrolling: touch;
|
| 267 |
}
|
| 268 |
|
| 269 |
-
.messages::-webkit-scrollbar
|
| 270 |
-
.workspace-list::-webkit-scrollbar {
|
| 271 |
width: 10px;
|
| 272 |
}
|
| 273 |
|
| 274 |
-
.messages::-webkit-scrollbar-thumb
|
| 275 |
-
.workspace-list::-webkit-scrollbar-thumb {
|
| 276 |
border: 3px solid transparent;
|
| 277 |
border-radius: 999px;
|
| 278 |
background: rgba(var(--glow-rgb), 0.24);
|
|
@@ -696,172 +707,6 @@ button:disabled {
|
|
| 696 |
text-align: center;
|
| 697 |
}
|
| 698 |
|
| 699 |
-
.modal {
|
| 700 |
-
position: fixed;
|
| 701 |
-
inset: 0;
|
| 702 |
-
z-index: 30;
|
| 703 |
-
display: grid;
|
| 704 |
-
place-items: center;
|
| 705 |
-
padding: 18px;
|
| 706 |
-
background: rgba(0, 0, 0, 0.24);
|
| 707 |
-
backdrop-filter: blur(14px);
|
| 708 |
-
}
|
| 709 |
-
|
| 710 |
-
.modal.hidden {
|
| 711 |
-
display: none;
|
| 712 |
-
}
|
| 713 |
-
|
| 714 |
-
.modal-card {
|
| 715 |
-
display: grid;
|
| 716 |
-
grid-template-rows: auto auto auto minmax(0, 1fr) auto;
|
| 717 |
-
gap: 12px;
|
| 718 |
-
width: min(780px, 100%);
|
| 719 |
-
max-height: min(760px, 82vh);
|
| 720 |
-
border: 1px solid var(--border);
|
| 721 |
-
border-radius: 28px;
|
| 722 |
-
background: var(--panel-strong);
|
| 723 |
-
box-shadow: 0 24px 88px rgba(0, 0, 0, 0.22);
|
| 724 |
-
padding: 18px;
|
| 725 |
-
}
|
| 726 |
-
|
| 727 |
-
.modal-head,
|
| 728 |
-
.modal-path-row,
|
| 729 |
-
.modal-actions {
|
| 730 |
-
display: flex;
|
| 731 |
-
align-items: center;
|
| 732 |
-
gap: 12px;
|
| 733 |
-
}
|
| 734 |
-
|
| 735 |
-
.modal-head {
|
| 736 |
-
justify-content: space-between;
|
| 737 |
-
}
|
| 738 |
-
|
| 739 |
-
.modal-head h2,
|
| 740 |
-
.modal-head p {
|
| 741 |
-
margin: 0;
|
| 742 |
-
}
|
| 743 |
-
|
| 744 |
-
.modal-head h2 {
|
| 745 |
-
font-size: 1.18rem;
|
| 746 |
-
letter-spacing: -0.025em;
|
| 747 |
-
}
|
| 748 |
-
|
| 749 |
-
.modal-head p,
|
| 750 |
-
.modal-actions span {
|
| 751 |
-
color: var(--muted);
|
| 752 |
-
font-size: 0.86rem;
|
| 753 |
-
}
|
| 754 |
-
|
| 755 |
-
.modal-path-row {
|
| 756 |
-
border: 1px solid var(--border);
|
| 757 |
-
border-radius: 18px;
|
| 758 |
-
background: var(--hover);
|
| 759 |
-
padding: 8px;
|
| 760 |
-
}
|
| 761 |
-
|
| 762 |
-
.modal-path-row input {
|
| 763 |
-
min-width: 0;
|
| 764 |
-
flex: 1;
|
| 765 |
-
border: 0;
|
| 766 |
-
outline: 0;
|
| 767 |
-
background: transparent;
|
| 768 |
-
color: var(--text);
|
| 769 |
-
}
|
| 770 |
-
|
| 771 |
-
.workspace-roots {
|
| 772 |
-
display: flex;
|
| 773 |
-
flex-wrap: wrap;
|
| 774 |
-
gap: 8px;
|
| 775 |
-
}
|
| 776 |
-
|
| 777 |
-
.root-chip {
|
| 778 |
-
max-width: 190px;
|
| 779 |
-
overflow: hidden;
|
| 780 |
-
border: 1px solid var(--border);
|
| 781 |
-
border-radius: 999px;
|
| 782 |
-
background: var(--panel);
|
| 783 |
-
color: var(--text);
|
| 784 |
-
font-weight: 800;
|
| 785 |
-
padding: 7px 11px;
|
| 786 |
-
text-overflow: ellipsis;
|
| 787 |
-
white-space: nowrap;
|
| 788 |
-
}
|
| 789 |
-
|
| 790 |
-
.workspace-list {
|
| 791 |
-
display: grid;
|
| 792 |
-
align-content: start;
|
| 793 |
-
gap: 7px;
|
| 794 |
-
min-height: 0;
|
| 795 |
-
overflow: auto;
|
| 796 |
-
padding-right: 4px;
|
| 797 |
-
}
|
| 798 |
-
|
| 799 |
-
.dir-row {
|
| 800 |
-
display: grid;
|
| 801 |
-
grid-template-columns: auto minmax(0, 1fr) auto;
|
| 802 |
-
align-items: center;
|
| 803 |
-
gap: 10px;
|
| 804 |
-
width: 100%;
|
| 805 |
-
border: 1px solid var(--border);
|
| 806 |
-
border-radius: 18px;
|
| 807 |
-
background: var(--panel);
|
| 808 |
-
color: var(--text);
|
| 809 |
-
padding: 10px 12px;
|
| 810 |
-
text-align: left;
|
| 811 |
-
}
|
| 812 |
-
|
| 813 |
-
.dir-row:hover,
|
| 814 |
-
.root-chip:hover {
|
| 815 |
-
border-color: rgba(var(--glow-rgb), 0.38);
|
| 816 |
-
background: var(--hover);
|
| 817 |
-
}
|
| 818 |
-
|
| 819 |
-
.dir-icon {
|
| 820 |
-
display: grid;
|
| 821 |
-
place-items: center;
|
| 822 |
-
width: 24px;
|
| 823 |
-
height: 24px;
|
| 824 |
-
border-radius: 50%;
|
| 825 |
-
background: rgba(var(--glow-rgb), 0.1);
|
| 826 |
-
font-weight: 900;
|
| 827 |
-
}
|
| 828 |
-
|
| 829 |
-
.dir-main {
|
| 830 |
-
min-width: 0;
|
| 831 |
-
}
|
| 832 |
-
|
| 833 |
-
.dir-main strong,
|
| 834 |
-
.dir-main small {
|
| 835 |
-
display: block;
|
| 836 |
-
overflow: hidden;
|
| 837 |
-
text-overflow: ellipsis;
|
| 838 |
-
white-space: nowrap;
|
| 839 |
-
}
|
| 840 |
-
|
| 841 |
-
.dir-main small {
|
| 842 |
-
margin-top: 2px;
|
| 843 |
-
color: var(--muted);
|
| 844 |
-
font-size: 0.78rem;
|
| 845 |
-
}
|
| 846 |
-
|
| 847 |
-
.dir-action {
|
| 848 |
-
color: var(--muted);
|
| 849 |
-
font-size: 0.76rem;
|
| 850 |
-
font-weight: 850;
|
| 851 |
-
}
|
| 852 |
-
|
| 853 |
-
.dir-empty {
|
| 854 |
-
border: 1px dashed var(--border);
|
| 855 |
-
border-radius: 18px;
|
| 856 |
-
padding: 18px;
|
| 857 |
-
color: var(--muted);
|
| 858 |
-
text-align: center;
|
| 859 |
-
}
|
| 860 |
-
|
| 861 |
-
.modal-actions {
|
| 862 |
-
justify-content: space-between;
|
| 863 |
-
}
|
| 864 |
-
|
| 865 |
#theme-switcher {
|
| 866 |
position: fixed;
|
| 867 |
right: 22px;
|
|
@@ -984,22 +829,6 @@ button:disabled {
|
|
| 984 |
max-width: none;
|
| 985 |
}
|
| 986 |
|
| 987 |
-
.modal-card {
|
| 988 |
-
max-height: 88vh;
|
| 989 |
-
padding: 14px;
|
| 990 |
-
}
|
| 991 |
-
|
| 992 |
-
.modal-head,
|
| 993 |
-
.modal-actions {
|
| 994 |
-
align-items: stretch;
|
| 995 |
-
flex-direction: column;
|
| 996 |
-
}
|
| 997 |
-
|
| 998 |
-
.modal-path-row {
|
| 999 |
-
align-items: stretch;
|
| 1000 |
-
flex-direction: column;
|
| 1001 |
-
}
|
| 1002 |
-
|
| 1003 |
.message,
|
| 1004 |
.event {
|
| 1005 |
max-width: 96%;
|
|
|
|
| 201 |
|
| 202 |
.plain,
|
| 203 |
.send-button,
|
| 204 |
+
.icon-button,
|
| 205 |
+
.model-select {
|
| 206 |
border: 1px solid var(--border);
|
| 207 |
border-radius: 999px;
|
| 208 |
background: var(--panel-strong);
|
|
|
|
| 215 |
padding: 8px 12px;
|
| 216 |
}
|
| 217 |
|
| 218 |
+
.model-select {
|
| 219 |
+
min-width: 150px;
|
| 220 |
+
padding: 8px 34px 8px 12px;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
.plain:hover,
|
| 224 |
+
.icon-button:hover,
|
| 225 |
+
.model-select:hover:not(:disabled),
|
| 226 |
+
.model-select:focus-visible {
|
| 227 |
border-color: rgba(var(--glow-rgb), 0.38);
|
| 228 |
transform: translateY(-1px);
|
| 229 |
}
|
| 230 |
|
| 231 |
+
.model-select:disabled {
|
| 232 |
+
cursor: not-allowed;
|
| 233 |
+
opacity: 0.58;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
.workspace-strip {
|
| 237 |
position: sticky;
|
| 238 |
top: 66px;
|
|
|
|
| 279 |
-webkit-overflow-scrolling: touch;
|
| 280 |
}
|
| 281 |
|
| 282 |
+
.messages::-webkit-scrollbar {
|
|
|
|
| 283 |
width: 10px;
|
| 284 |
}
|
| 285 |
|
| 286 |
+
.messages::-webkit-scrollbar-thumb {
|
|
|
|
| 287 |
border: 3px solid transparent;
|
| 288 |
border-radius: 999px;
|
| 289 |
background: rgba(var(--glow-rgb), 0.24);
|
|
|
|
| 707 |
text-align: center;
|
| 708 |
}
|
| 709 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
#theme-switcher {
|
| 711 |
position: fixed;
|
| 712 |
right: 22px;
|
|
|
|
| 829 |
max-width: none;
|
| 830 |
}
|
| 831 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
.message,
|
| 833 |
.event {
|
| 834 |
max-width: 96%;
|
frontend/static/app.js
CHANGED
|
@@ -131,28 +131,16 @@
|
|
| 131 |
var images = [];
|
| 132 |
var COLLAPSED_STEP_HEIGHT = 220;
|
| 133 |
|
| 134 |
-
var workspaceInput = document.getElementById("workspaceInput");
|
| 135 |
-
var workspaceStrip = document.getElementById("workspaceStrip");
|
| 136 |
var promptInput = document.getElementById("promptInput");
|
| 137 |
var runBtn = document.getElementById("runBtn");
|
| 138 |
var newBtn = document.getElementById("newBtn");
|
| 139 |
-
var
|
| 140 |
var attachBtn = document.getElementById("attachBtn");
|
| 141 |
var imageInput = document.getElementById("imageInput");
|
| 142 |
var imagePreview = document.getElementById("imagePreview");
|
| 143 |
var dropZone = document.getElementById("dropZone");
|
| 144 |
var timeline = document.getElementById("timeline");
|
| 145 |
var statusPill = document.getElementById("statusPill");
|
| 146 |
-
var workspaceMeta = document.getElementById("workspaceMeta");
|
| 147 |
-
var workspaceModal = document.getElementById("workspaceModal");
|
| 148 |
-
var workspaceCloseBtn = document.getElementById("workspaceCloseBtn");
|
| 149 |
-
var workspacePathInput = document.getElementById("workspacePathInput");
|
| 150 |
-
var workspaceGoBtn = document.getElementById("workspaceGoBtn");
|
| 151 |
-
var workspaceRoots = document.getElementById("workspaceRoots");
|
| 152 |
-
var workspaceList = document.getElementById("workspaceList");
|
| 153 |
-
var workspaceUseBtn = document.getElementById("workspaceUseBtn");
|
| 154 |
-
var workspacePickerHint = document.getElementById("workspacePickerHint");
|
| 155 |
-
var currentWorkspacePath = "";
|
| 156 |
var defaultPromptPlaceholder = promptInput.getAttribute("placeholder") || "Message ResearchHarness";
|
| 157 |
|
| 158 |
function escapeHtml(value) {
|
|
@@ -223,23 +211,20 @@
|
|
| 223 |
statusPill.className = "status " + (kind || "idle");
|
| 224 |
}
|
| 225 |
|
| 226 |
-
function setWorkspaceSelected(path) {
|
| 227 |
-
workspaceInput.value = path;
|
| 228 |
-
workspaceMeta.textContent = "Workspace selected: " + path;
|
| 229 |
-
}
|
| 230 |
-
|
| 231 |
function updateComposerMode() {
|
| 232 |
if (pendingAskId) {
|
| 233 |
runBtn.disabled = false;
|
| 234 |
runBtn.classList.remove("is-running");
|
| 235 |
runBtn.textContent = "Reply";
|
| 236 |
promptInput.placeholder = defaultPromptPlaceholder;
|
|
|
|
| 237 |
return;
|
| 238 |
}
|
| 239 |
runBtn.disabled = running && interrupting;
|
| 240 |
runBtn.classList.toggle("is-running", running);
|
| 241 |
runBtn.textContent = running ? (interrupting ? "Stopping" : "Stop") : "Run";
|
| 242 |
promptInput.placeholder = defaultPromptPlaceholder;
|
|
|
|
| 243 |
}
|
| 244 |
|
| 245 |
function setRunning(active, statusText) {
|
|
@@ -254,7 +239,7 @@
|
|
| 254 |
timeline.innerHTML = ''
|
| 255 |
+ '<div class="welcome">'
|
| 256 |
+ '<h1>What should the agent do?</h1>'
|
| 257 |
-
+ '<p>Ask a question, attach images,
|
| 258 |
+ '</div>';
|
| 259 |
}
|
| 260 |
|
|
@@ -551,7 +536,7 @@
|
|
| 551 |
ws.send(JSON.stringify({
|
| 552 |
type: "start",
|
| 553 |
prompt: prompt,
|
| 554 |
-
|
| 555 |
images: sentImages,
|
| 556 |
continue_conversation: continueConversation
|
| 557 |
}));
|
|
@@ -611,89 +596,6 @@
|
|
| 611 |
});
|
| 612 |
}
|
| 613 |
|
| 614 |
-
function openWorkspaceModal() {
|
| 615 |
-
workspaceModal.classList.remove("hidden");
|
| 616 |
-
loadWorkspaceDirectory(workspaceInput.value.trim());
|
| 617 |
-
}
|
| 618 |
-
|
| 619 |
-
function closeWorkspaceModal() {
|
| 620 |
-
workspaceModal.classList.add("hidden");
|
| 621 |
-
}
|
| 622 |
-
|
| 623 |
-
function setWorkspacePickerBusy(text) {
|
| 624 |
-
workspaceList.innerHTML = '<div class="dir-empty">' + escapeHtml(text || "Loading...") + "</div>";
|
| 625 |
-
workspacePickerHint.textContent = text || "Loading...";
|
| 626 |
-
}
|
| 627 |
-
|
| 628 |
-
function renderWorkspaceError(message) {
|
| 629 |
-
workspaceList.innerHTML = '<div class="dir-empty error-text">' + escapeHtml(message) + "</div>";
|
| 630 |
-
workspacePickerHint.textContent = "Paste a valid existing folder path, then press Go.";
|
| 631 |
-
}
|
| 632 |
-
|
| 633 |
-
function directoryRow(label, path, actionLabel, onClick) {
|
| 634 |
-
var row = document.createElement("button");
|
| 635 |
-
row.type = "button";
|
| 636 |
-
row.className = "dir-row";
|
| 637 |
-
row.innerHTML = ''
|
| 638 |
-
+ '<span class="dir-icon">›</span>'
|
| 639 |
-
+ '<span class="dir-main"><strong>' + escapeHtml(label) + '</strong><small>' + escapeHtml(path) + '</small></span>'
|
| 640 |
-
+ '<span class="dir-action">' + escapeHtml(actionLabel || "Open") + '</span>';
|
| 641 |
-
row.addEventListener("click", onClick);
|
| 642 |
-
return row;
|
| 643 |
-
}
|
| 644 |
-
|
| 645 |
-
function renderWorkspacePicker(payload) {
|
| 646 |
-
currentWorkspacePath = payload.path || "";
|
| 647 |
-
workspacePathInput.value = currentWorkspacePath;
|
| 648 |
-
workspaceRoots.innerHTML = "";
|
| 649 |
-
(payload.roots || []).forEach(function (root) {
|
| 650 |
-
var chip = document.createElement("button");
|
| 651 |
-
chip.type = "button";
|
| 652 |
-
chip.className = "root-chip";
|
| 653 |
-
chip.textContent = root.label || root.path;
|
| 654 |
-
chip.title = root.path || "";
|
| 655 |
-
chip.addEventListener("click", function () {
|
| 656 |
-
loadWorkspaceDirectory(root.path || "");
|
| 657 |
-
});
|
| 658 |
-
workspaceRoots.appendChild(chip);
|
| 659 |
-
});
|
| 660 |
-
|
| 661 |
-
workspaceList.innerHTML = "";
|
| 662 |
-
if (payload.parent) {
|
| 663 |
-
workspaceList.appendChild(directoryRow("..", payload.parent, "Parent", function () {
|
| 664 |
-
loadWorkspaceDirectory(payload.parent);
|
| 665 |
-
}));
|
| 666 |
-
}
|
| 667 |
-
(payload.entries || []).forEach(function (entry) {
|
| 668 |
-
workspaceList.appendChild(directoryRow(entry.name, entry.path, "Open", function () {
|
| 669 |
-
loadWorkspaceDirectory(entry.path);
|
| 670 |
-
}));
|
| 671 |
-
});
|
| 672 |
-
if (!payload.parent && !(payload.entries || []).length) {
|
| 673 |
-
workspaceList.innerHTML = '<div class="dir-empty">No readable child folders.</div>';
|
| 674 |
-
}
|
| 675 |
-
workspacePickerHint.textContent = payload.truncated
|
| 676 |
-
? "Directory list was truncated. Paste a deeper path if needed."
|
| 677 |
-
: "Current folder will be used when you click Use this folder.";
|
| 678 |
-
}
|
| 679 |
-
|
| 680 |
-
async function loadWorkspaceDirectory(path) {
|
| 681 |
-
setWorkspacePickerBusy("Loading folders...");
|
| 682 |
-
try {
|
| 683 |
-
var url = "/api/workspace-directories";
|
| 684 |
-
if (path) url += "?path=" + encodeURIComponent(path);
|
| 685 |
-
var response = await fetch(url);
|
| 686 |
-
var payload = await response.json();
|
| 687 |
-
if (!response.ok || payload.error) {
|
| 688 |
-
renderWorkspaceError(payload.error || "Cannot open this folder.");
|
| 689 |
-
return;
|
| 690 |
-
}
|
| 691 |
-
renderWorkspacePicker(payload);
|
| 692 |
-
} catch (error) {
|
| 693 |
-
renderWorkspaceError(String(error));
|
| 694 |
-
}
|
| 695 |
-
}
|
| 696 |
-
|
| 697 |
runBtn.addEventListener("click", sendStart);
|
| 698 |
timeline.addEventListener("scroll", syncTimelineFollowMode);
|
| 699 |
timeline.addEventListener("wheel", function (event) {
|
|
@@ -730,29 +632,6 @@
|
|
| 730 |
});
|
| 731 |
imageInput.addEventListener("change", function (event) { addImageFiles(event.target.files); });
|
| 732 |
|
| 733 |
-
pickWorkspaceBtn.addEventListener("click", function () {
|
| 734 |
-
openWorkspaceModal();
|
| 735 |
-
});
|
| 736 |
-
|
| 737 |
-
workspaceCloseBtn.addEventListener("click", closeWorkspaceModal);
|
| 738 |
-
workspaceModal.addEventListener("click", function (event) {
|
| 739 |
-
if (event.target === workspaceModal) closeWorkspaceModal();
|
| 740 |
-
});
|
| 741 |
-
workspaceGoBtn.addEventListener("click", function () {
|
| 742 |
-
loadWorkspaceDirectory(workspacePathInput.value.trim());
|
| 743 |
-
});
|
| 744 |
-
workspacePathInput.addEventListener("keydown", function (event) {
|
| 745 |
-
if (event.key === "Enter") {
|
| 746 |
-
event.preventDefault();
|
| 747 |
-
loadWorkspaceDirectory(workspacePathInput.value.trim());
|
| 748 |
-
}
|
| 749 |
-
});
|
| 750 |
-
workspaceUseBtn.addEventListener("click", function () {
|
| 751 |
-
if (!currentWorkspacePath) return;
|
| 752 |
-
setWorkspaceSelected(currentWorkspacePath);
|
| 753 |
-
closeWorkspaceModal();
|
| 754 |
-
});
|
| 755 |
-
|
| 756 |
["dragenter", "dragover"].forEach(function (name) {
|
| 757 |
dropZone.addEventListener(name, function (event) {
|
| 758 |
event.preventDefault();
|
|
|
|
| 131 |
var images = [];
|
| 132 |
var COLLAPSED_STEP_HEIGHT = 220;
|
| 133 |
|
|
|
|
|
|
|
| 134 |
var promptInput = document.getElementById("promptInput");
|
| 135 |
var runBtn = document.getElementById("runBtn");
|
| 136 |
var newBtn = document.getElementById("newBtn");
|
| 137 |
+
var modelSelect = document.getElementById("modelSelect");
|
| 138 |
var attachBtn = document.getElementById("attachBtn");
|
| 139 |
var imageInput = document.getElementById("imageInput");
|
| 140 |
var imagePreview = document.getElementById("imagePreview");
|
| 141 |
var dropZone = document.getElementById("dropZone");
|
| 142 |
var timeline = document.getElementById("timeline");
|
| 143 |
var statusPill = document.getElementById("statusPill");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
var defaultPromptPlaceholder = promptInput.getAttribute("placeholder") || "Message ResearchHarness";
|
| 145 |
|
| 146 |
function escapeHtml(value) {
|
|
|
|
| 211 |
statusPill.className = "status " + (kind || "idle");
|
| 212 |
}
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
function updateComposerMode() {
|
| 215 |
if (pendingAskId) {
|
| 216 |
runBtn.disabled = false;
|
| 217 |
runBtn.classList.remove("is-running");
|
| 218 |
runBtn.textContent = "Reply";
|
| 219 |
promptInput.placeholder = defaultPromptPlaceholder;
|
| 220 |
+
if (modelSelect) modelSelect.disabled = true;
|
| 221 |
return;
|
| 222 |
}
|
| 223 |
runBtn.disabled = running && interrupting;
|
| 224 |
runBtn.classList.toggle("is-running", running);
|
| 225 |
runBtn.textContent = running ? (interrupting ? "Stopping" : "Stop") : "Run";
|
| 226 |
promptInput.placeholder = defaultPromptPlaceholder;
|
| 227 |
+
if (modelSelect) modelSelect.disabled = running;
|
| 228 |
}
|
| 229 |
|
| 230 |
function setRunning(active, statusText) {
|
|
|
|
| 239 |
timeline.innerHTML = ''
|
| 240 |
+ '<div class="welcome">'
|
| 241 |
+ '<h1>What should the agent do?</h1>'
|
| 242 |
+
+ '<p>Ask a question, attach images, and watch tool calls stream from an isolated temporary workspace.</p>'
|
| 243 |
+ '</div>';
|
| 244 |
}
|
| 245 |
|
|
|
|
| 536 |
ws.send(JSON.stringify({
|
| 537 |
type: "start",
|
| 538 |
prompt: prompt,
|
| 539 |
+
model_name: modelSelect ? modelSelect.value : "",
|
| 540 |
images: sentImages,
|
| 541 |
continue_conversation: continueConversation
|
| 542 |
}));
|
|
|
|
| 596 |
});
|
| 597 |
}
|
| 598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
runBtn.addEventListener("click", sendStart);
|
| 600 |
timeline.addEventListener("scroll", syncTimelineFollowMode);
|
| 601 |
timeline.addEventListener("wheel", function (event) {
|
|
|
|
| 632 |
});
|
| 633 |
imageInput.addEventListener("change", function (event) { addImageFiles(event.target.files); });
|
| 634 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
["dragenter", "dragover"].forEach(function (name) {
|
| 636 |
dropZone.addEventListener(name, function (event) {
|
| 637 |
event.preventDefault();
|
frontend/static/index.html
CHANGED
|
@@ -19,13 +19,15 @@
|
|
| 19 |
</div>
|
| 20 |
</div>
|
| 21 |
<div class="top-actions">
|
| 22 |
-
<
|
|
|
|
|
|
|
|
|
|
| 23 |
<button id="newBtn" class="plain" type="button">New chat</button>
|
| 24 |
</div>
|
| 25 |
</header>
|
| 26 |
|
| 27 |
<section id="workspaceStrip" class="workspace-strip">
|
| 28 |
-
<input id="workspaceInput" type="hidden" value="" />
|
| 29 |
<span id="workspaceMeta">Managed temporary workspace. Each chat uses an isolated runtime directory.</span>
|
| 30 |
</section>
|
| 31 |
|
|
@@ -48,27 +50,6 @@
|
|
| 48 |
</footer>
|
| 49 |
</main>
|
| 50 |
|
| 51 |
-
<section id="workspaceModal" class="modal hidden" role="dialog" aria-modal="true" aria-labelledby="workspaceModalTitle">
|
| 52 |
-
<div class="modal-card">
|
| 53 |
-
<header class="modal-head">
|
| 54 |
-
<div>
|
| 55 |
-
<h2 id="workspaceModalTitle">Open workspace</h2>
|
| 56 |
-
<p>Choose an existing local folder. Unicode paths are supported.</p>
|
| 57 |
-
</div>
|
| 58 |
-
<button id="workspaceCloseBtn" class="plain" type="button" aria-label="Close workspace picker">Close</button>
|
| 59 |
-
</header>
|
| 60 |
-
<div class="modal-path-row">
|
| 61 |
-
<input id="workspacePathInput" type="text" autocomplete="off" placeholder="Paste a folder path..." />
|
| 62 |
-
<button id="workspaceGoBtn" class="plain" type="button">Go</button>
|
| 63 |
-
</div>
|
| 64 |
-
<div id="workspaceRoots" class="workspace-roots"></div>
|
| 65 |
-
<div id="workspaceList" class="workspace-list"></div>
|
| 66 |
-
<footer class="modal-actions">
|
| 67 |
-
<span id="workspacePickerHint">Select a folder to use as the agent workspace.</span>
|
| 68 |
-
<button id="workspaceUseBtn" class="send-button" type="button">Use this folder</button>
|
| 69 |
-
</footer>
|
| 70 |
-
</div>
|
| 71 |
-
</section>
|
| 72 |
<nav class="space-links" aria-label="Project links">
|
| 73 |
<a href="https://github.com/black-yt/ResearchHarness" target="_blank" rel="noopener noreferrer" title="GitHub">
|
| 74 |
<svg viewBox="0 0 16 16" aria-hidden="true"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
|
|
|
|
| 19 |
</div>
|
| 20 |
</div>
|
| 21 |
<div class="top-actions">
|
| 22 |
+
<select id="modelSelect" class="model-select" aria-label="Model">
|
| 23 |
+
<option value="gpt-5.5">gpt-5.5</option>
|
| 24 |
+
<option value="claude-opus-4-7">claude-opus-4-7</option>
|
| 25 |
+
</select>
|
| 26 |
<button id="newBtn" class="plain" type="button">New chat</button>
|
| 27 |
</div>
|
| 28 |
</header>
|
| 29 |
|
| 30 |
<section id="workspaceStrip" class="workspace-strip">
|
|
|
|
| 31 |
<span id="workspaceMeta">Managed temporary workspace. Each chat uses an isolated runtime directory.</span>
|
| 32 |
</section>
|
| 33 |
|
|
|
|
| 50 |
</footer>
|
| 51 |
</main>
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
<nav class="space-links" aria-label="Project links">
|
| 54 |
<a href="https://github.com/black-yt/ResearchHarness" target="_blank" rel="noopener noreferrer" title="GitHub">
|
| 55 |
<svg viewBox="0 0 16 16" aria-hidden="true"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
|
run_agent.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
"""Thin top-level CLI entrypoint for the ResearchHarness agent."""
|
| 2 |
-
|
| 3 |
-
from agent_base.react_agent import main
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
if __name__ == "__main__":
|
| 7 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_frontend.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
"""Launch the local ResearchHarness browser UI."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import argparse
|
| 6 |
-
import sys
|
| 7 |
-
import threading
|
| 8 |
-
import webbrowser
|
| 9 |
-
|
| 10 |
-
import uvicorn
|
| 11 |
-
|
| 12 |
-
from agent_base.utils import read_role_prompt_files
|
| 13 |
-
from frontend.local_server import app, configure_frontend
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def main(argv: list[str] | None = None) -> int:
|
| 17 |
-
parser = argparse.ArgumentParser(description="Run the local ResearchHarness frontend.")
|
| 18 |
-
parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Default: 127.0.0.1")
|
| 19 |
-
parser.add_argument("--port", type=int, default=8765, help="Port to bind. Default: 8765")
|
| 20 |
-
parser.add_argument("--no-browser", action="store_true", help="Do not open the browser automatically.")
|
| 21 |
-
parser.add_argument("--trace-dir", help="Optional directory where frontend agent traces are written.")
|
| 22 |
-
parser.add_argument(
|
| 23 |
-
"--role-prompt-file",
|
| 24 |
-
action="append",
|
| 25 |
-
default=[],
|
| 26 |
-
dest="role_prompt_files",
|
| 27 |
-
metavar="PATH",
|
| 28 |
-
help="Append one role-specific prompt file to the frontend agent. May be passed multiple times.",
|
| 29 |
-
)
|
| 30 |
-
args = parser.parse_args(argv)
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
role_prompt = read_role_prompt_files(args.role_prompt_files)
|
| 34 |
-
configure_frontend(role_prompt=role_prompt, trace_dir=args.trace_dir)
|
| 35 |
-
except (OSError, ValueError) as exc:
|
| 36 |
-
print(str(exc), file=sys.stderr)
|
| 37 |
-
return 1
|
| 38 |
-
|
| 39 |
-
url = f"http://{args.host}:{args.port}"
|
| 40 |
-
if not args.no_browser:
|
| 41 |
-
threading.Timer(0.8, lambda: webbrowser.open(url)).start()
|
| 42 |
-
print(f"ResearchHarness frontend: {url}")
|
| 43 |
-
uvicorn.run(app, host=args.host, port=args.port, reload=False)
|
| 44 |
-
return 0
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if __name__ == "__main__":
|
| 48 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_server.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
"""Run ResearchHarness as a minimal OpenAI-compatible API server."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import argparse
|
| 6 |
-
import sys
|
| 7 |
-
|
| 8 |
-
from agent_base.utils import PROJECT_ROOT, MissingRequiredEnvError, load_dotenv, require_required_env
|
| 9 |
-
from api.openai_server import serve
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def main(argv: list[str] | None = None) -> int:
|
| 13 |
-
parser = argparse.ArgumentParser(description="Serve ResearchHarness through /v1/chat/completions.")
|
| 14 |
-
parser.add_argument(
|
| 15 |
-
"--api-runs-dir",
|
| 16 |
-
required=True,
|
| 17 |
-
dest="api_runs_dir",
|
| 18 |
-
help="Directory where the server creates one isolated subdirectory per request.",
|
| 19 |
-
)
|
| 20 |
-
parser.add_argument("--host", default="127.0.0.1", help="Host to bind. Defaults to 127.0.0.1.")
|
| 21 |
-
parser.add_argument("--port", type=int, default=8686, help="Port to bind. Defaults to 8686.")
|
| 22 |
-
parser.add_argument(
|
| 23 |
-
"--role-prompt-file",
|
| 24 |
-
action="append",
|
| 25 |
-
default=[],
|
| 26 |
-
dest="role_prompt_files",
|
| 27 |
-
help="Optional role prompt file appended to the base ResearchHarness prompt.",
|
| 28 |
-
)
|
| 29 |
-
parser.add_argument(
|
| 30 |
-
"--input-wrapper",
|
| 31 |
-
action=argparse.BooleanOptionalAction,
|
| 32 |
-
default=True,
|
| 33 |
-
help="Enable or disable the input LLM wrapper. Enabled by default.",
|
| 34 |
-
)
|
| 35 |
-
parser.add_argument(
|
| 36 |
-
"--output-wrapper",
|
| 37 |
-
action=argparse.BooleanOptionalAction,
|
| 38 |
-
default=True,
|
| 39 |
-
help="Enable or disable the output LLM wrapper. Enabled by default.",
|
| 40 |
-
)
|
| 41 |
-
args = parser.parse_args(argv)
|
| 42 |
-
|
| 43 |
-
load_dotenv(PROJECT_ROOT / ".env")
|
| 44 |
-
try:
|
| 45 |
-
require_required_env("ResearchHarness API server")
|
| 46 |
-
serve(
|
| 47 |
-
api_runs_dir=args.api_runs_dir,
|
| 48 |
-
host=args.host,
|
| 49 |
-
port=args.port,
|
| 50 |
-
role_prompt_files=list(args.role_prompt_files),
|
| 51 |
-
input_wrapper=args.input_wrapper,
|
| 52 |
-
output_wrapper=args.output_wrapper,
|
| 53 |
-
)
|
| 54 |
-
except (MissingRequiredEnvError, ValueError) as exc:
|
| 55 |
-
print(str(exc), file=sys.stderr)
|
| 56 |
-
return 1
|
| 57 |
-
return 0
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
if __name__ == "__main__":
|
| 61 |
-
raise SystemExit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
traces/.gitkeep
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|
workspace/.gitkeep
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|