Skip to content

Commit a9cd827

Browse files
haoranpbCopilot
andauthored
Record tool usage using hooks instead (#621)
Co-authored-by: Copilot <copilot@github.com>
1 parent 2026842 commit a9cd827

18 files changed

Lines changed: 367 additions & 308 deletions

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "bcbench"
7-
version = "0.5.1"
7+
version = "0.5.2"
88
description = "Benchmarking tool for Business Central (AL) ecosystem, inspired by SWE-Bench"
99
readme = "README.md"
1010
requires-python = ">=3.13"

src/bcbench/agent/claude/agent.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
import yaml
77

88
from bcbench.agent.claude.metrics import parse_metrics
9-
from bcbench.agent.shared import build_mcp_config, build_prompt
9+
from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
1010
from bcbench.config import get_config
1111
from bcbench.dataset import BaseDatasetEntry
1212
from bcbench.exceptions import AgentError, AgentTimeoutError
1313
from bcbench.logger import get_logger
14-
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
14+
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
1515
from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
1616

1717
logger = get_logger(__name__)
@@ -36,6 +36,7 @@ def run_claude_code(
3636
instructions_enabled: bool = setup_instructions_from_config(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
3737
skills_enabled: bool = setup_agent_skills(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
3838
custom_agent: str | None = setup_custom_agent(claude_config, entry, repo_path, agent_type=AgentType.CLAUDE)
39+
tool_log_path: Path = setup_hooks(repo_path, AgentType.CLAUDE, output_dir)
3940
config = ExperimentConfiguration(
4041
mcp_servers=mcp_server_names,
4142
custom_instructions=instructions_enabled,
@@ -51,13 +52,9 @@ def run_claude_code(
5152
raise AgentError("Claude Code not found in PATH. Please ensure it is installed and available.")
5253

5354
try:
54-
debug_log_path: Path = output_dir.resolve() / "claude_debug.log"
5555
cmd_args = [
5656
claude_cmd,
5757
"--output-format=json",
58-
"--no-session-persistence",
59-
f"--debug-file={debug_log_path}",
60-
# "--verbose", # required for when using --print, --output-format=stream-json
6158
"--strict-mcp-config", # Only use MCP servers from --mcp-config, ignoring all other MCP configurations
6259
f"--model={model}",
6360
"--permission-mode=bypassPermissions", # bypassPermissions is needed to use tools and mcp servers
@@ -98,10 +95,14 @@ def run_claude_code(
9895
data = json.loads(striped_line)
9996
if "result" in data:
10097
print(data["result"], flush=True)
101-
metrics = parse_metrics(data, debug_log_path=debug_log_path)
98+
metrics = parse_metrics(data)
10299
except json.JSONDecodeError:
103100
logger.warning(f"Skipping non-JSON line: {striped_line}")
104101

102+
tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
103+
if metrics and tool_usage:
104+
metrics = metrics.model_copy(update={"tool_usage": tool_usage})
105+
105106
return metrics, config
106107
except subprocess.TimeoutExpired:
107108
logger.error(f"Claude Code timed out after {_config.timeout.agent_execution} seconds")

src/bcbench/agent/claude/metrics.py

Lines changed: 1 addition & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,42 @@
1-
import re
2-
from collections import Counter
3-
from pathlib import Path
4-
51
from bcbench.logger import get_logger
62
from bcbench.types import AgentMetrics
73

84
logger = get_logger(__name__)
95

10-
TOOL_USE_PATTERN = re.compile(r"executePreToolHooks called for tool: (.+)")
11-
12-
13-
def parse_debug_log(log_path: Path) -> dict[str, int]:
14-
content = log_path.read_text(encoding="utf-8")
15-
return dict(Counter(TOOL_USE_PATTERN.findall(content)))
16-
176

18-
def parse_metrics(data: dict, debug_log_path: Path | None = None) -> AgentMetrics | None:
19-
"""Parse metrics from Claude Code result object.
20-
21-
The Claude Code CLI outputs JSON when run with --output-format json.
22-
Expected format:
23-
{
24-
"type": "result",
25-
"subtype": "success",
26-
"is_error": false,
27-
"duration_ms": 2814,
28-
"duration_api_ms": 4819,
29-
"num_turns": 1,
30-
"result": "...",
31-
"session_id": "uuid",
32-
"total_cost_usd": 0.024,
33-
"usage": {
34-
"input_tokens": 2,
35-
"cache_creation_input_tokens": 4974,
36-
"cache_read_input_tokens": 12673,
37-
"output_tokens": 5,
38-
...
39-
},
40-
...
41-
}
42-
"""
7+
def parse_metrics(data: dict) -> AgentMetrics | None:
438
logger.debug(f"Parsing metrics from Claude Code output: {data}")
449

45-
# Extract metrics from JSON
4610
execution_time: float | None = None
4711
llm_duration: float | None = None
4812
turn_count: int | None = None
4913
prompt_tokens: int | None = None
5014
completion_tokens: int | None = None
51-
tool_usage: dict[str, int] | None = None
5215

53-
# Wall clock duration (ms -> seconds)
5416
if "duration_ms" in data:
5517
execution_time = data["duration_ms"] / 1000.0
5618

57-
# API duration (ms -> seconds)
5819
if "duration_api_ms" in data:
5920
llm_duration = data["duration_api_ms"] / 1000.0
6021

61-
# Turn count
6222
if "num_turns" in data:
6323
turn_count = data["num_turns"]
6424

65-
# Token usage from the usage object
6625
usage = data.get("usage", {})
6726
if usage:
68-
# Input tokens = direct input + cache creation + cache read
6927
input_tokens = usage.get("input_tokens", 0)
7028
cache_creation = usage.get("cache_creation_input_tokens", 0)
7129
cache_read = usage.get("cache_read_input_tokens", 0)
7230
prompt_tokens = input_tokens + cache_creation + cache_read
73-
7431
completion_tokens = usage.get("output_tokens")
7532

76-
if debug_log_path and debug_log_path.exists():
77-
try:
78-
tool_usage = parse_debug_log(debug_log_path) or None
79-
except Exception as e:
80-
logger.warning(f"Failed to parse tool usage from {debug_log_path}: {e}")
81-
8233
if any(v is not None for v in [execution_time, llm_duration, turn_count, prompt_tokens, completion_tokens]):
8334
return AgentMetrics(
8435
execution_time=execution_time,
8536
llm_duration=llm_duration,
8637
turn_count=turn_count,
8738
prompt_tokens=prompt_tokens,
8839
completion_tokens=completion_tokens,
89-
tool_usage=tool_usage,
9040
)
9141

9242
logger.warning("No metrics found in Claude Code output")

src/bcbench/agent/copilot/agent.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
import yaml
99

1010
from bcbench.agent.copilot.metrics import parse_metrics
11-
from bcbench.agent.shared import build_mcp_config, build_prompt
11+
from bcbench.agent.shared import build_mcp_config, build_prompt, parse_tool_usage_from_hooks
1212
from bcbench.config import get_config
1313
from bcbench.dataset import BaseDatasetEntry
1414
from bcbench.exceptions import AgentError, AgentTimeoutError
1515
from bcbench.logger import get_logger
16-
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_instructions_from_config
16+
from bcbench.operations import setup_agent_skills, setup_custom_agent, setup_hooks, setup_instructions_from_config
1717
from bcbench.types import AgentMetrics, AgentType, EvaluationCategory, ExperimentConfiguration
1818

1919
logger = get_logger(__name__)
@@ -38,6 +38,7 @@ def run_copilot_agent(
3838
instructions_enabled: bool = setup_instructions_from_config(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
3939
skills_enabled: bool = setup_agent_skills(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
4040
custom_agent: str | None = setup_custom_agent(copilot_config, entry, repo_path, agent_type=AgentType.COPILOT)
41+
tool_log_path: Path = setup_hooks(repo_path, AgentType.COPILOT, output_dir)
4142
config = ExperimentConfiguration(
4243
mcp_servers=mcp_server_names,
4344
custom_instructions=instructions_enabled,
@@ -87,12 +88,16 @@ def run_copilot_agent(
8788
stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
8889
stderr_lines = stderr.splitlines()
8990

90-
# Find the most recent session log for tool usage parsing
91+
# Find the most recent session log for turn count parsing
9192
session_logs = list(output_dir.glob("process-*.log"))
9293
session_log_path = max(session_logs, key=lambda p: p.stat().st_mtime) if session_logs else None
9394

9495
metrics = parse_metrics(stderr_lines, session_log_path=session_log_path)
9596

97+
tool_usage: dict[str, int] | None = parse_tool_usage_from_hooks(tool_log_path)
98+
if metrics and tool_usage:
99+
metrics = metrics.model_copy(update={"tool_usage": tool_usage})
100+
96101
return metrics, config
97102
except subprocess.TimeoutExpired:
98103
logger.error(f"Copilot CLI timed out after {_config.timeout.agent_execution} seconds")

src/bcbench/agent/copilot/metrics.py

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import re
2-
from collections import Counter
32
from pathlib import Path
43
from typing import Sequence
54

@@ -8,14 +7,6 @@
87

98
logger = get_logger(__name__)
109

11-
# Regex to find tool call function names in the log content
12-
# Matches tool calls (with "arguments") but NOT tool definitions (with "description")
13-
# Pattern: "function": {"name": "tool_name", "arguments": ...}
14-
TOOL_CALL_PATTERN = re.compile(
15-
r'"function"\s*:\s*\{\s*"name"\s*:\s*"([^"]+)"\s*,\s*"arguments"',
16-
re.MULTILINE,
17-
)
18-
1910
# Regex to count LLM requests (turns) in the log
2011
# Each "--- Start of group: Sending request to the AI model ---" indicates a new LLM call
2112
TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---")
@@ -29,23 +20,9 @@ def _parse_token_count(s: str) -> int:
2920
return int(float(s))
3021

3122

32-
def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]:
33-
"""Parse tool usage and step count from a single Copilot CLI log file.
34-
35-
The log file format is timestamped text with embedded JSON responses.
36-
Tool calls appear in response JSON under choices[].message.tool_calls[].
37-
Step count is determined by counting LLM requests.
38-
39-
Args:
40-
log_path: Path to the Copilot CLI log file
41-
42-
Returns:
43-
Tuple of (tool_usage dict mapping tool names to call counts, turn_count)
44-
"""
23+
def parse_turn_count_from_log(log_path: Path) -> int:
4524
content = log_path.read_text(encoding="utf-8")
46-
tool_usage = dict(Counter(TOOL_CALL_PATTERN.findall(content)))
47-
turn_count = len(TURN_COUNT_PATTERN.findall(content))
48-
return tool_usage, turn_count
25+
return len(TURN_COUNT_PATTERN.findall(content))
4926

5027

5128
def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = None) -> AgentMetrics | None:
@@ -81,20 +58,14 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
8158
llm_duration: float | None = None
8259
prompt_tokens: int | None = None
8360
completion_tokens: int | None = None
84-
tool_usage: dict[str, int] | None = None
8561
turn_count: int | None = None
8662

87-
# Parse tool usage and turn count from session log if provided
63+
# Parse turn count from session log if provided
8864
if session_log_path:
8965
try:
90-
tool_usage, turn_count = parse_session_log(session_log_path)
91-
if not tool_usage:
92-
tool_usage = None # Convert empty dict to None
93-
if turn_count == 0:
94-
turn_count = None # Convert zero to None
66+
turn_count = parse_turn_count_from_log(session_log_path) or None
9567
except Exception as e:
96-
logger.warning(f"Failed to parse tool usage from {session_log_path}: {e}")
97-
tool_usage = None
68+
logger.warning(f"Failed to parse turn count from {session_log_path}: {e}")
9869
turn_count = None
9970

10071
try:
@@ -133,14 +104,13 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
133104
prompt_tokens = _parse_token_count(tokens_match.group(1))
134105
completion_tokens = _parse_token_count(tokens_match.group(2))
135106

136-
if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None:
107+
if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
137108
return AgentMetrics(
138109
execution_time=execution_time,
139110
llm_duration=llm_duration,
140111
turn_count=turn_count,
141112
prompt_tokens=prompt_tokens,
142113
completion_tokens=completion_tokens,
143-
tool_usage=tool_usage,
144114
)
145115

146116
logger.warning("No metrics found in output")
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Shared code for CLI-based agents (Claude, Copilot)."""
22

3+
from bcbench.agent.shared.hooks_parser import parse_tool_usage_from_hooks
34
from bcbench.agent.shared.mcp import build_mcp_config
45
from bcbench.agent.shared.prompt import build_prompt
56

6-
__all__ = ["build_mcp_config", "build_prompt"]
7+
__all__ = ["build_mcp_config", "build_prompt", "parse_tool_usage_from_hooks"]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
$ErrorActionPreference = "Stop"
2+
3+
try {
4+
$inputJson = [Console]::In.ReadToEnd() | ConvertFrom-Json
5+
$toolName = if ($inputJson.tool_name) { $inputJson.tool_name } else { $inputJson.toolName }
6+
$timestamp = $inputJson.timestamp
7+
8+
if ($toolName -and $env:BCBENCH_TOOL_LOG) {
9+
$entry = @{ tool_name = $toolName; timestamp = $timestamp } | ConvertTo-Json -Compress
10+
Add-Content -Path $env:BCBENCH_TOOL_LOG -Value $entry -Encoding UTF8
11+
}
12+
13+
exit 0
14+
}
15+
catch {
16+
# Never block tool execution — silently fail
17+
exit 0
18+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import json
2+
from collections import Counter
3+
from pathlib import Path
4+
5+
6+
def parse_tool_usage_from_hooks(hooks_output_path: Path) -> dict[str, int] | None:
7+
if not hooks_output_path.exists():
8+
return None
9+
10+
counts: Counter[str] = Counter()
11+
for line in hooks_output_path.read_text(encoding="utf-8").splitlines():
12+
try:
13+
entry = json.loads(line)
14+
if name := entry.get("tool_name"):
15+
counts[name] += 1
16+
except (json.JSONDecodeError, AttributeError):
17+
continue
18+
19+
return dict(counts) or None

src/bcbench/commands/run.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from bcbench.agent.claude import run_claude_code
99
from bcbench.agent.copilot import run_copilot_agent
10-
from bcbench.agent.copilot.metrics import parse_session_log
1110
from bcbench.agent.mini import run_mini_agent
1211
from bcbench.cli_options import (
1312
ClaudeCodeModel,
@@ -127,25 +126,3 @@ def run_mini_inspector(
127126

128127
inspector = TrajectoryInspector(trajectory_files)
129128
inspector.run()
130-
131-
132-
@run_app.command("copilot-inspector")
133-
def run_copilot_tool_analyzer(path: Annotated[Path, typer.Argument(help="Directory to search for log files or specific log file", exists=True, file_okay=True, dir_okay=False)]):
134-
"""
135-
Inspect GitHub Copilot CLI session log(s)
136-
137-
Example:
138-
uv run bcbench run copilot-inspector ./evaluation_results/
139-
"""
140-
141-
usage, turn_count = parse_session_log(path)
142-
143-
print("Tool Usage Summary:")
144-
print("-" * 40)
145-
146-
for tool_name, count in sorted(usage.items(), key=lambda x: (-x[1], x[0])):
147-
print(f" {tool_name}: {count}")
148-
149-
print("-" * 40)
150-
print(f"Total tool calls: {sum(usage.values())}")
151-
print(f"Total LLM calls: {turn_count}")

0 commit comments

Comments
 (0)