Skip to content

Commit 569c3fd

Browse files
authored
Support new Copilot CLI metrics output format (#611)
1 parent 4a054c3 commit 569c3fd

2 files changed

Lines changed: 75 additions & 16 deletions

File tree

src/bcbench/agent/copilot/metrics.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
TURN_COUNT_PATTERN = re.compile(r"--- Start of group: Sending request to the AI model ---")
2222

2323

24+
def _parse_token_count(s: str) -> int:
25+
if s.endswith("m"):
26+
return int(float(s[:-1]) * 1000000)
27+
if s.endswith("k"):
28+
return int(float(s[:-1]) * 1000)
29+
return int(float(s))
30+
31+
2432
def parse_session_log(log_path: Path) -> tuple[dict[str, int], int]:
2533
"""Parse tool usage and step count from a single Copilot CLI log file.
2634
@@ -49,7 +57,12 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
4957
output_lines: Lines from Copilot CLI stderr output
5058
session_log_path: Optional path to session log file for tool usage parsing
5159
52-
Expected output format at the end:
60+
Expected output format (new, v1.0.2+):
61+
Changes +17 -0
62+
Requests 0.33 Premium (1m 45s)
63+
Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached)
64+
65+
Legacy output format:
5366
Total usage est: 0.33 Premium requests
5467
API time spent: 2m 10.145s
5568
Total session time: 2m 41.651s
@@ -85,35 +98,40 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
8598
turn_count = None
8699

87100
try:
88-
# Parse LLM duration (API time)
101+
# Parse LLM duration (API time) — legacy format
89102
llm_duration_match = re.search(r"API time spent:\s*(?:(\d+)m\s*)?(\d+(?:\.\d+)?)s", output_text)
90103
if llm_duration_match:
91104
minutes = int(llm_duration_match.group(1)) if llm_duration_match.group(1) else 0
92105
seconds = float(llm_duration_match.group(2))
93106
llm_duration = minutes * 60 + seconds
94107

95-
# Parse wall clock duration
108+
# Parse wall clock duration — legacy format
96109
duration_match = re.search(r"Total session time:\s*(?:(\d+)m\s*)?(\d+(?:\.\d+)?)s", output_text)
97110
if duration_match:
98111
minutes = int(duration_match.group(1)) if duration_match.group(1) else 0
99112
seconds = float(duration_match.group(2))
100113
execution_time = minutes * 60 + seconds
101114

102-
# Token usage: "1.3m in, 11.6k out"
115+
# New format: "Requests 0.33 Premium (1m 45s)" — extract session time from parenthesized duration
116+
if execution_time is None:
117+
requests_match = re.search(r"Requests\s+[\d.]+\s+Premium\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text)
118+
if requests_match:
119+
minutes = int(requests_match.group(1)) if requests_match.group(1) else 0
120+
seconds = float(requests_match.group(2))
121+
execution_time = minutes * 60 + seconds
122+
123+
# Token usage — legacy format: "1.3m in, 11.6k out"
103124
usage_match = re.search(r"(\d+(?:\.\d+)?[km]?)\s+in,\s*(\d+(?:\.\d+)?[km]?)\s+out", output_text)
104125
if usage_match:
105-
input_str = usage_match.group(1)
106-
output_str = usage_match.group(2)
107-
108-
def parse_token_count(s: str) -> int:
109-
if s.endswith("m"):
110-
return int(float(s[:-1]) * 1000000)
111-
if s.endswith("k"):
112-
return int(float(s[:-1]) * 1000)
113-
return int(float(s))
114-
115-
prompt_tokens = parse_token_count(input_str)
116-
completion_tokens = parse_token_count(output_str)
126+
prompt_tokens = _parse_token_count(usage_match.group(1))
127+
completion_tokens = _parse_token_count(usage_match.group(2))
128+
129+
# New format: "Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached)"
130+
if prompt_tokens is None:
131+
tokens_match = re.search(r"Tokens\s+[^\d]*(\d+(?:\.\d+)?[km]?)\s*[•·]\s*[^\d]*(\d+(?:\.\d+)?[km]?)", output_text)
132+
if tokens_match:
133+
prompt_tokens = _parse_token_count(tokens_match.group(1))
134+
completion_tokens = _parse_token_count(tokens_match.group(2))
117135

118136
if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or tool_usage is not None or turn_count is not None:
119137
return AgentMetrics(

tests/test_copilot_metrics_parsing.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,47 @@ def test_parse_metrics_minimal_real_output():
198198
assert result.completion_tokens == 1500
199199

200200

201+
def test_parse_metrics_new_format_full():
202+
output_lines = [
203+
"Changes +17 -0\n",
204+
"Requests 0.33 Premium (1m 45s)\n",
205+
"Tokens ↑ 317.5k • ↓ 4.3k • 255.0k (cached)\n",
206+
]
207+
208+
result = parse_metrics(output_lines)
209+
210+
assert result is not None
211+
assert result.execution_time == 105.0
212+
assert result.prompt_tokens == 317500
213+
assert result.completion_tokens == 4300
214+
215+
216+
def test_parse_metrics_new_format_seconds_only():
217+
output_lines = [
218+
"Requests 1 Premium (45s)\n",
219+
"Tokens ↑ 125.5k • ↓ 3.6k • 0 (cached)\n",
220+
]
221+
222+
result = parse_metrics(output_lines)
223+
224+
assert result is not None
225+
assert result.execution_time == 45.0
226+
assert result.prompt_tokens == 125500
227+
assert result.completion_tokens == 3600
228+
229+
230+
def test_parse_metrics_new_format_tokens_with_m():
231+
output_lines = [
232+
"Tokens ↑ 1.3m • ↓ 11.6k • 1.2m (cached)\n",
233+
]
234+
235+
result = parse_metrics(output_lines)
236+
237+
assert result is not None
238+
assert result.prompt_tokens == 1300000
239+
assert result.completion_tokens == 11600
240+
241+
201242
def test_parse_session_log_extracts_turn_count():
202243
log_content = """
203244
2026-01-20T08:55:10.767Z [INFO] --- Start of group: Sending request to the AI model ---

0 commit comments

Comments
 (0)