Skip to content

Evaluation with GitHub Copilot #522

Evaluation with GitHub Copilot

Evaluation with GitHub Copilot #522

name: Evaluation with GitHub Copilot
permissions:
contents: read
actions: write
on:
workflow_dispatch:
inputs:
model:
description: "Copilot model to use for evaluation"
required: false
default: "claude-haiku-4.5"
type: choice
options:
- "claude-sonnet-4.6"
- "claude-haiku-4.5"
- "claude-opus-4.6"
- "claude-opus-4.7"
- "gpt-5.4"
- "gpt-5.3-codex"
- "gpt-5.2-codex"
- "gpt-5.2"
- "gpt-4.1"
category:
description: "Evaluation category to run"
required: false
default: "bug-fix"
type: choice
options:
- "bug-fix"
- "test-generation"
test-run:
description: "Indicate this is a test run (with few entries)"
required: false
default: true
type: boolean
al-mcp:
description: "Enable AL MCP server"
required: false
default: false
type: boolean
repeat:
description: "Number of times to run sequentially (ignored for test runs)"
required: false
default: "1"
type: choice
options:
- "1"
- "2"
- "3"
- "4"
- "5"
concurrency:
group: copilot-evaluation-${{ inputs.test-run && 'test' || 'full' }}
cancel-in-progress: false
env:
EVALUATION_RESULTS_DIR: evaluation_results
jobs:
get-entries:
uses: ./.github/workflows/get-entries.yml
with:
test-run: ${{ inputs.test-run }}
category: ${{ inputs.category }}
evaluate-with-copilot-cli:
runs-on: [ GitHub-BCBench ]
needs: get-entries
outputs:
results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
if: needs.get-entries.outputs.entries != '[]'
environment:
name: ado-read
deployment: false
permissions:
contents: read
id-token: write
name: ${{ matrix.entry }}
strategy:
fail-fast: false
max-parallel: 32
matrix:
entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Setup BC container
id: setup-env
timeout-minutes: 40
uses: ./.github/actions/setup-bc-container
with:
instance-id: ${{ matrix.entry }}
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup Python with UV
uses: ./.github/actions/setup-python-uv
- uses: actions/setup-node@v6
with:
node-version: 24
- name: Install AL Tool
if: ${{ inputs.al-mcp }}
run: |
dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 17.0.33.55542
echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH
- name: Install GitHub Copilot CLI
run: npm install -g @github/copilot@1.0.31
- name: Select PAT based on job index
id: select-pat
run: |
$patIndex = ${{ strategy.job-index }} % 4
echo "pat_index=$patIndex" >> $env:GITHUB_OUTPUT
- name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
timeout-minutes: 120
env:
COPILOT_GITHUB_TOKEN: ${{ steps.select-pat.outputs.pat_index == '0' &&
secrets.COPILOT_PAT || (steps.select-pat.outputs.pat_index == '1' &&
secrets.COPILOT_PAT2 || (steps.select-pat.outputs.pat_index == '2'&&
secrets.COPILOT_PAT3 || secrets.COPILOT_PAT4)) }}
run: |
Write-Output "::add-mask::$env:COPILOT_GITHUB_TOKEN"
uv run bcbench evaluate copilot "${{ matrix.entry }}" `
--model "${{ inputs.model }}" `
--category "${{ inputs.category }}" `
--repo-path "${{ steps.setup-env.outputs.repo_path }}" `
--output-dir "${{ env.EVALUATION_RESULTS_DIR }}" `
${{ inputs.al-mcp && '--al-mcp' || '' }}
- name: Upload evaluation results
uses: actions/upload-artifact@v6
if: always()
with:
name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
path: ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
retention-days: ${{ inputs.test-run && 1 || 30 }}
summarize-results:
needs: evaluate-with-copilot-cli
uses: ./.github/workflows/summarize-results.yml
permissions:
contents: write
id-token: write
with:
results-dir: ${{ needs.evaluate-with-copilot-cli.outputs.results-dir }}
model: ${{ inputs.model }}
agent: "GitHub Copilot CLI"
mock: ${{ inputs.test-run }}
category: ${{ inputs.category }}
secrets: inherit
requeue:
needs: summarize-results
if: ${{ !inputs.test-run }}
uses: ./.github/workflows/requeue-evaluation.yml
permissions:
contents: write
actions: write
with:
workflow-file: copilot-evaluation.yml
repeat: ${{ inputs.repeat }}
workflow-inputs: |
{"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}