Skip to content

Evaluation with GitHub Copilot #15

Evaluation with GitHub Copilot

Evaluation with GitHub Copilot #15

name: Evaluation with GitHub Copilot
description: Evaluate using GitHub Copilot CLI
permissions:
contents: read
on:
workflow_dispatch:
inputs:
model:
description: 'Copilot model to use for evaluation'
required: false
default: 'claude-haiku-4.5'
type: choice
options:
- 'claude-sonnet-4.5'
- 'claude-sonnet-4'
- 'claude-haiku-4.5'
- 'gpt-5'
test-run:
description: 'Indicate this is a test run (with few entries)'
required: false
default: false
type: boolean
env:
EVALUATION_RESULTS_DIR: evaluation_results
jobs:
get-entries:
uses: ./.github/workflows/get-entries.yml
with:
test-run: ${{ inputs.test-run }}
evaluate-with-copilot-cli:
runs-on: windows-latest
needs: get-entries
outputs:
results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
env:
BC_CONTAINER_NAME: bcbench
BC_CONTAINER_USERNAME: admin
BC_CONTAINER_PASSWORD: ${{ secrets.BC_CONTAINER_PASSWORD }}
if: needs.get-entries.outputs.entries != '[]'
name: ${{ matrix.entry }}
strategy:
fail-fast: false
max-parallel: 15
matrix:
entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup BC container
id: setup-env
uses: ./.github/actions/setup-bc-container
with:
instance-id: ${{ matrix.entry }}
ado-token: ${{ secrets.ADO_TOKEN }}
- name: Setup Python with UV
uses: ./.github/actions/setup-python-uv
- name: Install GitHub Copilot CLI
run: npm install -g @github/copilot@0.0.351
- name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
env:
GH_TOKEN: ${{ secrets.COPILOT_PAT }}
run: |
uv run bcbench evaluate copilot "${{ matrix.entry }}" `
--model "${{ inputs.model }}" `
--repo-path "${{ steps.setup-env.outputs.repo_path }}" `
--output-dir "${{ env.EVALUATION_RESULTS_DIR }}"
- name: Upload evaluation results
uses: actions/upload-artifact@v4
if: always()
with:
name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
path: ${{ env.EVALUATION_RESULTS_DIR }}
retention-days: ${{ inputs.test-run && 1 || 30 }}
summarize-results:
needs: evaluate-with-copilot-cli
if: success() || failure()
uses: ./.github/workflows/summarize-results.yml
permissions:
contents: write
with:
results-dir: ${{ needs.evaluate-with-copilot-cli.outputs.results-dir }}
model: ${{ inputs.model }}
agent: 'GitHub Copilot CLI'
mock: ${{ inputs.test-run }}
secrets: inherit