Evaluation with GitHub Copilot #13

Workflow file for this run

.github/workflows/copilot-evaluation.yml at 1fcc5e2

	name: Evaluation with GitHub Copilot
	description: Evaluate using GitHub Copilot CLI
	permissions:
	contents: read

	on:
	workflow_dispatch:
	inputs:
	model:
	description: 'Copilot model to use for evaluation'
	required: false
	default: 'claude-haiku-4.5'
	type: choice
	options:
	- 'claude-sonnet-4.5'
	- 'claude-sonnet-4'
	- 'claude-haiku-4.5'
	- 'gpt-5'
	test-run:
	description: 'Indicate this is a test run (with few entries)'
	required: false
	default: false
	type: boolean

	env:
	EVALUATION_RESULTS_DIR: evaluation_results

	jobs:
	get-entries:
	uses: ./.github/workflows/get-entries.yml
	with:
	test-run: ${{ github.event.inputs.test-run }}

	evaluate-with-copilot-cli:
	runs-on: windows-latest
	needs: get-entries
	outputs:
	results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
	env:
	BC_CONTAINER_NAME: bcbench
	BC_CONTAINER_USERNAME: admin
	BC_CONTAINER_PASSWORD: ${{ secrets.BC_CONTAINER_PASSWORD }}
	if: needs.get-entries.outputs.entries != '[]'
	name: ${{ matrix.entry }}
	strategy:
	fail-fast: false
	max-parallel: 15
	matrix:
	entry: ${{ fromJson(needs.get-entries.outputs.entries) }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Setup BC container
	id: setup-env
	uses: ./.github/actions/setup-bc-container
	with:
	instance-id: ${{ matrix.entry }}
	ado-token: ${{ secrets.ADO_TOKEN }}

	- name: Setup Python with UV
	uses: ./.github/actions/setup-python-uv

	- name: Install GitHub Copilot CLI
	run: npm install -g @github/copilot@0.0.351

	- name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
	env:
	GH_TOKEN: ${{ secrets.COPILOT_PAT }}
	run: \|
	uv run bcbench evaluate copilot "${{ matrix.entry }}" `
	--model "${{ github.event.inputs.model }}" `
	--repo-path "${{ steps.setup-env.outputs.repo_path }}" `
	--output-dir "${{ env.EVALUATION_RESULTS_DIR }}"

	- name: Upload evaluation results
	uses: actions/upload-artifact@v4
	if: always()
	with:
	name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
	path: ${{ env.EVALUATION_RESULTS_DIR }}
	retention-days: ${{ github.event.inputs.test-run && 1 \|\| 30 }}

	summarize-results:
	needs: evaluate-with-copilot-cli
	if: success() \|\| failure()
	uses: ./.github/workflows/summarize-results.yml
	permissions:
	contents: write
	with:
	results-dir: ${{ needs.evaluate-with-copilot-cli.outputs.results-dir }}
	model: ${{ github.event.inputs.model }}
	agent: 'GitHub Copilot CLI'
	mock: ${{ github.event.inputs.test-run }}
	secrets: inherit

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Evaluation with GitHub Copilot #13

Workflow file

Evaluation with GitHub Copilot #13

Uh oh!

Workflow file for this run