Evaluation with GitHub Copilot #539

Workflow file for this run

.github/workflows/copilot-evaluation.yml at 2dab94d

	name: Evaluation with GitHub Copilot
	permissions:
	contents: read
	actions: write

	on:
	workflow_dispatch:
	inputs:
	model:
	description: "Copilot model to use for evaluation"
	required: false
	default: "claude-haiku-4.5"
	type: choice
	options:
	- "claude-sonnet-4.6"
	- "claude-haiku-4.5"
	- "claude-opus-4.6"
	- "claude-opus-4.7"
	- "gpt-5.4"
	- "gpt-5.3-codex"
	- "gpt-5.2-codex"
	- "gpt-5.2"
	- "gpt-4.1"
	category:
	description: "Evaluation category to run"
	required: false
	default: "bug-fix"
	type: choice
	options:
	- "bug-fix"
	- "test-generation"
	- "cf-1"
	- "cf-2"
	- "cf-3"
	- "cf-4"
	test-run:
	description: "Indicate this is a test run (with few entries)"
	required: false
	default: true
	type: boolean
	al-mcp:
	description: "Enable AL MCP server"
	required: false
	default: false
	type: boolean
	entries-override:
	description: "Optional JSON array of entry IDs to run (overrides get-entries). Example: [\"microsoftInternal__NAV-213524__cf-4\"]"
	required: false
	default: ""
	type: string
	repeat:
	description: "Number of times to run sequentially (ignored for test runs)"
	required: false
	default: "1"
	type: choice
	options:
	- "1"
	- "2"
	- "3"
	- "4"
	- "5"

	concurrency:
	group: copilot-evaluation-${{ inputs.test-run && 'test' \|\| 'full' }}
	cancel-in-progress: false

	env:
	EVALUATION_RESULTS_DIR: evaluation_results

	jobs:
	get-entries:
	if: ${{ inputs.entries-override == '' }}
	uses: ./.github/workflows/get-entries.yml
	with:
	test-run: ${{ inputs.test-run }}
	category: ${{ inputs.category }}

	resolve-entries:
	runs-on: ubuntu-latest
	needs: get-entries
	if: always()
	outputs:
	entries: ${{ steps.pick.outputs.entries }}
	steps:
	- id: pick
	run: \|
	if [[ -n "${{ inputs.entries-override }}" ]]; then
	echo "entries=${{ inputs.entries-override }}" >> "$GITHUB_OUTPUT"
	else
	echo 'entries=${{ needs.get-entries.outputs.entries }}' >> "$GITHUB_OUTPUT"
	fi

	evaluate-with-copilot-cli:
	runs-on: [ GitHub-BCBench ]
	needs: resolve-entries
	outputs:
	results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
	if: needs.resolve-entries.outputs.entries != '[]' && needs.resolve-entries.outputs.entries != ''
	environment:
	name: ado-read
	deployment: false
	permissions:
	contents: read
	id-token: write
	name: ${{ matrix.entry }}
	strategy:
	fail-fast: false
	max-parallel: 32
	matrix:
	entry: ${{ fromJson(needs.resolve-entries.outputs.entries) }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v5

	- name: Setup BC container
	id: setup-env
	timeout-minutes: 40
	uses: ./.github/actions/setup-bc-container
	with:
	instance-id: ${{ matrix.entry }}
	azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
	azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Setup Python with UV
	uses: ./.github/actions/setup-python-uv

	- uses: actions/setup-node@v6
	with:
	node-version: 24

	- name: Install AL Tool
	if: ${{ inputs.al-mcp }}
	run: \|
	dotnet tool install -g Microsoft.Dynamics.BusinessCentral.Development.Tools --version 17.0.33.55542
	echo "$HOME\.dotnet\tools" >> $env:GITHUB_PATH

	- name: Install GitHub Copilot CLI
	run: npm install -g @github/copilot@1.0.31

	- name: Select PAT based on job index
	id: select-pat
	run: \|
	$patIndex = ${{ strategy.job-index }} % 4
	echo "pat_index=$patIndex" >> $env:GITHUB_OUTPUT

	- name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
	timeout-minutes: 120
	env:
	COPILOT_GITHUB_TOKEN: ${{ steps.select-pat.outputs.pat_index == '0' &&
	secrets.COPILOT_PAT \|\| (steps.select-pat.outputs.pat_index == '1' &&
	secrets.COPILOT_PAT2 \|\| (steps.select-pat.outputs.pat_index == '2'&&
	secrets.COPILOT_PAT3 \|\| secrets.COPILOT_PAT4)) }}
	run: \|
	Write-Output "::add-mask::$env:COPILOT_GITHUB_TOKEN"

	uv run bcbench evaluate copilot "${{ matrix.entry }}" `
	--model "${{ inputs.model }}" `
	--category "${{ inputs.category }}" `
	--repo-path "${{ steps.setup-env.outputs.repo_path }}" `
	--output-dir "${{ env.EVALUATION_RESULTS_DIR }}" `
	${{ inputs.al-mcp && '--al-mcp' \|\| '' }}

	- name: Upload evaluation results
	uses: actions/upload-artifact@v6
	if: always()
	with:
	name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
	path: ${{ env.EVALUATION_RESULTS_DIR }}/*/.jsonl
	retention-days: ${{ inputs.test-run && 1 \|\| 30 }}

	summarize-results:
	needs: evaluate-with-copilot-cli
	uses: ./.github/workflows/summarize-results.yml
	permissions:
	contents: write
	id-token: write
	with:
	results-dir: ${{ needs.evaluate-with-copilot-cli.outputs.results-dir }}
	model: ${{ inputs.model }}
	agent: "GitHub Copilot CLI"
	mock: ${{ inputs.test-run }}
	category: ${{ inputs.category }}
	secrets: inherit

	requeue:
	needs: summarize-results
	if: ${{ !inputs.test-run }}
	uses: ./.github/workflows/requeue-evaluation.yml
	permissions:
	contents: write
	actions: write
	with:
	workflow-file: copilot-evaluation.yml
	repeat: ${{ inputs.repeat }}
	workflow-inputs: \|
	{"model": "${{ inputs.model }}", "category": "${{ inputs.category }}", "test-run": "${{ inputs.test-run }}", "al-mcp": "${{ inputs.al-mcp }}"}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Evaluation with GitHub Copilot #539

Workflow file

Evaluation with GitHub Copilot #539

Uh oh!

Workflow file for this run