microsoft
diff --git a/‎.github/actions/setup-bc-container/action.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/actions/setup-bc-container/action.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/actions/setup-python-uv/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python-uv/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/copilot-instructions.md‎
Lines changed: 4 additions & 0 deletions b/‎.github/copilot-instructions.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/claude-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/claude-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/copilot-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/copilot-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/dataset-validation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/dataset-validation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/mini-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/mini-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
@@ -14,6 +14,10 @@ inputs:
   github-token:
     description: GitHub token for accessing public repositories
     required: true
+  skip-container:
+    description: Skip BC container setup (only clone repository)
+    required: false
+    default: "false"
 
 outputs:
   repo_path:
@@ -24,6 +28,7 @@ runs:
   using: composite
   steps:
     - name: Generate BC container name and credentials
+      if: inputs.skip-container != 'true'
       run: |
         # Generate a 32-character random password using Get-Random
         # The password is short-lived and only used for the duration of the workflow
@@ -38,6 +43,7 @@ runs:
       shell: pwsh
 
     - name: Install BcContainerHelper module
+      if: inputs.skip-container != 'true'
       run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease
       shell: pwsh
 
@@ -59,5 +65,5 @@ runs:
         $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
         Write-Output "::add-mask::$env:ADO_TOKEN"
 
-        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}"
+        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
       shell: pwsh
@@ -11,7 +11,7 @@ runs:
   using: composite
   steps:
     - name: Install uv
-      uses: astral-sh/setup-uv@v7
+      uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57
       with:
         enable-cache: true
 
 
@@ -5,6 +5,7 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - **Dataset**: Benchmark entries following SWE-Bench schema with BC-specific adjustments
 - **Python Package** (`src/bcbench/`): CLI tools, agent implementations, and validation utilities
 - **PowerShell Scripts** (`scripts/`): Environment setup and dataset verification using AL-GO/BCContainerHelper
+- **Tools** (`tools/`): Standalone scripts for downloading and analyzing GitHub Actions artifacts
 - **Agent Evaluations**: Focuses on GitHub Copilot CLI and Claude Code
 - **Experiments**: MCP Servers, custom instructions, custom agents, skills, etc. and their performance on the benchmark
 - **Notebooks** (`notebooks/`): Analysis and visualization of benchmark results
@@ -14,6 +15,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - Uses `uv` for dependency management: e.g. `uv add <package>` to add packages, `uv run <command>` to run commands
 - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)
 
+## Categories
+BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
+
 ## Coding Patterns and Guidelines
 
 - Prefer strong typing and type hints
 
@@ -60,7 +60,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-claude-code:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
 
@@ -68,7 +68,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-copilot-cli:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
 
@@ -21,7 +21,7 @@ jobs:
         uses: actions/checkout@v5
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57
         with:
           enable-cache: true
 
 
@@ -22,7 +22,7 @@ jobs:
       category: "bug-fix"
 
   verify-build-and-tests:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     if: needs.get-entries.outputs.entries != '[]'
     environment:
 
@@ -41,7 +41,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-mini-agent:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}