microsoft
diff --git a/‎.github/actions/setup-bc-container/action.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/actions/setup-bc-container/action.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/actions/setup-python-uv/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python-uv/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/copilot-instructions.md‎
Lines changed: 3 additions & 0 deletions b/‎.github/copilot-instructions.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/claude-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/claude-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/copilot-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/copilot-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/copilot-setup-steps.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/dataset-validation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/dataset-validation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/mini-evaluation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/mini-evaluation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎notebooks/bug-fix/overview.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎notebooks/bug-fix/overview.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -14,6 +14,10 @@ inputs:
   github-token:
     description: GitHub token for accessing public repositories
     required: true
+  skip-container:
+    description: Skip BC container setup (only clone repository)
+    required: false
+    default: "false"
 
 outputs:
   repo_path:
@@ -24,6 +28,7 @@ runs:
   using: composite
   steps:
     - name: Generate BC container name and credentials
+      if: inputs.skip-container != 'true'
       run: |
         # Generate a 32-character random password using Get-Random
         # The password is short-lived and only used for the duration of the workflow
@@ -38,6 +43,7 @@ runs:
       shell: pwsh
 
     - name: Install BcContainerHelper module
+      if: inputs.skip-container != 'true'
       run: Install-Module -Name BcContainerHelper -Force -AllowClobber -AllowPrerelease
       shell: pwsh
 
@@ -59,5 +65,5 @@ runs:
         $env:ADO_TOKEN = az account get-access-token --resource "499b84ac-1321-427f-aa17-267ca6975798" --query accessToken -o tsv
         Write-Output "::add-mask::$env:ADO_TOKEN"
 
-        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}"
+        .\scripts\Setup-ContainerAndRepository.ps1 -InstanceId "${{ inputs.instance-id }}" ${{ inputs.skip-container == 'true' && '-SkipContainer' || '' }}
       shell: pwsh
@@ -11,7 +11,7 @@ runs:
   using: composite
   steps:
     - name: Install uv
-      uses: astral-sh/setup-uv@v7
+      uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57
       with:
         enable-cache: true
 
 
@@ -14,6 +14,9 @@ This is a benchmark for evaluating coding agents on real-world Business Central
 - Uses `uv` for dependency management: e.g. `uv add <package>` to add packages, `uv run <command>` to run commands
 - Uses `pre-commit` for code quality checks (ruff linting/formatting, trailing whitespace, etc.)
 
+## Categories
+BC-Bench is category-based and designed to grow over time. It currently has two categories, `bug-fix` and `test-generation`. They share the same dataset tasks and execution-based setup, but use different prompts, expected outputs, and evaluation pipelines. Future categories such as `code-review` can be added within the same overall benchmark structure, though they may require different inputs, setup, or evaluation methods.
+
 ## Coding Patterns and Guidelines
 
 - Prefer strong typing and type hints
 
@@ -60,7 +60,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-claude-code:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
 
@@ -68,7 +68,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-copilot-cli:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
 
@@ -21,7 +21,7 @@ jobs:
         uses: actions/checkout@v5
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57
         with:
           enable-cache: true
 
 
@@ -22,7 +22,7 @@ jobs:
       category: "bug-fix"
 
   verify-build-and-tests:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     if: needs.get-entries.outputs.entries != '[]'
     environment:
 
@@ -41,7 +41,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-mini-agent:
-    runs-on: [self-hosted, 1ES.Pool=GitHub-BCBench]
+    runs-on: [GitHub-BCBench]
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
 
@@ -269,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "8b5bb1be",
    "metadata": {},
    "outputs": [
@@ -291,7 +291,7 @@
     "merged_df[\"image_bin\"] = pd.cut(merged_df[\"image_count\"], bins=bins, labels=labels)\n",
     "\n",
     "# Add problem statement char count\n",
-    "ps_chars = {entry.instance_id: len(entry.get_task(transform_image_paths=False)) for entry in bcbench_dataset}\n",
+    "ps_chars = {entry.instance_id: len(entry.get_task()) for entry in bcbench_dataset}\n",
     "merged_df[\"ps_chars\"] = merged_df[\"instance_id\"].map(ps_chars)\n",
     "\n",
     "instance_df = (\n",
 
@@ -79,7 +79,7 @@ analysis = [
     "plotly>=6.5.0",
 ]
 dev = [
-    "pytest>=8.0",
+    "pytest>=9.0.3",
     "pytest-cov>=7.0",
     "ruff>=0.13.0",
     "pre-commit>=4.3.0",
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ analysis = [`
`79`	`79`	`"plotly>=6.5.0",`
`80`	`80`	`]`
`81`	`81`	`dev = [`
`82`		`- "pytest>=8.0",`
	`82`	`+ "pytest>=9.0.3",`
`83`	`83`	`"pytest-cov>=7.0",`
`84`	`84`	`"ruff>=0.13.0",`
`85`	`85`	`"pre-commit>=4.3.0",`