From aa8e7013a7fce243753388d675f0b24890dbd2f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Fri, 20 Mar 2026 16:49:47 -0700 Subject: [PATCH 1/6] fix(config): handle corrupted config.toml in get_credentials and set_credentials get_credentials() called toml.load() without exception handling, causing a raw TOMLDecodeError to crash any code that imports the runpod package when ~/.runpod/config.toml is malformed. Return None on parse failure, matching the existing pattern in check_credentials(). --- runpod/cli/groups/config/functions.py | 24 +++++++--- .../test_cli_groups/test_config_functions.py | 46 +++++++++++++++++++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/runpod/cli/groups/config/functions.py b/runpod/cli/groups/config/functions.py index ba6adffe..ac5043b8 100644 --- a/runpod/cli/groups/config/functions.py +++ b/runpod/cli/groups/config/functions.py @@ -31,11 +31,15 @@ def set_credentials(api_key: str, profile: str = "default", overwrite=False) -> Path(CREDENTIAL_FILE).touch(exist_ok=True) if not overwrite: - with open(CREDENTIAL_FILE, "rb") as cred_file: - if profile in toml.load(cred_file): - raise ValueError( - "Profile already exists. Use `update_credentials` instead." - ) + try: + with open(CREDENTIAL_FILE, "rb") as cred_file: + existing = toml.load(cred_file) + except (TypeError, ValueError): + existing = {} + if profile in existing: + raise ValueError( + "Profile already exists. Use `update_credentials` instead." + ) with open(CREDENTIAL_FILE, "w", encoding="UTF-8") as cred_file: cred_file.write("[" + profile + "]\n") @@ -72,12 +76,18 @@ def check_credentials(profile: str = "default"): def get_credentials(profile="default"): """ Returns the credentials for the specified profile from ~/.runpod/config.toml + + Returns None if the file does not exist, is not valid TOML, or does not + contain the requested profile. """ if not os.path.exists(CREDENTIAL_FILE): return None - with open(CREDENTIAL_FILE, "rb") as cred_file: - credentials = toml.load(cred_file) + try: + with open(CREDENTIAL_FILE, "rb") as cred_file: + credentials = toml.load(cred_file) + except (TypeError, ValueError): + return None if profile not in credentials: return None diff --git a/tests/test_cli/test_cli_groups/test_config_functions.py b/tests/test_cli/test_cli_groups/test_config_functions.py index 14c8418a..d4750bda 100644 --- a/tests/test_cli/test_cli_groups/test_config_functions.py +++ b/tests/test_cli/test_cli_groups/test_config_functions.py @@ -97,3 +97,49 @@ def test_get_credentials_non_existent_profile( assert result is None assert mock_open_call.called assert mock_exists.called + + @patch("os.path.exists", return_value=True) + @patch( + "runpod.cli.groups.config.functions.toml.load", + side_effect=ValueError("Invalid value"), + ) + @patch("builtins.open", new_callable=mock_open) + def test_get_credentials_corrupted_toml( + self, _mock_open_call, _mock_toml_load, _mock_exists + ): + """get_credentials returns None when config.toml contains invalid TOML.""" + result = functions.get_credentials("default") + assert result is None + + @patch("os.path.exists", return_value=True) + @patch( + "runpod.cli.groups.config.functions.toml.load", + side_effect=TypeError("bad type"), + ) + @patch("builtins.open", new_callable=mock_open) + def test_get_credentials_type_error( + self, _mock_open_call, _mock_toml_load, _mock_exists + ): + """get_credentials returns None on TypeError from corrupted file.""" + result = functions.get_credentials("default") + assert result is None + + @patch("runpod.cli.groups.config.functions.toml.load") + @patch("builtins.open", new_callable=mock_open()) + def test_set_credentials_corrupted_toml_allows_overwrite( + self, _mock_file, mock_toml_load + ): + """set_credentials with overwrite=True ignores corrupted existing file.""" + mock_toml_load.side_effect = ValueError("Invalid TOML") + # overwrite=True skips the toml.load check entirely + functions.set_credentials("NEW_KEY", overwrite=True) + + @patch("runpod.cli.groups.config.functions.toml.load") + @patch("builtins.open", new_callable=mock_open()) + def test_set_credentials_corrupted_toml_no_overwrite( + self, _mock_file, mock_toml_load + ): + """set_credentials without overwrite treats corrupted file as empty.""" + mock_toml_load.side_effect = ValueError("Invalid TOML") + # Should not raise — corrupted file is treated as having no profiles + functions.set_credentials("NEW_KEY", overwrite=False) From 00c6105b82002d9f9339ea771bbed2946f58448b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Fri, 20 Mar 2026 17:24:38 -0700 Subject: [PATCH 2/6] fix(test): use module-scoped patches and mock filesystem calls Address review feedback: - Patch os.path.exists via module path for consistency with existing tests - Patch os.makedirs and Path.touch in set_credentials tests for hermeticity --- .../test_cli_groups/test_config_functions.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_cli/test_cli_groups/test_config_functions.py b/tests/test_cli/test_cli_groups/test_config_functions.py index d4750bda..192b2cca 100644 --- a/tests/test_cli/test_cli_groups/test_config_functions.py +++ b/tests/test_cli/test_cli_groups/test_config_functions.py @@ -98,7 +98,7 @@ def test_get_credentials_non_existent_profile( assert mock_open_call.called assert mock_exists.called - @patch("os.path.exists", return_value=True) + @patch("runpod.cli.groups.config.functions.os.path.exists", return_value=True) @patch( "runpod.cli.groups.config.functions.toml.load", side_effect=ValueError("Invalid value"), @@ -111,7 +111,7 @@ def test_get_credentials_corrupted_toml( result = functions.get_credentials("default") assert result is None - @patch("os.path.exists", return_value=True) + @patch("runpod.cli.groups.config.functions.os.path.exists", return_value=True) @patch( "runpod.cli.groups.config.functions.toml.load", side_effect=TypeError("bad type"), @@ -124,20 +124,24 @@ def test_get_credentials_type_error( result = functions.get_credentials("default") assert result is None + @patch("runpod.cli.groups.config.functions.Path.touch") + @patch("runpod.cli.groups.config.functions.os.makedirs") @patch("runpod.cli.groups.config.functions.toml.load") @patch("builtins.open", new_callable=mock_open()) def test_set_credentials_corrupted_toml_allows_overwrite( - self, _mock_file, mock_toml_load + self, _mock_file, mock_toml_load, _mock_makedirs, _mock_touch ): """set_credentials with overwrite=True ignores corrupted existing file.""" mock_toml_load.side_effect = ValueError("Invalid TOML") # overwrite=True skips the toml.load check entirely functions.set_credentials("NEW_KEY", overwrite=True) + @patch("runpod.cli.groups.config.functions.Path.touch") + @patch("runpod.cli.groups.config.functions.os.makedirs") @patch("runpod.cli.groups.config.functions.toml.load") @patch("builtins.open", new_callable=mock_open()) def test_set_credentials_corrupted_toml_no_overwrite( - self, _mock_file, mock_toml_load + self, _mock_file, mock_toml_load, _mock_makedirs, _mock_touch ): """set_credentials without overwrite treats corrupted file as empty.""" mock_toml_load.side_effect = ValueError("Invalid TOML") From 281d10339b2ccf8917df9cfbc1c01f16d797dd9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 23 Mar 2026 16:54:46 -0700 Subject: [PATCH 3/6] chore: remove broken cleanup-endpoints workflow Fails with 403 on the Runpod GraphQL API. --- .github/workflows/cleanup-endpoints.yml | 110 ------------------------ 1 file changed, 110 deletions(-) delete mode 100644 .github/workflows/cleanup-endpoints.yml diff --git a/.github/workflows/cleanup-endpoints.yml b/.github/workflows/cleanup-endpoints.yml deleted file mode 100644 index 6a217e91..00000000 --- a/.github/workflows/cleanup-endpoints.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Cleanup stale endpoints -on: - workflow_dispatch: - inputs: - dry_run: - description: "List endpoints without deleting (true/false)" - required: true - default: "true" - type: choice - options: - - "true" - - "false" - name_filter: - description: "Only delete endpoints whose name contains this string (empty = all)" - required: false - default: "" - -jobs: - cleanup: - if: github.repository == 'runpod/runpod-python' - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - name: Cleanup endpoints - env: - RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} - DRY_RUN: ${{ inputs.dry_run }} - NAME_FILTER: ${{ inputs.name_filter }} - run: | - python3 - <<'SCRIPT' - import json - import os - import urllib.request - - API_URL = "https://api.runpod.io/graphql" - API_KEY = os.environ["RUNPOD_API_KEY"] - DRY_RUN = os.environ.get("DRY_RUN", "true") == "true" - NAME_FILTER = os.environ.get("NAME_FILTER", "").strip() - - def graphql(query, variables=None): - payload = json.dumps({"query": query, "variables": variables or {}}).encode() - req = urllib.request.Request( - f"{API_URL}?api_key={API_KEY}", - data=payload, - headers={"Content-Type": "application/json"}, - ) - with urllib.request.urlopen(req) as resp: - return json.loads(resp.read()) - - # List all endpoints - result = graphql(""" - query { - myself { - endpoints { - id - name - workersMin - workersMax - createdAt - } - } - } - """) - - endpoints = result.get("data", {}).get("myself", {}).get("endpoints", []) - if not endpoints: - print("No endpoints found.") - raise SystemExit(0) - - # Filter if requested - if NAME_FILTER: - targets = [ep for ep in endpoints if NAME_FILTER in ep.get("name", "")] - print(f"Filter '{NAME_FILTER}' matched {len(targets)}/{len(endpoints)} endpoints") - else: - targets = endpoints - print(f"Found {len(targets)} total endpoints (no filter applied)") - - print(f"\n{'DRY RUN — ' if DRY_RUN else ''}{'Listing' if DRY_RUN else 'Deleting'} {len(targets)} endpoint(s):\n") - for ep in sorted(targets, key=lambda e: e.get("createdAt", "")): - print(f" {ep['id']} {ep.get('name', '(unnamed)'):<40} " - f"workers={ep.get('workersMin', '?')}-{ep.get('workersMax', '?')} " - f"created={ep.get('createdAt', 'unknown')}") - - if DRY_RUN: - print(f"\nDry run complete. Re-run with dry_run=false to delete.") - raise SystemExit(0) - - # Delete each endpoint - deleted = 0 - failed = 0 - for ep in targets: - ep_id = ep["id"] - ep_name = ep.get("name", "(unnamed)") - try: - resp = graphql( - "mutation deleteEndpoint($id: String!) { deleteEndpoint(id: $id) }", - {"id": ep_id}, - ) - if "errors" in resp: - print(f" FAILED {ep_id} {ep_name}: {resp['errors']}") - failed += 1 - else: - print(f" DELETED {ep_id} {ep_name}") - deleted += 1 - except Exception as exc: - print(f" ERROR {ep_id} {ep_name}: {exc}") - failed += 1 - - print(f"\nDone: {deleted} deleted, {failed} failed, {len(endpoints) - len(targets)} skipped (filtered)") - SCRIPT From 336c18f74fee0943b22b40df0d6e601b82baabe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 23 Mar 2026 16:58:05 -0700 Subject: [PATCH 4/6] fix(e2e): replace flash undeploy with direct GraphQL endpoint deletion flash undeploy looks up endpoints in .runpod/resources.pkl which does not exist in CI. Use the Runpod GraphQL API to query endpoints by name and delete them directly. --- tests/e2e/conftest.py | 67 +++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 4e5ec585..42252090 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -1,8 +1,9 @@ """E2E test fixtures: provision real endpoints, configure SDK, clean up.""" +import json import logging import os -import subprocess +import urllib.request from pathlib import Path import pytest @@ -16,6 +17,48 @@ # Repo root: tests/e2e/conftest.py -> ../../ _REPO_ROOT = Path(__file__).resolve().parents[2] +_GRAPHQL_URL = "https://api.runpod.io/graphql" + + +def _graphql(api_key: str, query: str, variables: dict | None = None) -> dict: + """Execute a Runpod GraphQL query.""" + payload = json.dumps({"query": query, "variables": variables or {}}).encode() + req = urllib.request.Request( + f"{_GRAPHQL_URL}?api_key={api_key}", + data=payload, + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read()) + + +def _delete_endpoints_by_name(api_key: str, names: list[str]) -> None: + """Delete endpoints matching the given names via GraphQL API.""" + result = _graphql(api_key, """ + query { myself { endpoints { id name } } } + """) + all_endpoints = result.get("data", {}).get("myself", {}).get("endpoints", []) + name_set = set(names) + targets = [ep for ep in all_endpoints if ep.get("name") in name_set] + + if not targets: + log.warning("No matching endpoints found for names: %s", names) + return + + for ep in targets: + try: + resp = _graphql( + api_key, + "mutation($id: String!) { deleteEndpoint(id: $id) }", + {"id": ep["id"]}, + ) + if "errors" in resp: + log.warning("Failed to delete %s (%s): %s", ep["name"], ep["id"], resp["errors"]) + else: + log.info("Deleted endpoint %s (%s)", ep["name"], ep["id"]) + except Exception: + log.exception("Error deleting endpoint %s (%s)", ep["name"], ep["id"]) + @pytest.fixture(scope="session", autouse=True) def verify_local_runpod(): @@ -57,22 +100,10 @@ def endpoints(require_api_key, test_cases): log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A") yield eps - # Undeploy only the endpoints provisioned by this test run. - # Uses by-name undeploy to avoid tearing down unrelated endpoints - # sharing the same API key (parallel CI runs, developer endpoints). + # Delete provisioned endpoints via GraphQL API directly. + # flash undeploy relies on .runpod/resources.pkl which doesn't exist in CI. + api_key = os.environ.get("RUNPOD_API_KEY", "") endpoint_names = [ep.name for ep in eps.values()] log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names) - for name in endpoint_names: - try: - result = subprocess.run( - ["flash", "undeploy", name, "--force"], - capture_output=True, - text=True, - timeout=60, - ) - if result.returncode == 0: - log.info("Undeployed %s", name) - else: - log.warning("flash undeploy %s failed (rc=%d): %s", name, result.returncode, result.stderr) - except Exception: - log.exception("Failed to undeploy %s", name) + if api_key and endpoint_names: + _delete_endpoints_by_name(api_key, endpoint_names) From 4c6ec06dd1e16f01c129adcb1568baea82c46391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 23 Mar 2026 17:06:46 -0700 Subject: [PATCH 5/6] fix(e2e): make endpoint cleanup non-fatal The RUNPOD_API_KEY in CI lacks GraphQL API access, causing 403 on cleanup. Catch the error so test results are not masked by teardown failures. --- tests/e2e/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 42252090..fe1c3995 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -106,4 +106,7 @@ def endpoints(require_api_key, test_cases): endpoint_names = [ep.name for ep in eps.values()] log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names) if api_key and endpoint_names: - _delete_endpoints_by_name(api_key, endpoint_names) + try: + _delete_endpoints_by_name(api_key, endpoint_names) + except Exception: + log.exception("Endpoint cleanup failed (API key may lack GraphQL access)") From 1c2537cab9f93a2e0ab2a16ca9c9a937ebdf6685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 23 Mar 2026 17:07:26 -0700 Subject: [PATCH 6/6] fix(e2e): use flash undeploy --all --force for cleanup --- tests/e2e/conftest.py | 69 ++++++++++--------------------------------- 1 file changed, 15 insertions(+), 54 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index fe1c3995..8bb8b577 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -1,9 +1,8 @@ """E2E test fixtures: provision real endpoints, configure SDK, clean up.""" -import json import logging import os -import urllib.request +import subprocess from pathlib import Path import pytest @@ -17,48 +16,6 @@ # Repo root: tests/e2e/conftest.py -> ../../ _REPO_ROOT = Path(__file__).resolve().parents[2] -_GRAPHQL_URL = "https://api.runpod.io/graphql" - - -def _graphql(api_key: str, query: str, variables: dict | None = None) -> dict: - """Execute a Runpod GraphQL query.""" - payload = json.dumps({"query": query, "variables": variables or {}}).encode() - req = urllib.request.Request( - f"{_GRAPHQL_URL}?api_key={api_key}", - data=payload, - headers={"Content-Type": "application/json"}, - ) - with urllib.request.urlopen(req, timeout=30) as resp: - return json.loads(resp.read()) - - -def _delete_endpoints_by_name(api_key: str, names: list[str]) -> None: - """Delete endpoints matching the given names via GraphQL API.""" - result = _graphql(api_key, """ - query { myself { endpoints { id name } } } - """) - all_endpoints = result.get("data", {}).get("myself", {}).get("endpoints", []) - name_set = set(names) - targets = [ep for ep in all_endpoints if ep.get("name") in name_set] - - if not targets: - log.warning("No matching endpoints found for names: %s", names) - return - - for ep in targets: - try: - resp = _graphql( - api_key, - "mutation($id: String!) { deleteEndpoint(id: $id) }", - {"id": ep["id"]}, - ) - if "errors" in resp: - log.warning("Failed to delete %s (%s): %s", ep["name"], ep["id"], resp["errors"]) - else: - log.info("Deleted endpoint %s (%s)", ep["name"], ep["id"]) - except Exception: - log.exception("Error deleting endpoint %s (%s)", ep["name"], ep["id"]) - @pytest.fixture(scope="session", autouse=True) def verify_local_runpod(): @@ -100,13 +57,17 @@ def endpoints(require_api_key, test_cases): log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A") yield eps - # Delete provisioned endpoints via GraphQL API directly. - # flash undeploy relies on .runpod/resources.pkl which doesn't exist in CI. - api_key = os.environ.get("RUNPOD_API_KEY", "") - endpoint_names = [ep.name for ep in eps.values()] - log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names) - if api_key and endpoint_names: - try: - _delete_endpoints_by_name(api_key, endpoint_names) - except Exception: - log.exception("Endpoint cleanup failed (API key may lack GraphQL access)") + log.info("Cleaning up all provisioned endpoints") + try: + result = subprocess.run( + ["flash", "undeploy", "--all", "--force"], + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode == 0: + log.info("Undeployed all endpoints") + else: + log.warning("flash undeploy --all --force failed (rc=%d): %s", result.returncode, result.stderr) + except Exception: + log.exception("Failed to undeploy endpoints")