From aa8e7013a7fce243753388d675f0b24890dbd2f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Fri, 20 Mar 2026 16:49:47 -0700
Subject: [PATCH 1/6] fix(config): handle corrupted config.toml in
 get_credentials and set_credentials

get_credentials() called toml.load() without exception handling, causing
a raw TOMLDecodeError to crash any code that imports the runpod package
when ~/.runpod/config.toml is malformed. Return None on parse failure,
matching the existing pattern in check_credentials().
---
 runpod/cli/groups/config/functions.py         | 24 +++++++---
 .../test_cli_groups/test_config_functions.py  | 46 +++++++++++++++++++
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/runpod/cli/groups/config/functions.py b/runpod/cli/groups/config/functions.py
index ba6adffe..ac5043b8 100644
--- a/runpod/cli/groups/config/functions.py
+++ b/runpod/cli/groups/config/functions.py
@@ -31,11 +31,15 @@ def set_credentials(api_key: str, profile: str = "default", overwrite=False) ->
     Path(CREDENTIAL_FILE).touch(exist_ok=True)
 
     if not overwrite:
-        with open(CREDENTIAL_FILE, "rb") as cred_file:
-            if profile in toml.load(cred_file):
-                raise ValueError(
-                    "Profile already exists. Use `update_credentials` instead."
-                )
+        try:
+            with open(CREDENTIAL_FILE, "rb") as cred_file:
+                existing = toml.load(cred_file)
+        except (TypeError, ValueError):
+            existing = {}
+        if profile in existing:
+            raise ValueError(
+                "Profile already exists. Use `update_credentials` instead."
+            )
 
     with open(CREDENTIAL_FILE, "w", encoding="UTF-8") as cred_file:
         cred_file.write("[" + profile + "]\n")
@@ -72,12 +76,18 @@ def check_credentials(profile: str = "default"):
 def get_credentials(profile="default"):
     """
     Returns the credentials for the specified profile from ~/.runpod/config.toml
+
+    Returns None if the file does not exist, is not valid TOML, or does not
+    contain the requested profile.
     """
     if not os.path.exists(CREDENTIAL_FILE):
         return None
 
-    with open(CREDENTIAL_FILE, "rb") as cred_file:
-        credentials = toml.load(cred_file)
+    try:
+        with open(CREDENTIAL_FILE, "rb") as cred_file:
+            credentials = toml.load(cred_file)
+    except (TypeError, ValueError):
+        return None
 
     if profile not in credentials:
         return None
diff --git a/tests/test_cli/test_cli_groups/test_config_functions.py b/tests/test_cli/test_cli_groups/test_config_functions.py
index 14c8418a..d4750bda 100644
--- a/tests/test_cli/test_cli_groups/test_config_functions.py
+++ b/tests/test_cli/test_cli_groups/test_config_functions.py
@@ -97,3 +97,49 @@ def test_get_credentials_non_existent_profile(
         assert result is None
         assert mock_open_call.called
         assert mock_exists.called
+
+    @patch("os.path.exists", return_value=True)
+    @patch(
+        "runpod.cli.groups.config.functions.toml.load",
+        side_effect=ValueError("Invalid value"),
+    )
+    @patch("builtins.open", new_callable=mock_open)
+    def test_get_credentials_corrupted_toml(
+        self, _mock_open_call, _mock_toml_load, _mock_exists
+    ):
+        """get_credentials returns None when config.toml contains invalid TOML."""
+        result = functions.get_credentials("default")
+        assert result is None
+
+    @patch("os.path.exists", return_value=True)
+    @patch(
+        "runpod.cli.groups.config.functions.toml.load",
+        side_effect=TypeError("bad type"),
+    )
+    @patch("builtins.open", new_callable=mock_open)
+    def test_get_credentials_type_error(
+        self, _mock_open_call, _mock_toml_load, _mock_exists
+    ):
+        """get_credentials returns None on TypeError from corrupted file."""
+        result = functions.get_credentials("default")
+        assert result is None
+
+    @patch("runpod.cli.groups.config.functions.toml.load")
+    @patch("builtins.open", new_callable=mock_open())
+    def test_set_credentials_corrupted_toml_allows_overwrite(
+        self, _mock_file, mock_toml_load
+    ):
+        """set_credentials with overwrite=True ignores corrupted existing file."""
+        mock_toml_load.side_effect = ValueError("Invalid TOML")
+        # overwrite=True skips the toml.load check entirely
+        functions.set_credentials("NEW_KEY", overwrite=True)
+
+    @patch("runpod.cli.groups.config.functions.toml.load")
+    @patch("builtins.open", new_callable=mock_open())
+    def test_set_credentials_corrupted_toml_no_overwrite(
+        self, _mock_file, mock_toml_load
+    ):
+        """set_credentials without overwrite treats corrupted file as empty."""
+        mock_toml_load.side_effect = ValueError("Invalid TOML")
+        # Should not raise — corrupted file is treated as having no profiles
+        functions.set_credentials("NEW_KEY", overwrite=False)

From 00c6105b82002d9f9339ea771bbed2946f58448b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Fri, 20 Mar 2026 17:24:38 -0700
Subject: [PATCH 2/6] fix(test): use module-scoped patches and mock filesystem
 calls

Address review feedback:
- Patch os.path.exists via module path for consistency with existing tests
- Patch os.makedirs and Path.touch in set_credentials tests for hermeticity
---
 .../test_cli_groups/test_config_functions.py         | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/test_cli/test_cli_groups/test_config_functions.py b/tests/test_cli/test_cli_groups/test_config_functions.py
index d4750bda..192b2cca 100644
--- a/tests/test_cli/test_cli_groups/test_config_functions.py
+++ b/tests/test_cli/test_cli_groups/test_config_functions.py
@@ -98,7 +98,7 @@ def test_get_credentials_non_existent_profile(
         assert mock_open_call.called
         assert mock_exists.called
 
-    @patch("os.path.exists", return_value=True)
+    @patch("runpod.cli.groups.config.functions.os.path.exists", return_value=True)
     @patch(
         "runpod.cli.groups.config.functions.toml.load",
         side_effect=ValueError("Invalid value"),
@@ -111,7 +111,7 @@ def test_get_credentials_corrupted_toml(
         result = functions.get_credentials("default")
         assert result is None
 
-    @patch("os.path.exists", return_value=True)
+    @patch("runpod.cli.groups.config.functions.os.path.exists", return_value=True)
     @patch(
         "runpod.cli.groups.config.functions.toml.load",
         side_effect=TypeError("bad type"),
@@ -124,20 +124,24 @@ def test_get_credentials_type_error(
         result = functions.get_credentials("default")
         assert result is None
 
+    @patch("runpod.cli.groups.config.functions.Path.touch")
+    @patch("runpod.cli.groups.config.functions.os.makedirs")
     @patch("runpod.cli.groups.config.functions.toml.load")
     @patch("builtins.open", new_callable=mock_open())
     def test_set_credentials_corrupted_toml_allows_overwrite(
-        self, _mock_file, mock_toml_load
+        self, _mock_file, mock_toml_load, _mock_makedirs, _mock_touch
     ):
         """set_credentials with overwrite=True ignores corrupted existing file."""
         mock_toml_load.side_effect = ValueError("Invalid TOML")
         # overwrite=True skips the toml.load check entirely
         functions.set_credentials("NEW_KEY", overwrite=True)
 
+    @patch("runpod.cli.groups.config.functions.Path.touch")
+    @patch("runpod.cli.groups.config.functions.os.makedirs")
     @patch("runpod.cli.groups.config.functions.toml.load")
     @patch("builtins.open", new_callable=mock_open())
     def test_set_credentials_corrupted_toml_no_overwrite(
-        self, _mock_file, mock_toml_load
+        self, _mock_file, mock_toml_load, _mock_makedirs, _mock_touch
     ):
         """set_credentials without overwrite treats corrupted file as empty."""
         mock_toml_load.side_effect = ValueError("Invalid TOML")

From 281d10339b2ccf8917df9cfbc1c01f16d797dd9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 23 Mar 2026 16:54:46 -0700
Subject: [PATCH 3/6] chore: remove broken cleanup-endpoints workflow

Fails with 403 on the Runpod GraphQL API.
---
 .github/workflows/cleanup-endpoints.yml | 110 ------------------------
 1 file changed, 110 deletions(-)
 delete mode 100644 .github/workflows/cleanup-endpoints.yml

diff --git a/.github/workflows/cleanup-endpoints.yml b/.github/workflows/cleanup-endpoints.yml
deleted file mode 100644
index 6a217e91..00000000
--- a/.github/workflows/cleanup-endpoints.yml
+++ /dev/null
@@ -1,110 +0,0 @@
-name: Cleanup stale endpoints
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "List endpoints without deleting (true/false)"
-        required: true
-        default: "true"
-        type: choice
-        options:
-          - "true"
-          - "false"
-      name_filter:
-        description: "Only delete endpoints whose name contains this string (empty = all)"
-        required: false
-        default: ""
-
-jobs:
-  cleanup:
-    if: github.repository == 'runpod/runpod-python'
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - name: Cleanup endpoints
-        env:
-          RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
-          DRY_RUN: ${{ inputs.dry_run }}
-          NAME_FILTER: ${{ inputs.name_filter }}
-        run: |
-          python3 - <<'SCRIPT'
-          import json
-          import os
-          import urllib.request
-
-          API_URL = "https://api.runpod.io/graphql"
-          API_KEY = os.environ["RUNPOD_API_KEY"]
-          DRY_RUN = os.environ.get("DRY_RUN", "true") == "true"
-          NAME_FILTER = os.environ.get("NAME_FILTER", "").strip()
-
-          def graphql(query, variables=None):
-              payload = json.dumps({"query": query, "variables": variables or {}}).encode()
-              req = urllib.request.Request(
-                  f"{API_URL}?api_key={API_KEY}",
-                  data=payload,
-                  headers={"Content-Type": "application/json"},
-              )
-              with urllib.request.urlopen(req) as resp:
-                  return json.loads(resp.read())
-
-          # List all endpoints
-          result = graphql("""
-              query {
-                  myself {
-                      endpoints {
-                          id
-                          name
-                          workersMin
-                          workersMax
-                          createdAt
-                      }
-                  }
-              }
-          """)
-
-          endpoints = result.get("data", {}).get("myself", {}).get("endpoints", [])
-          if not endpoints:
-              print("No endpoints found.")
-              raise SystemExit(0)
-
-          # Filter if requested
-          if NAME_FILTER:
-              targets = [ep for ep in endpoints if NAME_FILTER in ep.get("name", "")]
-              print(f"Filter '{NAME_FILTER}' matched {len(targets)}/{len(endpoints)} endpoints")
-          else:
-              targets = endpoints
-              print(f"Found {len(targets)} total endpoints (no filter applied)")
-
-          print(f"\n{'DRY RUN — ' if DRY_RUN else ''}{'Listing' if DRY_RUN else 'Deleting'} {len(targets)} endpoint(s):\n")
-          for ep in sorted(targets, key=lambda e: e.get("createdAt", "")):
-              print(f"  {ep['id']}  {ep.get('name', '(unnamed)'):<40}  "
-                    f"workers={ep.get('workersMin', '?')}-{ep.get('workersMax', '?')}  "
-                    f"created={ep.get('createdAt', 'unknown')}")
-
-          if DRY_RUN:
-              print(f"\nDry run complete. Re-run with dry_run=false to delete.")
-              raise SystemExit(0)
-
-          # Delete each endpoint
-          deleted = 0
-          failed = 0
-          for ep in targets:
-              ep_id = ep["id"]
-              ep_name = ep.get("name", "(unnamed)")
-              try:
-                  resp = graphql(
-                      "mutation deleteEndpoint($id: String!) { deleteEndpoint(id: $id) }",
-                      {"id": ep_id},
-                  )
-                  if "errors" in resp:
-                      print(f"  FAILED  {ep_id}  {ep_name}: {resp['errors']}")
-                      failed += 1
-                  else:
-                      print(f"  DELETED {ep_id}  {ep_name}")
-                      deleted += 1
-              except Exception as exc:
-                  print(f"  ERROR   {ep_id}  {ep_name}: {exc}")
-                  failed += 1
-
-          print(f"\nDone: {deleted} deleted, {failed} failed, {len(endpoints) - len(targets)} skipped (filtered)")
-          SCRIPT

From 336c18f74fee0943b22b40df0d6e601b82baabe7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 23 Mar 2026 16:58:05 -0700
Subject: [PATCH 4/6] fix(e2e): replace flash undeploy with direct GraphQL
 endpoint deletion

flash undeploy looks up endpoints in .runpod/resources.pkl which does
not exist in CI. Use the Runpod GraphQL API to query endpoints by name
and delete them directly.
---
 tests/e2e/conftest.py | 67 +++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index 4e5ec585..42252090 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -1,8 +1,9 @@
 """E2E test fixtures: provision real endpoints, configure SDK, clean up."""
 
+import json
 import logging
 import os
-import subprocess
+import urllib.request
 from pathlib import Path
 
 import pytest
@@ -16,6 +17,48 @@
 # Repo root: tests/e2e/conftest.py -> ../../
 _REPO_ROOT = Path(__file__).resolve().parents[2]
 
+_GRAPHQL_URL = "https://api.runpod.io/graphql"
+
+
+def _graphql(api_key: str, query: str, variables: dict | None = None) -> dict:
+    """Execute a Runpod GraphQL query."""
+    payload = json.dumps({"query": query, "variables": variables or {}}).encode()
+    req = urllib.request.Request(
+        f"{_GRAPHQL_URL}?api_key={api_key}",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read())
+
+
+def _delete_endpoints_by_name(api_key: str, names: list[str]) -> None:
+    """Delete endpoints matching the given names via GraphQL API."""
+    result = _graphql(api_key, """
+        query { myself { endpoints { id name } } }
+    """)
+    all_endpoints = result.get("data", {}).get("myself", {}).get("endpoints", [])
+    name_set = set(names)
+    targets = [ep for ep in all_endpoints if ep.get("name") in name_set]
+
+    if not targets:
+        log.warning("No matching endpoints found for names: %s", names)
+        return
+
+    for ep in targets:
+        try:
+            resp = _graphql(
+                api_key,
+                "mutation($id: String!) { deleteEndpoint(id: $id) }",
+                {"id": ep["id"]},
+            )
+            if "errors" in resp:
+                log.warning("Failed to delete %s (%s): %s", ep["name"], ep["id"], resp["errors"])
+            else:
+                log.info("Deleted endpoint %s (%s)", ep["name"], ep["id"])
+        except Exception:
+            log.exception("Error deleting endpoint %s (%s)", ep["name"], ep["id"])
+
 
 @pytest.fixture(scope="session", autouse=True)
 def verify_local_runpod():
@@ -57,22 +100,10 @@ def endpoints(require_api_key, test_cases):
         log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A")
     yield eps
 
-    # Undeploy only the endpoints provisioned by this test run.
-    # Uses by-name undeploy to avoid tearing down unrelated endpoints
-    # sharing the same API key (parallel CI runs, developer endpoints).
+    # Delete provisioned endpoints via GraphQL API directly.
+    # flash undeploy relies on .runpod/resources.pkl which doesn't exist in CI.
+    api_key = os.environ.get("RUNPOD_API_KEY", "")
     endpoint_names = [ep.name for ep in eps.values()]
     log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names)
-    for name in endpoint_names:
-        try:
-            result = subprocess.run(
-                ["flash", "undeploy", name, "--force"],
-                capture_output=True,
-                text=True,
-                timeout=60,
-            )
-            if result.returncode == 0:
-                log.info("Undeployed %s", name)
-            else:
-                log.warning("flash undeploy %s failed (rc=%d): %s", name, result.returncode, result.stderr)
-        except Exception:
-            log.exception("Failed to undeploy %s", name)
+    if api_key and endpoint_names:
+        _delete_endpoints_by_name(api_key, endpoint_names)

From 4c6ec06dd1e16f01c129adcb1568baea82c46391 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 23 Mar 2026 17:06:46 -0700
Subject: [PATCH 5/6] fix(e2e): make endpoint cleanup non-fatal

The RUNPOD_API_KEY in CI lacks GraphQL API access, causing 403 on
cleanup. Catch the error so test results are not masked by teardown
failures.
---
 tests/e2e/conftest.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index 42252090..fe1c3995 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -106,4 +106,7 @@ def endpoints(require_api_key, test_cases):
     endpoint_names = [ep.name for ep in eps.values()]
     log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names)
     if api_key and endpoint_names:
-        _delete_endpoints_by_name(api_key, endpoint_names)
+        try:
+            _delete_endpoints_by_name(api_key, endpoint_names)
+        except Exception:
+            log.exception("Endpoint cleanup failed (API key may lack GraphQL access)")

From 1c2537cab9f93a2e0ab2a16ca9c9a937ebdf6685 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 23 Mar 2026 17:07:26 -0700
Subject: [PATCH 6/6] fix(e2e): use flash undeploy --all --force for cleanup

---
 tests/e2e/conftest.py | 69 ++++++++++---------------------------------
 1 file changed, 15 insertions(+), 54 deletions(-)

diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index fe1c3995..8bb8b577 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -1,9 +1,8 @@
 """E2E test fixtures: provision real endpoints, configure SDK, clean up."""
 
-import json
 import logging
 import os
-import urllib.request
+import subprocess
 from pathlib import Path
 
 import pytest
@@ -17,48 +16,6 @@
 # Repo root: tests/e2e/conftest.py -> ../../
 _REPO_ROOT = Path(__file__).resolve().parents[2]
 
-_GRAPHQL_URL = "https://api.runpod.io/graphql"
-
-
-def _graphql(api_key: str, query: str, variables: dict | None = None) -> dict:
-    """Execute a Runpod GraphQL query."""
-    payload = json.dumps({"query": query, "variables": variables or {}}).encode()
-    req = urllib.request.Request(
-        f"{_GRAPHQL_URL}?api_key={api_key}",
-        data=payload,
-        headers={"Content-Type": "application/json"},
-    )
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return json.loads(resp.read())
-
-
-def _delete_endpoints_by_name(api_key: str, names: list[str]) -> None:
-    """Delete endpoints matching the given names via GraphQL API."""
-    result = _graphql(api_key, """
-        query { myself { endpoints { id name } } }
-    """)
-    all_endpoints = result.get("data", {}).get("myself", {}).get("endpoints", [])
-    name_set = set(names)
-    targets = [ep for ep in all_endpoints if ep.get("name") in name_set]
-
-    if not targets:
-        log.warning("No matching endpoints found for names: %s", names)
-        return
-
-    for ep in targets:
-        try:
-            resp = _graphql(
-                api_key,
-                "mutation($id: String!) { deleteEndpoint(id: $id) }",
-                {"id": ep["id"]},
-            )
-            if "errors" in resp:
-                log.warning("Failed to delete %s (%s): %s", ep["name"], ep["id"], resp["errors"])
-            else:
-                log.info("Deleted endpoint %s (%s)", ep["name"], ep["id"])
-        except Exception:
-            log.exception("Error deleting endpoint %s (%s)", ep["name"], ep["id"])
-
 
 @pytest.fixture(scope="session", autouse=True)
 def verify_local_runpod():
@@ -100,13 +57,17 @@ def endpoints(require_api_key, test_cases):
         log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A")
     yield eps
 
-    # Delete provisioned endpoints via GraphQL API directly.
-    # flash undeploy relies on .runpod/resources.pkl which doesn't exist in CI.
-    api_key = os.environ.get("RUNPOD_API_KEY", "")
-    endpoint_names = [ep.name for ep in eps.values()]
-    log.info("Cleaning up %d provisioned endpoints: %s", len(endpoint_names), endpoint_names)
-    if api_key and endpoint_names:
-        try:
-            _delete_endpoints_by_name(api_key, endpoint_names)
-        except Exception:
-            log.exception("Endpoint cleanup failed (API key may lack GraphQL access)")
+    log.info("Cleaning up all provisioned endpoints")
+    try:
+        result = subprocess.run(
+            ["flash", "undeploy", "--all", "--force"],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        if result.returncode == 0:
+            log.info("Undeployed all endpoints")
+        else:
+            log.warning("flash undeploy --all --force failed (rc=%d): %s", result.returncode, result.stderr)
+    except Exception:
+        log.exception("Failed to undeploy endpoints")