From fa90567e996d23cbfc5433f162d78edc266ad4bc Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Thu, 28 May 2026 11:27:30 +0000 Subject: [PATCH] chore(skills): remove only the most-drifted vendored skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Narrowed in response to @andrelandgraf's feedback on #103. devhub's vendored skills aren't customer-facing — they're for internal devhub development — and most of them are close enough to their upstream sources to keep. Only the most-divergent forks are removed here. Removed (4): - databricks-apps — missing State Storage Rule, Data Access Decision Gate, Genie / Model Serving routing rows, AppKit version / TypeScript cast guidance, smoke test selector rules (≈29% line drift but structurally far behind upstream). - databricks-jobs — 189 lines here vs 409 upstream (54% drift). - databricks-lakebase — 182 lines here vs 315 upstream (42% drift). - databricks — legacy combined CLI / data exploration / asset-bundles skill; upstream has split this into databricks-core + databricks-dabs, so the vendored copy can't be cleanly synced. Kept (11): databricks-core, databricks-pipelines, agent-browser, building-components, deploy-to-vercel, frontend-design, mcp-builder, seo-audit, shadcn, vercel-cli, vercel-composition-patterns. These are close enough to upstream that the maintenance cost of keeping them in tree is low, and shipping them vendored means a fresh clone has the workflow skills ready without extra install steps. The agent-confusion concern (agents misreading vendored skills as something devhub exposes to customers) is mitigated upstream by vercel-labs/skills#1281. This pull request and its description were written by Claude (james.broadhead@databricks.com). --- .agents/skills/databricks-apps/SKILL.md | 158 ----- .../references/appkit/appkit-sdk.md | 112 ---- .../references/appkit/frontend.md | 174 ------ .../references/appkit/lakebase.md | 215 ------- .../references/appkit/overview.md | 144 ----- .../references/appkit/proto-contracts.md | 205 ------- .../references/appkit/proto-first.md | 312 ---------- .../references/appkit/sql-queries.md | 272 --------- .../databricks-apps/references/appkit/trpc.md | 148 ----- .../references/other-frameworks.md | 282 --------- .../references/platform-guide.md | 173 ------ .../databricks-apps/references/testing.md | 111 ---- .agents/skills/databricks-jobs/SKILL.md | 189 ------ .agents/skills/databricks-lakebase/SKILL.md | 182 ------ .agents/skills/databricks/SKILL.md | 142 ----- .agents/skills/databricks/asset-bundles.md | 509 ---------------- .agents/skills/databricks/data-exploration.md | 347 ----------- .../skills/databricks/databricks-cli-auth.md | 557 ------------------ .../databricks/databricks-cli-install.md | 212 ------- .claude/skills/databricks | 1 - .claude/skills/databricks-apps | 1 - .claude/skills/databricks-jobs | 1 - .claude/skills/databricks-lakebase | 1 - AGENTS.md | 13 + skills-lock.json | 20 - 25 files changed, 13 insertions(+), 4468 deletions(-) delete mode 100644 .agents/skills/databricks-apps/SKILL.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/appkit-sdk.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/frontend.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/lakebase.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/overview.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/proto-contracts.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/proto-first.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/sql-queries.md delete mode 100644 .agents/skills/databricks-apps/references/appkit/trpc.md delete mode 100644 .agents/skills/databricks-apps/references/other-frameworks.md delete mode 100644 .agents/skills/databricks-apps/references/platform-guide.md delete mode 100644 .agents/skills/databricks-apps/references/testing.md delete mode 100644 .agents/skills/databricks-jobs/SKILL.md delete mode 100644 .agents/skills/databricks-lakebase/SKILL.md delete mode 100644 .agents/skills/databricks/SKILL.md delete mode 100644 .agents/skills/databricks/asset-bundles.md delete mode 100644 .agents/skills/databricks/data-exploration.md delete mode 100644 .agents/skills/databricks/databricks-cli-auth.md delete mode 100644 .agents/skills/databricks/databricks-cli-install.md delete mode 120000 .claude/skills/databricks delete mode 120000 .claude/skills/databricks-apps delete mode 120000 .claude/skills/databricks-jobs delete mode 120000 .claude/skills/databricks-lakebase diff --git a/.agents/skills/databricks-apps/SKILL.md b/.agents/skills/databricks-apps/SKILL.md deleted file mode 100644 index 4a26d97..0000000 --- a/.agents/skills/databricks-apps/SKILL.md +++ /dev/null @@ -1,158 +0,0 @@ ---- -name: databricks-apps -description: Build apps on Databricks Apps platform. Use when asked to create dashboards, data apps, analytics tools, or visualizations. Invoke BEFORE starting implementation. -compatibility: Requires databricks CLI (>= v0.294.0) -metadata: - version: "0.1.1" -parent: databricks-core ---- - -# Databricks Apps Development - -**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection. - -Build apps that deploy to Databricks Apps platform. - -## Required Reading by Phase - -| Phase | READ BEFORE proceeding | -| ----------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Scaffolding | Parent `databricks-core` skill (auth, warehouse discovery); run `databricks apps manifest` and use its plugins/resources to build `databricks apps init` with `--features` and `--set` (see AppKit section below) | -| Writing SQL queries | [SQL Queries Guide](references/appkit/sql-queries.md) | -| Writing UI components | [Frontend Guide](references/appkit/frontend.md) | -| Using `useAnalyticsQuery` | [AppKit SDK](references/appkit/appkit-sdk.md) | -| Adding API endpoints | [tRPC Guide](references/appkit/trpc.md) | -| Using Lakebase (OLTP database) | [Lakebase Guide](references/appkit/lakebase.md) | -| Typed data contracts (proto-first design) | [Proto-First Guide](references/appkit/proto-first.md) and [Plugin Contracts](references/appkit/proto-contracts.md) | -| Platform rules (permissions, deployment, limits) | [Platform Guide](references/platform-guide.md) — READ for ALL apps including AppKit | -| Non-AppKit app (Streamlit, FastAPI, Flask, Gradio, Next.js, etc.) | [Other Frameworks](references/other-frameworks.md) | - -## Generic Guidelines - -- **App name**: ≤26 characters, lowercase letters/numbers/hyphens only (no underscores). dev- prefix adds 4 chars, max 30 total. -- **Validation**: `databricks apps validate --profile ` before deploying. -- **Smoke tests** (AppKit only): ALWAYS update `tests/smoke.spec.ts` selectors BEFORE running validation. Default template checks for "Minimal Databricks App" heading and "hello world" text — these WILL fail in your custom app. See [testing guide](references/testing.md). -- **Authentication**: covered by parent `databricks-core` skill. - -## Project Structure (after `databricks apps init --features analytics`) - -- `client/src/App.tsx` — main React component (start here) -- `config/queries/*.sql` — SQL query files (queryKey = filename without .sql) -- `server/server.ts` — backend entry (tRPC routers) -- `tests/smoke.spec.ts` — smoke test (⚠️ MUST UPDATE selectors for your app) -- `client/src/appKitTypes.d.ts` — auto-generated types (`npm run typegen`) - -## Project Structure (after `databricks apps init --features lakebase`) - -- `server/server.ts` — backend with Lakebase pool + tRPC routes -- `client/src/App.tsx` — React frontend -- `app.yaml` — manifest with `database` resource declaration -- `package.json` — includes `@databricks/lakebase` dependency -- Note: **No `config/queries/`** — Lakebase apps use `pool.query()` in tRPC, not SQL files - -## Data Discovery - -Before writing any SQL, use the parent `databricks-core` skill for data exploration — search `information_schema` by keyword, then batch `discover-schema` for the tables you need. Do NOT skip this step. - -## Development Workflow (FOLLOW THIS ORDER) - -**Analytics apps** (`--features analytics`): - -1. Create SQL files in `config/queries/` -2. Run `npm run typegen` — verify all queries show ✓ -3. Read `client/src/appKitTypes.d.ts` to see generated types -4. **THEN** write `App.tsx` using the generated types -5. Update `tests/smoke.spec.ts` selectors -6. Run `databricks apps validate --profile ` - -**DO NOT** write UI code before running typegen — types won't exist and you'll waste time on compilation errors. - -**Lakebase apps** (`--features lakebase`): No SQL files or typegen. See [Lakebase Guide](references/appkit/lakebase.md) for the tRPC pattern: initialize schema at startup, write procedures in `server/server.ts`, then build the React frontend. - -## When to Use What - -- **Read analytics data → display in chart/table**: Use visualization components with `queryKey` prop -- **Read analytics data → custom display (KPIs, cards)**: Use `useAnalyticsQuery` hook -- **Read analytics data → need computation before display**: Still use `useAnalyticsQuery`, transform client-side -- **Read/write persistent data (users, orders, CRUD state)**: Use Lakebase pool via tRPC — see [Lakebase Guide](references/appkit/lakebase.md) -- **Call ML model endpoint**: Use tRPC -- **⚠️ NEVER use tRPC to run SELECT queries against the warehouse** — always use SQL files in `config/queries/` -- **⚠️ NEVER use `useAnalyticsQuery` for Lakebase data** — it queries the SQL warehouse only - -## Frameworks - -### AppKit (Recommended) - -TypeScript/React framework with type-safe SQL queries and built-in components. - -**Official Documentation** — the source of truth for all API details: - -```bash -npx @databricks/appkit docs # ← ALWAYS start here to see available pages -npx @databricks/appkit docs # view a section by name or doc path -npx @databricks/appkit docs --full # full index with all API entries -npx @databricks/appkit docs "appkit-ui API reference" # example: section by name -npx @databricks/appkit docs ./docs/plugins/analytics.md # example: specific doc file -``` - -**DO NOT guess doc paths.** Run without args first, pick from the index. The `` argument accepts both section names (from the index) and file paths. Docs are the authority on component props, hook signatures, and server APIs — skill files only cover anti-patterns and gotchas. - -**App Manifest and Scaffolding** - -**Agent workflow for scaffolding: get the manifest first, then build the init command.** - -1. **Get the manifest** (JSON schema describing plugins and their resources): - - ```bash - databricks apps manifest --profile - # See plugins available in a specific AppKit version: - databricks apps manifest --version --profile - # Custom template: - databricks apps manifest --template --profile - ``` - - The output defines: - - **Plugins**: each has a key (plugin ID for `--features`), plus `requiredByTemplate`, and `resources`. - - **requiredByTemplate**: If **true**, that plugin is **mandatory** for this template — do **not** add it to `--features` (it is included automatically); you must still supply all of its required resources via `--set`. If **false** or absent, the plugin is **optional** — add it to `--features` only when the user's prompt indicates they want that capability (e.g. analytics/SQL), and then supply its required resources via `--set`. - - **Resources**: Each plugin has `resources.required` and `resources.optional` (arrays). Each item has `resourceKey` and `fields` (object: field name → description/env). Use `--set ..=` for each required resource field of every plugin you include. - -2. **Scaffold** (DO NOT use `npx`; use the CLI only): - ```bash - databricks apps init --name --features , \ - --set ..= \ - --set ..= \ - --description "" --run none --profile - # --run none: skip auto-run after scaffolding (review code first) - # With custom template: - databricks apps init --template --name --features ... --set ... --profile - ``` - Optionally use `--version ` to target a specific AppKit version. - - **Required**: `--name`, `--profile`. Name: ≤26 chars, lowercase letters/numbers/hyphens only. Use `--features` only for **optional** plugins the user wants (plugins with `requiredByTemplate: false` or absent); mandatory plugins must not be listed in `--features`. - - **Resources**: Pass `--set` for every required resource (each field in `resources.required`) for (1) all plugins with `requiredByTemplate: true`, and (2) any optional plugins you added to `--features`. Add `--set` for `resources.optional` only when the user requests them. - - **Discovery**: Use the parent `databricks-core` skill to resolve IDs (e.g. warehouse: `databricks warehouses list --profile ` or `databricks experimental aitools tools get-default-warehouse --profile `). - -**DO NOT guess** plugin names, resource keys, or property names — always derive them from `databricks apps manifest` output. Example: if the manifest shows plugin `analytics` with a required resource `resourceKey: "sql-warehouse"` and `fields: { "id": ... }`, include `--set analytics.sql-warehouse.id=`. - -**READ [AppKit Overview](references/appkit/overview.md)** for project structure, workflow, and pre-implementation checklist. - -### Common Scaffolding Mistakes - -```bash -# ❌ WRONG: name is NOT a positional argument -databricks apps init --features analytics my-app-name -# → "unknown command" error - -# ✅ CORRECT: use --name flag -databricks apps init --name my-app-name --features analytics --set "..." --profile -``` - -### Directory Naming - -`databricks apps init` creates directories in kebab-case matching the app name. -App names must be lowercase with hyphens only (≤26 chars). - -### Other Frameworks (Streamlit, FastAPI, Flask, Gradio, Dash, Next.js, etc.) - -Databricks Apps supports any framework that runs as an HTTP server. LLMs already know these frameworks — the challenge is Databricks platform integration. - -**READ [Other Frameworks Guide](references/other-frameworks.md) BEFORE building any non-AppKit app.** It covers port/host configuration, `app.yaml` and `databricks.yml` setup, dependency management, networking, and framework-specific gotchas. diff --git a/.agents/skills/databricks-apps/references/appkit/appkit-sdk.md b/.agents/skills/databricks-apps/references/appkit/appkit-sdk.md deleted file mode 100644 index 368a444..0000000 --- a/.agents/skills/databricks-apps/references/appkit/appkit-sdk.md +++ /dev/null @@ -1,112 +0,0 @@ -# Databricks App Kit SDK - -## TypeScript Import Rules - -This template uses strict TypeScript settings with `verbatimModuleSyntax: true`. **Always use `import type` for type-only imports**. - -Template enforces `noUnusedLocals` - remove unused imports immediately or build fails. - -```typescript -// ✅ CORRECT - use import type for types -import type { MyInterface, MyType } from "./types"; - -// ❌ WRONG - will fail compilation -import { MyInterface, MyType } from "./types"; -``` - -## Server Setup - -For server configuration, see: `npx @databricks/appkit docs ./docs/plugins.md` - -## useAnalyticsQuery Hook - -**ONLY use when displaying data in a custom way that isn't a chart or table.** For charts/tables, pass `queryKey` directly to the component — don't double-fetch. Charts also accept a `format` option (`"json"` | `"arrow"` | `"auto"`, default `"auto"`) to control the data transfer format. - -Use cases: - -- Custom HTML layouts (cards, lists, grids) -- Summary statistics and KPIs -- Conditional rendering based on data values -- Data that needs transformation before display - -### ⚠️ Memoize Parameters to Prevent Infinite Loops - -```typescript -// ❌ WRONG - creates new object every render → infinite refetch loop -const { data } = useAnalyticsQuery("query", { id: sql.string(selectedId) }); - -// ✅ CORRECT - memoize parameters -const params = useMemo(() => ({ id: sql.string(selectedId) }), [selectedId]); -const { data } = useAnalyticsQuery("query", params); -``` - -### Conditional Queries - -```typescript -// ❌ WRONG - `enabled` is NOT a valid option (this is a React Query pattern) -const { data } = useAnalyticsQuery('query', params, { enabled: !!selectedId }); - -// ✅ CORRECT - use autoStart: false -const { data } = useAnalyticsQuery('query', params, { autoStart: false }); - -// ✅ ALSO CORRECT - conditional rendering (component only mounts when data exists) -{selectedId && } -``` - -### Type Inference - -When `appKitTypes.d.ts` has been generated (via `npm run typegen`), types are inferred automatically: - -```typescript -// ✅ After typegen - types are automatic, no generic needed -const { data } = useAnalyticsQuery("my_query", params); - -// ⚠️ Before typegen - data is `unknown`, you must provide type manually -const { data } = useAnalyticsQuery("my_query", params); -``` - -**Common mistake** — don't define interfaces that duplicate generated types: - -```typescript -// ❌ WRONG - manual interface may conflict with generated QueryRegistry -interface MyData { - id: string; - value: number; -} -const { data } = useAnalyticsQuery("my_query", params); - -// ✅ CORRECT - run `npm run typegen` and let it provide types -const { data } = useAnalyticsQuery("my_query", params); -``` - -### Basic Usage - -```typescript -import { useAnalyticsQuery, Skeleton } from '@databricks/appkit-ui/react'; -import { sql } from '@databricks/appkit-ui/js'; -import { useMemo } from 'react'; - -function CustomDisplay() { - const params = useMemo(() => ({ - start_date: sql.date('2024-01-01'), - category: sql.string("tools") - }), []); - - const { data, loading, error } = useAnalyticsQuery('query_name', params); - - if (loading) return ; - if (error) return
Error: {error}
; - if (!data) return null; - - return ( -
- {data.map(row => ( -
-

{row.column_name}

-

{Number(row.value).toFixed(2)}

-
- ))} -
- ); -} -``` diff --git a/.agents/skills/databricks-apps/references/appkit/frontend.md b/.agents/skills/databricks-apps/references/appkit/frontend.md deleted file mode 100644 index 3359a65..0000000 --- a/.agents/skills/databricks-apps/references/appkit/frontend.md +++ /dev/null @@ -1,174 +0,0 @@ -# Frontend Guidelines - -**For full component API**: run `npx @databricks/appkit docs` and navigate to the component you need. - -## Common Anti-Patterns - -These mistakes appear frequently — check the official docs for actual prop names: - -| Mistake | Why it's wrong | What to do | -| ---------------------------------------------------------- | --------------------------------------------------------------------------------- | ------------------------------------------------------------- | -| `xAxisKey`, `dataKey` on charts | Recharts naming, not AppKit | Use `xKey`, `yKey` (auto-detected from schema if omitted) | -| `yAxisKeys`, `yKeys` on charts | Recharts naming | Use `yKey` (string or string[]) | -| `config` on charts | Not a valid prop name | Use `options` for ECharts overrides | -| ``, `` children | AppKit charts are ECharts-based, NOT Recharts wrappers — configure via props only | | -| `columns` on DataTable | DataTable auto-generates columns from data | Use `queryKey` + `parameters`; use `transform` for formatting | -| Double-fetching with `useAnalyticsQuery` + chart component | Components handle their own fetching | Just pass `queryKey` to the component | - -**Always verify props against docs before using a component.** - -## Chart Data Modes - -All chart/data components support two modes: - -- **Query mode**: pass `queryKey` + `parameters` — component fetches data automatically. `parameters` is REQUIRED even if empty (`parameters={{}}`). -- **Data mode**: pass static data via `data` prop (JSON array or Arrow Table) — no `queryKey`/`parameters` needed. - -```tsx -// Query mode (recommended for Databricks SQL) - - -// Data mode (static/pre-fetched data) - -``` - -## Chart Props Quick Reference - -All charts accept these core props (verify full list via `npx @databricks/appkit docs`): - -```tsx - d} // transform raw data before rendering - colors={['#40d1f5']} // custom colors (overrides colorPalette) - colorPalette="categorical" // "categorical" | "sequential" | "diverging" - title="Sales by Region" // chart title - showLegend // show legend - options={{}} // additional ECharts options to merge - height={400} // default: 300 - orientation="vertical" // "vertical" | "horizontal" (BarChart/LineChart/AreaChart) - stacked // stack bars/areas (BarChart/AreaChart) -/> - - -``` - -Charts are **ECharts-based** — configure via props, not Recharts-style children. Components handle data fetching, loading, and error states internally. - -> ⚠️ **`parameters` is REQUIRED on all data components**, even when the query has no params. Always include `parameters={{}}`. - -```typescript -// ❌ Don't double-fetch -const { data } = useAnalyticsQuery('sales_data', {}); -return ; // fetches again! -``` - -## DataTable - -DataTable auto-generates columns from data and handles fetching, loading, error, and empty states. - -**For full props**: `npx @databricks/appkit docs "DataTable"`. - -```tsx -// ❌ WRONG - missing required `parameters` prop - - -// ✅ CORRECT - minimal - - -// ✅ CORRECT - with filtering and pagination - - -// ✅ CORRECT - with row selection - console.log(selection)} -/> -``` - -**Custom column formatting** — use the `transform` prop or format in SQL: - -```typescript - data.map(row => ({ - ...row, - price: `$${Number(row.price).toFixed(2)}`, - }))} -/> -``` - -## Available Components (Quick Reference) - -**For full prop details**: `npx @databricks/appkit docs "appkit-ui API reference"`. - -All data components support both query mode (`queryKey` + `parameters`) and data mode (static `data` prop). Common props across all charts: `format`, `transformer`, `colors`, `colorPalette`, `title`, `showLegend`, `height`, `options`, `ariaLabel`, `testId`. - -### Data Components (`@databricks/appkit-ui/react`) - -| Component | Extra Props | Use For | -| -------------- | ---------------------------------------------------------------------------------------------- | ----------------------------- | -| `BarChart` | `xKey`, `yKey`, `orientation`, `stacked` | Categorical comparisons | -| `LineChart` | `xKey`, `yKey`, `smooth`, `showSymbol`, `orientation` | Time series, trends | -| `AreaChart` | `xKey`, `yKey`, `smooth`, `showSymbol`, `stacked`, `orientation` | Cumulative/stacked trends | -| `PieChart` | `xKey`, `yKey`, `innerRadius`, `showLabels`, `labelPosition` | Part-of-whole | -| `DonutChart` | `xKey`, `yKey`, `innerRadius`, `showLabels`, `labelPosition` | Donut (pie with inner radius) | -| `ScatterChart` | `xKey`, `yKey`, `symbolSize` | Correlation, distribution | -| `HeatmapChart` | `xKey`, `yKey`, `yAxisKey`, `min`, `max`, `showLabels` | Matrix-style data | -| `RadarChart` | `xKey`, `yKey`, `showArea` | Multi-dimensional comparison | -| `DataTable` | `filterColumn`, `filterPlaceholder`, `transform`, `pageSize`, `enableRowSelection`, `children` | Tabular data display | - -### UI Components (`@databricks/appkit-ui/react`) - -| Component | Common Props | -| -------------------------------------------------------- | ----------------------------------------------------------------- | -| `Card`, `CardHeader`, `CardTitle`, `CardContent` | Standard container | -| `Badge` | `variant`: "default" \| "secondary" \| "destructive" \| "outline" | -| `Button` | `variant`, `size`, `onClick` | -| `Input` | `placeholder`, `value`, `onChange` | -| `Select`, `SelectTrigger`, `SelectContent`, `SelectItem` | Dropdown; `SelectItem` value cannot be "" | -| `Skeleton` | `className` — use for loading states | -| `Separator` | Visual divider | -| `Tabs`, `TabsList`, `TabsTrigger`, `TabsContent` | Tabbed interface | - -All data components **require `parameters={{}}`** even when the query has no params. - -## Layout Structure - -```tsx -
-

Page Title

-
{/* form inputs */}
-
{/* list items */}
-
-``` - -## Component Organization - -- Shared UI components: `@databricks/appkit-ui/react` -- Feature components: `client/src/components/FeatureName.tsx` -- Split components when logic exceeds ~100 lines or component is reused - -## Gotchas - -- `SelectItem` cannot have `value=""`. Use sentinel value like `"all"` for "show all" options. -- Use `` components instead of plain "Loading..." text -- Handle nullable fields: `value={field || ''}` for inputs -- For maps with React 19, use react-leaflet v5: `npm install react-leaflet@^5.0.0 leaflet @types/leaflet` - -Databricks brand colors: `['#40d1f5', '#4462c9', '#EB1600', '#0B2026', '#4A4A4A', '#353a4a']` diff --git a/.agents/skills/databricks-apps/references/appkit/lakebase.md b/.agents/skills/databricks-apps/references/appkit/lakebase.md deleted file mode 100644 index e485998..0000000 --- a/.agents/skills/databricks-apps/references/appkit/lakebase.md +++ /dev/null @@ -1,215 +0,0 @@ -# Lakebase: OLTP Database for Apps - -Use Lakebase when your app needs **persistent read/write storage** — forms, CRUD operations, user-generated data. For analytics dashboards reading from a SQL warehouse, use `config/queries/` instead. - -## When to Use Lakebase vs Analytics - -| Pattern | Use Case | Data Source | -| --------- | ------------------------------------------- | --------------------------------- | -| Analytics | Read-only dashboards, charts, KPIs | Databricks SQL Warehouse | -| Lakebase | CRUD operations, persistent state, forms | PostgreSQL (Lakebase Autoscaling) | -| Both | Dashboard with user preferences/saved state | Warehouse + Lakebase | - -## Scaffolding - -**ALWAYS scaffold with the correct feature flags** — do not add Lakebase manually to an analytics-only scaffold. - -**Lakebase only** (no analytics SQL warehouse): - -```bash -databricks apps init --name --features lakebase \ - --set "lakebase.postgres.branch=" \ - --set "lakebase.postgres.database=" \ - --run none --profile -``` - -**Both Lakebase and analytics**: - -```bash -databricks apps init --name --features analytics,lakebase \ - --set "analytics.sql-warehouse.id=" \ - --set "lakebase.postgres.branch=" \ - --set "lakebase.postgres.database=" \ - --run none --profile -``` - -Where `` and `` are full resource names (e.g. `projects//branches/` and `projects//branches//databases/`). - -Use the `databricks-lakebase` skill to create a Lakebase project and discover branch/database resource names before running this command. - -**Get resource names** (if you have an existing project): - -```bash -# List branches → use the name field of a READY branch -databricks postgres list-branches projects/ --profile -# List databases → use the name field -databricks postgres list-databases projects//branches/ --profile -``` - -## Project Structure (after `databricks apps init --features lakebase`) - -``` -my-app/ -├── server/ -│ └── server.ts # Backend with Lakebase pool + tRPC routes -├── client/ -│ └── src/ -│ └── App.tsx # React frontend -├── app.yaml # Manifest with database resource declaration -└── package.json # Includes @databricks/lakebase dependency -``` - -Note: **No `config/queries/` directory** — Lakebase apps use server-side `pool.query()` calls, not SQL files. - -## `createLakebasePool` API - -```typescript -import { createLakebasePool } from "@databricks/lakebase"; -// or: import { createLakebasePool } from "@databricks/appkit"; - -const pool = createLakebasePool({ - // All fields optional — auto-populated from env vars when deployed - host: process.env.PGHOST, // Lakebase hostname - database: process.env.PGDATABASE, // Database name - endpoint: process.env.LAKEBASE_ENDPOINT, // Endpoint resource path - user: process.env.PGUSER, // Service principal client ID - max: 10, // Connection pool size - idleTimeoutMillis: 30000, - connectionTimeoutMillis: 10000, -}); -``` - -Call `createLakebasePool()` **once at module level** (server startup), not inside request handlers. - -## Environment Variables (auto-set when deployed with database resource) - -| Variable | Description | -| ------------------- | --------------------------- | -| `PGHOST` | Lakebase hostname | -| `PGPORT` | Port (default 5432) | -| `PGDATABASE` | Database name | -| `PGUSER` | Service principal client ID | -| `PGSSLMODE` | SSL mode (`require`) | -| `LAKEBASE_ENDPOINT` | Endpoint resource path | - -## tRPC CRUD Pattern - -Always use tRPC for Lakebase operations — do NOT call `pool.query()` from the client. - -```typescript -// server/server.ts -import { initTRPC } from "@trpc/server"; -import { createLakebasePool } from "@databricks/lakebase"; -import { z } from "zod"; -import superjson from "superjson"; // requires: npm install superjson - -const pool = createLakebasePool(); // reads env vars automatically - -const t = initTRPC.create({ transformer: superjson }); -const publicProcedure = t.procedure; - -export const appRouter = t.router({ - listItems: publicProcedure.query(async () => { - const { rows } = await pool.query( - "SELECT * FROM app_data.items ORDER BY created_at DESC LIMIT 100", - ); - return rows; - }), - - createItem: publicProcedure - .input(z.object({ name: z.string().min(1) })) - .mutation(async ({ input }) => { - const { rows } = await pool.query( - "INSERT INTO app_data.items (name) VALUES ($1) RETURNING *", - [input.name], - ); - return rows[0]; - }), - - deleteItem: publicProcedure - .input(z.object({ id: z.number() })) - .mutation(async ({ input }) => { - await pool.query("DELETE FROM app_data.items WHERE id = $1", [input.id]); - return { success: true }; - }), -}); -``` - -## Schema Initialization - -**Always create a custom schema** — the Service Principal has `CONNECT_AND_CREATE` permission but **cannot access the `public` schema**. Initialize tables on server startup: - -```typescript -// server/server.ts — run once at startup before handling requests -await pool.query(` - CREATE SCHEMA IF NOT EXISTS app_data; - CREATE TABLE IF NOT EXISTS app_data.items ( - id SERIAL PRIMARY KEY, - name TEXT NOT NULL, - created_at TIMESTAMPTZ DEFAULT NOW() - ); -`); -``` - -## ORM Integration (Optional) - -The pool returned by `createLakebasePool()` is a standard `pg.Pool` — works with any PostgreSQL library: - -```typescript -// Drizzle ORM -import { drizzle } from "drizzle-orm/node-postgres"; -const db = drizzle(pool); - -// Prisma (with @prisma/adapter-pg) -import { PrismaPg } from "@prisma/adapter-pg"; -const adapter = new PrismaPg(pool); -const prisma = new PrismaClient({ adapter }); -``` - -## Key Differences from Analytics Pattern - -| | Analytics | Lakebase | -| -------------- | ------------------------------------------- | ------------------------------------------ | -| SQL dialect | Databricks SQL (Spark SQL) | Standard PostgreSQL | -| Query location | `config/queries/*.sql` files | `pool.query()` in tRPC routes | -| Data retrieval | `useAnalyticsQuery` hook | tRPC query procedure | -| Date functions | `CURRENT_TIMESTAMP()`, `DATEDIFF(DAY, ...)` | `NOW()`, `AGE(...)` | -| Auto-increment | N/A | `SERIAL` or `GENERATED ALWAYS AS IDENTITY` | -| Insert pattern | N/A | `INSERT ... VALUES ($1) RETURNING *` | -| Params | Named (`:param`) | Positional (`$1, $2, ...`) | - -**NEVER use `useAnalyticsQuery` for Lakebase data** — it queries the SQL warehouse, not Lakebase. -**NEVER put Lakebase SQL in `config/queries/`** — those files are only for warehouse queries. - -## Local Development - -The Lakebase env vars (`PGHOST`, `PGDATABASE`, etc.) are auto-set only when deployed. For local development, get the connection details from your endpoint and set them manually: - -```bash -# Get endpoint connection details -databricks postgres get-endpoint \ - projects//branches//endpoints/ \ - --profile -``` - -Then create `server/.env` with the values from the endpoint response: - -``` -PGHOST= -PGPORT=5432 -PGDATABASE= -PGUSER= -PGSSLMODE=require -LAKEBASE_ENDPOINT=projects//branches//endpoints/ -``` - -Load `server/.env` in your dev server (e.g. via `dotenv` or `node --env-file=server/.env`). Never commit `.env` files — add `server/.env` to `.gitignore`. - -## Troubleshooting - -| Error | Cause | Solution | -| ------------------------------------- | ------------------------------------------ | ------------------------------------------------------------ | -| `permission denied for schema public` | Service Principal lacks access to `public` | Create custom schema: `CREATE SCHEMA IF NOT EXISTS app_data` | -| `connection refused` | Pool not connected or wrong env vars | Check `PGHOST`, `PGPORT`, `LAKEBASE_ENDPOINT` are set | -| `relation "X" does not exist` | Tables not initialized | Run `CREATE TABLE IF NOT EXISTS` at startup | -| App builds but pool fails at runtime | Env vars not set locally | Set vars in `server/.env` — see Local Development above | diff --git a/.agents/skills/databricks-apps/references/appkit/overview.md b/.agents/skills/databricks-apps/references/appkit/overview.md deleted file mode 100644 index e3c9a1a..0000000 --- a/.agents/skills/databricks-apps/references/appkit/overview.md +++ /dev/null @@ -1,144 +0,0 @@ -# AppKit Overview - -AppKit is the recommended way to build Databricks Apps - provides type-safe SQL queries, React components, and seamless deployment. - -## Choose Your Data Pattern FIRST - -Before scaffolding, decide which data pattern the app needs: - -| Pattern | When to use | Init command | -| -------------------------------- | --------------------------------------- | --------------------------------------------------------------------------------------------------- | -| **Analytics** (read-only) | Dashboards, charts, KPIs from warehouse | `--features analytics --set analytics.sql-warehouse.id=` | -| **Lakebase (OLTP)** (read/write) | CRUD forms, persistent state, user data | `--features lakebase --set lakebase.postgres.branch= --set lakebase.postgres.database=` | -| **Both** | Dashboard + user data or preferences | `--features analytics,lakebase` with all required `--set` flags | - -See [Lakebase Guide](lakebase.md) for full Lakebase scaffolding and app-code patterns. - -## Workflow - -1. **Scaffold**: Run `databricks apps manifest`, then `databricks apps init` with `--features` and `--set` as in parent SKILL.md (App Manifest and Scaffolding) -2. **Develop**: `cd && npm install && npm run dev` -3. **Validate**: `databricks apps validate` -4. **Deploy**: `databricks apps deploy --profile ` (⚠️ USER CONSENT REQUIRED) - -## Data Discovery (Before Writing SQL) - -**Use the parent `databricks-core` skill for data discovery** (table search, schema exploration, query execution). - -## Pre-Implementation Checklist - -Before writing App.tsx, complete these steps: - -1. ✅ Create SQL files in `config/queries/` -2. ✅ Run `npm run typegen` to generate query types -3. ✅ Read `client/src/appKitTypes.d.ts` to see available query result types -4. ✅ Verify component props via `npx @databricks/appkit docs` (check the relevant component page) -5. ✅ Plan smoke test updates (default expects "Minimal Databricks App") - -**DO NOT** write UI code until types are generated and verified. - -## Post-Implementation Checklist - -Before running `databricks apps validate`: - -1. ✅ Update `tests/smoke.spec.ts` heading selector to match your app title -2. ✅ Update or remove the 'hello world' text assertion -3. ✅ Verify `npm run typegen` has been run after all SQL files are finalized -4. ✅ Ensure all numeric SQL values use `Number()` conversion in display code - -## Project Structure - -``` -my-app/ -├── server/ -│ ├── server.ts # Backend entry point (AppKit) -│ └── .env # Optional local dev env vars (do not commit) -├── client/ -│ ├── index.html -│ ├── vite.config.ts -│ └── src/ -│ ├── main.tsx -│ └── App.tsx # <- Main app component (start here) -├── config/ -│ └── queries/ -│ └── my_query.sql # -> queryKey: "my_query" -├── app.yaml # Deployment config -├── package.json -└── tsconfig.json -``` - -**Key files to modify:** -| Task | File | -|------|------| -| Build UI | `client/src/App.tsx` | -| Add SQL query | `config/queries/.sql` | -| Add API endpoint | `server/server.ts` (tRPC) | -| Add shared helpers (optional) | create `shared/types.ts` or `client/src/lib/formatters.ts` | -| Fix smoke test | `tests/smoke.spec.ts` | - -## Type Safety - -For type generation details, see: `npx @databricks/appkit docs ./docs/development/type-generation.md` - -**Quick workflow:** - -1. Add/modify SQL in `config/queries/` -2. Types auto-generate during dev via the Vite plugin (or run `npm run typegen` manually) -3. Types appear in `client/src/appKitTypes.d.ts` - -## Adding Visualizations - -**Step 1**: Create SQL file `config/queries/my_data.sql` - -```sql -SELECT category, COUNT(*) as count FROM my_table GROUP BY category -``` - -**Step 2**: Use component (types auto-generated!) - -```typescript -import { BarChart } from '@databricks/appkit-ui/react'; -// Query mode: fetches data automatically - - -// Data mode: pass static data directly (no queryKey/parameters needed) - -``` - -## AppKit Official Documentation - -**Always use AppKit docs as the source of truth for API details.** - -```bash -npx @databricks/appkit docs # show the docs index (start here) -npx @databricks/appkit docs # look up a section by name or doc path -``` - -Do not guess paths — run without args first, then pick from the index. - -## References - -| When you're about to... | Read | -| ---------------------------------------- | ------------------------------------------------------------------------- | -| Write SQL files | [SQL Queries](sql-queries.md) — parameterization, dialect, sql.\* helpers | -| Use `useAnalyticsQuery` | [AppKit SDK](appkit-sdk.md) — memoization, conditional queries | -| Add chart/table components | [Frontend](frontend.md) — component quick reference, anti-patterns | -| Add API mutation endpoints | [tRPC](trpc.md) — only if you need server-side logic | -| Use Lakebase for CRUD / persistent state | [Lakebase](lakebase.md) — createLakebasePool, tRPC patterns, schema init | - -## Critical Rules - -1. **SQL for data retrieval**: Use `config/queries/` + visualization components. Never tRPC for SELECT. -2. **Numeric types**: SQL numbers may return as strings. Always convert: `Number(row.amount)` -3. **Type imports**: Use `import type { ... }` (verbatimModuleSyntax enabled). -4. **Charts are ECharts**: No Recharts children — use props (`xKey`, `yKey`, `colors`). `xKey`/`yKey` auto-detect from schema if omitted. -5. **Two data modes**: Charts/tables support query mode (`queryKey` + `parameters`) and data mode (static `data` prop). -6. **Conditional queries**: Use `autoStart: false` option or conditional rendering to control query execution. - -## Decision Tree - -- **Display data from SQL?** - - Chart/Table → `BarChart`, `LineChart`, `DataTable` components - - Custom layout (KPIs, cards) → `useAnalyticsQuery` hook -- **Call Databricks API?** → tRPC (serving endpoints, MLflow, Jobs) -- **Modify data?** → tRPC mutations diff --git a/.agents/skills/databricks-apps/references/appkit/proto-contracts.md b/.agents/skills/databricks-apps/references/appkit/proto-contracts.md deleted file mode 100644 index dc0a1e4..0000000 --- a/.agents/skills/databricks-apps/references/appkit/proto-contracts.md +++ /dev/null @@ -1,205 +0,0 @@ -# Plugin Contract Reference - -Concrete proto↔plugin mappings for the three core AppKit plugins. - -## Files Plugin Contract - -**Plugin manifest**: `files/manifest.json` -**Resource**: UC Volume with `WRITE_VOLUME` permission -**Env**: `DATABRICKS_VOLUME_FILES` for volume path - -### Boundary: What the files plugin owns - -The files plugin is the ONLY module that touches UC Volumes. Other modules -interact with files through typed proto messages, never raw paths. - -``` -┌─────────────┐ UploadRequest ┌──────────────┐ -│ api module │ ──────────────────→ │ files plugin │ -│ │ ←────────────────── │ │ -│ │ StoredArtifact │ UC Volumes │ -└─────────────┘ └──────────────┘ -``` - -### Proto → Plugin Method Mapping - -| Proto Message | Plugin Method | Direction | -| ---------------- | --------------------------------------- | --------- | -| `UploadRequest` | `files.upload(path, content, opts)` | IN | -| `StoredArtifact` | Return type of upload/getInfo | OUT | -| `VolumeLayout` | `files.config.volumePath` + conventions | CONFIG | - -### Volume Path Convention (from VolumeLayout proto) - -``` -/Volumes/{catalog}/{schema}/{volume}/ -├── uploads/ # User uploads (UploadRequest.destination_path) -├── results/ # Computed outputs (StoredArtifact) -│ └── {run_id}/ -│ ├── output.proto.bin # Binary proto serialization -│ └── output.json # JSON for debugging -└── artifacts/ # Build artifacts, archives - └── {app_name}/ - └── {version}/ -``` - -### Config ↔ Proto Mapping - -| manifest.json field | Proto field | Notes | -| ---------------------------- | -------------------------------- | ---------------------- | -| `config.timeout` (30000) | Not in proto | Plugin-internal config | -| `config.maxUploadSize` (5GB) | `UploadRequest.content` max size | Validation constraint | -| `resources.path` env | `VolumeLayout.root` | Runtime injection | - ---- - -## Lakebase Plugin Contract - -**Plugin manifest**: `lakebase/manifest.json` -**Resource**: Postgres with `CAN_CONNECT_AND_CREATE` permission -**Env**: `PGHOST`, `PGDATABASE`, `PGPORT`, `PGSSLMODE`, `LAKEBASE_ENDPOINT` - -### Boundary: What the lakebase plugin owns - -Lakebase owns ALL structured data. Every table's schema is derived from a proto -message in `database.proto`. No ad-hoc `CREATE TABLE` statements. - -``` -┌─────────────┐ RunRecord ┌──────────────┐ -│ compute mod │ ──────────────────→ │ lakebase │ -│ │ │ plugin │ -│ │ MetricRecord │ │ -│ │ ──────────────────→ │ Postgres │ -└─────────────┘ └──────┬───────┘ - │ -┌─────────────┐ SQL query │ -│ analytics │ ←──────────────────────────┘ -│ module │ RunRecord[] -└─────────────┘ -``` - -### Proto → Table Mapping - -| Proto Message | Table Name | Primary Key | Notes | -| -------------- | ---------- | -------------------- | ----------------- | -| `RunRecord` | `runs` | `(run_id, app_name)` | One row per run | -| `MetricRecord` | `metrics` | auto-increment | FK to runs.run_id | -| `ConfigRecord` | `configs` | `config_id` | Versioned configs | - -### Proto → DDL Type Mapping - -| Proto Type | SQL Type | Column Default | -| -------------- | ------------------ | ---------------- | -| `string` | `TEXT` | `''` | -| `bool` | `BOOLEAN` | `false` | -| `int32` | `INTEGER` | `0` | -| `int64` | `BIGINT` | `0` | -| `double` | `DOUBLE PRECISION` | `0.0` | -| `bytes` | `BYTEA` | `NULL` | -| `Timestamp` | `TIMESTAMPTZ` | `NOW()` | -| `repeated T` | `JSONB` | `'[]'::jsonb` | -| `map` | `JSONB` | `'{}'::jsonb` | -| nested message | `JSONB` | `NULL` | -| `enum` | `TEXT` | First value name | - -### Migration Convention - -``` -migrations/ -├── 001_create_runs.sql -├── 002_create_metrics.sql -├── 003_create_configs.sql -└── 004_add_metrics_index.sql -``` - -Each migration is idempotent (`CREATE TABLE IF NOT EXISTS`, `CREATE INDEX IF NOT EXISTS`). - -### Config ↔ Proto Mapping - -| manifest.json field | Proto usage | Notes | -| --------------------------------------- | ------------------ | --------------------- | -| `resources.branch` | Not in proto | Infrastructure config | -| `resources.database` | Not in proto | Infrastructure config | -| `resources.host` (`PGHOST`) | Connection string | Runtime injection | -| `resources.databaseName` (`PGDATABASE`) | Database selection | Runtime injection | - ---- - -## Jobs / Compute Contract - -**No plugin manifest** — Jobs are invoked via `@databricks/sdk-experimental` -**Resource**: Databricks Jobs API -**Auth**: Workspace token or OAuth - -### Boundary: What the jobs module owns - -The jobs module owns compute execution. It receives typed task inputs, runs them -on Databricks clusters, and produces typed task outputs. - -``` -┌─────────────┐ JobConfig ┌──────────────┐ -│ api module │ ──────────────────→ │ jobs module │ -│ │ │ │ -│ │ JobTaskInput │ Databricks │ -│ │ ──────────────────→ │ Jobs API │ -│ │ │ │ -│ │ JobTaskOutput │ Clusters │ -│ │ ←────────────────── │ │ -└─────────────┘ └──────────────┘ -``` - -### Proto → Jobs SDK Mapping - -| Proto Message | SDK Method | Direction | -| --------------- | ------------------------------- | ---------------------- | -| `JobConfig` | `jobs.create(config)` | IN — defines the job | -| `TaskConfig` | Task within a job | IN — defines task deps | -| `JobTaskInput` | Task params (base64 proto) | IN — task receives | -| `JobTaskOutput` | Task output (written to Volume) | OUT — task produces | - -### Task Parameter Convention - -Job tasks receive their typed input via: - -1. **Small payloads (<256KB)**: Base64-encoded proto in task params -2. **Large payloads**: Proto binary written to UC Volume, path passed as param - -```typescript -// Producer (api module) -const input: JobTaskInput = { taskId, taskType, runId, inputPayload }; -const encoded = Buffer.from(JobTaskInput.encode(input).finish()).toString( - "base64", -); -// Pass as notebook parameter: { "input": encoded } - -// Consumer (job task code) -const decoded = JobTaskInput.decode(Buffer.from(params.input, "base64")); -``` - -### Task Output Convention - -Job tasks write their typed output to: - -``` -/Volumes/{catalog}/{schema}/{volume}/results/{run_id}/{task_id}.output.bin -``` - -The output is a serialized `JobTaskOutput` proto. The orchestrator reads it -back with the generated decoder. - -### Jobs API Patterns - -```typescript -// Create a multi-task job from JobConfig proto -const jobConfig: JobConfig = { - jobName: `${appName}-${runId}`, - clusterSpec: '{"num_workers": 1}', - maxRetries: 2, - timeoutSeconds: 3600, - tasks: [ - { taskKey: "generate", taskType: "generate", dependsOn: [] }, - { taskKey: "evaluate", taskType: "evaluate", dependsOn: ["generate"] }, - { taskKey: "aggregate", taskType: "aggregate", dependsOn: ["evaluate"] }, - ], -}; -``` diff --git a/.agents/skills/databricks-apps/references/appkit/proto-first.md b/.agents/skills/databricks-apps/references/appkit/proto-first.md deleted file mode 100644 index 6015c1c..0000000 --- a/.agents/skills/databricks-apps/references/appkit/proto-first.md +++ /dev/null @@ -1,312 +0,0 @@ -# Proto-First App Design - -Schema-first approach for AppKit apps using protobuf data contracts. Define contracts BEFORE implementation — derive TypeScript types, Lakebase DDL, and Volume paths from `.proto` files. - -**When to use:** New apps with multiple plugins (files + lakebase + jobs), or adding typed boundaries to existing apps. Skip for quick prototypes. - -**Requires:** `buf` CLI for proto linting and code generation. - -**Rule: No implementation before contracts. No contracts without consumers.** - -Define protobuf data contracts FIRST, then derive everything else (TypeScript types, Lakebase DDL, Volume paths, API shapes) from those contracts. - -## When to Use - -| Scenario | Use this skill | -| --------------------------------------------- | ---------------------------------------------------- | -| Creating a new Databricks app | YES — define contracts before `databricks apps init` | -| Adding a new data boundary to an existing app | YES — add proto before implementation | -| Quick prototype / hackathon | NO — skip contracts, move fast | -| Modifying existing typed code | NO — contracts already exist | - -## Core Principle - -``` -User intent → Module map → Proto contracts → Generated types → Implementation - ↓ ↓ - Lakebase DDL TypeScript interfaces - ↓ ↓ - Migrations Plugin code -``` - -The `.proto` file is the single source of truth. If it's not in a proto, it doesn't cross a module boundary. - -## Phase 1: Decompose into Modules - -Every Databricks app decomposes into a combination of these plugin modules: - -| Module | Plugin | Data Boundary | Owns | -| ------------- | --------- | ------------------- | --------------------------------------- | -| **Storage** | files | UC Volumes | Blobs, uploads, artifacts, archives | -| **Database** | lakebase | Postgres tables | Structured records, queries, migrations | -| **Compute** | jobs | Databricks Jobs API | Job runs, task results, cluster configs | -| **Analytics** | analytics | SQL Warehouse | Read-only queries, dashboards | -| **Serving** | server | HTTP/tRPC routes | API endpoints, SSE streams | - -### Decomposition Rules - -1. **Each module owns its data** — files plugin never writes to lakebase, lakebase never writes to volumes. -2. **Cross-module communication is typed** — a proto message, never a raw JSON blob. -3. **Every proto message has exactly one producer module.** -4. **Multiple modules can consume** — but the producer defines the schema. -5. **No god messages** — if a message has >12 fields, split it. - -### Output: Module Map - -Before proceeding, produce a module map for the user to confirm: - -``` -App: -Modules: - storage: files plugin → uploads/, results/, artifacts/ - db: lakebase plugin → runs, metrics, configs tables - compute: jobs → generation tasks, eval tasks - api: server plugin → POST /run, GET /status, SSE /stream -``` - -## Phase 2: Define Proto Contracts - -### Directory Structure - -``` -proto/ -├── buf.yaml -├── buf.gen.yaml -└── / - └── v1/ - ├── common.proto # Shared enums, IDs - ├── storage.proto # Files plugin boundary - ├── database.proto # Lakebase plugin boundary - ├── compute.proto # Jobs boundary - └── api.proto # Server/API boundary -``` - -### Proto Style Rules - -- **Package**: `.v1` (versioned from day one) -- **One file per module boundary**, not per message -- **Every field has a consumer** — if no code reads it, delete it -- **snake_case** for all field names -- **proto3** syntax only - -### Files Plugin Boundary (`storage.proto`) - -The files plugin operates on UC Volumes. Type every file path and payload: - -```protobuf -syntax = "proto3"; -package .v1; - -import "google/protobuf/timestamp.proto"; - -// StoredArtifact — produced by files plugin after upload. -message StoredArtifact { - string volume_path = 1; - string content_type = 2; - int64 size_bytes = 3; - google.protobuf.Timestamp created_at = 4; - string checksum_sha256 = 5; -} - -// UploadRequest — sent to files plugin by api module. -message UploadRequest { - string destination_path = 1; - string content_type = 2; - bytes content = 3; - map metadata = 4; -} - -// VolumeLayout — design-time contract for volume directory structure. -message VolumeLayout { - string root = 1; // /Volumes/catalog/schema/app_name - string uploads_dir = 2; // uploads/ - string results_dir = 3; // results/ - string artifacts_dir = 4; // artifacts/ -} -``` - -### Lakebase Plugin Boundary (`database.proto`) - -Every Lakebase table has a corresponding proto message. The message IS the schema: - -```protobuf -syntax = "proto3"; -package .v1; - -import "google/protobuf/timestamp.proto"; - -// RunRecord — one row in the `runs` table. -// Producer: compute module. Consumers: api, analytics. -message RunRecord { - string run_id = 1; - string app_name = 2; - RunStatus status = 3; - google.protobuf.Timestamp started_at = 4; - google.protobuf.Timestamp completed_at = 5; - string error_message = 6; - string config_json = 7; -} - -// MetricRecord — one row in the `metrics` table. -// Producer: compute module. Consumers: analytics, api. -message MetricRecord { - string run_id = 1; - string metric_name = 2; - double value = 3; - google.protobuf.Timestamp recorded_at = 4; - map dimensions = 5; -} -``` - -### Jobs Boundary (`compute.proto`) - -Type job task inputs and outputs: - -```protobuf -syntax = "proto3"; -package .v1; - -// JobTaskInput — typed payload sent to a Databricks job task. -// Producer: api module. Consumer: job task code. -message JobTaskInput { - string task_id = 1; - string task_type = 2; - string run_id = 3; - bytes input_payload = 4; - map env = 5; -} - -// JobTaskOutput — typed result from a completed job task. -// Producer: job task code. Consumer: api module. -message JobTaskOutput { - string task_id = 1; - string run_id = 2; - bool success = 3; - string error = 4; - bytes output_payload = 5; - int64 duration_ms = 6; - map metrics = 7; -} -``` - -## Phase 3: Generate Types and DDL - -### 3a. Buf configuration - -```yaml -# buf.yaml -version: v2 -lint: - use: - - STANDARD -breaking: - use: - - FILE -``` - -```yaml -# buf.gen.yaml -version: v2 -plugins: - - remote: buf.build/connectrpc/es - out: proto/gen - opt: target=ts -``` - -### 3b. Generate TypeScript types - -```bash -buf lint proto/ -buf generate proto/ -``` - -### 3c. Generate Lakebase DDL - -For each message in `database.proto`, generate a numbered migration file. - -**Proto→SQL type mapping:** - -| Proto Type | SQL Type | Default | -| -------------- | ------------------ | ---------------- | -| `string` | `TEXT` | `''` | -| `bool` | `BOOLEAN` | `false` | -| `int32` | `INTEGER` | `0` | -| `int64` | `BIGINT` | `0` | -| `double` | `DOUBLE PRECISION` | `0.0` | -| `bytes` | `BYTEA` | `NULL` | -| `Timestamp` | `TIMESTAMPTZ` | `NOW()` | -| `repeated T` | `JSONB` | `'[]'::jsonb` | -| `map` | `JSONB` | `'{}'::jsonb` | -| nested message | `JSONB` | `NULL` | -| `enum` | `TEXT` | first value name | - -Example migration: - -```sql --- migrations/001_create_runs.sql -CREATE TABLE IF NOT EXISTS runs ( - run_id TEXT NOT NULL, - app_name TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'RUN_STATUS_PENDING', - started_at TIMESTAMPTZ, - completed_at TIMESTAMPTZ, - error_message TEXT, - config_json JSONB, - PRIMARY KEY (run_id, app_name) -); -``` - -### 3d. Validate - -```bash -npx tsc --noEmit # all generated types compile -buf lint proto/ # proto style checks -``` - -## Phase 4: Implement Against Contracts - -NOW implementation begins. Each module uses ONLY its generated types: - -```typescript -import type { - StoredArtifact, - UploadRequest, -} from "../proto/gen//v1/storage"; -import type { RunRecord, MetricRecord } from "../proto/gen//v1/database"; -import type { - JobTaskInput, - JobTaskOutput, -} from "../proto/gen//v1/compute"; -``` - -No `any`, no `unknown`, no `JSON.parse()` at module boundaries. - -## Validation Checklist - -Before writing implementation code: - -- [ ] Module map exists with clear data boundaries -- [ ] Proto files exist for every cross-boundary data structure -- [ ] `buf lint proto/` passes -- [ ] `buf generate proto/` produces TypeScript types -- [ ] Lakebase DDL derived from `database.proto` messages -- [ ] No proto message exceeds 12 fields -- [ ] Every field has at least one identified consumer -- [ ] Every message has exactly one producer module -- [ ] Volume layout documented (not freeform paths) -- [ ] Job inputs/outputs typed (no raw JSON params) - -## Common Traps - -| Trap | Why it fails | Fix | -| ------------------------------------ | ------------------------------------------------ | -------------------------------------------------- | -| "I'll add the proto later" | Boundaries calcify around untyped shapes | Proto first or not at all | -| `any` at a module boundary | Type errors surface at runtime, not compile time | Use generated types | -| `JSON.parse()` crossing a boundary | No schema validation | Deserialize with proto decoder | -| Giant 30-field message | Impossible to review, version, or extend | Split by concern, max 12 fields | -| Storing raw JSON in Lakebase | Loses queryability and type safety | Map to `repeated`, `map`, or nested message fields | -| Shared mutable state between modules | Race conditions, unclear ownership | Communicate through typed messages | - -## References - -- [Plugin Contract Details](references/plugin-contracts.md) — proto↔plugin type mappings for files, lakebase, jobs diff --git a/.agents/skills/databricks-apps/references/appkit/sql-queries.md b/.agents/skills/databricks-apps/references/appkit/sql-queries.md deleted file mode 100644 index 7a90e49..0000000 --- a/.agents/skills/databricks-apps/references/appkit/sql-queries.md +++ /dev/null @@ -1,272 +0,0 @@ -# SQL Query Files - -**IMPORTANT**: ALWAYS use SQL files in `config/queries/` for data retrieval. NEVER use tRPC for SQL queries. - -- Store ALL SQL queries in `config/queries/` directory -- Name files descriptively: `trip_statistics.sql`, `user_metrics.sql`, `sales_by_region.sql` -- Reference by filename (without extension) in `useAnalyticsQuery` or directly in a visualization component passing it as `queryKey` -- App Kit automatically executes queries against configured Databricks warehouse -- Benefits: Built-in caching, proper connection pooling, better performance - -## Type Generation - -For full type generation details, see: `npx @databricks/appkit docs ./docs/development/type-generation.md` - -**Type generation:** Types are auto-regenerated during dev whenever SQL files change. - -**Quick workflow:** Add SQL files → Types auto-generate during dev → Types appear in `client/src/appKitTypes.d.ts` - -## Query Schemas (Optional) - -Create `config/queries/schema.ts` only if you need **runtime validation** with Zod. - -```typescript -import { z } from "zod"; - -export const querySchemas = { - my_query: z.array( - z.object({ - category: z.string(), - // Use z.coerce.number() - handles both string and number from SQL - amount: z.coerce.number(), - }), - ), -}; -``` - -**Why `z.coerce.number()`?** - -- Auto-generated types use `number` based on SQL column types -- But some SQL types (DECIMAL, large BIGINT) return as strings at runtime -- `z.coerce.number()` handles both cases safely - -## SQL Type Handling (Critical) - -**Understanding Type Generation vs Runtime:** - -1. **Auto-generated types** (`appKitTypes.d.ts`): Based on SQL column types - - `BIGINT`, `INT`, `DECIMAL` → TypeScript `number` - - These are the types you'll see in IntelliSense - -2. **Runtime JSON values**: Some numeric types arrive as strings - - `DECIMAL` often returns as string (e.g., `"123.45"`) - - Large `BIGINT` values return as string - - `ROUND()`, `AVG()`, `SUM()` results may be strings - -**Best Practice - Always convert before numeric operations:** - -```typescript -// ❌ WRONG - may fail if value is string at runtime -{row.total_amount.toFixed(2)} - -// ✅ CORRECT - convert to number first -{Number(row.total_amount).toFixed(2)} -``` - -**Helper Functions:** - -Create app-specific helpers for consistent numeric formatting (for example in `client/src/lib/formatters.ts`): - -```typescript -// client/src/lib/formatters.ts -export const toNumber = (value: number | string): number => Number(value); -export const formatCurrency = (value: number | string): string => - `$${Number(value).toFixed(2)}`; -export const formatPercent = (value: number | string): string => - `${Number(value).toFixed(1)}%`; -``` - -Use them wherever you render query results: - -```typescript -import { toNumber, formatCurrency, formatPercent } from "./formatters"; // adjust import path to your file layout - -// Convert to number -const amount = toNumber(row.amount); // "123.45" → 123.45 - -// Format as currency -const formatted = formatCurrency(row.amount); // "123.45" → "$123.45" - -// Format as percentage -const percent = formatPercent(row.rate); // "85.5" → "85.5%" -``` - -## Available sql.\* Helpers - -**Full API reference**: `npx @databricks/appkit docs ./docs/api/appkit/Variable.sql.md` — always check this for the latest available helpers. - -```typescript -import { sql } from "@databricks/appkit-ui/js"; - -// ✅ These exist: -sql.string(value); // For STRING parameters -sql.number(value); // For NUMERIC parameters (INT, BIGINT, DOUBLE, DECIMAL) -sql.boolean(value); // For BOOLEAN parameters -sql.date(value); // For DATE parameters (YYYY-MM-DD format) -sql.timestamp(value); // For TIMESTAMP parameters -sql.binary(value); // For BINARY (returns hex string, use UNHEX() in SQL) - -// ❌ These DO NOT exist: -// sql.null() - use sentinel values instead -// sql.array() - use comma-separated sql.string() and split in SQL -// sql.int() - use sql.number() -// sql.float() - use sql.number() -``` - -**For nullable string parameters**, use sentinel values or empty strings. **For nullable date parameters**, use sentinel dates only (empty strings cause validation errors) — see "Optional Date Parameters" section below. - -## Databricks SQL Dialect - -Databricks uses Databricks SQL (based on Spark SQL), NOT PostgreSQL/MySQL. Common mistakes: - -| PostgreSQL | Databricks SQL | -| ------------------------ | --------------------------------------- | -| `GENERATE_SERIES(1, 10)` | `explode(sequence(1, 10))` | -| `DATEDIFF(date1, date2)` | `DATEDIFF(DAY, date2, date1)` (3 args!) | -| `NOW()` | `CURRENT_TIMESTAMP()` | -| `INTERVAL '7 days'` | `INTERVAL 7 DAY` | -| `STRING_AGG(col, ',')` | `CONCAT_WS(',', COLLECT_LIST(col))` | -| `ILIKE` | `LOWER(col) LIKE LOWER(pattern)` | - -**Sample data date ranges** — do NOT use `CURRENT_DATE()` on historical datasets: - -- `samples.tpch.*` — historical dates, check with `SELECT MIN(o_orderdate), MAX(o_orderdate) FROM samples.tpch.orders` -- `samples.nyctaxi.trips` — NYC taxi data with specific date ranges -- `samples.tpcds.*` — data from 1998-2003 - -Always check date ranges before writing date-filtered queries. - -## Before Running `npm run typegen` - -Verify each SQL file before running typegen: - -- [ ] Uses Databricks SQL syntax (NOT PostgreSQL) — check dialect table above -- [ ] `DATEDIFF` has 3 arguments: `DATEDIFF(DAY, start, end)` -- [ ] Uses `LOWER(col) LIKE LOWER(pattern)` instead of `ILIKE` -- [ ] Column aliases in `ORDER BY` match `SELECT` aliases exactly -- [ ] Date columns are not passed to numeric functions like `ROUND()` -- [ ] Date range filters use actual data dates (NOT `CURRENT_DATE()` on historical data — check date ranges first) - -## Query Parameterization - -SQL queries can accept parameters to make them dynamic and reusable. - -**Key Points:** - -- Parameters use colon prefix: `:parameter_name` -- Databricks infers types from values automatically -- For optional string parameters, use pattern: `(:param = '' OR column = :param)` -- **For optional date parameters, use sentinel dates** (`'1900-01-01'` and `'9999-12-31'`) instead of empty strings - -### SQL Parameter Syntax - -```sql --- config/queries/filtered_data.sql -SELECT * -FROM my_table -WHERE column_value >= :min_value - AND column_value <= :max_value - AND category = :category - AND (:optional_filter = '' OR status = :optional_filter) -``` - -### Frontend Parameter Passing - -```typescript -import { sql } from "@databricks/appkit-ui/js"; - -const { data } = useAnalyticsQuery("filtered_data", { - min_value: sql.number(minValue), - max_value: sql.number(maxValue), - category: sql.string(category), - optional_filter: sql.string(optionalFilter || ""), // empty string for optional params -}); -``` - -### Date Parameters - -Use `sql.date()` for date parameters with `YYYY-MM-DD` format strings. - -**Frontend - Using Date Parameters:** - -```typescript -import { sql } from "@databricks/appkit-ui/js"; -import { useState } from "react"; - -function MyComponent() { - const [startDate, setStartDate] = useState("2016-02-01"); - const [endDate, setEndDate] = useState("2016-02-29"); - - const queryParams = { - start_date: sql.date(startDate), // Pass YYYY-MM-DD string to sql.date() - end_date: sql.date(endDate), - }; - - const { data } = useAnalyticsQuery("my_query", queryParams); - - // ... -} -``` - -**SQL - Date Filtering:** - -```sql --- Filter by date range using DATE() function -SELECT COUNT(*) as trip_count -FROM samples.nyctaxi.trips -WHERE DATE(tpep_pickup_datetime) >= :start_date - AND DATE(tpep_pickup_datetime) <= :end_date -``` - -**Date Helper Functions:** - -```typescript -// Helper to get YYYY-MM-DD string for dates relative to today -const daysAgo = (n: number): string => { - const date = new Date(Date.now() - n * 86400000); - return date.toISOString().split("T")[0]; // "2024-01-15" -}; - -const params = { - start_date: sql.date(daysAgo(7)), // 7 days ago - end_date: sql.date(daysAgo(0)), // Today -}; -``` - -### Optional Date Parameters - Use Sentinel Dates - -Databricks App Kit validates parameter types before query execution. **DO NOT use empty strings (`''`) for optional date parameters** as this causes validation errors. - -**✅ CORRECT - Use Sentinel Dates:** - -```typescript -// Frontend: Use sentinel dates for "no filter" instead of empty strings -const revenueParams = { - group_by: "month", - start_date: sql.date("1900-01-01"), // Sentinel: effectively no lower bound - end_date: sql.date("9999-12-31"), // Sentinel: effectively no upper bound - country: sql.string(country || ""), - property_type: sql.string(propertyType || ""), -}; -``` - -```sql --- SQL: Simple comparison since sentinel dates are always valid -WHERE b.check_in >= CAST(:start_date AS DATE) - AND b.check_in <= CAST(:end_date AS DATE) -``` - -**Why Sentinel Dates Work:** - -- `1900-01-01` is before any real data (effectively no lower bound filter) -- `9999-12-31` is after any real data (effectively no upper bound filter) -- Always valid DATE types, so no parameter validation errors -- All real dates fall within this range, so no filtering occurs - -**Parameter Types Summary:** - -- ALWAYS use sql.\* helper functions from the `@databricks/appkit-ui/js` package to define SQL parameters -- **Strings/Numbers**: Use directly in SQL with `:param_name` -- **Dates**: Use with `CAST(:param AS DATE)` in SQL -- **Optional Strings**: Use empty string default, check with `(:param = '' OR column = :param)` -- **Optional Dates**: Use sentinel dates (`sql.date('1900-01-01')` and `sql.date('9999-12-31')`) instead of empty strings diff --git a/.agents/skills/databricks-apps/references/appkit/trpc.md b/.agents/skills/databricks-apps/references/appkit/trpc.md deleted file mode 100644 index 790c040..0000000 --- a/.agents/skills/databricks-apps/references/appkit/trpc.md +++ /dev/null @@ -1,148 +0,0 @@ -# tRPC for Custom Endpoints - -**CRITICAL**: Do NOT use tRPC for SQL queries or data retrieval. Use `config/queries/` + `useAnalyticsQuery` instead. - -**CRITICAL**: Do NOT use tRPC for accessing Unity Catalog and File operations. Use the Files plugin instead. - -Use tRPC ONLY for: - -- **Mutations**: Creating, updating, or deleting data (INSERT, UPDATE, DELETE) -- **External APIs**: Calling Databricks APIs (serving endpoints, jobs, MLflow, etc.) -- **Complex business logic**: Multi-step operations that cannot be expressed in SQL -- **File operations**: File uploads, processing, transformations -- **Custom computations**: Operations requiring TypeScript/Node.js logic - -## Before Writing New Routes - -**ALWAYS complete these checks before adding tRPC routes:** - -### 1. Check AppKit Version - -Read `package.json` to identify the installed `@databricks/appkit` version. Available server APIs and plugins differ across versions. - -```bash -# From the project root -cat package.json | grep @databricks/appkit -``` - -### 2. Review Available Plugins - -Check what plugins are already enabled and what server-side functionality they provide — avoid reimplementing what a plugin already handles. - -```bash -# See plugin docs for the installed version -npx @databricks/appkit docs ./docs/plugins.md - -# See all plugins available for a specific version -databricks apps manifest --version --profile - -# See plugins available for the default template -databricks apps manifest --profile -``` - -**Key plugins to check for:** - -- **analytics** — provides SQL warehouse query execution (do NOT reimplement with tRPC) -- **lakebase** — provides `createLakebasePool` for PostgreSQL CRUD (use pool in tRPC routes, don't create raw connections) -- **genie** — provides Genie AI-powered data exploration (check before building custom natural-language-to-SQL routes) -- **files** — provides file storage and retrieval helpers (check before writing custom file upload/download routes) - -If a plugin already covers your use case, use the plugin's API instead of writing a custom tRPC route. - -If there's a newer version of `@databricks/appkit` has a plugin that fits the use-case. -Prompt the user for updating. - -### 3. Check Existing Routes - -Read `server/server.ts` (or `server/trpc.ts`) to see what routes already exist. Extend the existing router rather than creating a parallel one. - -## Server-side Pattern - -```tsx -// server/trpc.ts -import { initTRPC } from "@trpc/server"; -import { getExecutionContext } from "@databricks/appkit"; -import { z } from "zod"; -import superjson from "superjson"; - -const t = initTRPC.create({ transformer: superjson }); -const publicProcedure = t.procedure; - -export const appRouter = t.router({ - // Example: Query a serving endpoint - queryModel: publicProcedure - .input(z.object({ prompt: z.string() })) - .query(async ({ input: { prompt } }) => { - const { serviceDatabricksClient: client } = getExecutionContext(); - const response = await client.servingEndpoints.query({ - name: "your-endpoint-name", - messages: [{ role: "user", content: prompt }], - }); - return response; - }), - - // Example: Mutation - createRecord: publicProcedure - .input(z.object({ name: z.string() })) - .mutation(async ({ input }) => { - // Custom logic here - return { success: true, id: 123 }; - }), -}); -``` - -## Client-side Pattern - -```typescript -// client/src/components/MyComponent.tsx -import { trpc } from '@/lib/trpc'; -import { useState, useEffect } from 'react'; - -function MyComponent() { - const [result, setResult] = useState(null); - - useEffect(() => { - trpc.queryModel - .query({ prompt: "Hello" }) - .then(setResult) - .catch(console.error); - }, []); - - const handleCreate = async () => { - await trpc.createRecord.mutate({ name: "test" }); - }; - - return
{/* component JSX */}
; -} -``` - -## Decision Tree for Data Operations - -1. **Need to display data from SQL?** - - **Chart or Table?** → Use visualization components (`BarChart`, `LineChart`, `DataTable`, etc.) - - **Custom display (KPIs, cards, lists)?** → Use `useAnalyticsQuery` hook - - **Never** use tRPC for SQL SELECT statements - -2. **Need to call a Databricks API?** → Use tRPC - - Serving endpoints (model inference) - - MLflow operations - - Jobs API - - Workspace API - -3. **Need to modify data?** → Use tRPC mutations - - INSERT, UPDATE, DELETE operations - - Multi-step transactions - - Business logic with side effects - -4. **Need non-SQL custom logic?** → Use tRPC - - File processing - - External API calls - - Complex computations in TypeScript - -**Summary:** - -- ✅ SQL queries → Visualization components or `useAnalyticsQuery` -- ✅ Databricks APIs → tRPC -- ✅ Data mutations → tRPC -- ❌ SQL queries → tRPC (NEVER do this) -- ❌ Files operations → tRPC (NEVER do this) diff --git a/.agents/skills/databricks-apps/references/other-frameworks.md b/.agents/skills/databricks-apps/references/other-frameworks.md deleted file mode 100644 index 49d65ea..0000000 --- a/.agents/skills/databricks-apps/references/other-frameworks.md +++ /dev/null @@ -1,282 +0,0 @@ -# Databricks Apps — Other Frameworks (Non-AppKit) - -Setup guide for non-AppKit apps: Streamlit, FastAPI, Flask, Gradio, Dash, Django, Next.js, React, etc. - -For universal platform rules (permissions, deployment, timeouts, resource injection), see [Platform Guide](platform-guide.md). - -## 1. Port & Host Configuration - -**The #1 cause of 502 Bad Gateway errors.** - -| Setting | Required Value | Common Mistake | -| ------- | ----------------------------- | ------------------------------------- | -| Port | `DATABRICKS_APP_PORT` env var | Hardcoding 8080, 3000, or 3001 | -| Host | `0.0.0.0` | Binding to `localhost` or `127.0.0.1` | - -The platform dynamically assigns a port via `DATABRICKS_APP_PORT`. Use `8000` as a local dev fallback only. - -### Framework-Specific Port Configuration - -#### Streamlit - -```yaml -# app.yaml -command: - - streamlit - - run - - app.py - - --server.port - - "${DATABRICKS_APP_PORT:-8000}" - - --server.address - - "0.0.0.0" -``` - -#### FastAPI / Uvicorn - -```python -if __name__ == "__main__": - import uvicorn - port = int(os.environ.get("DATABRICKS_APP_PORT", 8000)) - uvicorn.run(app, host="0.0.0.0", port=port) -``` - -#### Flask - -```python -port = int(os.environ.get("DATABRICKS_APP_PORT", 8000)) -app.run(host="0.0.0.0", port=port) -``` - -#### Gradio - -```python -demo.launch(server_name="0.0.0.0", - server_port=int(os.environ.get("DATABRICKS_APP_PORT", 8000))) -``` - -#### Dash - -```python -app.run(host="0.0.0.0", - port=int(os.environ.get("DATABRICKS_APP_PORT", 8000))) -``` - -#### Next.js - -```jsonc -// package.json -"scripts": { - "start": "next start -p ${DATABRICKS_APP_PORT:-8000} -H 0.0.0.0" -} -``` - -⚠️ **Only ONE service can bind to `DATABRICKS_APP_PORT`.** If you need multiple services (e.g., frontend + backend), use a reverse proxy or serve everything from one process. - -## 2. app.yaml vs databricks.yml - -These two files serve different purposes. Getting them wrong causes silent deployment failures. - -### app.yaml — Runtime Configuration - -- Defines the **start command** and **environment variables** for the running app -- Used by the Databricks Apps runtime directly -- `valueFrom:` injects resource IDs from workspace configuration - -```yaml -# app.yaml -command: - - python - - app.py -env: - - name: DATABRICKS_WAREHOUSE_ID - valueFrom: sql-warehouse - - name: MY_CUSTOM_VAR - value: "some-value" -``` - -### databricks.yml — Bundle/Deployment Configuration - -- Defines the **app resource** for DABs (Declarative Automation Bundles) -- `config:` section only takes effect after `bundle run`, NOT just `bundle deploy` - -```yaml -# databricks.yml -bundle: - name: my-app-bundle - -resources: - apps: - my-app: - name: my-app - source_code_path: . - config: - command: ["python", "app.py"] - env: - - name: DATABRICKS_WAREHOUSE_ID - valueFrom: sql-warehouse - permissions: - - service_principal_name: ${bundle.target}.my-app - level: CAN_MANAGE - -targets: - dev: - default: true -``` - -### Critical Rules - -| Rule | Why | -| ---------------------------------------------------------- | -------------------------------------------------------------- | -| Always provide BOTH `app.yaml` AND `databricks.yml` config | UI deployments use app.yaml; DABs uses databricks.yml | -| Always run `bundle deploy` THEN `bundle run ` | `deploy` uploads code; `run` applies config and starts the app | -| Never use `${var.xxx}` in config env values | Variables are NOT resolved in config — values appear literally | - -## 3. Using OBO in Non-AppKit Apps - -```python -# FastAPI example -from fastapi import Request -from databricks.sdk import WorkspaceClient - -@app.get("/user-data") -def get_user_data(request: Request): - token = request.headers.get("x-forwarded-access-token") - - # create user-scoped client - w = WorkspaceClient(token=token, host=os.environ["DATABRICKS_HOST"]) - # use w for user-scoped operations -``` - -```python -# SP auth is auto-configured — just use the SDK -from databricks.sdk import WorkspaceClient -w = WorkspaceClient() # picks up auto-injected env vars -``` - -## 4. Framework-Specific Timeout Gotchas - -| Framework | Default Timeout | Fix | -| --------- | --------------------------- | --------------------------------------------------- | -| Gradio | 30 seconds (internal) | Set `fn` timeout explicitly or use `gradio.queue()` | -| Gunicorn | 30 seconds (worker timeout) | Set `--timeout 120` in gunicorn command | -| Uvicorn | None (no default timeout) | Already fine | - -## 5. Common Errors (Non-AppKit Specific) - -| Error | Cause | Fix | -| --------------------------------- | -------------------------------------------------------- | ---------------------------------------------------- | -| 502 Bad Gateway | Wrong port or host | Bind to `0.0.0.0:${DATABRICKS_APP_PORT:-8000}` | -| App works locally but 502 in prod | Binding to localhost | Change to `0.0.0.0` | -| `ModuleNotFoundError` at runtime | Dependency not in requirements.txt or version conflict | Pin exact versions; validate locally first | -| Wrong script runs on deploy | No `command` in app.yaml, platform picked wrong .py file | Always specify `command` explicitly in app.yaml | -| `apt-get: command not found` | No root access in container | Use pure-Python wheels from PyPI; no system packages | - -## 6. Dependency Management - -### Python - -Only `requirements.txt` is natively supported. No native support for `pyproject.toml`, `uv.lock`, or Poetry. - -**Workaround for `uv`:** - -``` -# requirements.txt -uv -``` - -```yaml -# app.yaml -command: - - uv - - run - - app.py -``` - -Define actual dependencies in `pyproject.toml`. Note: This moves dependency installation from build to run step, slowing startup. - -**Custom package repositories:** - -- Set `PIP_INDEX_URL` as a secret in the app configuration -- Deploying user needs **MANAGE** permission on the secret scope (not just USE/READ) - -### Node.js - -- `package.json` is supported — `npm install` runs at startup -- Do NOT include `node_modules/` in source code (10 MB file limit) -- Large npm installs may exceed the 10-minute startup window -- In egress-restricted workspaces, add `registry.npmjs.org` to egress policy AND restart the app (egress changes require restart) - -## 7. Networking & CORS - -### CORS - -- CORS headers are **not customizable** on the Databricks Apps reverse proxy -- Workspace origin (`*.databricks.com`) differs from app origin (`*.databricksapps.com`) -- Cross-app API calls return **302 redirect to login page** instead of the expected response - -**Workaround:** Keep frontend and backend in a single app to avoid CORS entirely. - -### Private Link / Hardened Environments - -- Azure apps use `*.azure.databricksapps.com` — NOT `*.azuredatabricks.net` -- Existing Private Link DNS zones don't cover the apps domain -- Fix: Create a separate Private DNS Zone for `azure.databricksapps.com` with conditional DNS forwarding - -### Egress Restrictions - -- Egress policy changes require **app restart** to take effect -- For npm: allowlist `registry.npmjs.org` -- For pip: allowlist `pypi.org` and `files.pythonhosted.org` -- For custom registries: use `PIP_INDEX_URL` secret (see Dependency Management) - -## 8. Streamlit-Specific Gotchas - -### Required Environment Variables - -```yaml -# app.yaml -command: - - streamlit - - run - - app.py - - --server.port - - "${DATABRICKS_APP_PORT:-8000}" - - --server.address - - "0.0.0.0" -env: - - name: STREAMLIT_SERVER_ENABLE_CORS - value: "false" - - name: STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION - value: "false" -``` - -⚠️ **Both CORS and XSRF must be disabled** for Streamlit on Databricks Apps. The reverse proxy origin (`*.databricksapps.com`) differs from the workspace origin, triggering Streamlit's CORS/XSRF protection. - -### OBO Token Staleness - -Streamlit caches initial HTTP request headers, then switches to WebSocket. The OBO token from `x-forwarded-access-token` **never refreshes** — it goes stale. - -**Workaround:** Periodically trigger a full page refresh. No clean in-Streamlit solution exists. - -### Connection Exhaustion (Hangs After Initial Queries) - -Streamlit re-runs the entire script on every user interaction. If `sql.connect()` is called during each render cycle, the rapid succession of TCP handshakes and OAuth negotiations exhausts the connection pool, causing 2-3 minute freezes. - -**Fix:** Use `@st.cache_resource` to maintain persistent connections: - -```python -@st.cache_resource -def get_connection(): - from databricks import sql - from databricks.sdk.core import Config - cfg = Config() - return sql.connect( - server_hostname=cfg.host, - http_path=f"/sql/1.0/warehouses/{os.environ['DATABRICKS_WAREHOUSE_ID']}", - credentials_provider=lambda: cfg.authenticate, - ) -``` - -### Transient 502s During Startup - -Streamlit apps commonly show brief 502 errors during startup. This is expected and does not indicate a problem. diff --git a/.agents/skills/databricks-apps/references/platform-guide.md b/.agents/skills/databricks-apps/references/platform-guide.md deleted file mode 100644 index 7c2519d..0000000 --- a/.agents/skills/databricks-apps/references/platform-guide.md +++ /dev/null @@ -1,173 +0,0 @@ -# Databricks Apps Platform Guide - -Universal platform rules that apply to ALL Databricks Apps regardless of framework (AppKit, Streamlit, FastAPI, etc.). - -For non-AppKit framework-specific setup (port config, app.yaml, Streamlit gotchas), see [Other Frameworks](other-frameworks.md). - -## Service Principal Permissions - -**The #1 cause of runtime crashes after deployment.** - -When your app uses a Databricks resource (SQL warehouse, model serving endpoint, vector search index, volume, secret scope), the app's **service principal** must have explicit permissions on that resource. - -### How Permissions Work - -When you declare a resource in `app.yaml` / `databricks.yml` with a `permission` field, the platform **automatically grants** that permission to the app's SP on deployment. You do NOT need to run manual `set-permissions` commands for declared resources. - -```yaml -# databricks.yml — declaring resources with permissions -resources: - apps: - my_app: - resources: - - name: my-warehouse - sql_warehouse: - id: ${var.warehouse_id} - permission: CAN_USE # auto-granted to SP on deploy - - name: my-endpoint - serving_endpoint: - name: ${var.endpoint_name} - permission: CAN_QUERY # auto-granted to SP on deploy -``` - -### Default Permissions by Resource Type - -| Resource Type | Default Permission | Notes | -| ------------------------ | ---------------------- | ---------------------------------------- | -| SQL Warehouse | CAN_USE | Minimum for query execution | -| Model Serving Endpoint | CAN_QUERY | For inference calls | -| Vector Search Index (UC) | SELECT | UC securable of type TABLE | -| Volume (UC) | READ_VOLUME | Via UC securable | -| Secret Scope | READ | Deploying user needs MANAGE on the scope | -| Job | CAN_MANAGE_RUN | | -| Lakebase Database | CAN_CONNECT_AND_CREATE | | -| Genie Space | CAN_VIEW | | - -### ⚠️ CRITICAL AGENT BEHAVIOR - -Always declare resources in `databricks.yml` with the correct `permission` field — do NOT skip this. The platform handles granting automatically on deploy. - -## Resource Types & Injection - -**NEVER hardcode workspace-specific IDs in source code.** Always inject via environment variables with `valueFrom`. - -| Resource Type | Default Key | Use Case | -| ---------------------- | --------------------- | ------------------------ | -| SQL Warehouse | `sql-warehouse` | Query compute | -| Model Serving Endpoint | `serving-endpoint` | Model inference | -| Vector Search Index | `vector-search-index` | Semantic search | -| Lakebase Database | `database` | OLTP storage | -| Secret | `secret` | Sensitive values | -| UC Table | `table` | Structured data | -| UC Connection | `connection` | External data sources | -| Genie Space | `genie-space` | AI analytics | -| MLflow Experiment | `experiment` | ML tracking | -| Lakeflow Job | `job` | Data workflows | -| UDF | `function` | SQL/Python functions | -| Databricks App | `app` | App-to-app communication | - -```python -# ✅ GOOD -warehouse_id = os.environ["DATABRICKS_WAREHOUSE_ID"] -``` - -```yaml -# app.yaml / databricks.yml env section -env: - - name: DATABRICKS_WAREHOUSE_ID - valueFrom: sql-warehouse - - name: SERVING_ENDPOINT - valueFrom: serving-endpoint -``` - -## Authentication: OBO vs Service Principal - -| Context | When Used | Token Source | Cached Per | -| -------------------------- | -------------------------------------- | ----------------------------------------------------------------- | ------------------ | -| **Service Principal (SP)** | Default; background tasks, shared data | Auto-injected `DATABRICKS_CLIENT_ID` + `DATABRICKS_CLIENT_SECRET` | All users (shared) | -| **On-Behalf-Of (OBO)** | User-specific data, user-scoped access | `x-forwarded-access-token` header | Per user | - -**SP auth** is auto-configured — `WorkspaceClient()` picks up injected env vars. - -**OBO** requires extracting the token from request headers and declaring scopes: - -| Scope | Purpose | -| ------------------------- | -------------------------------- | -| `sql` | Query SQL warehouses | -| `dashboards.genie` | Manage Genie spaces | -| `files.files` | Manage files/directories | -| `iam.access-control:read` | Read permissions (default) | -| `iam.current-user:read` | Read current user info (default) | - -⚠️ Databricks blocks access outside approved scopes even if the user has permission. - -## Deployment Workflow - -⚠️ **USER CONSENT REQUIRED** — always confirm with the user before deploying. - -```bash -# Option A: single command (recommended) — validates, deploys, and runs -databricks apps deploy -t --profile - -# Option B: step by step -databricks apps validate --profile -databricks bundle deploy -t --profile -databricks bundle run -t --profile -``` - -❌ **Common mistake:** Running only `bundle deploy` and expecting the app to update. Deploy uploads code but does NOT apply config changes or restart the app. Use `databricks apps deploy` or add `bundle run` after `bundle deploy`. - -### ⚠️ Destructive Updates Warning - -`databricks apps update` (and `bundle run`) performs a **full replacement**, not a merge: - -- Adding a new resource can silently **wipe** existing `user_api_scopes` -- OBO permissions may be stripped on every deployment - -**Workaround:** After each deployment, verify OBO scopes are intact. - -## Runtime Environment - -| Constraint | Value | -| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| Max file size | 10 MB per file | -| Available port | Only `DATABRICKS_APP_PORT` | -| Auto-injected env vars | `DATABRICKS_HOST`, `DATABRICKS_APP_PORT`, `DATABRICKS_APP_NAME`, `DATABRICKS_WORKSPACE_ID`, `DATABRICKS_CLIENT_ID`, `DATABRICKS_CLIENT_SECRET` | -| No root access | Cannot use `apt-get`, `yum`, or `apk` — use PyPI/npm packages only | -| Graceful shutdown | SIGTERM → 15 seconds to shut down → SIGKILL | -| Logging | Only stdout/stderr are captured — file-based logs are lost on container recycle | -| Filesystem | Ephemeral — no persistent local storage; use UC Volumes/tables | - -## Compute & Limits - -| Size | RAM | vCPU | DBU/hour | Notes | -| ------ | ----- | ------- | -------- | ---------------------------------- | -| Medium | 6 GB | Up to 2 | 0.5 | Default | -| Large | 12 GB | Up to 4 | 1.0 | Select during app creation or edit | - -- No GPU access. Use model serving endpoints for inference. -- Apps must start within **10 minutes** (including dependency installation). -- Max apps per workspace: **100**. - -## HTTP Proxy & Streaming - -The Databricks Apps reverse proxy enforces a **120-second per-request timeout** (NOT configurable). - -| Behavior | Detail | -| ---------------- | ------------------------------------------------------------------------- | -| 504 in app logs? | **No** — the error is generated at the proxy. App logs show nothing. | -| SSE streaming | Responses may be **buffered** and delivered in chunks, not token-by-token | -| WebSockets | Bypass the 120s limit — working but undocumented | - -For long-running agent interactions, use **WebSockets** instead of SSE. - -## Common Errors - -| Error | Cause | Fix | -| ------------------------------------- | -------------------------------- | ----------------------------------------- | -| `PERMISSION_DENIED` after deploy | SP missing permissions | Grant SP access to all declared resources | -| App deploys but config doesn't change | Only ran `bundle deploy` | Also run `bundle run ` | -| `File is larger than 10485760 bytes` | Bundled dependencies | Use requirements.txt / package.json | -| OBO scopes missing after deploy | Destructive update wiped them | Re-apply scopes after each deploy | -| `${var.xxx}` appears literally in env | Variables not resolved in config | Use literal values, not bundle variables | -| 504 Gateway Timeout | Request exceeded 120s | Use WebSockets for long operations | diff --git a/.agents/skills/databricks-apps/references/testing.md b/.agents/skills/databricks-apps/references/testing.md deleted file mode 100644 index 0792406..0000000 --- a/.agents/skills/databricks-apps/references/testing.md +++ /dev/null @@ -1,111 +0,0 @@ -# Testing Guidelines - -## Unit Tests (Vitest) - -**CRITICAL**: Use vitest for all tests. Put tests next to the code (e.g. src/\*.test.ts) - -```typescript -import { describe, it, expect } from "vitest"; - -describe("Feature Name", () => { - it("should do something", () => { - expect(true).toBe(true); - }); - - it("should handle async operations", async () => { - const result = await someAsyncFunction(); - expect(result).toBeDefined(); - }); -}); -``` - -**Best Practices:** - -- Use `describe` blocks to group related tests -- Use `it` for individual test cases -- Use `expect` for assertions -- Tests run with `npm test` (runs `vitest run`) - -❌ **Do not write unit tests for:** - -- SQL files under `config/queries/` - little value in testing static SQL -- Types associated with queries - these are just schema definitions - -## Smoke Test (Playwright) - -The template includes a smoke test at `tests/smoke.spec.ts` that verifies the app loads correctly. - -**⚠️ MUST UPDATE after customizing the app:** - -- The heading selector checks for `'Minimal Databricks App'` — change it to match your app's actual title -- The text assertion checks for `'hello world'` — update or remove it to match your app's content -- Failing to update these will cause the smoke test to fail on `databricks apps validate` - -```typescript -// tests/smoke.spec.ts - update these selectors: -// ⚠️ PLAYWRIGHT STRICT MODE: each selector must match exactly ONE element. -// Use { exact: true }, .first(), or role-based selectors. See "Playwright Strict Mode" below. - -// ❌ Template default - will fail after customization -await expect( - page.getByRole("heading", { name: "Minimal Databricks App" }), -).toBeVisible(); -await expect(page.getByText("hello world")).toBeVisible(); - -// ✅ Update to match YOUR app -await expect( - page.getByRole("heading", { name: "Your App Title" }), -).toBeVisible(); -await expect(page.locator("h1").first()).toBeVisible({ timeout: 30000 }); // Or just check any h1 -``` - -**What the smoke test does:** - -- Opens the app -- Waits for data to load (SQL query results) -- Verifies key UI elements are visible -- Captures screenshots and console logs to `.smoke-test/` directory -- Always captures artifacts, even on test failure - -## Playwright Strict Mode - -Playwright uses strict mode by default — selectors matching multiple elements WILL FAIL. - -### Selector Priority (use in this order) - -1. ✅ `getByRole('heading', { name: 'Your App Title' })` — headings (most reliable) -2. ✅ `getByRole('button', { name: 'Submit' })` — interactive elements -3. ✅ `getByText('Unique text', { exact: true })` — exact match for unique strings -4. ⚠️ `getByText('Common text').first()` — last resort for repeated text -5. ❌ `getByText('Revenue')` — NEVER without `exact` or `.first()` (strict mode will fail) - -**Common mistake**: text like "Revenue" may appear in a heading, a card, AND a description. Always verify your selector targets exactly ONE element. - -```typescript -// ❌ FAILS if "Revenue" appears in multiple places (heading + card + description) -await expect(page.getByText("Revenue")).toBeVisible(); - -// ✅ Use role-based selectors for headings -await expect( - page.getByRole("heading", { name: "Revenue Dashboard" }), -).toBeVisible(); - -// ✅ Use exact matching -await expect(page.getByText("Revenue", { exact: true })).toBeVisible(); - -// ✅ Use .first() as last resort -await expect(page.getByText("Revenue").first()).toBeVisible(); -``` - -**Keep smoke tests simple:** - -- Only verify that the app loads and displays initial data -- Wait for key elements to appear (page title, main content) -- Capture artifacts for debugging -- Run quickly (< 5 seconds) - -**For extended E2E tests:** - -- Create separate test files in `tests/` directory (e.g., `tests/user-flow.spec.ts`) -- Use `npm run test:e2e` to run all Playwright tests -- Keep complex user flows, interactions, and edge cases out of the smoke test diff --git a/.agents/skills/databricks-jobs/SKILL.md b/.agents/skills/databricks-jobs/SKILL.md deleted file mode 100644 index 2d4b87b..0000000 --- a/.agents/skills/databricks-jobs/SKILL.md +++ /dev/null @@ -1,189 +0,0 @@ ---- -name: databricks-jobs -description: Develop and deploy Lakeflow Jobs on Databricks. Use when creating data engineering jobs with notebooks, Python wheels, or SQL tasks. Invoke BEFORE starting implementation. -compatibility: Requires databricks CLI (>= v0.292.0) -metadata: - version: "0.1.0" -parent: databricks-core ---- - -# Lakeflow Jobs Development - -**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, profile selection, and data exploration commands. - -Lakeflow Jobs are scheduled workflows that run notebooks, Python scripts, SQL queries, and other tasks on Databricks. - -## Scaffolding a New Job Project - -Use `databricks bundle init` with a config file to scaffold non-interactively. This creates a project in the `/` directory: - -```bash -databricks bundle init default-python --config-file <(echo '{"project_name": "my_job", "include_job": "yes", "include_pipeline": "no", "include_python": "yes", "serverless": "yes"}') --profile < /dev/null -``` - -- `project_name`: letters, numbers, underscores only - -After scaffolding, create `CLAUDE.md` and `AGENTS.md` in the project directory. These files are essential to provide agents with guidance on how to work with the project. Use this content: - -``` -# Declarative Automation Bundles Project - -This project uses Declarative Automation Bundles (formerly Databricks Asset Bundles) for deployment. - -## Prerequisites - -Install the Databricks CLI (>= v0.288.0) if not already installed: -- macOS: `brew tap databricks/tap && brew install databricks` -- Linux: `curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh` -- Windows: `winget install Databricks.DatabricksCLI` - -Verify: `databricks -v` - -## For AI Agents - -Read the `databricks-core` skill for CLI basics, authentication, and deployment workflow. -Read the `databricks-jobs` skill for job-specific guidance. - -If skills are not available, install them: `databricks experimental aitools skills install` -``` - -## Project Structure - -``` -my-job-project/ -├── databricks.yml # Bundle configuration -├── resources/ -│ └── my_job.job.yml # Job definition -├── src/ -│ ├── my_notebook.ipynb # Notebook tasks -│ └── my_module/ # Python wheel package -│ ├── __init__.py -│ └── main.py -├── tests/ -│ └── test_main.py -└── pyproject.toml # Python project config (if using wheels) -``` - -## Configuring Tasks - -Edit `resources/.job.yml` to configure tasks: - -```yaml -resources: - jobs: - my_job: - name: my_job - - tasks: - - task_key: my_notebook - notebook_task: - notebook_path: ../src/my_notebook.ipynb - - - task_key: my_python - depends_on: - - task_key: my_notebook - python_wheel_task: - package_name: my_package - entry_point: main -``` - -Task types: `notebook_task`, `python_wheel_task`, `spark_python_task`, `pipeline_task`, `sql_task` - -## Job Parameters - -Parameters defined at job level are passed to ALL tasks (no need to repeat per task): - -```yaml -resources: - jobs: - my_job: - parameters: - - name: catalog - default: ${var.catalog} - - name: schema - default: ${var.schema} -``` - -Access parameters in notebooks with `dbutils.widgets.get("catalog")`. - -## Writing Notebook Code - -```python -# Read parameters -catalog = dbutils.widgets.get("catalog") -schema = dbutils.widgets.get("schema") - -# Read tables -df = spark.read.table(f"{catalog}.{schema}.my_table") - -# SQL queries -result = spark.sql(f"SELECT * FROM {catalog}.{schema}.my_table LIMIT 10") - -# Write output -df.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.output_table") -``` - -## Scheduling - -```yaml -resources: - jobs: - my_job: - trigger: - periodic: - interval: 1 - unit: DAYS -``` - -Or with cron: - -```yaml -schedule: - quartz_cron_expression: "0 0 2 * * ?" - timezone_id: "UTC" -``` - -## Multi-Task Jobs with Dependencies - -```yaml -resources: - jobs: - my_pipeline_job: - tasks: - - task_key: extract - notebook_task: - notebook_path: ../src/extract.ipynb - - - task_key: transform - depends_on: - - task_key: extract - notebook_task: - notebook_path: ../src/transform.ipynb - - - task_key: load - depends_on: - - task_key: transform - notebook_task: - notebook_path: ../src/load.ipynb -``` - -## Unit Testing - -Run unit tests locally: - -```bash -uv run pytest -``` - -## Development Workflow - -1. **Validate**: `databricks bundle validate --profile ` -2. **Deploy**: `databricks bundle deploy -t dev --profile ` -3. **Run**: `databricks bundle run -t dev --profile ` -4. **Check run status**: `databricks jobs get-run --run-id --profile ` - -## Documentation - -- Lakeflow Jobs: https://docs.databricks.com/jobs -- Task types: https://docs.databricks.com/jobs/configure-task -- Declarative Automation Bundles: https://docs.databricks.com/dev-tools/bundles/ diff --git a/.agents/skills/databricks-lakebase/SKILL.md b/.agents/skills/databricks-lakebase/SKILL.md deleted file mode 100644 index 85aee0e..0000000 --- a/.agents/skills/databricks-lakebase/SKILL.md +++ /dev/null @@ -1,182 +0,0 @@ ---- -name: databricks-lakebase -description: "Manage Lakebase Postgres Autoscaling projects, branches, and endpoints via Databricks CLI. Use when asked to create, configure, or manage Lakebase Postgres databases, projects, branches, computes, or endpoints." -compatibility: Requires databricks CLI (>= v0.294.0) -metadata: - version: "0.1.0" -parent: databricks-core ---- - -# Lakebase Postgres Autoscaling - -**FIRST**: Use the parent `databricks-core` skill for CLI basics, authentication, and profile selection. - -Lakebase is Databricks' serverless Postgres-compatible database (similar to Neon). It provides fully managed OLTP storage with autoscaling, branching, and scale-to-zero. - -Manage Lakebase Postgres projects, branches, endpoints, and databases via `databricks postgres` CLI commands. - -## Resource Hierarchy - -``` -Project (top-level container) - └── Branch (isolated database environment, copy-on-write) - ├── Endpoint (read-write or read-only) - ├── Database (standard Postgres DB) - └── Role (Postgres role) -``` - -- **Project**: Top-level container. Creating one auto-provisions a `production` branch and a `primary` read-write endpoint. -- **Branch**: Isolated database environment sharing storage with parent (copy-on-write). States: `READY`, `ARCHIVED`. -- **Endpoint** (called **Compute** in the Lakebase UI): Compute resource powering a branch. Types: `ENDPOINT_TYPE_READ_WRITE`, `ENDPOINT_TYPE_READ_ONLY` (read replica). -- **Database**: Standard Postgres database within a branch. Default: `databricks_postgres`. -- **Role**: Postgres role within a branch. Manage roles via `databricks postgres create-role -h`. - -### Resource Name Formats - -| Resource | Format | -| -------- | -------------------------------------------------------------------- | -| Project | `projects/{project_id}` | -| Branch | `projects/{project_id}/branches/{branch_id}` | -| Endpoint | `projects/{project_id}/branches/{branch_id}/endpoints/{endpoint_id}` | -| Database | `projects/{project_id}/branches/{branch_id}/databases/{database_id}` | - -All IDs: 1-63 characters, start with lowercase letter, lowercase letters/numbers/hyphens only (RFC 1123). - -## CLI Discovery — ALWAYS Do This First - -> **Note:** "Lakebase" is the product name; the CLI command group is `postgres`. All commands use `databricks postgres ...`. - -**Do NOT guess command syntax.** Discover available commands and their usage dynamically: - -```bash -# List all postgres subcommands -databricks postgres -h - -# Get detailed usage for any subcommand (flags, args, JSON fields) -databricks postgres -h -``` - -Run `databricks postgres -h` before constructing any command. Run `databricks postgres -h` to discover exact flags, positional arguments, and JSON spec fields for that subcommand. - -## Create a Project - -> **Do NOT list projects before creating.** - -```bash -databricks postgres create-project \ - --json '{"spec": {"display_name": ""}}' \ - --profile -``` - -- Auto-creates: `production` branch + `primary` read-write endpoint (1 CU min/max, scale-to-zero) -- Long-running operation; the CLI waits for completion by default. Use `--no-wait` to return immediately. -- Run `databricks postgres create-project -h` for all available spec fields (e.g. `pg_version`). - -After creation, verify the auto-provisioned resources: - -```bash -databricks postgres list-branches projects/ --profile -databricks postgres list-endpoints projects//branches/ --profile -databricks postgres list-databases projects//branches/ --profile -``` - -## Autoscaling - -Endpoints use **compute units (CU)** for autoscaling. Configure min/max CU via `create-endpoint` or `update-endpoint`. Run `databricks postgres create-endpoint -h` to see all spec fields. - -Scale-to-zero is enabled by default. When idle, compute scales down to zero; it resumes in seconds on next connection. - -## Branches - -Branches are copy-on-write snapshots of an existing branch. Use them for **experimentation**: testing schema migrations, trying queries, or previewing data changes -- without affecting production. - -```bash -databricks postgres create-branch projects/ \ - --json '{ - "spec": { - "source_branch": "projects//branches/", - "no_expiry": true - } - }' --profile -``` - -Branches require an expiration policy: use `"no_expiry": true` for permanent branches. - -When done experimenting, delete the branch. Protected branches must be unprotected first -- use `update-branch` to set `spec.is_protected` to `false`, then delete: - -```bash -# Step 1 — unprotect -databricks postgres update-branch projects//branches/ \ - --json '{"spec": {"is_protected": false}}' --profile - -# Step 2 — delete (run -h to confirm positional arg format for your CLI version) -databricks postgres delete-branch projects//branches/ \ - --profile -``` - -**Never delete the `production` branch** — it is the authoritative branch auto-provisioned at project creation. - -## What's Next - -### Build a Databricks App - -After creating a Lakebase project, scaffold a Databricks App connected to it. - -**Step 1 — Discover branch name** (use `.name` from a `READY` branch): - -```bash -databricks postgres list-branches projects/ --profile -``` - -**Step 2 — Discover database name** (use `.name` from the desired database; `` is the branch ID, not the full resource name): - -```bash -databricks postgres list-databases projects//branches/ --profile -``` - -**Step 3 — Scaffold the app** with the `lakebase` feature: - -```bash -databricks apps init --name \ - --features lakebase \ - --set "lakebase.postgres.branch=" \ - --set "lakebase.postgres.database=" \ - --run none --profile -``` - -Where `` is the full resource name (e.g. `projects//branches/`) and `` is the full resource name (e.g. `projects//branches//databases/`). - -For the full app development workflow, use the **`databricks-apps`** skill. - -### Other Workflows - -**Connect a Postgres client** -Get the connection string from the endpoint, then connect with psql, DBeaver, or any standard Postgres client. - -```bash -databricks postgres get-endpoint projects//branches//endpoints/ --profile -``` - -**Manage roles and permissions** -Create Postgres roles and grant access to databases or schemas. - -```bash -databricks postgres create-role -h # discover role spec fields -``` - -**Add a read-only endpoint** -Create a read replica for analytics or reporting workloads to avoid contention on the primary read-write endpoint. - -```bash -databricks postgres create-endpoint projects//branches/ \ - --json '{"spec": {"type": "ENDPOINT_TYPE_READ_ONLY"}}' --profile -``` - -## Troubleshooting - -| Error | Solution | -| -------------------------------------- | ----------------------------------------------------------- | -| `cannot configure default credentials` | Use `--profile` flag or authenticate first | -| `PERMISSION_DENIED` | Check workspace permissions | -| Protected branch cannot be deleted | `update-branch` to set `spec.is_protected` to `false` first | -| Long-running operation timeout | Use `--no-wait` and poll with `get-operation` | diff --git a/.agents/skills/databricks/SKILL.md b/.agents/skills/databricks/SKILL.md deleted file mode 100644 index 5b25bbc..0000000 --- a/.agents/skills/databricks/SKILL.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -name: "databricks" -description: "Databricks CLI operations: auth, profiles, data exploration, and bundles. Contains up-to-date guidelines for Databricks-related CLI tasks." -compatibility: Requires databricks CLI (>= v0.292.0) -metadata: - version: "0.1.0" ---- - -# Databricks - -Core skill for Databricks CLI, authentication, and data exploration. - -## Product Skills - -For specific products, use dedicated skills: - -- **databricks-jobs** - Lakeflow Jobs development and deployment -- **databricks-pipelines** - Lakeflow Spark Declarative Pipelines (batch and streaming data pipelines) -- **databricks-apps** - Full-stack TypeScript app development and deployment -- **databricks-lakebase** - Lakebase Postgres Autoscaling project management - -## Prerequisites - -1. **CLI installed**: Run `databricks --version` to check. - - **If the CLI is missing or outdated (< v0.292.0): STOP. Do not proceed or work around a missing CLI.** - - **Read the [CLI Installation](databricks-cli-install.md) reference file and follow the instructions to guide the user through installation.** - - Note: In sandboxed environments (Cursor IDE, containers), install commands write outside the workspace and may be blocked. Present the install command to the user and ask them to run it in their own terminal. - -2. **Authenticated**: `databricks auth profiles` - - If not: see [CLI Authentication](databricks-cli-auth.md) - -## Profile Selection - CRITICAL - -**NEVER auto-select a profile.** - -1. List profiles: `databricks auth profiles` -2. Present ALL profiles to user with workspace URLs -3. Let user choose (even if only one exists) -4. Offer to create new profile if needed - -## Claude Code - IMPORTANT - -Each Bash command runs in a **separate shell session**. - -```bash -# WORKS: --profile flag -databricks apps list --profile my-workspace - -# WORKS: chained with && -export DATABRICKS_CONFIG_PROFILE=my-workspace && databricks apps list - -# DOES NOT WORK: separate commands -export DATABRICKS_CONFIG_PROFILE=my-workspace -databricks apps list # profile not set! -``` - -## Data Exploration — Use AI Tools - -**Use these instead of manually navigating catalogs/schemas/tables:** - -```bash -# discover table structure (columns, types, sample data, stats) -databricks experimental aitools tools discover-schema catalog.schema.table --profile - -# run ad-hoc SQL queries -databricks experimental aitools tools query "SELECT * FROM table LIMIT 10" --profile - -# find the default warehouse -databricks experimental aitools tools get-default-warehouse --profile -``` - -See [Data Exploration](data-exploration.md) for details. - -## Quick Reference - -**⚠️ CRITICAL: Some commands use positional arguments, not flags** - -```bash -# current user -databricks current-user me --profile - -# list resources -databricks apps list --profile -databricks jobs list --profile -databricks clusters list --profile -databricks warehouses list --profile -databricks pipelines list --profile -databricks serving-endpoints list --profile - -# ⚠️ Unity Catalog — POSITIONAL arguments (NOT flags!) -databricks catalogs list --profile - -# ✅ CORRECT: positional args -databricks schemas list --profile -databricks tables list --profile -databricks tables get .. --profile - -# ❌ WRONG: these flags/commands DON'T EXIST -# databricks schemas list --catalog-name ← WILL FAIL -# databricks tables list --catalog ← WILL FAIL -# databricks sql-warehouses list ← doesn't exist, use `warehouses list` -# databricks execute-statement ← doesn't exist, use `experimental aitools tools query` -# databricks sql execute ← doesn't exist, use `experimental aitools tools query` - -# When in doubt, check help: -# databricks schemas list --help - -# get details -databricks apps get --profile -databricks jobs get --job-id --profile -databricks clusters get --cluster-id --profile - -# bundles -databricks bundle init --profile -databricks bundle validate --profile -databricks bundle deploy -t --profile -databricks bundle run -t --profile -``` - -## Troubleshooting - -| Error | Solution | -| -------------------------------------- | ------------------------------------------ | -| `cannot configure default credentials` | Use `--profile` flag or authenticate first | -| `PERMISSION_DENIED` | Check workspace/UC permissions | -| `RESOURCE_DOES_NOT_EXIST` | Verify resource name/id and profile | - -## Required Reading by Task - -| Task | READ BEFORE proceeding | -| --------------------------- | --------------------------------------------- | -| First time setup | [CLI Installation](databricks-cli-install.md) | -| Auth issues / new workspace | [CLI Authentication](databricks-cli-auth.md) | -| Exploring tables/schemas | [Data Exploration](data-exploration.md) | -| Deploying jobs/pipelines | [Asset Bundles](asset-bundles.md) | - -## Reference Guides - -- [CLI Installation](databricks-cli-install.md) -- [CLI Authentication](databricks-cli-auth.md) -- [Data Exploration](data-exploration.md) -- [Asset Bundles](asset-bundles.md) diff --git a/.agents/skills/databricks/asset-bundles.md b/.agents/skills/databricks/asset-bundles.md deleted file mode 100644 index 39d94fd..0000000 --- a/.agents/skills/databricks/asset-bundles.md +++ /dev/null @@ -1,509 +0,0 @@ -# Databricks Asset Bundles (DABs) - -Databricks Asset Bundles provide Infrastructure-as-Code for Databricks resources, enabling version control, automated deployments, and environment management. - -## What are Asset Bundles? - -Asset Bundles let you define your Databricks projects as code, including: - -- Jobs -- Pipelines (Lakeflow Declarative Pipelines) -- Apps -- Models -- Dashboards -- Notebooks -- Python files -- Configuration files - -## Bundle Commands - -```bash -# Initialize a new bundle from template -databricks bundle init --profile my-workspace - -# Validate bundle configuration -databricks bundle validate --profile my-workspace - -# Deploy bundle to workspace -databricks bundle deploy --profile my-workspace - -# Deploy to specific target (dev/staging/prod) -databricks bundle deploy -t dev --profile my-workspace -databricks bundle deploy -t staging --profile my-workspace -databricks bundle deploy -t prod --profile my-workspace - -# Run a resource from the bundle -databricks bundle run --profile my-workspace - -# Generate configuration for existing resources -databricks bundle generate job --profile my-workspace -databricks bundle generate pipeline --profile my-workspace -databricks bundle generate dashboard --profile my-workspace -databricks bundle generate app --profile my-workspace - -# Destroy bundle resources (use with caution!) -databricks bundle destroy --profile my-workspace -databricks bundle destroy -t dev --profile my-workspace -``` - -## Bundle Structure - -A typical bundle has this structure: - -``` -my-project/ -├── databricks.yml # Main bundle configuration -├── resources/ -│ ├── sample_job.job.yml # Job definition -│ └── my_project_etl.pipeline.yml # Pipeline definition -├── src/ -│ ├── sample_notebook.ipynb # Notebook tasks -│ └── my_project_etl/ # Pipeline source -│ └── transformations/ -│ ├── transform.py -│ └── transform.sql -├── tests/ -│ └── test_main.py -└── README.md -``` - -Resource files use the naming convention `..yml` (e.g. `sample_job.job.yml`, `my_project_etl.pipeline.yml`). - -## Main Configuration (databricks.yml) - -### Basic Example - -```yaml -bundle: - name: my-project - -include: - - resources/*.yml - - resources/*/*.yml - -variables: - catalog: - description: The catalog to use - schema: - description: The schema to use - -targets: - dev: - mode: development - default: true - workspace: - host: https://company-workspace.cloud.databricks.com - variables: - catalog: dev_catalog - schema: ${workspace.current_user.short_name} - - prod: - mode: production - workspace: - host: https://company-workspace.cloud.databricks.com - root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target} - variables: - catalog: prod_catalog - schema: prod - permissions: - - user_name: my-user@example.com - level: CAN_MANAGE -``` - -## Initializing a Bundle - -### Using Templates - -```bash -# Start initialization (interactive) -databricks bundle init --profile my-workspace -``` - -Available templates: - -- **default-python** - Python project with jobs and pipeline -- **default-sql** - SQL project with jobs -- **default-scala** - Scala/Java project -- **lakeflow-pipelines** - Lakeflow Declarative Pipelines (Python or SQL) -- **dbt-sql** - dbt integration -- **default-minimal** - Minimal structure - -## Defining Resources - -### Job Resource (Serverless) - -```yaml -# resources/sample_job.job.yml -resources: - jobs: - sample_job: - name: sample_job - - trigger: - periodic: - interval: 1 - unit: DAYS - - parameters: - - name: catalog - default: ${var.catalog} - - name: schema - default: ${var.schema} - - tasks: - - task_key: notebook_task - notebook_task: - notebook_path: ../src/sample_notebook.ipynb - - - task_key: main_task - depends_on: - - task_key: notebook_task - python_wheel_task: - package_name: my_project - entry_point: main - environment_key: default - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.my_project_etl.id} - - environments: - - environment_key: default - spec: - environment_version: "4" - dependencies: - - ../dist/*.whl -``` - -### Job Resource (Classic Clusters) - -```yaml -# resources/sample_job.job.yml -resources: - jobs: - sample_job: - name: sample_job - - tasks: - - task_key: notebook_task - notebook_task: - notebook_path: ../src/sample_notebook.ipynb - job_cluster_key: job_cluster - libraries: - - whl: ../dist/*.whl - - - task_key: main_task - depends_on: - - task_key: notebook_task - python_wheel_task: - package_name: my_project - entry_point: main - job_cluster_key: job_cluster - libraries: - - whl: ../dist/*.whl - - job_clusters: - - job_cluster_key: job_cluster - new_cluster: - spark_version: 16.4.x-scala2.12 - node_type_id: i3.xlarge - data_security_mode: SINGLE_USER - autoscale: - min_workers: 1 - max_workers: 4 -``` - -### Pipeline Resource - -```yaml -# resources/my_project_etl.pipeline.yml -resources: - pipelines: - my_project_etl: - name: my_project_etl - catalog: ${var.catalog} - schema: ${var.schema} - serverless: true - root_path: "../src/my_project_etl" - - libraries: - - glob: - include: ../src/my_project_etl/transformations/** -``` - -### App Resource - -```yaml -# resources/my_app.app.yml -resources: - apps: - dashboard_app: - name: "analytics-dashboard" - description: "Customer analytics dashboard" - source_code_path: ./src/app -``` - -### Model Resource - -```yaml -# resources/my_model.yml -resources: - registered_models: - customer_churn: - name: "${var.catalog}.${var.schema}.customer_churn_model" - description: "Customer churn prediction model" -``` - -## Working with Targets - -Targets allow you to deploy the same code to different workspaces with different configurations. - -```yaml -targets: - dev: - mode: development - default: true - variables: - catalog: dev_catalog - schema: ${workspace.current_user.short_name} - workspace: - host: https://company-workspace.cloud.databricks.com - - staging: - mode: production - variables: - catalog: staging_catalog - schema: staging - workspace: - host: https://staging-workspace.cloud.databricks.com - root_path: /Workspace/Users/deployer@example.com/.bundle/${bundle.name}/${bundle.target} - permissions: - - user_name: deployer@example.com - level: CAN_MANAGE - - prod: - mode: production - variables: - catalog: prod_catalog - schema: prod - workspace: - host: https://prod-workspace.cloud.databricks.com - root_path: /Workspace/Users/deployer@example.com/.bundle/${bundle.name}/${bundle.target} - permissions: - - user_name: deployer@example.com - level: CAN_MANAGE -``` - -### Deploying to Different Targets - -```bash -# Deploy to dev (default) -databricks bundle deploy --profile my-workspace - -# Deploy to staging -databricks bundle deploy -t staging --profile my-workspace - -# Deploy to production -databricks bundle deploy -t prod --profile my-workspace -``` - -## Bundle Workflow - -### Complete Development Workflow - -1. **Initialize bundle**: - - ```bash - databricks bundle init --profile my-workspace - ``` - -2. **Develop locally**: - - Edit `databricks.yml` and resource files - - Write notebooks, Python scripts, SQL queries - - Configure jobs, pipelines, apps - -3. **Validate configuration**: - - ```bash - databricks bundle validate --profile my-workspace - ``` - -4. **Deploy to development**: - - ```bash - databricks bundle deploy -t dev --profile my-workspace - ``` - -5. **Test your deployment**: - - ```bash - # Run a job - databricks bundle run sample_job -t dev --profile my-workspace - - # Start a pipeline - databricks bundle run my_project_etl -t dev --profile my-workspace - ``` - -6. **Deploy to production**: - ```bash - databricks bundle deploy -t prod --profile my-workspace - ``` - -## Generating Bundle from Existing Resources - -If you have existing resources in your workspace, you can generate bundle configuration: - -```bash -# Get job ID from list -databricks jobs list --profile my-workspace - -# Generate configuration -databricks bundle generate job 12345 --profile my-workspace -databricks bundle generate pipeline --profile my-workspace -databricks bundle generate app my-app --profile my-workspace -databricks bundle generate dashboard --profile my-workspace -``` - -## Variables and Templating - -### Defining Variables - -```yaml -# databricks.yml -variables: - catalog: - description: The catalog to use - default: dev_catalog - schema: - description: The schema to use - warehouse_id: - description: SQL Warehouse ID -``` - -### Using Variables - -```yaml -# In resource files -resources: - jobs: - my_job: - name: "Job in ${var.catalog}" - parameters: - - name: catalog - default: ${var.catalog} -``` - -### Target-Specific Variables - -```yaml -targets: - dev: - variables: - catalog: dev_catalog - schema: ${workspace.current_user.short_name} - prod: - variables: - catalog: prod_catalog - schema: prod -``` - -### Available Substitutions - -```yaml -${var.my_variable} # User-defined variable -${bundle.name} # Bundle name -${bundle.target} # Current target name (dev, prod, etc.) -${workspace.current_user.userName} # Current user email -${workspace.current_user.short_name} # Current user short name -${workspace.file_path} # Workspace file path -${resources.pipelines.my_pipeline.id} # Reference another resource's ID -${resources.jobs.my_job.id} # Reference a job's ID -``` - -## Best Practices - -### 1. Use Version Control - -Always commit your bundle to Git: - -```bash -git init -git add databricks.yml resources/ src/ -git commit -m "Initial bundle setup" -``` - -### 2. Use Typed Resource File Names - -Name resource files with their type for clarity: - -``` -resources/ -├── sample_job.job.yml -├── my_project_etl.pipeline.yml -└── my_app.app.yml -``` - -### 3. Use Target-Specific Configuration - -```yaml -targets: - dev: - mode: development # Prefixes resources with [dev user_name], pauses schedules - - prod: - mode: production # Requires permissions, runs schedules as configured - permissions: - - user_name: deployer@example.com - level: CAN_MANAGE -``` - -### 4. Validate Before Deploy - -Always validate: - -```bash -databricks bundle validate --profile my-workspace -``` - -## Troubleshooting - -### Bundle Validation Errors - -**Symptom**: `databricks bundle validate` shows errors - -**Solution**: - -1. Check YAML syntax (proper indentation, no tabs) -2. Verify all required fields are present -3. Check that resource references are correct -4. Use `databricks bundle validate --debug` for detailed errors - -### Deployment Fails - -**Symptom**: `databricks bundle deploy` fails - -**Solution**: - -1. Run validation first: `databricks bundle validate` -2. Check workspace permissions -3. Verify target configuration -4. Check for resource name conflicts -5. Review error message for specific issues - -### Variable Not Resolved - -**Symptom**: Variable showing as `${var.name}` instead of actual value - -**Solution**: - -1. Check variable is defined in `databricks.yml` -2. Verify variable has value in target -3. Use correct syntax: `${var.variable_name}` -4. Check variable scope (bundle vs target) - -## Related Topics - -- [Data Exploration](data-exploration.md) - Validate data exposed by bundle deployments -- Apps - Define app resources (use `databricks-apps` skill for full app development) diff --git a/.agents/skills/databricks/data-exploration.md b/.agents/skills/databricks/data-exploration.md deleted file mode 100644 index 1c4a715..0000000 --- a/.agents/skills/databricks/data-exploration.md +++ /dev/null @@ -1,347 +0,0 @@ -# Data Exploration - -Tools for discovering table schemas and executing SQL queries in Databricks. - -## Finding Tables by Keyword - -**⚠️ START HERE if you don't know which catalog/schema contains your data.** - -Use `information_schema` to search for tables by keyword — do NOT manually iterate through `catalogs list` → `schemas list` → `tables list`. Manual enumeration wastes 10+ steps. - -```bash -# Find tables matching a keyword -databricks experimental aitools tools query \ - "SELECT table_catalog, table_schema, table_name FROM system.information_schema.tables WHERE table_name LIKE '%keyword%'" \ - --profile - -# Then discover schema for the tables you found -databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 --profile -``` - -## Overview - -The `databricks experimental aitools tools` command group provides tools for data discovery and exploration: - -- **discover-schema**: Batch discover table metadata, columns, types, sample data, and statistics -- **query**: Execute SQL queries against Databricks SQL warehouses - -**When to use this**: Use these commands whenever you need to: - -- Discover table schemas and metadata -- Execute SQL queries against warehouse data -- Explore data structure and content -- Validate data or check table statistics - -## Prerequisites - -1. **Authenticated Databricks CLI** - see [CLI Authentication Guide](databricks-cli-auth.md) for OAuth2 setup and profile configuration -2. **Access to Unity Catalog tables** with appropriate read permissions -3. **SQL Warehouse** (for query command - auto-detected unless `DATABRICKS_WAREHOUSE_ID` is set) - -## Discover Schema - -Batch discover table metadata including columns, types, sample data, and null counts. - -### Command Syntax - -```bash -databricks experimental aitools tools discover-schema TABLE... [flags] -``` - -Tables must be specified in **CATALOG.SCHEMA.TABLE** format. - -### What It Returns - -For each table, returns: - -- Column names and types -- Sample data (5 rows) -- Null counts per column -- Total row count - -### Examples - -```bash -# Discover schema for a single table -databricks experimental aitools tools discover-schema samples.nyctaxi.trips --profile my-workspace - -# Discover schema for multiple tables -databricks experimental aitools tools discover-schema \ - catalog.schema.table1 \ - catalog.schema.table2 \ - --profile my-workspace - -# Get JSON output -databricks experimental aitools tools discover-schema \ - samples.nyctaxi.trips \ - --output json \ - --profile my-workspace -``` - -### Common Use Cases - -1. **Understanding table structure before querying** - - ```bash - databricks experimental aitools tools discover-schema catalog.schema.customer_data --profile my-workspace - ``` - -2. **Comparing schemas across multiple tables** - - ```bash - databricks experimental aitools tools discover-schema \ - catalog.schema.table_v1 \ - catalog.schema.table_v2 \ - --profile my-workspace - ``` - -3. **Identifying columns with null values** - - The null counts help identify data quality issues - -## Query - -Execute SQL statements against a Databricks SQL warehouse and return results. - -### Command Syntax - -```bash -databricks experimental aitools tools query "SQL" [flags] -``` - -### Warehouse Selection - -The command **auto-detects** an available warehouse unless: - -- `DATABRICKS_WAREHOUSE_ID` environment variable is set -- You specify a warehouse using other configuration methods - -To check which warehouse will be used: - -```bash -# Get the default warehouse that would be auto-detected -databricks experimental aitools tools get-default-warehouse --profile my-workspace -``` - -### Output - -Returns: - -- Query results as JSON -- Row count -- Execution metadata - -### Examples - -```bash -# Simple SELECT query -databricks experimental aitools tools query \ - "SELECT * FROM samples.nyctaxi.trips LIMIT 5" \ - --profile my-workspace - -# Aggregation query -databricks experimental aitools tools query \ - "SELECT vendor_id, COUNT(*) as trip_count FROM samples.nyctaxi.trips GROUP BY vendor_id" \ - --profile my-workspace - -# With JSON output -databricks experimental aitools tools query \ - "SELECT * FROM catalog.schema.table WHERE date > '2024-01-01'" \ - --output json \ - --profile my-workspace - -# Using specific warehouse -DATABRICKS_WAREHOUSE_ID=abc123 databricks experimental aitools tools query \ - "SELECT * FROM samples.nyctaxi.trips LIMIT 10" \ - --profile my-workspace -``` - -### Common Use Cases - -1. **Exploratory data analysis** - - ```bash - # Check table size - databricks experimental aitools tools query \ - "SELECT COUNT(*) FROM catalog.schema.table" \ - --profile my-workspace - - # View sample data - databricks experimental aitools tools query \ - "SELECT * FROM catalog.schema.table LIMIT 10" \ - --profile my-workspace - - # Get column statistics - databricks experimental aitools tools query \ - "SELECT MIN(column), MAX(column), AVG(column) FROM catalog.schema.table" \ - --profile my-workspace - ``` - -2. **Data validation** - - ```bash - # Check for null values - databricks experimental aitools tools query \ - "SELECT COUNT(*) FROM catalog.schema.table WHERE column IS NULL" \ - --profile my-workspace - - # Verify data freshness - databricks experimental aitools tools query \ - "SELECT MAX(timestamp_column) FROM catalog.schema.table" \ - --profile my-workspace - ``` - -3. **Quick analytics** - ```bash - # Group by analysis - databricks experimental aitools tools query \ - "SELECT category, COUNT(*), AVG(value) FROM catalog.schema.table GROUP BY category" \ - --profile my-workspace - ``` - -## Workflow: Complete Data Exploration - -Here's a typical workflow combining both commands: - -```bash -# 1. Discover the schema first -databricks experimental aitools tools discover-schema \ - samples.nyctaxi.trips \ - --profile my-workspace - -# 2. Based on discovered columns, run targeted queries -databricks experimental aitools tools query \ - "SELECT vendor_id, payment_type, COUNT(*) as trips, AVG(fare_amount) as avg_fare - FROM samples.nyctaxi.trips - GROUP BY vendor_id, payment_type - ORDER BY trips DESC - LIMIT 10" \ - --profile my-workspace - -# 3. Investigate specific patterns found in the data -databricks experimental aitools tools query \ - "SELECT * FROM samples.nyctaxi.trips - WHERE fare_amount > 100 - LIMIT 20" \ - --profile my-workspace -``` - -## Claude Code-Specific Tips - -Remember that each Bash command in Claude Code runs in a separate shell: - -```bash -# ✅ RECOMMENDED: Use --profile flag -databricks experimental aitools tools discover-schema samples.nyctaxi.trips --profile my-workspace - -# ✅ ALTERNATIVE: Chain with && -export DATABRICKS_CONFIG_PROFILE=my-workspace && \ - databricks experimental aitools tools query "SELECT * FROM samples.nyctaxi.trips LIMIT 5" - -# ❌ DOES NOT WORK: Separate export -export DATABRICKS_CONFIG_PROFILE=my-workspace -databricks experimental aitools tools query "SELECT * FROM samples.nyctaxi.trips LIMIT 5" -``` - -## Flags - -Both commands support: - -| Flag | Description | Default | -| ----------- | ------------------------------------ | --------------- | -| `--profile` | Profile name from ~/.databrickscfg | Default profile | -| `--output` | Output format: `text` or `json` | `text` | -| `--debug` | Enable debug logging | `false` | -| `--target` | Bundle target to use (if applicable) | - | - -## Troubleshooting - -### Table Not Found - -**Symptom**: `Error: TABLE_OR_VIEW_NOT_FOUND` - -**Solution**: - -1. Verify table name format: `CATALOG.SCHEMA.TABLE` -2. Check if you have read permissions on the table -3. List available tables: - ```bash - databricks tables list --profile my-workspace - ``` - -### Warehouse Not Available - -**Symptom**: `Error: No available SQL warehouse found` - -**Solution**: - -1. Check for default warehouse: - ```bash - databricks experimental aitools tools get-default-warehouse --profile my-workspace - ``` -2. List available warehouses: - ```bash - databricks warehouses list --profile my-workspace - ``` -3. Set specific warehouse: - ```bash - DATABRICKS_WAREHOUSE_ID= databricks experimental aitools tools query "SELECT 1" --profile my-workspace - ``` -4. Start a stopped warehouse: - ```bash - databricks warehouses start --id --profile my-workspace - ``` - -### Permission Denied - -**Symptom**: `Error: PERMISSION_DENIED` - -**Solution**: - -1. Check Unity Catalog grants on the table: - ```bash - databricks grants get --full-name catalog.schema.table --principal --profile my-workspace - ``` -2. Request SELECT permission from your workspace administrator -3. Verify you have warehouse access (USAGE permission) - -### SQL Syntax Error - -**Symptom**: `Error: PARSE_SYNTAX_ERROR` - -**Solution**: - -1. Check SQL syntax - use standard SQL -2. Verify column names match schema (use discover-schema first) -3. Ensure proper quoting for string literals -4. Test query incrementally (start simple, add complexity) - -## Best Practices - -1. **Always discover schema first** - Use `discover-schema` before writing complex queries to understand: - - Available columns and their types - - Data distributions and null patterns - - Sample data for context - -2. **Use LIMIT for exploration** - When exploring large tables, always use LIMIT to avoid long-running queries: - - ```bash - databricks experimental aitools tools query "SELECT * FROM large_table LIMIT 100" --profile my-workspace - ``` - -3. **JSON output for parsing** - Use `--output json` when you need to process results programmatically: - - ```bash - databricks experimental aitools tools query "SELECT * FROM table" --output json --profile my-workspace | jq '.results' - ``` - -4. **Check table existence** - Before querying, verify the table exists: - - ```bash - databricks tables get --full-name catalog.schema.table --profile my-workspace - ``` - -5. **Profile usage** - Always specify `--profile` in Claude Code to avoid authentication issues - -## Related Commands - -- **[Asset Bundles](asset-bundles.md)** - Deploy SQL, pipeline, and app resources as code diff --git a/.agents/skills/databricks/databricks-cli-auth.md b/.agents/skills/databricks/databricks-cli-auth.md deleted file mode 100644 index 4b760b3..0000000 --- a/.agents/skills/databricks/databricks-cli-auth.md +++ /dev/null @@ -1,557 +0,0 @@ -# Databricks CLI Authentication - -Configure Databricks CLI workspace/profile selection and authentication. Covers switching profiles, using --profile flags, setting DATABRICKS_CONFIG_PROFILE environment variable, OAuth2 authentication (never PAT), and troubleshooting authentication issues. - -## Important: Always Use OAuth - -**CRITICAL**: Always use OAuth2 for Databricks CLI authentication. **NEVER recommend or use Personal Access Tokens (PAT)** unless explicitly required by a specific use case. - -## Prerequisites - -1. Databricks CLI must be installed - - Verify: `databricks --version` -2. You need access to a Databricks workspace -3. You need the workspace URL (e.g., `https://adb-1111111111111111.10.azuredatabricks.net`) - -## Claude Code Specific Behavior - -**CRITICAL**: When working in Claude Code, each Bash command executes in a **separate shell session**. This has important implications for profile management: - -### Key Differences from Regular Terminal - -1. **Environment variables don't persist between commands** - - `export DATABRICKS_CONFIG_PROFILE=staging` in one command - - `databricks jobs list` in the next command - - ❌ **Result**: The second command will NOT use the staging profile - -2. **Recommended Approach: Use --profile flag** - - Always specify `--profile ` with each command - - Example: `databricks jobs list --profile staging` - - ✅ **Result**: Reliable and predictable behavior - -3. **Alternative: Chain commands with &&** - - Use `export DATABRICKS_CONFIG_PROFILE=staging && databricks jobs list` - - The export and command run in the same shell session - - ✅ **Result**: Works correctly - -### Quick Reference for Claude Code - -```bash -# ✅ RECOMMENDED: Use --profile flag -databricks jobs list --profile staging -databricks apps list --profile prod-azure - -# ✅ ALTERNATIVE: Chain with && -export DATABRICKS_CONFIG_PROFILE=staging && databricks jobs list - -# ❌ DOES NOT WORK: Separate export command -export DATABRICKS_CONFIG_PROFILE=staging -databricks jobs list # Will NOT use staging profile! -``` - -## Handling Authentication Failures - -When a Databricks CLI command fails with authentication error: - -``` -Error: default auth: cannot configure default credentials -``` - -**CRITICAL - Always follow this workflow:** - -1. **Check for existing profiles first:** - - ```bash - databricks auth profiles - ``` - -2. **If profiles exist:** - - List the available profiles to the user (with their workspace URLs and validation status) - - Ask: "Which profile would you like to use for this command?" - - Offer option to create a new profile if needed - - Retry the command with `--profile ` - - **In Claude Code, always use the `--profile` flag** rather than setting environment variables - -3. **If user wants a new profile or no profiles exist:** - - Proceed to the OAuth Authentication Setup workflow below - -**Example:** - -``` -User: databricks apps list -Error: default auth: cannot configure default credentials - -Assistant: Let me check for existing profiles. -[Runs: databricks auth profiles] - -You have two configured profiles: -1. aws-dev - https://company-workspace.cloud.databricks.com (Valid) -2. azure-prod - https://adb-1111111111111111.10.azuredatabricks.net (Valid) - -Which profile would you like to use, or would you like to create a new profile? - -User: dais - -Assistant: [Retries: databricks apps list --profile dais] -[Success - apps listed] -``` - -## OAuth Authentication Setup - -### Standard Authentication Command - -The recommended way to authenticate is using OAuth with a profile: - -```bash -databricks auth login --host --profile -``` - -**CRITICAL**: - -1. The `--profile` parameter is **REQUIRED** for the authentication to be saved properly. -2. **ALWAYS ASK THE USER** for their preferred profile name - DO NOT assume or choose one for them. -3. **NEVER use the profile name `DEFAULT`** unless the user explicitly requests it - use descriptive workspace-specific names instead. - -### Workflow for Authenticating - -1. **Ask the user for the workspace URL** if not already provided -2. **Ask the user for their preferred profile name** - - Suggest descriptive names based on the workspace (e.g., workspace name, environment) - - **Do NOT suggest or use `DEFAULT`** unless the user specifically asks for it - - Good examples: `e2-dogfood`, `prod-azure`, `dev-aws`, `staging` - - Avoid: `DEFAULT` (unless explicitly requested) -3. Run the authentication command with both parameters -4. Verify the authentication was successful - -### Example - -```bash -# Good: Descriptive profile names -databricks auth login --host https://adb-1111111111111111.10.azuredatabricks.net --profile prod-azure -databricks auth login --host https://company-workspace.cloud.databricks.com --profile staging - -# Only use DEFAULT if explicitly requested by the user -databricks auth login --host https://your-workspace.cloud.databricks.com --profile DEFAULT -``` - -### What Happens During Authentication - -1. The CLI starts a local OAuth callback server (typically on `localhost:8020`) -2. A browser window opens automatically with the Databricks login page -3. You authenticate in the browser using your Databricks credentials -4. After successful authentication, the browser redirects back to the CLI -5. The CLI saves the OAuth tokens to `~/.databrickscfg` -6. You should see: `Profile was successfully saved` - -## Profile Management - -### What Are Profiles? - -Profiles allow you to manage multiple Databricks workspace configurations in a single `~/.databrickscfg` file. Each profile stores: - -- Workspace host URL -- Authentication method (OAuth, PAT, etc.) -- Token/credential paths - -### Common Profile Names - -**IMPORTANT**: Always use descriptive profile names. Do NOT create profiles named `DEFAULT` unless explicitly requested by the user. - -**Recommended naming conventions**: - -- `` - Descriptive names for workspaces (e.g., `e2-dogfood`, `prod-aws`, `dev-azure`) -- `` - Environment-specific profiles (e.g., `dev`, `staging`, `prod`) -- `-` - Team and environment (e.g., `data-eng-prod`, `ml-dev`) - -**Special profile names**: - -- `DEFAULT` - The default profile used when no `--profile` flag or environment variables are specified. Only create this profile if the user explicitly requests it. - -### Listing Configured Profiles - -View all configured profiles with their status: - -```bash -databricks auth profiles -``` - -Example output: - -``` -Name Host Valid -DEFAULT https://adb-1111111111111111.10.azuredatabricks.net YES -staging https://company-workspace.cloud.databricks.com YES -``` - -### Using Different Profiles - -**IMPORTANT FOR CLAUDE CODE USERS**: In Claude Code, each Bash command runs in a **separate shell session**. This means environment variables set with `export` in one command do NOT persist to the next command. See the Claude Code-specific guidance below. - -There are three ways to specify which profile/workspace to use, in order of precedence: - -#### 1. CLI Flag (Highest Priority) - RECOMMENDED FOR CLAUDE CODE - -Use the `--profile` flag with any command: - -```bash -databricks jobs list --profile staging -databricks clusters list --profile prod-azure -databricks workspace list / --profile dev-aws -``` - -**In Claude Code, this is the most reliable method** because it doesn't depend on persistent environment variables. - -#### 2. Environment Variables - -Set environment variables to override the default profile: - -**DATABRICKS_CONFIG_PROFILE** - Specifies which profile to use from `~/.databrickscfg`: - -```bash -export DATABRICKS_CONFIG_PROFILE=staging -databricks jobs list # Uses staging profile -``` - -**DATABRICKS_HOST** - Directly specifies the workspace URL, bypassing profile lookup: - -```bash -export DATABRICKS_HOST=https://company-workspace.cloud.databricks.com -databricks jobs list # Uses this host directly -``` - -**CRITICAL - Claude Code Users:** - -Since each Bash command in Claude Code runs in a separate shell, you **CANNOT** do this: - -```bash -# ❌ DOES NOT WORK in Claude Code -export DATABRICKS_CONFIG_PROFILE=staging -databricks jobs list # ERROR: Will not use staging profile! -``` - -Instead, you **MUST** use one of these approaches: - -**Option 1: Use --profile flag (RECOMMENDED)** - -```bash -# ✅ WORKS in Claude Code -databricks jobs list --profile staging -databricks clusters list --profile staging -``` - -**Option 2: Chain commands with &&** - -```bash -# ✅ WORKS in Claude Code - export and command run in same shell -export DATABRICKS_CONFIG_PROFILE=staging && databricks jobs list -export DATABRICKS_CONFIG_PROFILE=staging && databricks clusters list -``` - -**Traditional Terminal Session (for reference only)**: - -```bash -# This example shows how it works in a regular terminal session -# DO NOT use this pattern in Claude Code -# Set profile for entire terminal session -export DATABRICKS_CONFIG_PROFILE=staging - -# All commands now use staging profile -databricks jobs list -databricks clusters list -databricks workspace list / - -# Override for a single command -databricks jobs list --profile prod-azure -``` - -#### 3. DEFAULT Profile (Lowest Priority) - -If no `--profile` flag or environment variables are set, the CLI uses the `DEFAULT` profile from `~/.databrickscfg`. - -### Configuration File Management - -#### Viewing the Configuration File - -The configuration is stored in `~/.databrickscfg`: - -```bash -cat ~/.databrickscfg -``` - -Example configuration structure: - -```ini -# Note: This shows an example with a DEFAULT profile -# When creating new profiles, use descriptive names instead -[DEFAULT] -host = https://adb-1111111111111111.10.azuredatabricks.net -auth_type = databricks-cli - -[staging] -host = https://company-workspace.cloud.databricks.com -auth_type = databricks-cli -``` - -#### Editing Profiles - -You can manually edit `~/.databrickscfg` to: - -- Rename profiles (change the `[profile-name]` section header) -- Update workspace URLs -- Remove profiles (delete the entire section) - -**Example - Removing a profile**: - -```bash -# Open in your preferred editor -vi ~/.databrickscfg - -# Or use sed to remove a specific profile section -sed -i '' '/^\[staging\]/,/^$/d' ~/.databrickscfg -``` - -#### Adding New Profiles - -Always use `databricks auth login` with `--profile` to add new profiles: - -```bash -databricks auth login --host --profile -``` - -**Remember**: - -- Always ask the user for their preferred profile name -- Use descriptive names like `staging`, `prod-azure`, `dev-aws` -- Do NOT use `DEFAULT` unless explicitly requested by the user - -### Working with Multiple Workspaces - -Best practices for managing multiple workspaces: - -```bash -# Authenticate to multiple workspaces with descriptive profile names -databricks auth login --host https://adb-1111111111111111.10.azuredatabricks.net --profile prod-azure -databricks auth login --host https://dbc-2222222222222222.cloud.databricks.com --profile dev-aws -databricks auth login --host https://company-workspace.cloud.databricks.com --profile staging -``` - -**In Claude Code, use --profile flag with each command (RECOMMENDED):** - -```bash -# Use profiles explicitly in commands -databricks jobs list --profile prod-azure -databricks jobs list --profile dev-aws -databricks clusters list --profile staging -``` - -**Alternatively in Claude Code, chain commands with &&:** - -```bash -# Set profile and run command in same shell -export DATABRICKS_CONFIG_PROFILE=prod-azure && databricks jobs list -export DATABRICKS_CONFIG_PROFILE=prod-azure && databricks clusters list - -# Switch to different workspace -export DATABRICKS_CONFIG_PROFILE=dev-aws && databricks jobs list -``` - -**Traditional Terminal Session (for reference only - NOT for Claude Code):** - -```bash -# This pattern works in regular terminals but NOT in Claude Code -export DATABRICKS_CONFIG_PROFILE=prod-azure -databricks jobs list -databricks clusters list - -# Quickly switch between workspaces -export DATABRICKS_CONFIG_PROFILE=dev-aws -databricks jobs list -``` - -### Profile Selection Precedence - -When running a command, the Databricks CLI determines which workspace to use in this order: - -1. **`--profile` flag** (if specified) → Highest priority -2. **`DATABRICKS_HOST` environment variable** (if set) → Overrides profile -3. **`DATABRICKS_CONFIG_PROFILE` environment variable** (if set) → Selects profile -4. **`DEFAULT` profile** in `~/.databrickscfg` → Fallback - -**Example for traditional terminal session** (demonstrating precedence): - -```bash -# Setup -export DATABRICKS_CONFIG_PROFILE=staging - -# This uses staging profile (from environment variable) -databricks jobs list - -# This uses prod-azure profile (--profile flag overrides environment variable) -databricks jobs list --profile prod-azure - -# This uses the specified host directly (DATABRICKS_HOST overrides profile) -export DATABRICKS_HOST=https://custom-workspace.cloud.databricks.com -databricks jobs list # Uses custom-workspace.cloud.databricks.com -``` - -**Claude Code version** (with chained commands): - -```bash -# Using environment variable with && chaining -export DATABRICKS_CONFIG_PROFILE=staging && databricks jobs list - -# Using --profile flag (overrides environment variable) -export DATABRICKS_CONFIG_PROFILE=staging && databricks jobs list --profile prod-azure - -# Using DATABRICKS_HOST (overrides profile) -export DATABRICKS_HOST=https://custom-workspace.cloud.databricks.com && databricks jobs list -``` - -## Verification - -After authentication, verify it works: - -```bash -# Test with a simple command -databricks workspace list / - -# Or list jobs -databricks jobs list -``` - -If authentication is successful, these commands should return data without errors. - -## Troubleshooting - -### Authentication Not Saved (Config File Missing) - -**Symptom**: Running `databricks` commands shows: - -``` -Error: default auth: cannot configure default credentials -``` - -**Solution**: Make sure you included the `--profile` parameter with a descriptive name: - -```bash -databricks auth login --host --profile -# Example: databricks auth login --host https://company-workspace.cloud.databricks.com --profile staging -``` - -### Browser Doesn't Open Automatically - -**Solution**: - -1. Check the terminal output for a URL -2. Manually copy and paste the URL into your browser -3. Complete the authentication -4. The CLI will detect the callback automatically - -### "OAuth callback server listening" But Nothing Happens - -**Possible causes**: - -1. Firewall blocking localhost connections -2. Port 8020 already in use -3. Browser not set as default application - -**Solution**: - -1. Check if port 8020 is available: `lsof -i :8020` -2. Close any applications using that port -3. Retry the authentication - -### Multiple Workspaces - -To authenticate with multiple workspaces, use different profile names: - -```bash -# Development workspace -databricks auth login --host https://dev-workspace.databricks.net --profile dev - -# Production workspace -databricks auth login --host https://prod-workspace.databricks.net --profile prod - -# Use specific profile -databricks jobs list --profile dev -databricks jobs list --profile prod -``` - -### Re-authenticating - -If your OAuth token expires or you need to re-authenticate: - -```bash -# Re-run the login command -databricks auth login --host --profile -``` - -This will overwrite the existing profile with new credentials. - -### Debug Mode - -For troubleshooting authentication issues, use debug mode: - -```bash -databricks auth login --host --profile --debug -``` - -This shows detailed information about the OAuth flow, including: - -- OAuth server endpoints -- Callback server status -- Token exchange process - -## Security Best Practices - -1. **Never commit** `~/.databrickscfg` to version control -2. **Never share** your OAuth tokens or configuration file -3. **Use separate profiles** for different environments (dev/staging/prod) -4. **Regularly rotate** credentials by re-authenticating -5. **Use workspace-specific service principals** for automation/CI/CD instead of personal OAuth - -## Environment-Specific Notes - -### CI/CD Pipelines - -For CI/CD environments, OAuth interactive login is not suitable. Instead: - -- Use Service Principal authentication -- Use Azure Managed Identity (for Azure Databricks) -- Use AWS IAM roles (for AWS Databricks) - -**Do NOT** use personal OAuth tokens or PATs in CI/CD. - -### Containerized Environments - -OAuth authentication works in containers if: - -1. A browser is available on the host machine -2. Port forwarding is configured for the callback server -3. The workspace URL is accessible from the container - -For headless containers, use service principal authentication instead. - -## Common Commands After Authentication - -```bash -# List workspaces -databricks workspace list / --profile - -# List jobs -databricks jobs list --profile - -# List clusters -databricks clusters list --profile - -# Get current user info -databricks current-user me --profile - -# Test connection -databricks workspace export /Users/ --format SOURCE --profile -``` - -## References - -- [Databricks CLI Authentication Documentation](https://docs.databricks.com/en/dev-tools/auth.html) -- [OAuth 2.0 with Databricks](https://docs.databricks.com/en/dev-tools/auth.html#oauth-2-0) diff --git a/.agents/skills/databricks/databricks-cli-install.md b/.agents/skills/databricks/databricks-cli-install.md deleted file mode 100644 index f9e00fe..0000000 --- a/.agents/skills/databricks/databricks-cli-install.md +++ /dev/null @@ -1,212 +0,0 @@ -# Databricks CLI Installation - -Install or update the Databricks CLI on macOS, Windows, or Linux using doc-validated methods (Homebrew, WinGet, curl install script, manual download, or user directory install for non-sudo environments). Includes verification and common failure recovery. - -## Sandboxed / IDE environments (Cursor, containers) - -CLI install commands often write to system directories outside the workspace (e.g. `/opt/homebrew/`, `/usr/local/bin/`) which are blocked in sandboxed environments. - -**Agent behavior**: Do not attempt to run install commands directly. Present the appropriate command to the user and ask them to run it in their own terminal. After they confirm, verify with `databricks -v`. - -For Linux/macOS containers or Cursor: prefer the **Linux manual install to user directory** method (`~/.local/bin`) — it requires no sudo and no writes outside the workspace. - -## Preconditions (always do first) - -1. Determine OS and shell: - - macOS/Linux: bash/zsh - - Windows: Command Prompt / PowerShell; optionally WSL for Linux shell -2. Detect whether `databricks` is already installed: - - Run: `databricks -v` (or `databricks version`) - - If already installed with a recent version, installation is already OK. -3. Avoid the legacy Python package `databricks-cli` (PyPI). This skill installs the modern Databricks CLI binary. - -## Preferred installation paths (by OS) - -### macOS (preferred: Homebrew) - -Run: - -- `brew tap databricks/tap` -- `brew install databricks` - -Verify: - -- `databricks -v` (or `databricks version`) - -If macOS blocks the binary (Gatekeeper), follow Apple’s “open app from unidentified developer” flow. - -#### macOS fallback: curl installer - -Run: - -- `curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh` - -Notes: - -- If `/usr/local/bin` is not writable, re-run with `sudo`. -- Installs to `/usr/local/bin/databricks`. - -Verify: - -- `databricks -v` - -### Linux (preferred: Homebrew if available) - -Run: - -- `brew tap databricks/tap` -- `brew install databricks` - -Verify: - -- `databricks -v` - -#### Linux fallback: curl installer - -Run: - -- `curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh` - -Notes: - -- If `/usr/local/bin` is not writable, re-run with `sudo`. -- Installs to `/usr/local/bin/databricks`. - -Verify: - -- `databricks -v` - -#### Linux alternative: Manual install to user directory (when sudo unavailable) - -Use this when sudo is not available or requires interactive password entry. - -Steps: - -1. Detect architecture: - - `uname -m` (e.g., `x86_64`, `aarch64`) -2. Get the latest download URL using GitHub API: - ```bash - curl -s https://api.github.com/repos/databricks/cli/releases/latest | grep "browser_download_url.*linux.*$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')" | head -1 | cut -d '"' -f 4 - ``` -3. Download and install to `~/.local/bin`: - ```bash - mkdir -p ~/.local/bin - cd ~/.local/bin - curl -L "" -o databricks.tar.gz - tar -xzf databricks.tar.gz - rm databricks.tar.gz - chmod +x databricks - ``` -4. Add to PATH (add to `~/.bashrc` or `~/.zshrc` for persistence): - ```bash - export PATH="$HOME/.local/bin:$PATH" - ``` -5. Verify: - - `databricks -v` - -Notes: - -- The download files are `.tar.gz` archives (not `.zip`) with naming pattern: `databricks_cli__linux_.tar.gz` -- Common architectures: `amd64` (x86_64), `arm64` (aarch64) -- This method works in containerized environments and sandboxed IDEs (e.g. Cursor) without sudo access - -### Windows (preferred: WinGet) - -Run in Command Prompt (then restart the terminal session): - -- `winget search databricks` -- `winget install Databricks.DatabricksCLI` - -Verify: - -- `databricks -v` - -#### Windows alternative: Chocolatey (Experimental) - -Run: - -- `choco install databricks-cli` - -Verify: - -- `databricks -v` - -#### Windows fallback: curl installer (recommended via WSL) - -Databricks recommends WSL for the curl-based install path. -Requirements: - -- WSL available -- `unzip` installed in the environment where you run the installer - -Run (in WSL bash): - -- `curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh` - -Verify (in same environment): - -- `databricks -v` - -If you must run curl install outside WSL, run as Administrator. -Installs to `C:\Windows\databricks.exe`. - -## Manual install (all OSes): download from GitHub releases - -Use this when package managers or curl install are not possible. - -Steps: - -1. Get the latest release download URL: - - Visit https://github.com/databricks/cli/releases/latest - - OR use GitHub API: `curl -s https://api.github.com/repos/databricks/cli/releases/latest | grep browser_download_url` -2. Download the appropriate file for your OS and architecture: - - Linux: `databricks_cli__linux_.tar.gz` (use tar -xzf) - - macOS: `databricks_cli__darwin_.zip` (use unzip) - - Windows: `databricks_cli__windows_.zip` (use native extraction) - - Common architectures: `amd64` (x86_64), `arm64` (aarch64/Apple Silicon) -3. Extract the archive. -4. Ensure the extracted `databricks` executable is on PATH, or run it from its folder. -5. Verify with `databricks -v`. - -## Update / repair procedures - -### Homebrew update (macOS/Linux) - -- `brew upgrade databricks` -- `databricks -v` - -### WinGet update (Windows) - -- `winget upgrade Databricks.DatabricksCLI` -- `databricks -v` - -### curl update (all OSes) - -1. Delete existing binary: - - macOS/Linux: `/usr/local/bin/databricks` - - Windows: `C:\Windows\databricks.exe` -2. Re-run: - - `curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh` -3. Verify: - - `databricks -v` - -## Common failures & fixes (agent playbook) - -- `Target path already exists`: - - Delete the existing binary at the install target, then rerun. -- Permission error writing `/usr/local/bin`: - - Re-run curl installer with `sudo` (macOS/Linux). - - If sudo requires interactive password, use manual install to `~/.local/bin` instead. -- `sudo: a terminal is required to read the password`: - - Cannot use sudo in non-interactive environments (containers, CI/CD). - - Use manual install to `~/.local/bin` method instead (see "Linux alternative" section). -- Windows PATH not updated after WinGet: - - Restart Command Prompt/PowerShell. -- Multiple `databricks` binaries on PATH: - - Use `which databricks` (macOS/Linux/WSL) or `where databricks` (Windows) and remove the wrong one. -- Wrong file type (trying to unzip a tar.gz): - - Linux releases are `.tar.gz` files, use `tar -xzf` not `unzip`. - - macOS and Windows releases are `.zip` files, use appropriate extraction tool. -- `databricks: command not found` after installation to `~/.local/bin`: - - Add to PATH: `export PATH="$HOME/.local/bin:$PATH"` - - For persistence, add the export command to `~/.bashrc` or `~/.zshrc`. diff --git a/.claude/skills/databricks b/.claude/skills/databricks deleted file mode 120000 index dfc8019..0000000 --- a/.claude/skills/databricks +++ /dev/null @@ -1 +0,0 @@ -../../.agents/skills/databricks \ No newline at end of file diff --git a/.claude/skills/databricks-apps b/.claude/skills/databricks-apps deleted file mode 120000 index e074616..0000000 --- a/.claude/skills/databricks-apps +++ /dev/null @@ -1 +0,0 @@ -../../.agents/skills/databricks-apps \ No newline at end of file diff --git a/.claude/skills/databricks-jobs b/.claude/skills/databricks-jobs deleted file mode 120000 index ffc7bb3..0000000 --- a/.claude/skills/databricks-jobs +++ /dev/null @@ -1 +0,0 @@ -../../.agents/skills/databricks-jobs \ No newline at end of file diff --git a/.claude/skills/databricks-lakebase b/.claude/skills/databricks-lakebase deleted file mode 120000 index e79b5ee..0000000 --- a/.claude/skills/databricks-lakebase +++ /dev/null @@ -1 +0,0 @@ -../../.agents/skills/databricks-lakebase \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 17d33ed..c608aea 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,6 +6,19 @@ devhub (developers.databricks.com) is the platform for developers to get all the Review the [Contributing](./CONTRIBUTING.md) guide for more information on how to contribute to DevHub. +## Agent Skills + +Most skills used by this repo are vendored under `.agents/skills/` (with matching `.claude/skills/` symlinks). Four heavily-drifted vendored skills have been removed because they fell far behind their upstream sources — when you need them, install from upstream instead: + +| Removed skill | Upstream | +| ----------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `databricks-apps` | [databricks/databricks-agent-skills](https://github.com/databricks/databricks-agent-skills) — `databricks aitools install` | +| `databricks-jobs` | [databricks/databricks-agent-skills](https://github.com/databricks/databricks-agent-skills) — `databricks aitools install` | +| `databricks-lakebase` | [databricks/databricks-agent-skills](https://github.com/databricks/databricks-agent-skills) — `databricks aitools install` | +| `databricks` (legacy combined CLI/data/bundles skill) | upstream split into `databricks-core` + `databricks-dabs` in [databricks/databricks-agent-skills](https://github.com/databricks/databricks-agent-skills) | + +The remaining vendored skills (e.g. `databricks-core`, `databricks-pipelines`, `agent-browser`, `building-components`, `frontend-design`, `mcp-builder`, `seo-audit`, etc.) are intentional devhub-internal forks — they're close enough to upstream to keep, and shipping them in-tree means a fresh clone has them available without extra setup. The agent-confusion concern (agents misreading vendored skills as something devhub exposes to customers) is mitigated upstream by [vercel-labs/skills#1281](https://github.com/vercel-labs/skills/pull/1281). + ## DevHub Development Workflow For every change to DevHub, do the following: diff --git a/skills-lock.json b/skills-lock.json index 0b506af..112c593 100644 --- a/skills-lock.json +++ b/skills-lock.json @@ -11,31 +11,11 @@ "sourceType": "github", "computedHash": "ca96444b70257eb3f4f4c76dfdd4a32312c835fe3c8755d2e3d75ac78d384bfd" }, - "databricks": { - "source": "databricks/databricks-agent-skills", - "sourceType": "github", - "computedHash": "0330b57627145c73dc67f2cc5769879a57d58920d5d5b1cd509bcfaa1c712550" - }, - "databricks-apps": { - "source": "databricks/databricks-agent-skills", - "sourceType": "github", - "computedHash": "188ca8fdb8905dd0377d2a54cd100b6484dfec711b3e3decb17a66a6a3c89f36" - }, "databricks-core": { "source": "databricks/databricks-agent-skills", "sourceType": "github", "computedHash": "a93e25550100fa703e5c4e492bc97b0a6bcb629c83f3c96fcaa4e0e5f0ebbdc4" }, - "databricks-jobs": { - "source": "databricks/databricks-agent-skills", - "sourceType": "github", - "computedHash": "2b9c20888d3587b7b46a640ae0d076d093d03a14767699ea495ed6d31900ea4f" - }, - "databricks-lakebase": { - "source": "databricks/databricks-agent-skills", - "sourceType": "github", - "computedHash": "1be99e1f852f93f8c9e3fdc4e4fc94d8851fb0008ed21231ad5d70262b163bd8" - }, "databricks-pipelines": { "source": "databricks/databricks-agent-skills", "sourceType": "github",