Skip to content

feat: add report state tracking for workflow management #13

feat: add report state tracking for workflow management

feat: add report state tracking for workflow management #13

---
name: SDK Diff Analyzer
# Analyzes gdc-nas repository for REST API changes that may require SDK updates.
# Uses GitHub API - no clone required.
# Reports are uploaded as artifacts for use by sdk-analyze and jira-sync skills.
on:
schedule:
- cron: '0 */6 * * *' # Every 6 hours
push:
branches:
- feature/sdk-diff-analyzer # Temporary: for testing
workflow_dispatch:
inputs:
commits_to_analyze:
description: 'Number of commits to analyze (default: 200)'
default: '200'
since_commit:
description: 'Analyze since this commit SHA (overrides state tag)'
default: ''
enable_clustering:
description: 'Enable report clustering (groups similar changes)'
default: 'true'
type: boolean
use_claude_clustering:
description: 'Use Claude for enhanced clustering (requires ANTHROPIC_API_KEY)'
default: 'false'
type: boolean
debug:
description: 'Enable verbose debug logging'
default: 'true'
type: boolean
env:
# Use fork for testing, main repo in production
GDC_NAS_REPO: ${{ github.repository_owner == 'gooddata' && 'gooddata/gdc-nas' || 'tychtjan/gdc-nas' }}
# File in this repo that stores the last analyzed gdc-nas commit SHA
STATE_FILE: '.github/gdc-nas-last-analyzed.txt'
jobs:
analyze:
name: Analyze gdc-nas changes
# Use self-hosted runners in production, GitHub-hosted for forks/testing
runs-on: ${{ github.repository == 'gooddata/gooddata-python-sdk' && 'infra1-runners-arc' || 'ubuntu-latest' }}
permissions:
contents: write
actions: write
env:
# Use PAT for gdc-nas access, falls back to GITHUB_TOKEN
GH_TOKEN: ${{ secrets.GDC_NAS_READ_TOKEN || secrets.GITHUB_TOKEN }}
steps:
- name: Debug - Environment info
run: |
echo "=== Environment ==="
echo "Repository: ${{ github.repository }}"
echo "Event: ${{ github.event_name }}"
echo "Runner: ${{ runner.os }}"
echo "GH CLI version: $(gh --version | head -1)"
echo ""
echo "=== Inputs ==="
echo "commits_to_analyze: ${{ inputs.commits_to_analyze || '200' }}"
echo "since_commit: ${{ inputs.since_commit || '(not set)' }}"
- name: Verify gdc-nas access
run: |
echo "=== Testing API access to ${{ env.GDC_NAS_REPO }} ==="
if gh api "repos/${{ env.GDC_NAS_REPO }}" --jq '.full_name' 2>/dev/null; then
echo "✅ Access verified"
else
echo "❌ Cannot access ${{ env.GDC_NAS_REPO }}"
echo ""
echo "To fix: Add a Personal Access Token with 'repo' scope as secret GDC_NAS_READ_TOKEN"
exit 1
fi
- name: Checkout SDK repository
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Get current gdc-nas HEAD
id: gdc_nas
run: |
HEAD_SHA=$(gh api "repos/${{ env.GDC_NAS_REPO }}/commits/HEAD" --jq '.sha')
echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT
echo "gdc-nas HEAD: $HEAD_SHA"
- name: Determine analysis range
id: range
run: |
COMMITS="${{ inputs.commits_to_analyze || '200' }}"
SINCE_INPUT="${{ inputs.since_commit }}"
echo "=== Determining range ==="
if [ -n "$SINCE_INPUT" ]; then
# User provided explicit since commit
echo "Using user-provided since commit: $SINCE_INPUT"
echo "since=$SINCE_INPUT" >> $GITHUB_OUTPUT
echo "mode=manual" >> $GITHUB_OUTPUT
elif [ -f "${{ env.STATE_FILE }}" ]; then
# Read from state file
LAST_SHA=$(cat "${{ env.STATE_FILE }}" | tr -d '[:space:]')
echo "Found state file with SHA: $LAST_SHA"
# Verify commit exists
if gh api "repos/${{ env.GDC_NAS_REPO }}/commits/$LAST_SHA" --jq '.sha' >/dev/null 2>&1; then
echo "Commit verified, using incremental mode"
echo "since=$LAST_SHA" >> $GITHUB_OUTPUT
echo "mode=incremental" >> $GITHUB_OUTPUT
else
echo "Stored commit not found, falling back to last $COMMITS commits"
echo "since=" >> $GITHUB_OUTPUT
echo "mode=fallback" >> $GITHUB_OUTPUT
echo "commits=$COMMITS" >> $GITHUB_OUTPUT
fi
else
echo "No state file, analyzing last $COMMITS commits"
echo "since=" >> $GITHUB_OUTPUT
echo "mode=initial" >> $GITHUB_OUTPUT
echo "commits=$COMMITS" >> $GITHUB_OUTPUT
fi
- name: Run API-based analyzer
id: analyze
run: |
mkdir -p reports
echo "=== Running Analyzer ==="
echo "Mode: ${{ steps.range.outputs.mode }}"
ARGS="--repo ${{ env.GDC_NAS_REPO }} --output-dir ./reports"
if [ -n "${{ steps.range.outputs.since }}" ]; then
ARGS="$ARGS --since ${{ steps.range.outputs.since }}"
else
ARGS="$ARGS --commits ${{ steps.range.outputs.commits || '200' }}"
fi
echo "Running: python3 scripts/gdc_nas_api_analyzer.py $ARGS"
python3 scripts/gdc_nas_api_analyzer.py $ARGS 2>&1 | tee analyzer.log
echo ""
echo "=== Reports ==="
ls -la reports/
# Count SDK-relevant reports (excluding summary)
REPORT_COUNT=$(ls reports/*.md 2>/dev/null | grep -v "00-summary" | wc -l || echo "0")
echo "sdk_reports=$REPORT_COUNT" >> $GITHUB_OUTPUT
- name: Cluster similar reports
id: cluster
if: steps.analyze.outputs.sdk_reports > 1 && inputs.enable_clustering != 'false'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
echo "=== Clustering Reports ==="
echo "Reports to cluster: ${{ steps.analyze.outputs.sdk_reports }}"
mkdir -p clustered
# Build clustering arguments
CLUSTER_ARGS="--input-dir ./reports --output-dir ./clustered"
# Check if Claude enhancement is requested and API key is available
if [ "${{ inputs.use_claude_clustering }}" = "true" ] && [ -n "$ANTHROPIC_API_KEY" ]; then
echo "Claude enhancement enabled"
pip install anthropic --quiet
CLUSTER_ARGS="$CLUSTER_ARGS --use-claude"
elif [ "${{ inputs.use_claude_clustering }}" = "true" ]; then
echo "Warning: Claude clustering requested but ANTHROPIC_API_KEY not set"
echo "Falling back to heuristic clustering"
fi
echo "Running: python3 scripts/cluster_sdk_reports.py $CLUSTER_ARGS"
python3 scripts/cluster_sdk_reports.py $CLUSTER_ARGS 2>&1 | tee clustering.log
# Count clusters
CLUSTER_COUNT=$(ls clustered/cluster-*.md 2>/dev/null | wc -l || echo "0")
echo "clusters_created=$CLUSTER_COUNT" >> $GITHUB_OUTPUT
if [ -f clustered/clusters.json ]; then
echo ""
echo "=== Cluster Summary ==="
cat clustered/00-clusters.md
fi
- name: Sync report state
id: state
if: steps.analyze.outputs.sdk_reports > 0
run: |
echo "=== Syncing Report State ==="
# Sync state with new reports and clusters
SYNC_ARGS="--reports-dir ./reports"
if [ -d "./clustered" ]; then
SYNC_ARGS="$SYNC_ARGS --clusters-dir ./clustered"
fi
python3 scripts/sdk_report_state.py sync $SYNC_ARGS
# Show what needs attention
echo ""
python3 scripts/sdk_report_state.py needs-attention
# Output counts for summary
SUMMARY=$(python3 scripts/sdk_report_state.py summary 2>&1)
NEW_REPORTS=$(echo "$SUMMARY" | grep -A10 "REPORTS:" | grep "new" | awk '{print $NF}' || echo "0")
NEW_CLUSTERS=$(echo "$SUMMARY" | grep -A10 "CLUSTERS:" | grep "new" | awk '{print $NF}' || echo "0")
echo "new_reports=$NEW_REPORTS" >> $GITHUB_OUTPUT
echo "new_clusters=$NEW_CLUSTERS" >> $GITHUB_OUTPUT
- name: Upload analysis reports
uses: actions/upload-artifact@v4
with:
name: sdk-diff-reports-${{ github.run_number }}
path: reports/
retention-days: 30
- name: Upload clustered reports
if: steps.cluster.outputs.clusters_created > 0
uses: actions/upload-artifact@v4
with:
name: sdk-clustered-reports-${{ github.run_number }}
path: clustered/
retention-days: 30
- name: Upload analyzer log
if: always()
uses: actions/upload-artifact@v4
with:
name: analyzer-log-${{ github.run_number }}
path: |
analyzer.log
clustering.log
retention-days: 7
if-no-files-found: ignore
- name: Update state files
run: |
echo "=== Updating state ==="
HEAD_SHA="${{ steps.gdc_nas.outputs.head_sha }}"
# Update gdc-nas commit state
echo "$HEAD_SHA" > "${{ env.STATE_FILE }}"
echo "Stored gdc-nas SHA: $HEAD_SHA"
# Commit and push state files
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
# Add both state files
git add "${{ env.STATE_FILE }}"
git add ".github/sdk-report-state.json" 2>/dev/null || true
if git diff --staged --quiet; then
echo "No changes to state files"
else
git commit -m "chore: update analyzer state to ${HEAD_SHA:0:12}
gdc-nas HEAD: ${HEAD_SHA}
New reports: ${{ steps.state.outputs.new_reports || '0' }}
New clusters: ${{ steps.state.outputs.new_clusters || '0' }}"
git push
echo "State files updated and pushed"
fi
- name: Job summary
if: always()
run: |
cat >> $GITHUB_STEP_SUMMARY << EOF
## SDK Diff Analyzer Results
| Parameter | Value |
|-----------|-------|
| Mode | ${{ steps.range.outputs.mode }} |
| Since | \`${{ steps.range.outputs.since || 'N/A' }}\` |
| gdc-nas HEAD | \`${{ steps.gdc_nas.outputs.head_sha }}\` |
| SDK-relevant commits | ${{ steps.analyze.outputs.sdk_reports }} |
| Clusters created | ${{ steps.cluster.outputs.clusters_created || '0' }} |
| **New reports** | **${{ steps.state.outputs.new_reports || '0' }}** |
| **New clusters** | **${{ steps.state.outputs.new_clusters || '0' }}** |
EOF
# Show items needing attention
if [ -f ".github/sdk-report-state.json" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🔔 Items Needing Attention" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
python3 scripts/sdk_report_state.py needs-attention 2>&1 >> $GITHUB_STEP_SUMMARY || echo "State check failed"
echo '```' >> $GITHUB_STEP_SUMMARY
fi
if [ -f reports/00-summary.md ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Report Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat reports/00-summary.md >> $GITHUB_STEP_SUMMARY
fi
if [ -f clustered/00-clusters.md ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Clustering Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat clustered/00-clusters.md >> $GITHUB_STEP_SUMMARY
fi