Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"features": {
"ghcr.io/devcontainers/features/azure-cli:1": {}
},
"postCreateCommand": "python3 -m pip install -r task-1/requirements.txt",
"postCreateCommand": "python3 -m pip install -r requirements.txt",
"customizations": {
"vscode": {
"extensions": [
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ env/
!.env.example

# Generated pipeline output (committed templates stay; generated files do not)
task-1/output/error_report.json
task-1/output/azure_resource_groups.json
task-1/weather.db
output/error_report.json
output/azure_resource_groups.json
weather.db

# Editor and IDE settings
.vscode/
Expand Down
102 changes: 54 additions & 48 deletions .hyf/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ cat > "$SCRIPT_DIR/score.json" <<'INIT'
{"score": 0, "pass": false, "passingScore": 60}
INIT

# Remove runtime-generated files so each grader run starts from a clean state.
# In CI the checkout is always clean; locally this prevents stale artifacts from
# a previous successful run inflating the score on a broken re-submission.
rm -f weather.db output/error_report.json

# --- Tasks 1-6: Ingestion Pipeline (70 points) ---
#
# Scoring ladder (each level requires all previous levels to pass):
Expand All @@ -29,16 +34,16 @@ INIT
# 70 code uses required patterns (@field_validator, parameterized queries,
# ON CONFLICT upsert, time.sleep backoff)
task16=0
task16_msg="missing required files in task-1/"
task16_msg="missing required files"

required_files=(
"task-1/models.py"
"task-1/ingest_api.py"
"task-1/ingest_files.py"
"task-1/validate.py"
"task-1/database.py"
"task-1/pipeline.py"
"task-1/.env.example"
"models.py"
"ingest_api.py"
"ingest_files.py"
"validate.py"
"database.py"
"pipeline.py"
".env.example"
)

all_present=true
Expand All @@ -53,13 +58,13 @@ if [ "$all_present" = true ]; then
task16=10
task16_msg="files exist but pipeline failed to run"

if [ -f task-1/requirements.txt ]; then
python3 -m pip install -q -r task-1/requirements.txt || \
if [ -f requirements.txt ]; then
python3 -m pip install -q -r requirements.txt || \
echo "WARN: pip install failed; pipeline may crash with ModuleNotFoundError" >&2
fi

Comment thread
lassebenni marked this conversation as resolved.
PIPELINE_ERR=$(mktemp)
if ( cd task-1 && python3 -m pipeline ) >/dev/null 2>"$PIPELINE_ERR"; then
if python3 -m pipeline >/dev/null 2>"$PIPELINE_ERR"; then
task16=20
task16_msg="pipeline ran but output checks failed"

Expand All @@ -69,7 +74,7 @@ import json, sqlite3
from pathlib import Path

# error_report.json must exist and be a non-empty list of error objects
rpt = Path("task-1/output/error_report.json")
rpt = Path("output/error_report.json")
assert rpt.exists(), "output/error_report.json was not created"
errors = json.loads(rpt.read_text())
assert isinstance(errors, list), "error_report.json must be a JSON list"
Expand All @@ -81,7 +86,7 @@ for i, e in enumerate(errors[:3]):
assert not missing, f"error object {i} missing fields: {missing}"

# weather.db must exist and have rows
db = Path("task-1/weather.db")
db = Path("weather.db")
assert db.exists(), "weather.db was not created"
conn = sqlite3.connect(db)
count = conn.execute("SELECT COUNT(*) FROM weather_readings").fetchone()[0]
Expand All @@ -95,33 +100,34 @@ PY
# Idempotency: run a second time, row count must stay the same
count_before=$(python3 -c "
import sqlite3
conn = sqlite3.connect('task-1/weather.db')
conn = sqlite3.connect('weather.db')
print(conn.execute('SELECT COUNT(*) FROM weather_readings').fetchone()[0])
")
if ( cd task-1 && python3 -m pipeline ) >/dev/null 2>&1; then
if python3 -m pipeline >/dev/null 2>&1; then
count_after=$(python3 -c "
import sqlite3
conn = sqlite3.connect('task-1/weather.db')
conn = sqlite3.connect('weather.db')
print(conn.execute('SELECT COUNT(*) FROM weather_readings').fetchone()[0])
")
if [ "$count_before" = "$count_after" ]; then
task16=50
task16_msg="output + idempotency pass; checking code patterns"

# Code introspection for the final 20 points.
# These greps target actual code constructs, not docstrings:
# @field_validator / @classmethod — present in scaffold but only
# safe to match after pipeline passes (NotImplementedError in the
# validator would crash the pipeline before we get here)
# execute.*? — SQL parameterized placeholder in a call
# ON CONFLICT — upsert keyword in actual SQL string
# time\.sleep — stdlib sleep call (avoids matching the
# function name "fetch_with_retry" or docstring words)
has_field_validator=$(grep -cE "@field_validator" task-1/models.py || true)
has_classmethod=$(grep -cE "@classmethod" task-1/models.py || true)
has_param_queries=$(grep -cE "execute[a-z]*\(.*\?" task-1/database.py || true)
has_on_conflict=$(grep -ciE "ON CONFLICT" task-1/database.py || true)
has_sleep=$(grep -cE "time\.sleep" task-1/ingest_api.py || true)
# Patterns target actual code constructs, not docstrings:
# execute.*? — SQL placeholder in an execute() call
# ON CONFLICT — upsert keyword in actual SQL string
# time\.sleep — stdlib sleep call (avoids the function
# name "fetch_with_retry" or docstring words)
has_field_validator=$(grep -cE "@field_validator" models.py || true)
has_classmethod=$(grep -cE "@classmethod" models.py || true)
Comment thread
lassebenni marked this conversation as resolved.
# Parameterized queries: check for ? placeholder anywhere in the file
# AND an execute call — handles both inline and multi-line SQL forms.
has_q=$(grep -c "?" database.py || true)
has_exec=$(grep -cE "\.execute" database.py || true)
if [ "$has_q" -gt 0 ] && [ "$has_exec" -gt 0 ]; then has_param_queries=1; else has_param_queries=0; fi
has_on_conflict=$(grep -ciE "ON CONFLICT" database.py || true)
Comment thread
lassebenni marked this conversation as resolved.
has_sleep=$(grep -cE "time\.sleep" ingest_api.py || true)

if [ "$has_field_validator" -gt 0 ] && \
[ "$has_classmethod" -gt 0 ] && \
Expand Down Expand Up @@ -157,51 +163,51 @@ fi

# --- Task 7: Azure CLI + Portal (15 points) ---
#
# 5 pts azure_resource_groups.json exists and is valid JSON
# 10 pts azure_compare.md exists
# 15 pts azure_compare.md has all 3 sections and is filled in (>1200 chars,
# which is above the committed template's ~310 chars of non-comment text)
# 5 pts output/azure_resource_groups.json exists and is valid JSON
# 10 pts output/azure_compare.md exists
# 15 pts output/azure_compare.md has all 3 sections and is filled in (>1200 chars,
# which is above the committed template's ~233 bytes)
task7=0
task7_msg="missing task-1/output/azure_resource_groups.json"
task7_msg="missing output/azure_resource_groups.json"

if [ -s "task-1/output/azure_resource_groups.json" ]; then
if python3 -c "import json; json.load(open('task-1/output/azure_resource_groups.json'))" 2>/dev/null; then
if [ -s "output/azure_resource_groups.json" ]; then
if python3 -c "import json; json.load(open('output/azure_resource_groups.json'))" 2>/dev/null; then
Comment thread
lassebenni marked this conversation as resolved.
task7=5
task7_msg="azure_resource_groups.json is valid JSON; azure_compare.md missing or not filled in"

if [ -s "task-1/output/azure_compare.md" ]; then
if [ -s "output/azure_compare.md" ]; then
task7=10
task7_msg="azure_compare.md exists but looks too short or missing sections"
section_count=$(grep -cE "^## " task-1/output/azure_compare.md || true)
char_count=$(wc -c < task-1/output/azure_compare.md)
section_count=$(grep -cE "^## " output/azure_compare.md || true)
char_count=$(wc -c < output/azure_compare.md)
if [ "$section_count" -ge 3 ] && [ "$char_count" -gt 1200 ]; then
Comment thread
lassebenni marked this conversation as resolved.
task7=15
task7_msg="azure_resource_groups.json and azure_compare.md both present and filled in"
fi
fi
else
task7_msg="task-1/output/azure_resource_groups.json is not valid JSON"
task7_msg="output/azure_resource_groups.json is not valid JSON"
fi
fi

# --- Task 8: AI Debug Report (15 points) ---
#
# 5 pts task-2/AI_DEBUG.md exists
# 5 pts AI_DEBUG.md exists
# 10 pts all four sections present (## The Error, ## The Prompt, ## The Solution, ## Reflection)
# 15 pts file is meaningfully filled in (>1800 chars)
task8=0
task8_msg="missing task-2/AI_DEBUG.md"
task8_msg="missing AI_DEBUG.md"

if [ -s "task-2/AI_DEBUG.md" ]; then
if [ -s "AI_DEBUG.md" ]; then
task8=5
task8_msg="AI_DEBUG.md exists but missing required sections"
if grep -q "^## The Error" task-2/AI_DEBUG.md && \
grep -q "^## The Prompt" task-2/AI_DEBUG.md && \
grep -q "^## The Solution" task-2/AI_DEBUG.md && \
grep -q "^## Reflection" task-2/AI_DEBUG.md; then
if grep -q "^## The Error" AI_DEBUG.md && \
grep -q "^## The Prompt" AI_DEBUG.md && \
grep -q "^## The Solution" AI_DEBUG.md && \
grep -q "^## Reflection" AI_DEBUG.md; then
task8=10
task8_msg="all sections present but file looks too short to be filled in"
if [ "$(wc -c < task-2/AI_DEBUG.md)" -gt 1800 ]; then
if [ "$(wc -c < AI_DEBUG.md)" -gt 1800 ]; then
task8=15
task8_msg="AI_DEBUG.md is filled in"
fi
Expand Down
File renamed without changes.
128 changes: 76 additions & 52 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,80 +1,104 @@
# Data Track — Week 3 Assignment (Template)
# Data Track — Week 3 Assignment

The HackYourFuture Data Track Week 3 assignment: **Build a Validated Ingestion Pipeline**.
**Build a Validated Ingestion Pipeline** · Total: 100 points · Passing: 60

> 👩‍🎓 **Students:** you are in the wrong place. Do **not** fork or use this template.
> Go to your cohort's assignment repo under
> [`HackYourAssignment`](https://github.com/HackYourAssignment) (e.g. `c55-data-week3`,
> `c56-data-week3`, …). Your teacher posts the exact link in your cohort channel.
> Fork the cohort repo, branch, and open a PR back to it. Full instructions live in the
> Week 3 Assignment chapter in the learning platform.
---

## For instructors / track maintainers
## Why no task folders?

Previous assignments split work across `task-1/`, `task-2/`, etc. This assignment drops that structure intentionally. Real Python projects keep all related modules at the root — you navigate by reading the code, not by opening numbered folders.

Every file you need to touch is listed below, in the order you should work through them.

This repo is the **upstream template** for the Week 3 assignment. At the start of each
cohort, generate a cohort-specific repo under the `HackYourAssignment` org from this
template (GitHub: **Use this template → Create a new repository**, owner =
`HackYourAssignment`, name = `c<NN>-data-week3`). Students then fork *that* cohort repo
and open PRs back to it; the auto-grader runs on every push.
---

Edits to the assignment, dataset, or grader belong here on the template, not on the
cohort copies.
## Where to start

## Tasks at a glance
Work through the files in this order. Each one maps to a task in the assignment chapter.

| Task | Folder | Points | What you build |
| Step | File | Task in the chapter | Points |
|---|---|---|---|
| **Tasks 1-6** — Ingestion Pipeline | `task-1/` | 70 | A modular pipeline: `fetch_with_retry` with exponential backoff, Open-Meteo API ingestion, CSV file ingestion, Pydantic validation with `@field_validator`, SQLite upsert storage, and a `pipeline.py` orchestrator that produces an error report and pipeline summary. |
| **Task 7** — Azure CLI + Portal | `task-1/output/` | 15 | Run three `az` CLI commands, call the ARM API with a Bearer token, save `azure_resource_groups.json`, and fill in `azure_compare.md` with three comparison points. |
| **Task 8** — AI Debug Report | `task-2/` | 15 | Document one LLM-assisted debugging session. Fill in the four sections of `AI_DEBUG.md`. |
| 1 | `models.py` | Task 4: Pydantic Validation | — |
| 2 | `ingest_api.py` | Task 1: Error Handling + Task 2: API Ingestion | — |
| 3 | `ingest_files.py` | Task 3: File Reading | — |
| 4 | `validate.py` | Task 4: Pydantic Validation | — |
| 5 | `database.py` | Task 5: Database Storage | — |
| 6 | `pipeline.py` | Task 6: Pipeline Orchestration | 70 total |
| 7 | `output/azure_compare.md` | Task 7: Azure CLI + Portal | 15 |
| 8 | `AI_DEBUG.md` | Task 8: AI Debug Report | 15 |

Total: 100 · Passing: 60.
Open each file and read the docstrings and TODO comments — they explain exactly what to implement. Start with `models.py` and `ingest_api.py`; `pipeline.py` is the last thing you wire together.

---

## Repository layout

```text
.
├── task-1/
│ ├── data/
│ │ └── weather_stations.csv # messy input dataset (committed; do not edit)
│ ├── output/ # pipeline writes here (gitignored except templates)
│ │ ├── error_report.json # generated by pipeline.py
│ │ ├── azure_resource_groups.json # Task 7: save ARM API response here
│ │ └── azure_compare.md # Task 7: fill in 3 comparison points
│ ├── models.py # Pydantic WeatherReading model — fill in TODOs
│ ├── ingest_api.py # fetch_with_retry + API ingestion — fill in TODOs
│ ├── ingest_files.py # CSV reader — fill in TODOs
│ ├── validate.py # batch validation — fill in TODOs
│ ├── database.py # SQLite create, upsert, query — fill in TODOs
│ ├── pipeline.py # orchestrator — fill in TODOs
│ ├── .env.example # no secrets needed; copy to .env if you extend it
│ └── requirements.txt
├── task-2/
│ └── AI_DEBUG.md # Task 8: fill in the four sections
├── data/
│ └── weather_stations.csv # input dataset — do not edit
├── output/
│ ├── azure_compare.md # Task 7: fill in your 3 comparison sentences
│ └── azure_resource_groups.json # Task 7: generated by your Python script
├── models.py # Step 1 — Pydantic model (Task 4)
├── ingest_api.py # Step 2 — fetch_with_retry + API call (Tasks 1–2)
├── ingest_files.py # Step 3 — CSV reader (Task 3)
├── validate.py # Step 4 — batch validation (Task 4)
├── database.py # Step 5 — SQLite tables + upsert (Task 5)
├── pipeline.py # Step 6 — orchestrator that calls everything (Task 6)
├── AI_DEBUG.md # Step 8 — your debugging log (Task 8)
├── requirements.txt
├── .env.example
├── .hyf/
│ └── test.sh # auto-grader (read it to see exactly what it checks)
│ └── test.sh # auto-grader read this to see exactly what is checked
└── .github/workflows/
└── grade-assignment.yml # runs .hyf/test.sh on every PR
└── grade-assignment.yml
```

## Run the grader locally
Files the pipeline generates at runtime (gitignored):
- `weather.db` — SQLite database
- `output/error_report.json` — invalid records from validation

---

Before opening a PR, run the same checks the auto-grader runs:
## Run the pipeline

Comment thread
lassebenni marked this conversation as resolved.
```bash
cd task-1
python3 -m pip install -r requirements.txt
cd ..
python3 -m pipeline
```

---

## Check your score locally

Run the same grader the auto-grader runs on every PR push:

```bash
bash .hyf/test.sh
cat .hyf/score.json
```

## Scoring ladder (Tasks 1-6)
---

## Scoring ladder (Tasks 1–6)

Points are awarded incrementally so partial work earns partial credit:

| Score | What the grader checks |
|---|---|
| 10/70 | All required files exist |
| 20/70 | `python3 -m pipeline` runs without crashing |
| 40/70 | `output/error_report.json` is a valid list with the right fields; `weather.db` has rows |
| 50/70 | Pipeline is idempotent: a second run leaves the same row count (upsert working) |
| 70/70 | Code uses: `@field_validator` + `@classmethod` in `models.py`, `?` placeholders in `database.py`, `ON CONFLICT` upsert in `database.py`, `time.sleep` backoff in `ingest_api.py` |

---

## For instructors / track maintainers

This repo is the upstream template. At the start of each cohort, generate a cohort repo under `HackYourAssignment` (**Use this template → Create a new repository**, owner = `HackYourAssignment`, name = `c<NN>-data-week3`). Students fork that cohort repo and open PRs back to it; the auto-grader runs on every push.

The grader awards points incrementally so partial credit is meaningful:
Edits to the assignment, dataset, or grader belong here on the template — not on cohort copies.

- **10/70** — required files all exist (`models.py`, `ingest_api.py`, `ingest_files.py`, `validate.py`, `database.py`, `pipeline.py`, `.env.example`).
- **20/70** — `python3 -m pipeline` runs from `task-1/` without crashing.
- **40/70** — `output/error_report.json` exists, is a valid JSON list, and contains objects with `index`, `source`, `raw_record`, and `error_details` fields; `weather.db` has rows in `weather_readings`.
- **50/70** — pipeline is idempotent: running it twice leaves the same row count in `weather_readings` (upsert working correctly).
- **70/70** — code uses the required patterns: `@field_validator` + `@classmethod` in `models.py`, parameterized queries (`?` placeholders) in `database.py`, `ON CONFLICT ... DO UPDATE SET` in `database.py`, retry/backoff logic in `ingest_api.py`.
> 👩‍🎓 **Students:** if you landed here, you are in the wrong place. Go to your cohort repo under [`HackYourAssignment`](https://github.com/HackYourAssignment). Your teacher posts the exact link in your cohort channel.
File renamed without changes.
5 changes: 5 additions & 0 deletions task-1/database.py → database.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Step 5 — Task 5: Database Storage
# create_tables() — run once at startup to set up raw_weather and weather_readings.
# insert_raw() — store every record before validation so nothing is lost.
# upsert_readings()— insert valid records; ON CONFLICT updates instead of duplicating.
# count_readings() — query the final row count for the pipeline summary.
import sqlite3
from pathlib import Path

Expand Down
3 changes: 3 additions & 0 deletions task-1/ingest_api.py → ingest_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Step 2 — Tasks 1 & 2: Error Handling + API Ingestion
# fetch_with_retry handles transient network errors (Task 1).
# fetch_api_records calls it and shapes the response into flat dicts (Task 2).
import logging
import time

Expand Down
3 changes: 3 additions & 0 deletions task-1/ingest_files.py → ingest_files.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Step 3 — Task 3: File Reading
# Read the messy CSV and normalize each row into the same dict format
# that fetch_api_records() produces, so validate_records() can handle both sources.
import csv
from pathlib import Path

Expand Down
Loading