diff --git a/.hyf/test.sh b/.hyf/test.sh old mode 100644 new mode 100755 index ee037fc..1d04ac9 --- a/.hyf/test.sh +++ b/.hyf/test.sh @@ -1,13 +1,74 @@ #!/usr/bin/env bash -set -euo pipefail +set -uo pipefail -# Run your test scripts here. -# Auto grade tool will execute this file within the .hyf working directory. -# The result should be stored in score.json file with the format shown below. -cat << EOF > score.json +# Week 12 autograder. +# Ladder: the untouched scaffold must FAIL; a completed Minimum solution passes. +# Work-verifying checks (not mere file presence): +# - metric Names filled with real (non-placeholder) values +# - Panel 1 implemented (no NotImplementedError / TODO: implement left) +# - AI_ASSIST.md filled with real content + +PASSING=6 +SCORE=0 + +check() { + if [ "$2" = "true" ]; then echo " PASS $1"; SCORE=$((SCORE + 1)); + else echo " FAIL $1"; fi +} + +METRIC_FILE="../task-1/metric_definitions.md" +APP_FILE="../task-2/app.py" +AI_FILE="../AI_ASSIST.md" + +# 1. metric_definitions.md exists +[ -f "$METRIC_FILE" ] && check "metric_definitions.md exists" "true" || check "metric_definitions.md exists" "false" + +# 2. >=3 metric Name fields filled with a REAL value (not empty, not _(...)_ placeholder) +FILLED=$(grep -E '^\| \*\*Name\*\* \|' "$METRIC_FILE" 2>/dev/null \ + | sed -E 's/^\| \*\*Name\*\* \|([^|]*)\|.*/\1/' \ + | grep -vE '^[[:space:]]*$' | grep -vF '_(' | wc -l | tr -d ' ') +[ "${FILLED:-0}" -ge 3 ] && check "3+ metric Name fields filled" "true" \ + || check "3+ metric Name fields filled (found ${FILLED:-0})" "false" + +# 3. task-2/app.py exists +[ -f "$APP_FILE" ] && check "task-2/app.py exists" "true" || check "task-2/app.py exists" "false" + +# 4. Panel 1 implemented: no NotImplementedError / TODO: implement remaining +if grep -qE 'raise NotImplementedError|TODO: implement' "$APP_FILE" 2>/dev/null; then + check "Panel 1 implemented (get_dag_runs)" "false" +else + check "Panel 1 implemented (get_dag_runs)" "true" +fi + +# 5. app.py uses @st.cache_data (quality: do not strip caching) +grep -q "st.cache_data" "$APP_FILE" 2>/dev/null \ + && check "app.py uses @st.cache_data" "true" || check "app.py uses @st.cache_data" "false" + +# 6. app.py has no hardcoded password +if grep -qE 'password[[:space:]]*=[[:space:]]*"[^"]+"' "$APP_FILE" 2>/dev/null; then + check "app.py has no hardcoded password" "false" +else + check "app.py has no hardcoded password" "true" +fi + +# 7. AI_ASSIST.md filled: >=3 real content lines (exclude blanks, headings, placeholders) +if [ -f "$AI_FILE" ]; then + AI_LINES=$(grep -vE '^[[:space:]]*$|^#|^⚠️|^Document' "$AI_FILE" | grep -vF '_(' | wc -l | tr -d ' ') + [ "${AI_LINES:-0}" -ge 3 ] && check "AI_ASSIST.md documents LLM usage" "true" \ + || check "AI_ASSIST.md documents LLM usage (needs 3+ entries)" "false" +else + check "AI_ASSIST.md exists" "false" +fi + +if [ "$SCORE" -ge "$PASSING" ]; then PASS=true; else PASS=false; fi + +cat > score.json < week X assignment -The Week X assignment for the HackYourFuture can be found at the following link: [TODO: Assignment url in the learning platform] +# Data Track: Week 12 Assignment +Week 12 assignment for the [HackYourFuture Data Track](https://www.notion.so/hackyourfuture/Data-Track-Overview). -## Implementation Instructions +The assignment for this week can be found at: [Week 12: Assignment](https://www.notion.so/hackyourfuture/Assignment-Build-Two-Dashboards) -Provide clear instructions on how trainees should implement the tasks. +## What you will build -### Task 1 -Instructions for Task 1 +- **Task 1: Metabase dashboard** — three analytical Questions on your dbt mart tables, arranged into a dashboard with a date filter, and a `metric_definitions.md` file documenting each panel. +- **Task 2: Streamlit engineering dashboard** — a Python app that shows Airflow DAG run status and data freshness from Azure Postgres. -### Task 2 -Instructions for Task 2 +## Getting started -... +### Task 1: Metabase +1. Log in to the HYF Metabase instance (URL in `task-1/README.md`). +2. Build three Questions in SQL mode on your `dev_.fct_trips` table. +3. Arrange them into a Dashboard and add a date-range filter. +4. Fill in `task-1/metric_definitions.md` for each panel. +5. Copy the dashboard URL (or take screenshots) into `task-1/README.md`. + +### Task 2: Streamlit + +1. Open `task-2/app.py` and follow the `TODO` comments. +2. Copy `.env.example` to `.env` and fill in your credentials (never commit `.env`). +3. Run `pip install -r task-2/requirements.txt`. +4. Run `streamlit run task-2/app.py` and verify the panels load with real data. +5. Push `task-2/` to GitHub. + +## Autograder + +```bash +bash .hyf/test.sh +``` + +The scaffold returns `pass: false`. It returns `pass: true` once: +- `task-1/metric_definitions.md` contains at least 3 metric definitions +- `task-2/app.py` contains `st.cache_data` and no hardcoded passwords + +## Submission + +Submit the following in the class assignment tracker: + +| Item | Required for | +|---|---| +| Metabase dashboard link / screenshots | Minimum | +| `task-2/` GitHub link | Minimum | +| `task-1/metric_definitions.md` | Minimum | +| Date filter on ≥2 Questions | Target | +| Streamlit freshness panel | Target | +| 5-minute presentation recording link | Target | diff --git a/task-1/README.md b/task-1/README.md new file mode 100644 index 0000000..b0b39e9 --- /dev/null +++ b/task-1/README.md @@ -0,0 +1,27 @@ +# Task 1: Metabase Analytical Dashboard + +## Metabase instance + +```text +https://metabase-hyf.politepebble-abd3ebc2.westeurope.azurecontainerapps.io +``` + +Log in with the credentials your teacher provided. + +## Your dashboard URL + + +**Dashboard URL:** _(paste here after building)_ + + + +## Screenshots (optional) + +Add screenshots here if the public link is not shareable. + +## Checklist + +- [ ] 3 Questions built in SQL mode on `dev_.fct_trips` or `fct_daily_borough_stats` +- [ ] Questions arranged in one Dashboard named "NYC Taxi Analytics: [Your Name]" +- [ ] Date-range filter connected to at least 2 Questions (Target) +- [ ] `metric_definitions.md` filled in for all panels diff --git a/task-1/metric_definitions.md b/task-1/metric_definitions.md new file mode 100644 index 0000000..bfe1acb --- /dev/null +++ b/task-1/metric_definitions.md @@ -0,0 +1,40 @@ +# Metric Definitions + +Document every Metabase panel here using all five fields. +A metric without a complete definition will not pass the grading criteria. + +--- + +## Metric 1: _(name)_ + +| Field | Value | +|---|---| +| **Name** | _(e.g. `trip_count_by_borough`)_ | +| **Description** | _(one sentence: what does this number measure, for whom?)_ | +| **Calculation** | _(the SQL logic or formula: e.g. `COUNT(*) FROM fct_trips GROUP BY pickup_borough`)_ | +| **Data source** | _(table + schema: e.g. `dev_yourname.fct_trips`)_ | +| **Refresh frequency** | _(e.g. Daily, after 03:00 UTC Airflow run)_ | + +--- + +## Metric 2: _(name)_ + +| Field | Value | +|---|---| +| **Name** | | +| **Description** | | +| **Calculation** | | +| **Data source** | | +| **Refresh frequency** | | + +--- + +## Metric 3: _(name)_ + +| Field | Value | +|---|---| +| **Name** | | +| **Description** | | +| **Calculation** | | +| **Data source** | | +| **Refresh frequency** | | diff --git a/task-2/.env.example b/task-2/.env.example new file mode 100644 index 0000000..affce79 --- /dev/null +++ b/task-2/.env.example @@ -0,0 +1,9 @@ +# Copy this file to .env and fill in your values. +# .env is in .gitignore — never commit it. + +AIRFLOW_URL=https://your-airflow-instance.example.com +AIRFLOW_USER=admin +AIRFLOW_PASS=your-airflow-password + +# PostgreSQL connection string (format: postgresql://user:password@host:5432/dbname) +PG_URL=postgresql://pipeline_user:your-pg-password@your-pg-host:5432/team1 diff --git a/task-2/app.py b/task-2/app.py new file mode 100644 index 0000000..7df7c3d --- /dev/null +++ b/task-2/app.py @@ -0,0 +1,81 @@ +""" +Week 12 Engineering Dashboard , Streamlit app + +Run: streamlit run app.py + +Credentials: copy .env.example to .env and fill in values. + Load with: pip install python-dotenv, then call load_dotenv() below. + Never hardcode credentials in this file. +""" + +import os + +import requests +import streamlit as st + +# TODO: uncomment these two lines after installing python-dotenv +# from dotenv import load_dotenv +# load_dotenv() + +AIRFLOW_URL = os.environ.get("AIRFLOW_URL", "") +AIRFLOW_USER = os.environ.get("AIRFLOW_USER", "") +AIRFLOW_PASS = os.environ.get("AIRFLOW_PASS", "") +PG_URL = os.environ.get("PG_URL", "") # postgresql://user:pass@host/db + +st.set_page_config(page_title="Pipeline Health", layout="wide") +st.title("Pipeline Health Dashboard") + + +# ── Panel 1: Last DAG run status ───────────────────────────────────────────── + + +@st.cache_data(ttl=60) +def get_dag_runs(dag_id: str, limit: int = 10) -> list: + """Return recent DAG runs from the Airflow REST API.""" + # TODO: implement this function + # Endpoint: GET {AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns + # Auth (Airflow 3): POST {AIRFLOW_URL}/auth/token with username/password to + # get an access_token, then send headers={"Authorization": f"Bearer {token}"}. + # Basic Auth (auth=(user, pass)) returns 401 on Airflow 3. Cache the token. + # Return: list of run dicts, each with "state", "start_date", "end_date" + raise NotImplementedError("TODO: implement get_dag_runs") + + +dag_id = "ingest_taxi_month" # TODO: on the shared Airflow your DAG id is +# prefixed: _ingest_taxi_month + +st.subheader("Last DAG run") +try: + runs = get_dag_runs(dag_id, limit=1) + if runs: + last = runs[0] + state = last["state"] + if state == "success": + st.success(f"Last run: **{state}**, started {last['start_date']}") + elif state == "failed": + st.error(f"Last run: **{state}**, check Airflow logs") + else: + st.warning(f"Last run: **{state}**") + else: + st.info("No runs found for this DAG.") +except NotImplementedError: + st.warning("Panel 1: implement `get_dag_runs` to show live data.") +except Exception as exc: + st.error(f"Could not reach Airflow: {exc}") + + +# ── Panel 2 (Target): Run duration trend ───────────────────────────────────── + +st.subheader("Run duration trend (last 30 runs)") +# TODO (Target): call get_dag_runs(dag_id, limit=30), compute duration from +# start_date and end_date, and plot with st.line_chart. +st.info("TODO (Target): add a line chart of run durations.") + + +# ── Panel 3 (Target): Data freshness from Postgres ─────────────────────────── + +st.subheader("Data freshness") +# TODO (Target): query MAX(pickup_datetime) and COUNT(*) from +# dev_.fct_trips using psycopg2 or sqlalchemy + PG_URL. +# Display as st.metric widgets. +st.info("TODO (Target): add freshness metrics from Postgres.") diff --git a/task-2/requirements.txt b/task-2/requirements.txt new file mode 100644 index 0000000..2cf1fd3 --- /dev/null +++ b/task-2/requirements.txt @@ -0,0 +1,4 @@ +streamlit>=1.35.0 +requests>=2.31.0 +psycopg2-binary>=2.9.9 +python-dotenv>=1.0.0