Skip to content

Commit 7c053eb

Browse files
committed
feat: implement API concurrency fix strategy by converting async route handlers to sync and enhancing error handling
1 parent c3c7648 commit 7c053eb

20 files changed

Lines changed: 451 additions & 182 deletions

.github/workflows/CD_production.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ jobs:
8888
run: |
8989
export MAX_INSTANCES="10"
9090
export SERVICE_NAME="ocotillo-api"
91-
export ENTRYPOINT="gunicorn -w 1 -k uvicorn.workers.UvicornWorker main:app"
91+
export ENTRYPOINT="gunicorn -w 4 -k uvicorn.workers.UvicornWorker main:app"
9292
export MIN_INSTANCES="0"
9393
envsubst < .github/app.template.yaml > app.yaml
9494

.github/workflows/CD_staging.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ jobs:
8888
run: |
8989
export MAX_INSTANCES="10"
9090
export SERVICE_NAME="ocotillo-api-staging"
91-
export ENTRYPOINT="gunicorn -w 1 -k uvicorn.workers.UvicornWorker main:app"
91+
export ENTRYPOINT="gunicorn -w 4 -k uvicorn.workers.UvicornWorker main:app"
9292
export MIN_INSTANCES="0"
9393
envsubst < .github/app.template.yaml > app.yaml
9494

.github/workflows/CD_testing.yml

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
name: CD (Testing)
2+
3+
on:
4+
push:
5+
branches: [jir*]
6+
7+
permissions:
8+
contents: write
9+
10+
jobs:
11+
testing-deploy:
12+
13+
runs-on: ubuntu-latest
14+
environment: staging
15+
16+
steps:
17+
- name: Check out source repository
18+
uses: actions/checkout@v6.0.2
19+
with:
20+
fetch-depth: 0
21+
22+
- name: Install uv in container
23+
uses: astral-sh/setup-uv@v8.0.0
24+
with:
25+
version: "latest"
26+
27+
- name: Generate requirements.txt
28+
run: |
29+
uv export \
30+
--format requirements-txt \
31+
--no-emit-project \
32+
--no-dev \
33+
--output-file requirements.txt
34+
35+
- name: Authenticate to Google Cloud
36+
uses: 'google-github-actions/auth@v3'
37+
with:
38+
credentials_json: ${{ secrets.CLOUD_DEPLOY_SERVICE_ACCOUNT_KEY }}
39+
40+
- name: Run Alembic migrations on staging database
41+
env:
42+
DB_DRIVER: "cloudsql"
43+
CLOUD_SQL_INSTANCE_NAME: "${{ secrets.CLOUD_SQL_INSTANCE_NAME }}"
44+
CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}"
45+
CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}"
46+
CLOUD_SQL_IAM_AUTH: true
47+
run: |
48+
uv run alembic upgrade head
49+
50+
- name: Refresh materialized views on staging database
51+
env:
52+
DB_DRIVER: "cloudsql"
53+
CLOUD_SQL_INSTANCE_NAME: "${{ secrets.CLOUD_SQL_INSTANCE_NAME }}"
54+
CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}"
55+
CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}"
56+
CLOUD_SQL_IAM_AUTH: true
57+
run: |
58+
uv run python -m cli.cli refresh-pygeoapi-materialized-views
59+
60+
- name: Ensure envsubst is available
61+
run: |
62+
if ! command -v envsubst >/dev/null 2>&1; then
63+
sudo apt-get update
64+
sudo apt-get install -y gettext-base
65+
fi
66+
67+
- name: Render App Engine configs
68+
env:
69+
ENVIRONMENT: "staging"
70+
CLOUD_SQL_INSTANCE_NAME: "${{ secrets.CLOUD_SQL_INSTANCE_NAME }}"
71+
CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}"
72+
CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}"
73+
PYGEOAPI_POSTGRES_DB: "${{ vars.CLOUD_SQL_DATABASE }}"
74+
PYGEOAPI_POSTGRES_USER: "${{ secrets.PYGEOAPI_POSTGRES_USER }}"
75+
PYGEOAPI_POSTGRES_HOST: "${{ vars.PYGEOAPI_POSTGRES_HOST || '127.0.0.1' }}"
76+
PYGEOAPI_POSTGRES_PORT: "${{ vars.PYGEOAPI_POSTGRES_PORT || '5432' }}"
77+
PYGEOAPI_POSTGRES_PASSWORD: "${{ secrets.PYGEOAPI_POSTGRES_PASSWORD }}"
78+
PYGEOAPI_SERVER_URL: "${{ vars.PYGEOAPI_SERVER_URL }}"
79+
CLOUD_SQL_IAM_AUTH: "true"
80+
GCS_SERVICE_ACCOUNT_KEY: "${{ secrets.GCS_SERVICE_ACCOUNT_KEY }}"
81+
GCS_BUCKET_NAME: "${{ vars.GCS_BUCKET_NAME }}"
82+
AUTHENTIK_URL: "${{ vars.AUTHENTIK_URL }}"
83+
AUTHENTIK_CLIENT_ID: "${{ vars.AUTHENTIK_CLIENT_ID }}"
84+
AUTHENTIK_AUTHORIZE_URL: "${{ vars.AUTHENTIK_AUTHORIZE_URL }}"
85+
AUTHENTIK_TOKEN_URL: "${{ vars.AUTHENTIK_TOKEN_URL }}"
86+
SESSION_SECRET_KEY: "${{ secrets.SESSION_SECRET_KEY }}"
87+
APITALLY_CLIENT_ID: "${{ vars.APITALLY_CLIENT_ID }}"
88+
run: |
89+
export MAX_INSTANCES="10"
90+
export SERVICE_NAME="ocotillo-api-testing"
91+
export ENTRYPOINT="gunicorn -w 4 -k uvicorn.workers.UvicornWorker main:app"
92+
export MIN_INSTANCES="0"
93+
envsubst < .github/app.template.yaml > app.yaml
94+
95+
- name: Deploy to Google Cloud
96+
run: |
97+
gcloud app deploy \
98+
app.yaml \
99+
--quiet \
100+
--project ${{ vars.GCP_PROJECT_ID }}
101+
102+
- name: Clean up oldest versions
103+
run: |
104+
SERVICE="ocotillo-api-testing"
105+
VERSIONS_JSON="$(gcloud app versions list --service="$SERVICE" --project=${{ vars.GCP_PROJECT_ID }} --format=json --sort-by="version.createTime" 2>/dev/null || printf '[]')"
106+
export VERSIONS_JSON
107+
DELETE_VERSION="$(python - <<'PY'
108+
import json
109+
import os
110+
111+
versions = json.loads(os.environ.get("VERSIONS_JSON", "[]") or "[]")
112+
if len(versions) <= 1:
113+
print("")
114+
raise SystemExit(0)
115+
116+
def traffic_split(version):
117+
for key in ("traffic_split", "trafficSplit"):
118+
value = version.get(key)
119+
if value is not None:
120+
try:
121+
return float(value)
122+
except (TypeError, ValueError):
123+
return 0.0
124+
return 0.0
125+
126+
for version in versions:
127+
if traffic_split(version) == 0.0:
128+
print(version.get("id", ""))
129+
break
130+
else:
131+
print("")
132+
PY
133+
)"
134+
if [ -n "$DELETE_VERSION" ]; then
135+
echo "Deleting old non-serving version for $SERVICE: $DELETE_VERSION"
136+
gcloud app versions delete "$DELETE_VERSION" --service="$SERVICE" --project=${{ vars.GCP_PROJECT_ID }} --quiet
137+
else
138+
echo "No old non-serving versions to delete for $SERVICE"
139+
fi
140+
141+
- name: Remove rendered configs
142+
run: |
143+
rm app.yaml
144+
145+
# Use PR author's username as git user name
146+
- name: Set up git user
147+
run: |
148+
git config --global user.name "${{ github.actor }}"
149+
git config --global user.email "${{ github.actor }}@users.noreply.github.com"
150+
151+
# ":" are not alloed in git tags, so replace with "-"
152+
- name: Tag commit
153+
run: |
154+
git tag -a "testing-deploy-$(date -u +%Y-%m-%d)T$(date -u +%H-%M-%S%z)" -m "testing gcloud deployment: $
155+
(date
156+
-u +%Y-%m-%d)T$(date -u +%H:%M:%S%z)"
157+
git push origin --tags

ADR2.md

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# ADR2: API Concurrency Fix Strategy
2+
3+
## Summary
4+
5+
This document describes a verified FastAPI concurrency issue in the API stack and recommends a two-phase remediation plan for maintainers.
6+
7+
The API uses synchronous SQLAlchemy sessions backed by `psycopg`. When those sessions are consumed from `async def` route handlers, blocking database work runs on the event loop thread if the handlers call synchronous ORM helpers directly. The lowest-risk immediate fix is to convert database-bound route handlers that do not perform asynchronous work into plain `def`. The longer-term fix is to introduce a real async SQLAlchemy stack and migrate the affected handlers and helpers incrementally.
8+
9+
## Problem
10+
11+
FastAPI supports synchronous generator dependencies such as `get_db_session()`. The issue is not the dependency shape itself. The issue is that the injected object is a synchronous SQLAlchemy `Session`, and any `async def` route that consumes it while executing synchronous ORM queries directly will block the event loop thread.
12+
13+
In this configuration, FastAPI runs the `async def` route body on the event loop thread. If that body performs blocking database I/O through the synchronous session, the worker cannot make progress on other requests assigned to that event loop until the database call returns. A slow well query can therefore delay unrelated lightweight requests handled by the same worker.
14+
15+
This is a concurrency problem, not a correctness problem. The endpoints can still return correct data while reducing throughput and responsiveness under load.
16+
17+
## Evidence In This Repo
18+
19+
- [`db/engine.py`](db/engine.py) creates `database_sessionmaker = sessionmaker(engine, expire_on_commit=False)` and `get_db_session()` yields a regular synchronous `Session`.
20+
- [`db/engine.py`](db/engine.py) builds synchronous `postgresql+psycopg` engines for both the default PostgreSQL path and the Cloud SQL path, confirming that the active database layer is synchronous.
21+
- [`core/dependencies.py`](core/dependencies.py) injects that session through `session_dependency`.
22+
- [`services/well_details_helper.py`](services/well_details_helper.py) performs synchronous ORM operations such as `session.scalars(...).all()` and related query chains.
23+
- [`api/thing.py`](api/thing.py) contains representative database-backed routes that pass the synchronous session into helper functions such as `get_db_things(...)` and `get_well_details_payload(...)`.
24+
- [`api/asset.py`](api/asset.py) shows a contrasting safe pattern for non-database blocking work by wrapping synchronous GCS calls in `run_in_threadpool(...)`.
25+
- The short-term fix described in this ADR converts database-bound routes from `async def` to `def` where they do not need `await`, but the helper/query layer remains synchronous until a real async session stack is introduced.
26+
27+
## Short-Term Fix
28+
29+
The short-term fix is to convert database-bound route handlers from `async def` to `def` when they do not actually perform asynchronous work.
30+
31+
This lets FastAPI offload the entire route function to a worker thread instead of running its synchronous database calls on the event loop thread. It does not require changing the current database engine, dependency, query helpers, or response schemas.
32+
33+
### Short-term implementation guidance
34+
35+
- Convert any route handler that:
36+
- receives `session: session_dependency`,
37+
- performs synchronous ORM work directly or through helpers, and
38+
- does not require `await` for other operations in the route body.
39+
- Prioritize the highest-value endpoints first:
40+
- high-traffic list and detail endpoints,
41+
- endpoints known to run expensive joins or eager-loads,
42+
- endpoints that affect warmup or perceived application responsiveness.
43+
- Keep route behavior unchanged:
44+
- do not change paths, status codes, payloads, or auth dependencies as part of this phase.
45+
- Avoid mixed patterns:
46+
- do not leave a route as `async def` if it still calls synchronous SQLAlchemy code directly.
47+
- Use `run_in_threadpool(...)` only when a route must remain `async def` for a separate reason, such as mixing in another async operation, and only for isolated blocking helpers rather than as a blanket wrapper for all DB access.
48+
49+
### Expected impact
50+
51+
- Lower risk than a full async migration.
52+
- No intended HTTP contract changes.
53+
- Better worker responsiveness because blocking DB work moves off the event loop thread.
54+
55+
## Long-Term Fix
56+
57+
The long-term fix is to add a real async database stack and migrate selected API areas to it incrementally.
58+
59+
This phase should introduce an explicit async path rather than trying to reuse the current synchronous dependency. Importing async SQLAlchemy primitives is not enough; the repo needs a working async engine, async sessionmaker, async dependency, and async query/helper layer for migrated endpoints.
60+
61+
### Long-term target architecture
62+
63+
- Add an `AsyncEngine` configured for the intended async driver.
64+
- Add an `async_sessionmaker` that yields `AsyncSession` instances.
65+
- Add a dedicated async dependency such as `get_async_db_session()` rather than overloading `get_db_session()`.
66+
- Update migrated handlers and helper functions to use async database access:
67+
- `await session.execute(...)`
68+
- `await session.scalars(...)`
69+
- other `AsyncSession`-compatible patterns as needed
70+
71+
### Long-term migration guidance
72+
73+
- Migrate by subsystem, not all at once.
74+
- Start with a bounded route/helper cluster where the query patterns are understood.
75+
- Keep sync and async paths separate during migration to avoid ambiguous dependencies and accidental sync calls from async routes.
76+
- Treat helper-layer migration as part of the work. Converting route signatures alone is insufficient if the helper functions still expect synchronous sessions.
77+
78+
### Non-goals and cautions
79+
80+
- Do not claim the repo already has a working async DB session path unless one is actually implemented and used.
81+
- Do not treat “switch everything to async” as a trivial refactor.
82+
- Do not mix `AsyncSession` route code with synchronous helper/query internals.
83+
84+
## Recommended Path
85+
86+
The recommended order is:
87+
88+
1. Convert database-bound `async def` routes that do not use `await` into plain `def`.
89+
2. Validate behavior and measure the effect on responsiveness.
90+
3. Introduce a dedicated async DB stack.
91+
4. Migrate selected route/helper subsystems incrementally to `AsyncSession`.
92+
93+
This sequence delivers immediate concurrency improvement with limited risk, while preserving a clear path to a full async architecture later.
94+
95+
## Acceptance Criteria
96+
97+
### Short-term acceptance criteria
98+
99+
- Targeted API tests continue to pass after `async def` to `def` conversions.
100+
- HTTP behavior is unchanged:
101+
- same routes,
102+
- same auth requirements,
103+
- same status codes,
104+
- same payload shapes.
105+
- Concurrency smoke checks or request-timing instrumentation show that DB-heavy requests no longer block the event loop thread for that worker in the same way they do today.
106+
107+
### Long-term acceptance criteria
108+
109+
- Migrated endpoints pass the existing API test coverage for their subsystem.
110+
- The async session lifecycle is correct for successful and failing requests.
111+
- Migrated `async def` routes do not call synchronous session helpers.
112+
- Before/after measurements are captured for latency and concurrency so the migration can be evaluated against real behavior rather than assumptions.
113+
114+
## Defaults And Assumptions
115+
116+
- This document is written for maintainers and assumes familiarity with FastAPI and SQLAlchemy internals.
117+
- The document is self-contained and does not require code changes to be useful.
118+
- The recommended short-term action is intentionally conservative and does not prescribe a file-by-file rollout sequence.
119+
- The recommended long-term action is a staged migration, not a flag-day rewrite.

api/author.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
"/{author_id}/publications",
3131
response_model=list[PublicationResponse],
3232
)
33-
async def get_author_publications(
33+
def get_author_publications(
3434
user: viewer_dependency, author_id: int, session: session_dependency
3535
):
3636
"""

0 commit comments

Comments
 (0)