Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions components/debug/limit-tester.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ export function LimitTester() {
const [naturalDimensions, setNaturalDimensions] = useState<{ w: number; h: number } | null>(null)

const startGeneration = useMutation(api.singleGeneration.startGeneration)
const dispatchGeneration = useAction(api.singleGeneration.dispatchGeneration)
const processGeneration = useAction(api.singleGenerationProcessor.processGeneration)

// BYOP context for API key
const apiKey = usePollenApiKey()
Expand Down Expand Up @@ -81,7 +81,7 @@ export function LimitTester() {
apiKey,
})
setCurrentGenId(id)
void Promise.resolve(dispatchGeneration({ generationId: id, apiKey })).catch((dispatchError) => {
void Promise.resolve(processGeneration({ generationId: id, apiKey })).catch((dispatchError) => {
console.error("Immediate dispatch failed; recovery path will retry:", dispatchError)
})
} catch (error) {
Expand Down
22 changes: 21 additions & 1 deletion components/studio/batch/batch-action-button.test.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import "@testing-library/jest-dom/vitest"
import { render, screen, fireEvent } from "@testing-library/react"
import { describe, it, expect, vi } from "vitest"
import { beforeEach, describe, expect, it, vi } from "vitest"
import { BatchActionButton } from "./batch-action-button"

describe("BatchActionButton", () => {
Expand Down Expand Up @@ -53,6 +54,25 @@ describe("BatchActionButton", () => {
expect(onResume).toHaveBeenCalledTimes(1)
})

it("disables resume while paused items are still in flight", () => {
const onResume = vi.fn()
render(
<BatchActionButton
{...defaultProps}
isPaused={true}
inFlightCount={2}
onResume={onResume}
/>
)

const resumeButton = screen.getByRole("button", { name: /resume/i })
expect(resumeButton).toBeDisabled()

fireEvent.click(resumeButton)

expect(onResume).not.toHaveBeenCalled()
})

it("calls onCancel when cancel button is clicked", () => {
const onCancel = vi.fn()
render(<BatchActionButton {...defaultProps} onCancel={onCancel} />)
Expand Down
4 changes: 4 additions & 0 deletions components/studio/batch/batch-action-button.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export const BatchActionButton = React.memo(function BatchActionButton({
className,
}: BatchActionButtonProps) {
const progress = totalCount > 0 ? (completedCount / totalCount) * 100 : 0
const isResumeDisabled = isPaused && inFlightCount > 0

// Show in-flight indicator when paused and items are still processing
const showInFlightIndicator = isPaused && inFlightCount > 0
Expand Down Expand Up @@ -94,6 +95,7 @@ export const BatchActionButton = React.memo(function BatchActionButton({
{/* The actual button - positioned above progress */}
<Button
onClick={isPaused ? onResume : onPause}
disabled={isResumeDisabled}
variant="ghost"
size="lg"
className={cn(
Expand All @@ -102,11 +104,13 @@ export const BatchActionButton = React.memo(function BatchActionButton({
"border border-border/40 hover:border-border/60",
"rounded-lg overflow-hidden",
"transition-all duration-300",
isResumeDisabled && "cursor-not-allowed opacity-70",
// Text color based on state
isPaused
? "text-amber-600 dark:text-amber-400"
: "text-emerald-700 dark:text-emerald-300"
)}
title={isResumeDisabled ? "Wait for in-flight images to finish before resuming" : undefined}
>
{/* Icon with subtle animation */}
<span className={cn(
Expand Down
2 changes: 1 addition & 1 deletion components/studio/upgrade-modal.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ describe("UpgradeModal", () => {
expect(
screen.getByText("generations / day"),
).toBeInTheDocument();
expect(screen.getByText("9")).toBeInTheDocument();
expect(screen.getByText("10")).toBeInTheDocument();
expect(screen.getByText("AI models included")).toBeInTheDocument();
expect(screen.getByText("Daily")).toBeInTheDocument();
});
Expand Down
215 changes: 215 additions & 0 deletions convex-optimization-branch-review.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
# Convex Optimization Branch Review

## Scope

I reviewed the branch against `origin/main` with focus on the changes tied to `convex-optimiziations.md`, especially:

- `convex/batchGeneration.ts`
- `convex/batchProcessor.ts`
- `convex/singleGeneration.ts`
- `convex/singleGenerationProcessor.ts`
- `hooks/queries/use-generate-image.ts`
- `components/debug/limit-tester.tsx`
- related tests and UI surfaces

I also cross-checked Convex-specific judgments against current official guidance:

- `https://docs.convex.dev/functions/actions`
- `https://docs.convex.dev/understanding/best-practices/`
- `https://docs.convex.dev/scheduling/scheduled-functions`
- `@convex-dev/workpool` README

I ran the relevant test files as well:

```bash
bun run test hooks/queries/use-generate-image.test.tsx hooks/queries/use-batch-generation.test.ts hooks/use-batch-mode.test.ts
```

Those tests all passed, which is important context for the testing-gap finding below.

## High-level take

The overall direction of the branch is good and largely aligned with current Convex guidance:

- moving retry waiting out of long-running actions is directionally correct
- adding hard fetch timeouts is good
- removing the extra single-generation action wrapper is directionally correct
- adaptive throttling is a reasonable next step

The main problem is that the new batch state machine has a correctness bug around `pause` / `resume` under pipelined scheduling.

## Findings

## P1 - Resuming a paused batch can duplicate item processing

- **Severity**: P1
- **Type**: clear bug, business logic regression, UX regression
- **Where**:
- `convex/batchGeneration.ts:276-335`
- `convex/batchGeneration.ts:700-738`
- `convex/batchProcessor.ts:192-225`
- `components/studio/batch/batch-action-button.tsx:49-50`
- `components/studio/batch/batch-action-button.tsx:95-125`

### Why this is a bug

`currentIndex` is advanced when the next item is **scheduled**, not when it is **completed**.

That means a paused batch can legitimately be in this state:

- item `N` already scheduled / still in flight
- `currentIndex === N`
- UI shows `"N images finishing..."`

`resumeBatchJob` then immediately schedules `currentIndex` again:

- `convex/batchGeneration.ts:731-735`

So if the user resumes before the already-scheduled item drains, the same item index can be processed twice.

Because each first-attempt batch item also pipelines the next item at the start of processing:

- `convex/batchProcessor.ts:192-204`

this can cascade into:

- duplicate generated images
- inflated `completedCount` / `failedCount`
- premature `totalProcessed >= totalCount`
- early deletion of the batch job record

### Why the UX contributes to it

The UI explicitly surfaces that items are still finishing while paused:

- `showInFlightIndicator = isPaused && inFlightCount > 0`

but the same button still exposes `Resume` immediately. So the UI currently invites the exact unsafe transition that corrupts the batch state.

### Recommendation

Resume should not schedule a new item while any prior scheduled work is still outstanding.

At minimum, resume needs one of these invariants:

- only resume when `inFlightCount === 0`
- track a separate `nextUnprocessedIndex` instead of reusing `currentIndex`
- persist scheduled item identities so resume can tell whether the next index is already queued

## P2 - `inFlightCount` becomes wrong after resume

- **Severity**: P2
- **Type**: clear bug, UI/UX regression
- **Where**:
- `convex/batchGeneration.ts:151-175`
- `convex/batchGeneration.ts:303-318`
- `convex/batchGeneration.ts:438-450`
- `convex/batchGeneration.ts:724-735`

### Why this is a bug

The system treats `inFlightCount` as including already-scheduled outstanding work:

- initial batch starts with `inFlightCount: 1`
- scheduling the next item increments it
- recording a result decrements it

But `resumeBatchJob` schedules work without incrementing `inFlightCount`.

That means after a clean paused state with `inFlightCount = 0`, resuming undercounts by one immediately. From there the counter can hit zero even though work is still running or scheduled.

### Impact

- the paused UI can claim everything has drained when one item is still outstanding
- users can think it is safe to resume / pause again when it is not
- any future logic that relies on `inFlightCount` as a drain signal will be unreliable

This also makes the P1 duplication bug easier to trigger repeatedly.

### Recommendation

Either:

- increment `inFlightCount` when resume schedules the restart item

or better:

- redesign the batch state model so `inFlightCount` is derived from durable queued/running state instead of manual increments/decrements

## P3 - The new batch lifecycle is not protected by regression tests

- **Severity**: P3
- **Type**: high-value testing gap
- **Where**:
- `hooks/use-batch-mode.test.ts:306-442`
- `hooks/queries/use-batch-generation.test.ts:46-107`
- `hooks/queries/use-batch-generation.test.ts:141-171`

### Why this matters

I ran the relevant tests and they all passed, but the current coverage only checks:

- basic hook return shapes
- simple paused/processing flag mapping
- simple progress mapping

It does **not** cover the dangerous transitions introduced by the optimization work:

- pause while scheduled items are still outstanding
- resume before in-flight work drains
- `currentIndex` / `inFlightCount` invariants under pipelining
- duplicate scheduling protection

This is why the P1/P2 bug can exist while the branch still looks green.

### Recommendation

Add targeted tests around the batch state machine, not just hook shape tests. The minimum valuable cases are:

- paused batch with `inFlightCount > 0` cannot safely schedule the same `currentIndex` twice
- resume preserves correct `inFlightCount`
- total processed count cannot be incremented twice for the same item index

## P4 - Old public single-generation wrapper is now stale API surface

- **Severity**: P4
- **Type**: maintainability / API hygiene
- **Where**:
- `convex/singleGeneration.ts:106-134`
- current consumers now use `api.singleGenerationProcessor.processGeneration`

### Why this matters

This branch correctly moved the hot path to the public `singleGenerationProcessor.processGeneration` action, which removes the extra wrapper layer and is more in line with current Convex guidance about avoiding unnecessary `runAction` overhead.

However, `singleGeneration.dispatchGeneration` is still exported as a public action even though current consumers no longer use it.

That leaves two public entrypoints for the same behavior:

- the new direct path
- the old wrapper path

Low severity, but it increases confusion and makes it easy for a future caller to accidentally reintroduce the extra wrapper overhead.

### Recommendation

Remove the stale public wrapper or make the public API converge on one entrypoint only.

## Non-findings worth calling out

These branch decisions looked good after doc validation and I would keep them:

- **Retry rescheduling instead of sleeping in actions**
- aligned with the cost problem described in the spec
- **Hard provider fetch timeout**
- good protection against long-tail provider stalls
- **Removing the extra single-generation action wrapper from the hot path**
- aligned with current Convex guidance that `runAction` should generally only be used when crossing runtimes
- **Adaptive throttling as a scheduler-level control**
- reasonable, and Workpool is not obviously required yet if the batch lifecycle bugs are fixed first

## Recommended fix order

- **First**: fix P1 and P2 together in the batch state machine
- **Second**: add the regression tests from P3 before making further scheduler tweaks
- **Third**: clean up the stale single-generation wrapper from P4
45 changes: 45 additions & 0 deletions convex-optimiziations.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
Yes — there’s a clear path to reduce this **a lot** while keeping the same UX.

## What’s likely driving cost

Based on your code + latest Convex docs:

- `singleGenerationProcessor.processGeneration` and `batchProcessor.processBatchItem` are both Node actions (`"use node"`), and Node actions are billed at **512MB** runtime memory ([limits](https://docs.convex.dev/production/state/limits.md)).
- Both actions spend significant wall-time waiting on external work (Pollinations + retries), and in `fetchWithRetry` you currently **sleep inside the action** between retries (`convex/lib/retry.ts`).
- `singleGeneration.dispatchGeneration` wraps `processGeneration` via `ctx.runAction(...)` (`convex/singleGeneration.ts`), which Convex docs call out as overhead-heavy unless crossing runtimes ([actions best practices](https://docs.convex.dev/functions/actions)).
- Batch scheduling is aggressively pipelined (`100ms + jitter`) in `convex/batchGeneration.ts`, which can increase 429/5xx retry pressure and burn more GBh.

## Highest-impact fixes (without UX loss)

1. **Move retry backoff out of running actions**
- Instead of `sleep(...)` in `fetchWithRetry`, schedule the next retry attempt via `ctx.scheduler.runAfter(...)` and exit.
- Keeps persistence, keeps reactive UX, cuts billed “idle waiting” time.

2. **Add hard fetch timeout + retry by reschedule**
- Abort slow provider calls (e.g. 45-90s depending image/video), mark attempt, reschedule.
- Prevents long-tail stuck calls from eating GBh.

3. **Adaptive batch throttling**
- Dynamically increase inter-item delay when 429/5xx spike, decrease when healthy.
- Usually lowers retries and total GBh while keeping throughput stable.

4. **Remove extra action wrapper overhead in single flow**
- Current: client -> `dispatchGeneration` action -> `runAction(processGeneration)`.
- Better: reduce one layer (or switch back to mutation-scheduled processor if acceptable).
- This is a smaller win than #1/#2 but still worthwhile.

5. **Stop burning real generation compute in dev by default**
- Your `dev` usage is huge too. Add a dev-only mock/placeholder path unless explicitly enabled.

## Convex-idiomatic guidance (2026 docs)

- Mutation that records intent + schedules background work is still the canonical pattern ([scheduled functions](https://docs.convex.dev/scheduling/scheduled-functions), [actions](https://docs.convex.dev/functions/actions)).
- Free tier has tight scheduled concurrency; if you need controlled parallelism/retries, `@convex-dev/workpool` is now a strong fit ([workpool README](https://raw.githubusercontent.com/get-convex/workpool/main/README.md)).

## Recommended order for you

- **Phase 1 (fast, biggest ROI):** #1 + #2 + #5
- **Phase 2:** #3
- **Phase 3:** #4 / workpool migration if needed

If you want, I can implement **Phase 1 now** directly in your codebase (retry rescheduling + timeout + dev guard) and keep your current UX contract intact.
2 changes: 2 additions & 0 deletions convex/_generated/api.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import type * as favorites from "../favorites.js";
import type * as follows from "../follows.js";
import type * as generatedImages from "../generatedImages.js";
import type * as http from "../http.js";
import type * as lib_batchGenerationState from "../lib/batchGenerationState.js";
import type * as lib_crypto from "../lib/crypto.js";
import type * as lib_dirtberryCrop from "../lib/dirtberryCrop.js";
import type * as lib_groq from "../lib/groq.js";
Expand Down Expand Up @@ -73,6 +74,7 @@ declare const fullApi: ApiFromModules<{
follows: typeof follows;
generatedImages: typeof generatedImages;
http: typeof http;
"lib/batchGenerationState": typeof lib_batchGenerationState;
"lib/crypto": typeof lib_crypto;
"lib/dirtberryCrop": typeof lib_dirtberryCrop;
"lib/groq": typeof lib_groq;
Expand Down
Loading
Loading