diff --git a/services/ws-modules/face-detection/src/lib.rs b/services/ws-modules/face-detection/src/lib.rs index aec6cbf..82fa4e1 100644 --- a/services/ws-modules/face-detection/src/lib.rs +++ b/services/ws-modules/face-detection/src/lib.rs @@ -823,10 +823,10 @@ fn face_video_element() -> Result { .and_then(|window| window.document()) .ok_or_else(|| JsValue::from_str("No document available"))?; document - .get_element_by_id("face-video-preview") - .ok_or_else(|| JsValue::from_str("Missing #face-video-preview element"))? + .get_element_by_id("video-preview") + .ok_or_else(|| JsValue::from_str("Missing #video-preview element"))? .dyn_into::() - .map_err(|_| JsValue::from_str("#face-video-preview was not a video element")) + .map_err(|_| JsValue::from_str("#video-preview was not a video element")) } fn face_output_canvas_element() -> Result { @@ -834,10 +834,10 @@ fn face_output_canvas_element() -> Result { .and_then(|window| window.document()) .ok_or_else(|| JsValue::from_str("No document available"))?; document - .get_element_by_id("face-video-output-canvas") - .ok_or_else(|| JsValue::from_str("Missing #face-video-output-canvas element"))? + .get_element_by_id("video-output-canvas") + .ok_or_else(|| JsValue::from_str("Missing #video-output-canvas element"))? .dyn_into::() - .map_err(|_| JsValue::from_str("#face-video-output-canvas was not a canvas element")) + .map_err(|_| JsValue::from_str("#video-output-canvas was not a canvas element")) } fn face_preprocess_canvas() -> Result { diff --git a/services/ws-server/static/app.js b/services/ws-server/static/app.js index 7108695..2f9e258 100644 --- a/services/ws-server/static/app.js +++ b/services/ws-server/static/app.js @@ -28,13 +28,10 @@ const gpuInfoButton = document.getElementById("gpu-info-button"); const speechButton = document.getElementById("speech-button"); const nfcButton = document.getElementById("nfc-button"); const sensorsButton = document.getElementById("sensors-button"); -const videoOutputButton = document.getElementById("video-output-button"); const agentStatusEl = document.getElementById("agent-status"); const agentIdEl = document.getElementById("agent-id"); const sensorOutputEl = document.getElementById("sensor-output"); -const videoOutputEl = document.getElementById("ml-debug-output"); const videoPreview = document.getElementById("video-preview"); -const videoOutputCanvas = document.getElementById("video-output-canvas"); let microphone = null; let videoCapture = null; @@ -44,25 +41,7 @@ let speechListening = false; let sensorsActive = false; let orientationState = null; let motionState = null; -let videoCvSession = null; -let videoCvInputName = null; -let videoCvOutputName = null; -let videoCvLoopId = null; -let videoCvInferencePending = false; -let lastVideoInferenceAt = 0; -let lastVideoCvLabel = null; -let videoCvCanvas = null; -let videoCvContext = null; -let videoOverlayContext = videoOutputCanvas.getContext("2d"); -let videoOutputVisible = false; -let videoRenderFrameId = null; -let lastVideoInferenceSummary = null; -const loadedWorkflowModules = new Map(); let sendClientEvent = () => {}; -const VIDEO_INFERENCE_INTERVAL_MS = 750; -const VIDEO_RENDER_SCORE_THRESHOLD = 0.35; -const VIDEO_MODEL_PATH = "/static/models/video_cv.onnx"; -const VIDEO_FALLBACK_INPUT_SIZE = 224; const STORED_AGENT_ID_KEY = "ws_wasm_agent.agent_id"; let currentAgentId = null; @@ -178,11 +157,22 @@ const runSelectedWorkflowModule = async () => { } const loadedModule = await loadWorkflowModule(moduleKey); + if ( + typeof loadedModule.is_running === "function" + && loadedModule.is_running() + && typeof loadedModule.stop === "function" + ) { + append(`${moduleConfig.label} module: calling stop()`); + loadedModule.stop(); + append(`${moduleConfig.label} module stopped`); + return; + } + append(`${moduleConfig.label} module: calling run()`); const runPromise = loadedModule.run(); append(`${moduleConfig.label} module: run() started`); await runPromise; - append(`${moduleConfig.label} module completed`); + append(`${moduleConfig.label} module run() returned`); }; const handleProtocolMessage = (message) => { @@ -257,31 +247,6 @@ const renderSensorOutput = () => { sensorOutputEl.value = lines.join("\n"); }; -const setVideoOutput = (lines) => { - videoOutputEl.value = Array.isArray(lines) ? lines.join("\n") : String(lines); -}; - -const updateVideoStatus = (extraLines = []) => { - const inputMetadata = videoCvInputName - ? videoCvSession?.inputMetadata?.[videoCvInputName] - : null; - const outputMetadata = videoCvOutputName - ? videoCvSession?.outputMetadata?.[videoCvOutputName] - : null; - const lines = [ - `model: ${videoCvSession ? "loaded" : "not loaded"}`, - `video: ${videoCapture ? "active" : "inactive"}`, - `input: ${videoCvInputName ?? "n/a"}`, - `output: ${videoCvOutputName ?? "n/a"}`, - `input dims: ${JSON.stringify(inputMetadata?.dimensions ?? [])}`, - `output dims: ${JSON.stringify(outputMetadata?.dimensions ?? [])}`, - `loop: ${videoCvLoopId === null ? "idle" : "running"}`, - `display: ${videoOutputVisible ? "visible" : "hidden"}`, - `mode: ${lastVideoInferenceSummary?.mode ?? "unknown"}`, - ]; - setVideoOutput(lines.concat("", extraLines)); -}; - const handleOrientation = (event) => { orientationState = { alpha: event.alpha, @@ -370,765 +335,7 @@ const startSensorsFlow = async () => { append("device sensors started; streaming locally to textbox"); }; -const getTopK = (values, limit = 3) => { - return values - .map((value, index) => ({ value, index })) - .sort((left, right) => right.value - left.value) - .slice(0, limit); -}; - -const ensureVideoCvCanvas = () => { - if (!videoCvCanvas) { - videoCvCanvas = document.createElement("canvas"); - videoCvContext = videoCvCanvas.getContext("2d", { willReadFrequently: true }); - } - - if (!videoCvContext) { - throw new Error("Unable to create 2D canvas context for video preprocessing."); - } - - return videoCvContext; -}; - -const ensureVideoOverlayContext = () => { - if (!videoOverlayContext) { - videoOverlayContext = videoOutputCanvas.getContext("2d"); - } - - if (!videoOverlayContext) { - throw new Error("Unable to create video output canvas context."); - } - - return videoOverlayContext; -}; - -const selectVideoModelInputName = (session) => { - const inputNames = Array.isArray(session?.inputNames) ? session.inputNames : []; - if (!inputNames.length) { - return null; - } - - const ranked = inputNames - .map((name) => { - const metadata = session?.inputMetadata?.[name]; - const dimensions = Array.isArray(metadata?.dimensions) ? metadata.dimensions : []; - const normalizedName = String(name).toLowerCase(); - let score = 0; - - if (dimensions.length === 4) { - score += 100; - } else if (dimensions.length === 3) { - score += 40; - } - - if ( - normalizedName.includes("pixel") - || normalizedName.includes("image") - || normalizedName.includes("images") - || normalizedName.includes("input") - ) { - score += 25; - } - - if (normalizedName.includes("mask") || normalizedName.includes("token")) { - score -= 50; - } - - return { name, score }; - }) - .sort((left, right) => right.score - left.score); - - return ranked[0]?.name ?? inputNames[0]; -}; - -const selectVideoModelOutputName = (session) => { - const outputNames = Array.isArray(session?.outputNames) ? session.outputNames : []; - if (!outputNames.length) { - return null; - } - - const ranked = outputNames - .map((name) => { - const normalizedName = String(name).toLowerCase(); - let score = 0; - if (normalizedName.includes("box")) { - score += 100; - } - if (normalizedName.includes("logit") || normalizedName.includes("score")) { - score += 40; - } - return { name, score }; - }) - .sort((left, right) => right.score - left.score); - - return ranked[0]?.name ?? outputNames[0]; -}; - -const resolveVideoModelLayout = () => { - if (!videoCvSession || !videoCvInputName) { - throw new Error("Video CV model is not loaded."); - } - - const metadata = videoCvSession.inputMetadata?.[videoCvInputName]; - const dataType = metadata?.type ?? "float32"; - if (dataType !== "float32" && dataType !== "uint8") { - throw new Error(`Unsupported video model input type: ${dataType}`); - } - - const rawDimensions = Array.isArray(metadata?.dimensions) - ? metadata.dimensions - : []; - const dimensions = rawDimensions.length === 4 - ? rawDimensions - : rawDimensions.length === 3 - ? [1, ...rawDimensions] - : [1, 3, VIDEO_FALLBACK_INPUT_SIZE, VIDEO_FALLBACK_INPUT_SIZE]; - - const resolved = dimensions.map((dimension, index) => { - if (typeof dimension === "number" && Number.isFinite(dimension) && dimension > 0) { - return dimension; - } - - if (index === 0) { - return 1; - } - - if (index === 1 && dimensions.length === 4) { - const inputName = String(videoCvInputName).toLowerCase(); - if (!inputName.includes("nhwc")) { - return 3; - } - } - - return VIDEO_FALLBACK_INPUT_SIZE; - }); - - const secondDimension = resolved[1]; - const lastDimension = resolved[3]; - const inputName = String(videoCvInputName).toLowerCase(); - const channelsFirst = inputName.includes("nhwc") - ? false - : secondDimension === 1 - || secondDimension === 3 - || ((lastDimension !== 1 && lastDimension !== 3) && !inputName.includes("image_embeddings")); - if (channelsFirst) { - const [, channels, height, width] = resolved; - if (channels !== 1 && channels !== 3) { - throw new Error(`Unsupported channel count for NCHW image input: ${channels}`); - } - - return { - dataType, - channels, - width, - height, - tensorDimensions: [1, channels, height, width], - layout: "nchw", - }; - } - - const [, height, width, channels] = resolved; - if (channels !== 1 && channels !== 3) { - throw new Error(`Unsupported channel count for NHWC image input: ${channels}`); - } - - return { - dataType, - channels, - width, - height, - tensorDimensions: [1, height, width, channels], - layout: "nhwc", - }; -}; - -const buildVideoInputTensor = () => { - if (!videoCapture || !videoCvSession || !videoCvInputName) { - throw new Error("Video capture or model session is unavailable."); - } - - if (!videoPreview.videoWidth || !videoPreview.videoHeight) { - throw new Error("Video stream is not ready yet."); - } - - const { - dataType, - channels, - width, - height, - tensorDimensions, - layout, - } = resolveVideoModelLayout(); - const context = ensureVideoCvCanvas(); - videoCvCanvas.width = width; - videoCvCanvas.height = height; - context.drawImage(videoPreview, 0, 0, width, height); - - const rgba = context.getImageData(0, 0, width, height).data; - const elementCount = width * height * channels; - const tensorData = dataType === "uint8" - ? new Uint8Array(elementCount) - : new Float32Array(elementCount); - - for (let pixelIndex = 0; pixelIndex < width * height; pixelIndex += 1) { - const rgbaIndex = pixelIndex * 4; - const red = rgba[rgbaIndex]; - const green = rgba[rgbaIndex + 1]; - const blue = rgba[rgbaIndex + 2]; - - if (channels === 1) { - const grayscale = Math.round(0.299 * red + 0.587 * green + 0.114 * blue); - tensorData[pixelIndex] = dataType === "uint8" ? grayscale : grayscale / 255; - continue; - } - - if (layout === "nchw") { - const planeSize = width * height; - if (dataType === "uint8") { - tensorData[pixelIndex] = red; - tensorData[pixelIndex + planeSize] = green; - tensorData[pixelIndex + 2 * planeSize] = blue; - } else { - tensorData[pixelIndex] = red / 255; - tensorData[pixelIndex + planeSize] = green / 255; - tensorData[pixelIndex + 2 * planeSize] = blue / 255; - } - continue; - } - - const tensorIndex = pixelIndex * channels; - if (dataType === "uint8") { - tensorData[tensorIndex] = red; - tensorData[tensorIndex + 1] = green; - tensorData[tensorIndex + 2] = blue; - } else { - tensorData[tensorIndex] = red / 255; - tensorData[tensorIndex + 1] = green / 255; - tensorData[tensorIndex + 2] = blue / 255; - } - } - - return new window.ort.Tensor(dataType, tensorData, tensorDimensions); -}; - -const looksLikeBoxes = (tensor) => { - if (!tensor?.dims || !tensor?.data) { - return false; - } - - const dims = tensor.dims.filter((dimension) => Number.isFinite(dimension)); - const values = Array.from(tensor.data ?? []); - const lastDimension = dims[dims.length - 1]; - return values.length >= 4 && (lastDimension === 4 || lastDimension === 6 || lastDimension === 7); -}; - -const flattenFinite = (tensor) => { - return Array.from(tensor?.data ?? []).map(Number).filter((value) => Number.isFinite(value)); -}; - -const normalizeBox = (boxValues, format = "xyxy") => { - if (boxValues.length < 4) { - return null; - } - - let x1; - let y1; - let x2; - let y2; - if (format === "cxcywh") { - const [centerX, centerY, width, height] = boxValues; - x1 = centerX - width / 2; - y1 = centerY - height / 2; - x2 = centerX + width / 2; - y2 = centerY + height / 2; - } else { - [x1, y1, x2, y2] = boxValues; - } - - if (x2 < x1) { - [x1, x2] = [x2, x1]; - } - if (y2 < y1) { - [y1, y2] = [y2, y1]; - } - - const normalized = [x1, y1, x2, y2].map((value) => ( - value > 1.5 ? value : Math.max(0, Math.min(1, value)) - )); - - return normalized; -}; - -const softmax = (logits) => { - const maxLogit = Math.max(...logits); - const scores = logits.map((l) => Math.exp(l - maxLogit)); - const sumScores = scores.reduce((a, b) => a + b, 0); - return scores.map((s) => s / sumScores); -}; - -const findDetectionTensor = (entries, patterns, predicate = () => true) => { - return entries.find(([name, tensor]) => { - const normalizedName = String(name).toLowerCase(); - return patterns.some((pattern) => pattern.test(normalizedName)) && predicate(tensor); - }) ?? null; -}; - -const decodeHuggingFaceDetectionOutputs = (entries) => { - const boxesEntry = findDetectionTensor( - entries, - [/pred_boxes/, /boxes?/, /bbox/], - (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : null) === 4, - ); - const logitsEntry = findDetectionTensor( - entries, - [/logits/, /scores?/, /class/], - (tensor) => (Array.isArray(tensor?.dims) ? tensor.dims[tensor.dims.length - 1] : 0) > 1, - ); - - if (!boxesEntry || !logitsEntry) { - return null; - } - - const [boxesName, boxesTensor] = boxesEntry; - const [, logitsTensor] = logitsEntry; - const rawBoxes = flattenFinite(boxesTensor); - const rawLogits = flattenFinite(logitsTensor); - const boxCount = Math.floor(rawBoxes.length / 4); - const classCount = boxCount > 0 ? Math.floor(rawLogits.length / boxCount) : 0; - if (boxCount <= 0 || classCount <= 1) { - return null; - } - - const usesCenterBoxes = /pred_boxes/.test(String(boxesName).toLowerCase()); - const detections = []; - for (let index = 0; index < boxCount; index += 1) { - const box = rawBoxes.slice(index * 4, index * 4 + 4); - const logits = rawLogits.slice(index * classCount, index * classCount + classCount); - const candidateLogits = logits.length > 1 ? logits.slice(0, -1) : logits; - const probabilities = softmax(candidateLogits); - const best = getTopK(probabilities, 1)[0]; - if (!best || best.value < VIDEO_RENDER_SCORE_THRESHOLD) { - continue; - } - - const normalizedBox = normalizeBox(box, usesCenterBoxes ? "cxcywh" : "xyxy"); - if (!normalizedBox) { - continue; - } - - detections.push({ - label: `class_${best.index}`, - class_index: best.index, - score: best.value, - box: normalizedBox, - }); - } - - if (!detections.length) { - return { - mode: "detection", - detections: [], - detected_class: "no_detection", - class_index: -1, - confidence: 0, - probabilities: [], - top_classes: [], - }; - } - - detections.sort((left, right) => right.score - left.score); - const best = detections[0]; - return { - mode: "detection", - detections, - detected_class: best.label, - class_index: best.class_index, - confidence: best.score, - probabilities: detections.map((entry) => entry.score), - top_classes: detections.slice(0, 3).map((entry) => ({ - label: entry.label, - index: entry.class_index, - probability: entry.score, - })), - }; -}; - -const decodeDetectionOutputs = (outputs) => { - const entries = Object.entries(outputs); - const huggingFaceSummary = decodeHuggingFaceDetectionOutputs(entries); - if (huggingFaceSummary) { - return huggingFaceSummary; - } - - const boxesEntry = entries.find(([, tensor]) => looksLikeBoxes(tensor)); - - if (!boxesEntry) { - return null; - } - - const [boxesName, boxesTensor] = boxesEntry; - const boxDims = Array.isArray(boxesTensor.dims) ? boxesTensor.dims : []; - const rawBoxes = flattenFinite(boxesTensor); - const boxWidth = boxDims[boxDims.length - 1] ?? 4; - const detectionCount = Math.floor(rawBoxes.length / boxWidth); - if (detectionCount <= 0) { - return null; - } - - const scoresEntry = entries.find(([name, tensor]) => - name !== boxesName && flattenFinite(tensor).length >= detectionCount - ); - const classEntry = entries.find(([name, tensor]) => - name !== boxesName && name !== scoresEntry?.[0] && flattenFinite(tensor).length >= detectionCount - ); - const detections = []; - const scoreValues = scoresEntry ? flattenFinite(scoresEntry[1]) : []; - const classValues = classEntry ? flattenFinite(classEntry[1]) : []; - - for (let index = 0; index < detectionCount; index += 1) { - const start = index * boxWidth; - const row = rawBoxes.slice(start, start + boxWidth); - const normalizedBox = normalizeBox(row); - if (!normalizedBox) { - continue; - } - - let score = Number(scoreValues[index] ?? row[4] ?? row[5] ?? 1); - if (!Number.isFinite(score)) { - score = 1; - } - - let classIndex = classValues[index]; - if (!Number.isFinite(classIndex)) { - classIndex = row.length >= 6 ? row[5] : row.length >= 7 ? row[6] : index; - } - - if (score < VIDEO_RENDER_SCORE_THRESHOLD) { - continue; - } - - detections.push({ - label: `class_${Math.round(classIndex)}`, - class_index: Math.round(classIndex), - score, - box: normalizedBox, - }); - } - - if (!detections.length) { - return { - mode: "detection", - detections: [], - detected_class: "no_detection", - class_index: -1, - confidence: 0, - probabilities: [], - top_classes: [], - }; - } - - detections.sort((left, right) => right.score - left.score); - const best = detections[0]; - return { - mode: "detection", - detections, - detected_class: best.label, - class_index: best.class_index, - confidence: best.score, - probabilities: detections.map((entry) => entry.score), - top_classes: detections.slice(0, 3).map((entry) => ({ - label: entry.label, - index: entry.class_index, - probability: entry.score, - })), - }; -}; - -const decodeClassificationOutputs = (output) => { - const values = Array.from(output?.data ?? []); - if (values.length === 0) { - throw new Error("Video model returned an empty output tensor."); - } - - if (values.length === 1) { - return { - mode: "classification", - detections: [], - detected_class: "scalar_output", - class_index: 0, - confidence: Number(values[0]), - probabilities: values, - top_classes: [{ label: "scalar_output", index: 0, probability: Number(values[0]) }], - }; - } - - const probabilities = softmax(values); - const ranked = getTopK(probabilities, 3); - const best = ranked[0]; - - return { - mode: "classification", - detections: [], - detected_class: `class_${best.index}`, - class_index: best.index, - confidence: best.value, - probabilities, - top_classes: ranked.map(({ index, value }) => ({ - label: `class_${index}`, - index, - probability: value, - logit: values[index], - })), - }; -}; - -const summarizeVideoOutput = (outputMap) => { - const detectionSummary = decodeDetectionOutputs(outputMap); - if (detectionSummary) { - return detectionSummary; - } - - const primaryOutput = outputMap[videoCvOutputName]; - const primaryValues = Array.from(primaryOutput?.data ?? []); - if (primaryValues.length > 0 && primaryValues.length <= 4096) { - return decodeClassificationOutputs(primaryOutput); - } - - return { - mode: "passthrough", - detections: [], - detected_class: "unrecognized_output", - class_index: -1, - confidence: 0, - probabilities: [], - top_classes: [], - }; -}; - -const drawOverlayText = (context, lines) => { - if (!lines.length) { - return; - } - - context.font = "18px ui-monospace, monospace"; - const lineHeight = 24; - const width = Math.max(...lines.map((line) => context.measureText(line).width), 0) + 20; - const height = lines.length * lineHeight + 12; - context.fillStyle = "rgba(24, 32, 40, 0.72)"; - context.fillRect(12, 12, width, height); - context.fillStyle = "#fffdfa"; - lines.forEach((line, index) => { - context.fillText(line, 22, 36 + index * lineHeight); - }); -}; - -const renderVideoOutputFrame = () => { - videoRenderFrameId = null; - - if (!videoOutputVisible || !videoCapture || !videoPreview.videoWidth || !videoPreview.videoHeight) { - return; - } - - const context = ensureVideoOverlayContext(); - const width = videoPreview.videoWidth; - const height = videoPreview.videoHeight; - if (videoOutputCanvas.width !== width || videoOutputCanvas.height !== height) { - videoOutputCanvas.width = width; - videoOutputCanvas.height = height; - } - - context.drawImage(videoPreview, 0, 0, width, height); - - if (lastVideoInferenceSummary?.mode === "detection") { - context.lineWidth = 3; - context.font = "16px ui-monospace, monospace"; - lastVideoInferenceSummary.detections.forEach((entry) => { - const [x1, y1, x2, y2] = entry.box; - const left = x1 <= 1 ? x1 * width : x1; - const top = y1 <= 1 ? y1 * height : y1; - const right = x2 <= 1 ? x2 * width : x2; - const bottom = y2 <= 1 ? y2 * height : y2; - const boxWidth = Math.max(1, right - left); - const boxHeight = Math.max(1, bottom - top); - - context.strokeStyle = "#ef8f35"; - context.strokeRect(left, top, boxWidth, boxHeight); - - const label = `${entry.label} ${(entry.score * 100).toFixed(1)}%`; - const textWidth = context.measureText(label).width + 10; - context.fillStyle = "#182028"; - context.fillRect(left, Math.max(0, top - 24), textWidth, 22); - context.fillStyle = "#fffdfa"; - context.fillText(label, left + 5, Math.max(16, top - 8)); - }); - } else if (lastVideoInferenceSummary?.mode === "classification") { - drawOverlayText(context, [ - `classification: ${lastVideoInferenceSummary.detected_class}`, - `confidence: ${(lastVideoInferenceSummary.confidence * 100).toFixed(1)}%`, - ]); - } else if (lastVideoInferenceSummary?.mode === "passthrough") { - drawOverlayText(context, [ - "output mode: passthrough", - "model output not recognized as detection or classification", - ]); - } - - videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame); -}; - -const syncVideoOutputView = () => { - videoOutputCanvas.hidden = !videoOutputVisible || !videoCapture; - videoOutputButton.textContent = videoOutputVisible ? "Hide video output" : "Show video output"; - - if (!videoOutputVisible || !videoCapture) { - if (videoRenderFrameId !== null) { - window.cancelAnimationFrame(videoRenderFrameId); - videoRenderFrameId = null; - } - updateVideoStatus(); - return; - } - - if (videoRenderFrameId === null) { - videoRenderFrameId = window.requestAnimationFrame(renderVideoOutputFrame); - } - updateVideoStatus(); -}; - -const stopVideoCvLoop = () => { - if (videoCvLoopId !== null) { - window.clearInterval(videoCvLoopId); - videoCvLoopId = null; - } - lastVideoCvLabel = null; - updateVideoStatus(); -}; - -const inferVideoPrediction = async () => { - if ( - !videoCapture - || !videoCvSession - || !videoCvInputName - || !videoCvOutputName - || videoCvInferencePending - ) { - return; - } - - const now = Date.now(); - if (now - lastVideoInferenceAt < VIDEO_INFERENCE_INTERVAL_MS) { - return; - } - - videoCvInferencePending = true; - lastVideoInferenceAt = now; - - try { - const input = buildVideoInputTensor(); - const outputMap = await videoCvSession.run({ [videoCvInputName]: input }); - const output = outputMap[videoCvOutputName]; - const summary = summarizeVideoOutput(outputMap); - const labelChanged = summary.detected_class !== lastVideoCvLabel; - lastVideoCvLabel = summary.detected_class; - lastVideoInferenceSummary = summary; - - updateVideoStatus([ - `output mode: ${summary.mode}`, - `prediction: ${summary.detected_class}`, - `confidence: ${summary.confidence.toFixed(4)}`, - ...( - summary.mode === "detection" - ? [ - `detections: ${summary.detections.length}`, - ...summary.detections.slice(0, 3).map( - (entry) => - `${entry.label}: score=${entry.score.toFixed(4)} box=${ - entry.box.map((value) => value.toFixed(3)).join(",") - }`, - ), - ] - : [ - "top classes:", - ...summary.top_classes.map( - (entry) => - `${entry.label}: p=${entry.probability.toFixed(4)} logit=${ - Number(entry.logit ?? entry.probability).toFixed(4) - }`, - ), - ] - ), - `frame: ${videoPreview.videoWidth}x${videoPreview.videoHeight}`, - `processed at: ${new Date().toLocaleTimeString()}`, - ]); - syncVideoOutputView(); - - sendClientEvent("video_cv", "inference", { - mode: summary.mode, - detected_class: summary.detected_class, - class_index: summary.class_index, - confidence: summary.confidence, - probabilities: summary.probabilities, - top_classes: summary.top_classes, - detections: summary.detections, - changed: labelChanged, - processed_at: new Date().toISOString(), - model_path: VIDEO_MODEL_PATH, - input_name: videoCvInputName, - output_name: videoCvOutputName, - input_dimensions: videoCvSession.inputMetadata?.[videoCvInputName]?.dimensions ?? [], - output_dimensions: Array.isArray(output?.dims) ? output.dims : [], - source_resolution: { - width: videoPreview.videoWidth, - height: videoPreview.videoHeight, - }, - }); - } catch (error) { - lastVideoInferenceSummary = { - mode: "passthrough", - detections: [], - detected_class: "inference_error", - class_index: -1, - confidence: 0, - probabilities: [], - top_classes: [], - }; - updateVideoStatus([ - `inference error: ${error instanceof Error ? error.message : String(error)}`, - ]); - console.error(error); - } finally { - videoCvInferencePending = false; - } -}; - -const syncVideoCvLoop = () => { - if (videoCapture && videoCvSession) { - if (videoCvLoopId === null) { - videoCvLoopId = window.setInterval(() => { - void inferVideoPrediction(); - }, VIDEO_INFERENCE_INTERVAL_MS); - } - updateVideoStatus([ - "browser-side webcam inference active", - "results are sent to the backend over the websocket.", - ]); - return; - } - - stopVideoCvLoop(); - lastVideoInferenceSummary = null; - updateVideoStatus([ - videoCvSession - ? "model loaded; start video capture to begin inference." - : `model file: ${VIDEO_MODEL_PATH}`, - ]); -}; - renderSensorOutput(); -updateVideoStatus([ - `model file: ${VIDEO_MODEL_PATH}`, - "load the model, then start video capture to process frames in-browser.", -]); const wsProtocol = window.location.protocol === "https:" ? "wss:" : "ws:"; const wsUrl = `${wsProtocol}//${window.location.host}/ws`; @@ -1244,8 +451,6 @@ try { videoPreview.hidden = true; videoButton.textContent = "Start video"; delete window.videoCapture; - syncVideoCvLoop(); - syncVideoOutputView(); append("video stopped"); sendClientEvent("video", "stopped", { track_count: 0 }); return; @@ -1257,8 +462,6 @@ try { videoButton.textContent = "Stop video"; append(`video granted: ${videoCapture.trackCount()} video track(s)`); window.videoCapture = videoCapture; - syncVideoCvLoop(); - syncVideoOutputView(); sendClientEvent("video", "started", { track_count: videoCapture.trackCount(), }); @@ -1480,11 +683,6 @@ try { } }); - videoOutputButton.addEventListener("click", () => { - videoOutputVisible = !videoOutputVisible; - syncVideoOutputView(); - }); - window.client = client; window.sendAlive = () => client.send_alive(); window.runWorkflowModule = (moduleKey) => { diff --git a/services/ws-server/static/index.html b/services/ws-server/static/index.html index d2a936d..e100aa8 100644 --- a/services/ws-server/static/index.html +++ b/services/ws-server/static/index.html @@ -102,6 +102,23 @@ margin: 0; min-height: 180px; } + + video, + canvas { + box-sizing: border-box; + display: block; + width: auto; + height: auto; + max-width: 100%; + max-height: min(42vh, 420px); + margin: 12px auto 0; + background: #182028; + object-fit: contain; + } + + [hidden] { + display: none !important; + } @@ -125,8 +142,6 @@

WASM web agent

- -
@@ -156,9 +171,6 @@

WASM web agent

-