-
Notifications
You must be signed in to change notification settings - Fork 20
Browser Quickstart
Baurzhan Atinov edited this page May 14, 2026
·
1 revision
Embed FaceX detection + recognition into any web app in 30 lines of vanilla JavaScript. Inference is 100% client-side via onnxruntime-web; camera frames never leave the device.
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/ort.min.js"></script>
<video id="v" autoplay playsinline muted style="display:none"></video>
<script>
ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/';
async function fetchModel(url) {
return new Uint8Array(await (await fetch(url)).arrayBuffer());
}
// Load detector and recognition (use the plain .onnx if you've decrypted,
// or your own WebCrypto pipeline — see "Encrypted Weights").
const detSess = await ort.InferenceSession.create(await fetchModel('facex_detect.onnx'),
{ executionProviders: ['wasm'] });
const recSess = await ort.InferenceSession.create(await fetchModel('facex_tiny.onnx'),
{ executionProviders: ['wasm'] });
const video = document.getElementById('v');
video.srcObject = await navigator.mediaDevices.getUserMedia({ video: true });
await new Promise(r => video.onloadedmetadata = r);
// Off-screen canvases
const detCv = new OffscreenCanvas(320, 320), detCtx = detCv.getContext('2d');
const recCv = new OffscreenCanvas(112, 112), recCtx = recCv.getContext('2d');
function letterboxDet() {
const vw = video.videoWidth, vh = video.videoHeight, s = Math.min(320/vw, 320/vh);
detCtx.fillStyle = '#000'; detCtx.fillRect(0, 0, 320, 320);
detCtx.drawImage(video, 0, 0, vw*s, vh*s);
return { scale: s };
}
function packCHW(data, W, H, norm) {
const N = W * H, out = new Float32Array(3 * N);
for (let i = 0, p = 0; i < N; i++) {
out[i] = norm(data[p++]);
out[N + i] = norm(data[p++]);
out[2*N + i] = norm(data[p++]);
p++; // alpha
}
return out;
}
async function detectFaces() {
const { scale } = letterboxDet();
const img = detCtx.getImageData(0, 0, 320, 320).data;
const x = packCHW(img, 320, 320, v => (v - 127.5) / 128);
const o = await detSess.run({ input: new ort.Tensor('float32', x, [1, 3, 320, 320]) });
// Decode FCOS-style outputs (cls_p3/4/5 + box_p3/4/5). See demo_mesh.html
// decodeFacex() for a ready-to-use 80-line decoder with sigmoid + NMS.
return /* { x1, y1, x2, y2, score } */ decodeFacex(o, 320, 320, scale);
}
async function getEmbedding(face) {
const side = Math.max(face.x2-face.x1, face.y2-face.y1) * 1.3;
const cx = (face.x1+face.x2)/2, cy = (face.y1+face.y2)/2;
recCtx.drawImage(video, cx-side/2, cy-side/2, side, side, 0, 0, 112, 112);
const img = recCtx.getImageData(0, 0, 112, 112).data;
const x = packCHW(img, 112, 112, v => (v - 127.5) / 128);
const o = await recSess.run({ input: new ort.Tensor('float32', x, [1, 3, 112, 112]) });
return o['embedding'].data;
}
function cosSim(a, b) {
let s = 0; for (let i = 0; i < a.length; i++) s += a[i] * b[i]; return s;
}
// Usage: capture once, compare on every frame
const faces = await detectFaces();
const ref = await getEmbedding(faces[0]);
setInterval(async () => {
const now = await detectFaces();
if (!now.length) return;
const cur = await getEmbedding(now[0]);
console.log('match similarity:', cosSim(ref, cur).toFixed(3));
}, 200);
</script>decodeFacex() and the full preprocessing helpers live in the
demo source: wasm/demo_mesh.html.
Copy the function as-is; it has no dependencies.
- Set
ort.env.wasm.numThreads = 4if your host serves the COOP+COEP headers required for SharedArrayBuffer. Default 1 thread is the safest choice — still hits ~25 ms full pipeline on a 2020-era laptop. - The 4 recognition variants trade size for accuracy. Use
nano(800 KB, 95.6% LFW) for low-bandwidth sites;xs(8.4 MB, 99.07%) for KYC. -
executionProviders: ['webgpu', 'wasm']lights up the WebGPU backend where supported and falls back to WASM otherwise. WebGPU is 3–5× faster on supported devices.
The above runs in any framework. For React, lift the sessions into a
top-level useEffect. For Vue/Svelte, equivalent: load once, share via
context/store. The bottleneck is one-time decryption + first-frame
warmup (~600 ms); steady-state is camera frame rate.