Browser Quickstart

Embed FaceX detection + recognition into any web app in 30 lines of vanilla JavaScript. Inference is 100% client-side via onnxruntime-web; camera frames never leave the device.

Minimal: detect → 512-dim embedding → cosine match

<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/ort.min.js"></script>
<video id="v" autoplay playsinline muted style="display:none"></video>
<script>
ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/';

async function fetchModel(url) {
  return new Uint8Array(await (await fetch(url)).arrayBuffer());
}

// Load detector and recognition (use the plain .onnx if you've decrypted,
// or your own WebCrypto pipeline — see "Encrypted Weights").
const detSess = await ort.InferenceSession.create(await fetchModel('facex_detect.onnx'),
                                                   { executionProviders: ['wasm'] });
const recSess = await ort.InferenceSession.create(await fetchModel('facex_tiny.onnx'),
                                                   { executionProviders: ['wasm'] });

const video = document.getElementById('v');
video.srcObject = await navigator.mediaDevices.getUserMedia({ video: true });
await new Promise(r => video.onloadedmetadata = r);

// Off-screen canvases
const detCv = new OffscreenCanvas(320, 320), detCtx = detCv.getContext('2d');
const recCv = new OffscreenCanvas(112, 112), recCtx = recCv.getContext('2d');

function letterboxDet() {
  const vw = video.videoWidth, vh = video.videoHeight, s = Math.min(320/vw, 320/vh);
  detCtx.fillStyle = '#000'; detCtx.fillRect(0, 0, 320, 320);
  detCtx.drawImage(video, 0, 0, vw*s, vh*s);
  return { scale: s };
}

function packCHW(data, W, H, norm) {
  const N = W * H, out = new Float32Array(3 * N);
  for (let i = 0, p = 0; i < N; i++) {
    out[i]       = norm(data[p++]);
    out[N + i]   = norm(data[p++]);
    out[2*N + i] = norm(data[p++]);
    p++; // alpha
  }
  return out;
}

async function detectFaces() {
  const { scale } = letterboxDet();
  const img = detCtx.getImageData(0, 0, 320, 320).data;
  const x = packCHW(img, 320, 320, v => (v - 127.5) / 128);
  const o = await detSess.run({ input: new ort.Tensor('float32', x, [1, 3, 320, 320]) });
  // Decode FCOS-style outputs (cls_p3/4/5 + box_p3/4/5). See demo_mesh.html
  // decodeFacex() for a ready-to-use 80-line decoder with sigmoid + NMS.
  return /* { x1, y1, x2, y2, score } */ decodeFacex(o, 320, 320, scale);
}

async function getEmbedding(face) {
  const side = Math.max(face.x2-face.x1, face.y2-face.y1) * 1.3;
  const cx = (face.x1+face.x2)/2, cy = (face.y1+face.y2)/2;
  recCtx.drawImage(video, cx-side/2, cy-side/2, side, side, 0, 0, 112, 112);
  const img = recCtx.getImageData(0, 0, 112, 112).data;
  const x = packCHW(img, 112, 112, v => (v - 127.5) / 128);
  const o = await recSess.run({ input: new ort.Tensor('float32', x, [1, 3, 112, 112]) });
  return o['embedding'].data;
}

function cosSim(a, b) {
  let s = 0; for (let i = 0; i < a.length; i++) s += a[i] * b[i]; return s;
}

// Usage: capture once, compare on every frame
const faces = await detectFaces();
const ref = await getEmbedding(faces[0]);

setInterval(async () => {
  const now = await detectFaces();
  if (!now.length) return;
  const cur = await getEmbedding(now[0]);
  console.log('match similarity:', cosSim(ref, cur).toFixed(3));
}, 200);
</script>

decodeFacex() and the full preprocessing helpers live in the demo source: wasm/demo_mesh.html. Copy the function as-is; it has no dependencies.

Performance tips

Set ort.env.wasm.numThreads = 4 if your host serves the COOP+COEP headers required for SharedArrayBuffer. Default 1 thread is the safest choice — still hits ~25 ms full pipeline on a 2020-era laptop.
The 4 recognition variants trade size for accuracy. Use nano (800 KB, 95.6% LFW) for low-bandwidth sites; xs (8.4 MB, 99.07%) for KYC.
executionProviders: ['webgpu', 'wasm'] lights up the WebGPU backend where supported and falls back to WASM otherwise. WebGPU is 3–5× faster on supported devices.

Frameworks

The above runs in any framework. For React, lift the sessions into a top-level useEffect. For Vue/Svelte, equivalent: load once, share via context/store. The bottleneck is one-time decryption + first-frame warmup (~600 ms); steady-state is camera frame rate.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Browser Quickstart

Browser Quickstart

Minimal: detect → 512-dim embedding → cosine match

Performance tips

Frameworks

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally