diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index 0b8d6fe06ae..3c800bfdb38 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -782,6 +782,15 @@ fn process_with_whisper( log::info!(" Segment {i} has {num_tokens} tokens"); + let cap_word_end = |text: &str, start: f32, end: f32| -> f32 { + let max_duration = (text.len() as f32 * 0.1).clamp(0.5, 1.5); + if end - start > max_duration + 0.3 { + start + max_duration + } else { + end + } + }; + let mut current_word = String::new(); let mut word_start: Option = None; let mut word_end: f32 = start_time; @@ -812,16 +821,17 @@ fn process_with_whisper( if !current_word.is_empty() && let Some(ws) = word_start { + let text = current_word.trim().to_string(); + word_end = cap_word_end(&text, ws, word_end); + log::info!( - " -> Completing word: '{}' ({:.2}s - {:.2}s)", - current_word.trim(), - ws, - word_end + " -> Completing word: '{text}' ({ws:.2}s - {word_end:.2}s)" ); words.push(CaptionWord { - text: current_word.trim().to_string(), + text, start: ws, end: word_end, + ..Default::default() }); } current_word = token_text.trim().to_string(); @@ -844,16 +854,16 @@ fn process_with_whisper( if !current_word.trim().is_empty() && let Some(ws) = word_start { + let text = current_word.trim().to_string(); + word_end = cap_word_end(&text, ws, word_end); log::info!( - " -> Final word: '{}' ({:.2}s - {:.2}s)", - current_word.trim(), - ws, - word_end + " -> Final word: '{text}' ({ws:.2}s - {word_end:.2}s)" ); words.push(CaptionWord { - text: current_word.trim().to_string(), + text, start: ws, end: word_end, + ..Default::default() }); } @@ -1000,6 +1010,7 @@ fn process_with_parakeet( text: t.text.trim().to_string(), start: t.start, end: t.end, + ..Default::default() }) .collect(); @@ -1393,6 +1404,7 @@ pub fn parse_captions_json(json: &str) -> Result { const i = segmentIndex; const segment = () => segments()[i()]; + if (!segment()) return null; + const [startHandleDrag, setStartHandleDrag] = createSignal maxDuration + 0.3) { + // Parakeet TDT attaches trailing silence to the END of the word. + // We must cap w.end so the spoken word is preserved at the beginning of the timestamp block, + // exposing the silence AFTER the word. + end = w.start + maxDuration; + } + } + result.push({ text: w.text, - start: w.start, - end: w.end, + start, + end, + storedEnd: w.end, segmentIndex: segIdx, wordIndex: wordIdx, + deleted: w.deleted ?? false, + isFiller: w.isFiller || isFillerWord(w.text), + isPause: w.isPause ?? false, + bufferStart: w.bufferStart ?? 0, + bufferEnd: w.bufferEnd ?? 0, }); } } return result; }); + const fillerCount = createMemo( + () => allWords().filter((w) => w.isFiller && !w.deleted).length, + ); + + const pauseCount = createMemo( + () => allWords().filter((w) => w.isPause && !w.deleted).length, + ); + const segmentGroups = createMemo((): TranscriptSegmentGroup[] => { const words = allWords(); const groups: TranscriptSegmentGroup[] = []; @@ -107,21 +150,9 @@ export function TranscriptPanel() { const activeWordIndex = createMemo(() => { const time = editorState.playbackTime; const words = allWords(); - if (words.length === 0) return -1; - - let lo = 0; - let hi = words.length - 1; - while (lo <= hi) { - const mid = (lo + hi) >>> 1; - if (time >= words[mid].end) { - lo = mid + 1; - } else if (time < words[mid].start) { - hi = mid - 1; - } else { - return mid; - } - } - return -1; + return words.findIndex( + (w) => !w.deleted && time >= w.start && time < w.end, + ); }); const handleWordClick = async (word: FlatWord) => { @@ -148,24 +179,51 @@ export function TranscriptPanel() { const words = allWords(); const wordsToDelete = flatIndices .map((idx) => words[idx]) - .filter((w): w is FlatWord => !!w); + .filter((w): w is FlatWord => !!w && !w.deleted); if (wordsToDelete.length === 0) return; - const sorted = [...wordsToDelete].sort((a, b) => { - if (a.segmentIndex !== b.segmentIndex) - return b.segmentIndex - a.segmentIndex; - return b.wordIndex - a.wordIndex; - }); + const deletingSet = new Set( + wordsToDelete.map((w) => `${w.segmentIndex}:${w.wordIndex}`), + ); + + const orphanedPauses: FlatWord[] = []; + for (const w of words) { + if (!w.isPause || w.deleted) continue; + if (deletingSet.has(`${w.segmentIndex}:${w.wordIndex}`)) continue; + + const flatIdx = words.indexOf(w); + const prev = words[flatIdx - 1]; + const next = words[flatIdx + 1]; + + const prevGone = + !prev || + prev.deleted || + deletingSet.has(`${prev.segmentIndex}:${prev.wordIndex}`); + const nextGone = + !next || + next.deleted || + deletingSet.has(`${next.segmentIndex}:${next.wordIndex}`); + + if (prevGone && nextGone) { + orphanedPauses.push(w); + deletingSet.add(`${w.segmentIndex}:${w.wordIndex}`); + } + } + + const allToDelete = [...wordsToDelete, ...orphanedPauses]; - const timeRanges = wordsToDelete - .map((w) => ({ start: w.start, end: w.end })) + const timeRanges = allToDelete + .map((w) => ({ + start: Math.max(0, w.start - (w.bufferStart || 0)), + end: w.storedEnd + (w.bufferEnd || 0), + })) .sort((a, b) => a.start - b.start); const mergedRanges: { start: number; end: number }[] = []; for (const range of timeRanges) { const last = mergedRanges[mergedRanges.length - 1]; - if (last && range.start <= last.end) { + if (last && range.start <= last.end + 0.001) { last.end = Math.max(last.end, range.end); } else { mergedRanges.push({ ...range }); @@ -176,22 +234,13 @@ export function TranscriptPanel() { produce((p) => { if (!p.captions?.segments) return; - for (const word of sorted) { + for (const word of allToDelete) { const seg = p.captions.segments[word.segmentIndex]; - if (!seg?.words) continue; - if (word.wordIndex < seg.words.length) { - seg.words.splice(word.wordIndex, 1); - } - } - - for (let i = p.captions.segments.length - 1; i >= 0; i--) { - const seg = p.captions.segments[i]; - if (!seg.words || seg.words.length === 0) { - p.captions.segments.splice(i, 1); - } else { - seg.text = getCaptionTextFromWords(seg.words); - seg.start = seg.words[0].start; - seg.end = seg.words[seg.words.length - 1].end; + if (seg?.words) { + const w = seg.words[word.wordIndex] as CaptionWordExtended; + if (w) { + seg.words[word.wordIndex] = { ...w, deleted: true }; + } } } @@ -211,6 +260,28 @@ export function TranscriptPanel() { } } + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (seg.words && seg.words.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + if (p.timeline && p.captions) { p.timeline.captionSegments = createCaptionTrackSegments( p.captions.segments, @@ -230,6 +301,98 @@ export function TranscriptPanel() { } }; + const restoreWords = (flatIndices: number[]) => { + const words = allWords(); + const wordsToRestore = flatIndices + .map((idx) => words[idx]) + .filter((w): w is FlatWord => !!w && w.deleted); + + if (wordsToRestore.length === 0) return; + + setProject( + produce((p) => { + if (!p.captions?.segments) return; + + const sortedByIndex = [...wordsToRestore].sort( + (a, b) => + b.segmentIndex - a.segmentIndex || b.wordIndex - a.wordIndex, + ); + + const chronologicalWords = [...sortedByIndex].reverse(); + + for (const word of chronologicalWords) { + const seg = p.captions.segments[word.segmentIndex]; + if (!seg?.words) continue; + const w = seg.words[word.wordIndex] as CaptionWordExtended; + + const insertDuration = w.end - w.start; + if (insertDuration <= 0.001) continue; + + for (let i = 0; i < p.captions.segments.length; i++) { + const s = p.captions.segments[i]; + if (!s.words) continue; + for (let j = 0; j < s.words.length; j++) { + if ( + i < word.segmentIndex || + (i === word.segmentIndex && j <= word.wordIndex) + ) { + continue; + } + const cw = s.words[j] as CaptionWordExtended; + cw.start += insertDuration; + cw.end += insertDuration; + } + } + + if (p.timeline) { + rippleInsertAllTracks(p.timeline, w.start, insertDuration); + } + } + + for (const word of sortedByIndex) { + const seg = p.captions.segments[word.segmentIndex]; + if (!seg?.words) continue; + const w = seg.words[word.wordIndex] as CaptionWordExtended; + if (w) { + w.deleted = false; + w.bufferStart = 0; + w.bufferEnd = 0; + } + } + + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (extWords.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + + if (p.timeline && p.captions) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + }), + ); + + setEditorState("captions", "isStale", false); + }; + const handleDeleteWord = (flatIndex: number) => { applyWordDeletions([flatIndex]); }; @@ -238,6 +401,207 @@ export function TranscriptPanel() { applyWordDeletions(flatIndices); }; + const handleRestoreWord = (flatIndex: number) => { + restoreWords([flatIndex]); + }; + + const handleRestoreWords = (flatIndices: number[]) => { + restoreWords(flatIndices); + }; + + const [silenceThreshold, setSilenceThreshold] = makePersisted( + createSignal(AUTO_CLEAN_SILENCE_THRESHOLD), + { name: "editorAutoCleanThreshold" }, + ); + + const cleanablePauseCount = createMemo( + () => + allWords().filter( + (w) => + w.isPause && + !w.deleted && + w.storedEnd - w.start >= silenceThreshold(), + ).length, + ); + + const autoClean = () => { + const words = allWords(); + const threshold = silenceThreshold(); + + const keeperWords = words.filter( + (w) => !w.deleted && !w.isFiller && !w.isPause, + ); + + if (keeperWords.length === 0) return; + + setProject( + produce((p) => { + if (!p.captions?.segments) return; + + for (let segIdx = 0; segIdx < p.captions.segments.length; segIdx++) { + const seg = p.captions.segments[segIdx]; + if (!seg.words) continue; + for (let wIdx = 0; wIdx < seg.words.length; wIdx++) { + const w = seg.words[wIdx] as CaptionWordExtended; + if ( + w && + !w.deleted && + (w.isFiller || + isFillerWord(w.text) || + (w.isPause && w.end - w.start >= threshold)) + ) { + seg.words[wIdx] = { ...w, deleted: true }; + } + } + } + + for (const kw of keeperWords) { + const seg = p.captions.segments[kw.segmentIndex]; + if (!seg?.words) continue; + const w = seg.words[kw.wordIndex]; + if (w && !w.deleted) { + const duration = w.end - w.start; + const maxDuration = Math.max( + 0.5, + Math.min(1.5, w.text.length * 0.1), + ); + if (duration > maxDuration + 0.3) { + w.end = w.start + maxDuration; + } + } + } + + const timeRanges: Array<{ start: number; end: number }> = []; + const pauseInsertions: Array<{ + segmentIndex: number; + insertIdx: number; + pauseWord: CaptionWordExtended; + }> = []; + + for (let i = -1; i < keeperWords.length - 1; i++) { + const curr = i >= 0 ? keeperWords[i] : null; + const next = keeperWords[i + 1]; + + const currSeg = curr ? p.captions.segments[curr.segmentIndex] : null; + const currWord = + currSeg?.words && curr ? currSeg.words[curr.wordIndex] : null; + + const nextSeg = p.captions.segments[next.segmentIndex]; + const nextWord = nextSeg?.words?.[next.wordIndex]; + + const gapStart = currWord ? currWord.end : curr ? curr.end : 0; + const gapEnd = nextWord ? nextWord.start : next.start; + const gap = gapEnd - gapStart; + + if (gap < 0.001) continue; + + let hasFillerInGap = false; + for (const w of words) { + if (w.deleted || w.isPause) continue; + if (!w.isFiller) continue; + if (w.start >= gapStart - 0.01 && w.end <= gapEnd + 0.01) { + hasFillerInGap = true; + break; + } + } + + const shouldCut = hasFillerInGap || gap >= threshold; + + if (shouldCut) { + timeRanges.push({ start: gapStart, end: gapEnd }); + + if (gap >= threshold) { + const targetSegIdx = curr ? curr.segmentIndex : next.segmentIndex; + const insertIdx = curr ? curr.wordIndex + 1 : 0; + pauseInsertions.push({ + segmentIndex: targetSegIdx, + insertIdx, + pauseWord: { + text: `[Pause ${gap.toFixed(1)}s]`, + start: gapStart, + end: gapEnd, + deleted: true, + isPause: true, + isFiller: false, + bufferStart: DEFAULT_PAUSE_BUFFER, + bufferEnd: DEFAULT_PAUSE_BUFFER, + }, + }); + } + } + } + + pauseInsertions.sort( + (a, b) => + b.segmentIndex - a.segmentIndex || b.insertIdx - a.insertIdx, + ); + + for (const ins of pauseInsertions) { + const targetSeg = p.captions.segments[ins.segmentIndex]; + if (targetSeg?.words) { + targetSeg.words.splice(ins.insertIdx, 0, ins.pauseWord); + } + } + + timeRanges.sort((a, b) => a.start - b.start); + const mergedRanges: { start: number; end: number }[] = []; + for (const range of timeRanges) { + const last = mergedRanges[mergedRanges.length - 1]; + if (last && range.start <= last.end + 0.001) { + last.end = Math.max(last.end, range.end); + } else { + mergedRanges.push({ ...range }); + } + } + + const reversedRanges = [...mergedRanges].reverse(); + for (const range of reversedRanges) { + const cutDuration = range.end - range.start; + if (cutDuration <= 0.001) continue; + + shiftCaptionTimesAfterCut( + p.captions.segments, + range.start, + cutDuration, + ); + + if (p.timeline) { + rippleDeleteAllTracks(p.timeline, range.start, range.end); + } + } + + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (seg.words && seg.words.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + + if (p.timeline && p.captions) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + }), + ); + setEditorState("captions", "isStale", false); + }; + const isAtEnd = () => { const total = totalDuration(); return total > 0 && total - editorState.playbackTime <= 0.1; @@ -291,11 +655,93 @@ export function TranscriptPanel() { handlePlayPause(); }); + const [showAutoCleanDropdown, setShowAutoCleanDropdown] = createSignal(false); + let autoCleanDropdownRef: HTMLDivElement | undefined; + + createEventListener(document, "mousedown", (e: MouseEvent) => { + if ( + showAutoCleanDropdown() && + autoCleanDropdownRef && + !autoCleanDropdownRef.contains(e.target as Node) + ) { + setShowAutoCleanDropdown(false); + } + }); + return (
Transcript
+ 0 || pauseCount() > 0}> + + {fillerCount() > 0 && + `${fillerCount()} filler${fillerCount() > 1 ? "s" : ""}`} + {fillerCount() > 0 && pauseCount() > 0 && ", "} + {pauseCount() > 0 && + `${pauseCount()} pause${pauseCount() > 1 ? "s" : ""}`} + + + +
+
+ + +
+ +
+
+ Silence Threshold +
+
+ + setSilenceThreshold( + Number.parseFloat(e.currentTarget.value), + ) + } + class="flex-1 h-1 accent-blue-9" + /> + + {silenceThreshold().toFixed(1)}s + +
+ +
+
+
); } +function BufferPopover(props: { + word: FlatWord; + position: { x: number; y: number }; + onClose: () => void; + onBufferChange: ( + segmentIndex: number, + wordIndex: number, + bufferStart: number, + bufferEnd: number, + ) => void; + onRestore: () => void; +}) { + const wordDuration = Math.max(0, props.word.end - props.word.start); + const minBuffer = Number(Math.max(-0.5, -(wordDuration / 2)).toFixed(2)); + + const [bufStart, setBufStart] = createSignal( + Math.max(minBuffer, props.word.bufferStart), + ); + const [bufEnd, setBufEnd] = createSignal( + Math.max(minBuffer, props.word.bufferEnd), + ); + let popoverRef: HTMLDivElement | undefined; + + const handleClickOutside = (e: MouseEvent) => { + if (popoverRef && !popoverRef.contains(e.target as Node)) { + props.onClose(); + } + }; + + const handleEscape = (e: KeyboardEvent) => { + if (e.key === "Escape") props.onClose(); + }; + + createEventListener(document, "mousedown", handleClickOutside); + createEventListener(window, "keydown", handleEscape); + + const updateBuffer = (start: number, end: number) => { + setBufStart(start); + setBufEnd(end); + props.onBufferChange( + props.word.segmentIndex, + props.word.wordIndex, + start, + end, + ); + }; + + const popoverStyle = () => { + const x = Math.min(props.position.x, window.innerWidth - 220); + const y = Math.min(props.position.y, window.innerHeight - 200); + return { + position: "fixed" as const, + left: `${x}px`, + top: `${y}px`, + "z-index": "9999", + }; + }; + + return ( +
+
+
+ + Adjust Buffer + + +
+

+ Buffer around deleted word to preserve pronunciations. +

+ +
+
+
+ Start Buffer + + {bufStart().toFixed(2)}s + +
+ + setBufStart(Number.parseFloat(e.currentTarget.value)) + } + onChange={(e) => + updateBuffer(Number.parseFloat(e.currentTarget.value), bufEnd()) + } + class="w-full h-1 accent-blue-9" + /> +
+
+
+ End Buffer + + {bufEnd().toFixed(2)}s + +
+ + setBufEnd(Number.parseFloat(e.currentTarget.value)) + } + onChange={(e) => + updateBuffer( + bufStart(), + Number.parseFloat(e.currentTarget.value), + ) + } + class="w-full h-1 accent-blue-9" + /> +
+
+ + + + +
+
+ ); +} + function WordWithTooltip(props: { word: FlatWord; isActive: boolean; @@ -341,6 +931,8 @@ function WordWithTooltip(props: { ref: (el: HTMLSpanElement) => void; onClick: (e: MouseEvent) => void; onDelete: () => void; + onRestore: () => void; + onContextMenu: (e: MouseEvent) => void; }) { const [hovering, setHovering] = createSignal(false); let hoverTimer: number | undefined; @@ -361,14 +953,25 @@ function WordWithTooltip(props: { ref={props.ref} class={cx( "cursor-pointer transition-colors duration-100 rounded-xs relative", - props.isSelected && "bg-blue-4/50", - props.isActive - ? "text-blue-11" - : props.isSelected + props.word.deleted + ? "line-through opacity-40 text-red-9 bg-red-3/30" + : props.word.isFiller + ? "border-b-2 border-dotted border-amber-8/80 bg-amber-3/15" + : "", + !props.word.deleted && props.isSelected && "bg-blue-4/50", + props.word.deleted + ? "hover:opacity-60" + : props.isActive ? "text-blue-11" - : "text-gray-9 hover:text-gray-12", + : props.isSelected + ? "text-blue-11" + : "text-gray-9 hover:text-gray-12", )} onClick={(e) => props.onClick(e)} + onContextMenu={(e) => { + e.preventDefault(); + props.onContextMenu(e); + }} onMouseEnter={onEnter} onMouseLeave={onLeave} > @@ -382,16 +985,32 @@ function WordWithTooltip(props: { {formatTimePrecise(props.word.start)} - + } > - - + + @@ -399,6 +1018,46 @@ function WordWithTooltip(props: { ); } +function PauseBadge(props: { + word: FlatWord; + onDelete: () => void; + onContextMenu: (e: MouseEvent) => void; +}) { + const duration = props.word.storedEnd - props.word.start; + return ( + { + e.preventDefault(); + props.onContextMenu(e); + }} + > + ⏸ {duration.toFixed(1)}s + + + ); +} + function TranscriptEditor(props: { segmentGroups: TranscriptSegmentGroup[]; allWords: FlatWord[]; @@ -407,11 +1066,19 @@ function TranscriptEditor(props: { onWordClick: (word: FlatWord) => void; onDeleteWord: (flatIndex: number) => void; onDeleteWords: (flatIndices: number[]) => void; + onRestoreWord: (flatIndex: number) => void; + onRestoreWords: (flatIndices: number[]) => void; }) { + const { editorState, setProject, setEditorState } = useEditorContext(); const [selectedIndices, setSelectedIndices] = createSignal>( new Set(), ); const [anchorIndex, setAnchorIndex] = createSignal(-1); + const [bufferPopover, setBufferPopover] = createSignal<{ + word: FlatWord; + flatIndex: number; + position: { x: number; y: number }; + } | null>(null); let scrollContainerRef: HTMLDivElement | undefined; let activeWordRef: HTMLSpanElement | undefined; @@ -460,10 +1127,21 @@ function TranscriptEditor(props: { if (e.key === "Backspace" || e.key === "Delete") { e.preventDefault(); const indices = [...selected]; - if (indices.length === 1) { - props.onDeleteWord(indices[0]); - } else { - props.onDeleteWords(indices); + const toDelete = indices.filter((i) => !props.allWords[i]?.deleted); + const toRestore = indices.filter((i) => props.allWords[i]?.deleted); + + if (toDelete.length > 0) { + if (toDelete.length === 1) { + props.onDeleteWord(toDelete[0]); + } else { + props.onDeleteWords(toDelete); + } + } else if (toRestore.length > 0) { + if (toRestore.length === 1) { + props.onRestoreWord(toRestore[0]); + } else { + props.onRestoreWords(toRestore); + } } setSelectedIndices(new Set()); setAnchorIndex(-1); @@ -548,6 +1226,157 @@ function TranscriptEditor(props: { setAnchorIndex(-1); }; + const handleWordRestore = (word: FlatWord) => { + const selected = selectedIndices(); + if (selected.size > 1) { + props.onRestoreWords([...selected]); + } else { + props.onRestoreWord(flatIndexOf(word)); + } + setSelectedIndices(new Set()); + setAnchorIndex(-1); + }; + + const handleContextMenu = (word: FlatWord, e: MouseEvent) => { + if (word.deleted || word.isPause) { + setBufferPopover({ + word, + flatIndex: flatIndexOf(word), + position: { x: e.clientX, y: e.clientY }, + }); + } + }; + + const handleBufferChange = async ( + segmentIndex: number, + wordIndex: number, + bufferStart: number, + bufferEnd: number, + ) => { + if (editorState.playing) { + await commands.stopPlayback(); + setEditorState("playing", false); + } + + let appliedDelta = 0; + const currentTime = editorState.playbackTime; + + setProject( + produce((p) => { + if (!p.captions?.segments) return; + const seg = p.captions.segments[segmentIndex]; + if (!seg?.words) return; + const w = seg.words[wordIndex] as CaptionWordExtended; + if (!w) return; + + if (w.deleted) { + const oldCutStart = Math.max(0, w.start - (w.bufferStart || 0)); + const oldCutEnd = w.end + (w.bufferEnd || 0); + const oldDuration = Math.max(0, oldCutEnd - oldCutStart); + + const newCutStart = Math.max(0, w.start - bufferStart); + const newCutEnd = w.end + bufferEnd; + const newDuration = Math.max(0, newCutEnd - newCutStart); + + if (Math.abs(oldDuration - newDuration) < 0.001) { + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + return; + } + + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + if (oldDuration > 0.001) { + for (let i = 0; i < p.captions.segments.length; i++) { + const s = p.captions.segments[i]; + if (!s.words) continue; + for (let j = 0; j < s.words.length; j++) { + if ( + i < segmentIndex || + (i === segmentIndex && j <= wordIndex) + ) { + continue; + } + const cw = s.words[j] as CaptionWordExtended; + cw.start += oldDuration; + cw.end += oldDuration; + } + } + + if (p.timeline) { + rippleInsertAllTracks(p.timeline, oldCutStart, oldDuration); + } + if (currentTime > oldCutStart) { + appliedDelta += oldDuration; + } + } + + if (newDuration > 0.001) { + for (let i = 0; i < p.captions.segments.length; i++) { + const s = p.captions.segments[i]; + if (!s.words) continue; + for (let j = 0; j < s.words.length; j++) { + if ( + i < segmentIndex || + (i === segmentIndex && j <= wordIndex) + ) { + continue; + } + const cw = s.words[j] as CaptionWordExtended; + + cw.start -= newDuration; + cw.end -= newDuration; + } + } + + if (p.timeline) { + rippleDeleteAllTracks(p.timeline, newCutStart, newCutEnd); + } + if (currentTime > newCutStart) { + appliedDelta -= newDuration; + } + } + + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + + for (const s of p.captions.segments) { + const extWords = (s.words ?? []) as CaptionWordExtended[]; + const visible = extWords.filter((vw) => !vw.deleted); + if (visible.length > 0) { + s.start = visible[0].start; + s.end = visible[visible.length - 1].end; + } + } + + if (p.timeline) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + } else { + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + } + }), + ); + setEditorState("captions", "isStale", false); + + if (Math.abs(appliedDelta) > 0.001) { + const newTime = Math.max(0, currentTime + appliedDelta); + setEditorState("playbackTime", newTime); + const frame = Math.max(Math.floor(newTime * FPS), 0); + await commands.seekTo(frame); + } + }; + return (
props.activeWordIndex === flatIdx(); const isSelected = () => selectedIndices().has(flatIdx()); + if (word.isPause) { + return ( + { + if (word.deleted) { + handleWordRestore(word); + } else { + handleWordDelete(word); + } + }} + onContextMenu={(e: MouseEvent) => + handleContextMenu(word, e) + } + /> + ); + } + return ( handleWordSelect(word, e)} onDelete={() => handleWordDelete(word)} + onRestore={() => handleWordRestore(word)} + onContextMenu={(e: MouseEvent) => + handleContextMenu(word, e) + } /> ); }} @@ -598,6 +1449,20 @@ function TranscriptEditor(props: {
+ + {(popover) => ( + setBufferPopover(null)} + onBufferChange={handleBufferChange} + onRestore={() => { + handleWordRestore(popover().word); + setBufferPopover(null); + }} + /> + )} +
); } diff --git a/apps/desktop/src/routes/editor/caption-types.ts b/apps/desktop/src/routes/editor/caption-types.ts new file mode 100644 index 00000000000..4ef410e417b --- /dev/null +++ b/apps/desktop/src/routes/editor/caption-types.ts @@ -0,0 +1,9 @@ +import type { CaptionWord as BaseCaptionWord } from "~/utils/tauri"; + +export interface CaptionWordExtended extends BaseCaptionWord { + deleted?: boolean; + isFiller?: boolean; + isPause?: boolean; + bufferStart?: number; + bufferEnd?: number; +} diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index 1af027145ca..14ba3004620 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -11,6 +11,8 @@ import { type SegmentRecordings, type TimelineSegment, } from "~/utils/tauri"; +import type { CaptionWordExtended } from "./caption-types"; +import { isFillerWord, PAUSE_DETECTION_THRESHOLD } from "./filler-detection"; export const DEFAULT_CAPTION_MODEL = "best"; export const DEFAULT_WHISPER_CAPTION_MODEL = "small"; export const DEFAULT_CAPTION_LANGUAGE = "auto"; @@ -138,18 +140,20 @@ export function mapCaptionsToEditedTimeline( const mappedCaptionSegments = mappings.flatMap((mapping) => { if (caption.words && caption.words.length > 0) { const mappedWords = caption.words.flatMap((word) => { - const wordMapped = mapTimeRangeWithinMapping( - word.start, - word.end, - mapping, - ); + const w = word as CaptionWordExtended; + const wordMapped = mapTimeRangeWithinMapping(w.start, w.end, mapping); return wordMapped ? [ { - text: word.text, + text: w.text, start: wordMapped.start, end: wordMapped.end, + deleted: w.deleted ?? false, + isFiller: w.isFiller || isFillerWord(w.text), + isPause: w.isPause ?? false, + bufferStart: w.bufferStart ?? 0, + bufferEnd: w.bufferEnd ?? 0, }, ] : []; @@ -209,19 +213,116 @@ export function mapCaptionsToEditedTimeline( export function createCaptionTrackSegments( segments: CaptionSegment[], ): CaptionTrackSegment[] { - return segments.map((segment) => ({ - id: segment.id, - start: segment.start, - end: segment.end, - text: segment.text, - words: segment.words ?? [], - fadeDurationOverride: null, - lingerDurationOverride: null, - positionOverride: null, - colorOverride: null, - backgroundColorOverride: null, - fontSizeOverride: null, - })); + return segments.map((segment) => { + const words = (segment.words ?? []) as CaptionWordExtended[]; + const visibleText = words.some((w) => w.deleted || w.isPause) + ? words + .filter((w) => !w.deleted && !w.isPause) + .map((w) => w.text.trim()) + .filter((t) => t.length > 0) + .join(" ") + : segment.text; + return { + id: segment.id, + start: segment.start, + end: segment.end, + text: visibleText, + words, + fadeDurationOverride: null, + lingerDurationOverride: null, + positionOverride: null, + colorOverride: null, + backgroundColorOverride: null, + fontSizeOverride: null, + }; + }); +} + +function cappedWordEnd(word: CaptionWord): number { + const duration = word.end - word.start; + const maxDuration = Math.max(0.5, Math.min(1.5, word.text.length * 0.1)); + if (duration > maxDuration + 0.3) { + return word.start + maxDuration; + } + return word.end; +} + +function insertPauseWordsIntoSegments(segments: CaptionSegment[]): void { + const allWords: { segIdx: number; wIdx: number; word: CaptionWord }[] = []; + for (let s = 0; s < segments.length; s++) { + const ws = (segments[s].words ?? []) as CaptionWordExtended[]; + for (let w = 0; w < ws.length; w++) { + if (ws[w].isPause) continue; + allWords.push({ segIdx: s, wIdx: w, word: ws[w] }); + } + } + + const insertions: { + segIdx: number; + afterWIdx: number; + pause: CaptionWordExtended; + }[] = []; + + if (allWords.length > 0) { + const first = allWords[0]; + const hasPauseBefore = + first.wIdx > 0 && + (segments[first.segIdx].words?.[first.wIdx - 1] as CaptionWordExtended) + ?.isPause; + if (first.word.start >= PAUSE_DETECTION_THRESHOLD && !hasPauseBefore) { + insertions.push({ + segIdx: first.segIdx, + afterWIdx: first.wIdx - 1, + pause: { + text: `[Pause ${first.word.start.toFixed(1)}s]`, + start: 0, + end: first.word.start, + isPause: true, + isFiller: false, + }, + }); + } + } + + for (let i = 1; i < allWords.length; i++) { + const prev = allWords[i - 1]; + const curr = allWords[i]; + const prevEnd = cappedWordEnd(prev.word); + const gap = curr.word.start - prevEnd; + if (gap < PAUSE_DETECTION_THRESHOLD) continue; + + let alreadyHasPause = false; + if (prev.segIdx === curr.segIdx) { + const ws = (segments[prev.segIdx].words ?? []) as CaptionWordExtended[]; + for (let j = prev.wIdx + 1; j < curr.wIdx; j++) { + if (ws[j]?.isPause) { + alreadyHasPause = true; + break; + } + } + } + if (alreadyHasPause) continue; + + insertions.push({ + segIdx: prev.segIdx, + afterWIdx: prev.wIdx, + pause: { + text: `[Pause ${gap.toFixed(1)}s]`, + start: prevEnd, + end: curr.word.start, + isPause: true, + isFiller: false, + }, + }); + } + + for (let i = insertions.length - 1; i >= 0; i--) { + const ins = insertions[i]; + const seg = segments[ins.segIdx]; + if (seg.words) { + seg.words.splice(ins.afterWIdx + 1, 0, ins.pause); + } + } } export function applyCaptionResultToProject< @@ -276,6 +377,8 @@ export function applyCaptionResultToProject< recordingSegments, ); + insertPauseWordsIntoSegments(mappedSegments); + captions.segments = mappedSegments; timeline.captionSegments = createCaptionTrackSegments(mappedSegments); } @@ -301,8 +404,9 @@ export async function transcribeEditorCaptions( return await commands.transcribeAudio(videoPath, modelPath, language, engine); } -export function getCaptionTextFromWords(words: CaptionWord[]) { +export function getCaptionTextFromWords(words: CaptionWordExtended[]) { return words + .filter((word) => !word.deleted && !word.isPause) .map((word) => word.text.trim()) .filter((word) => word.length > 0) .join(" "); @@ -410,7 +514,18 @@ if (import.meta.vitest) { start: 0.4, end: 0.6, text: "hello", - words: [{ text: "hello", start: 0.4, end: 0.6 }], + words: [ + { + text: "hello", + start: 0.4, + end: 0.6, + deleted: false, + isFiller: false, + isPause: false, + bufferStart: 0, + bufferEnd: 0, + }, + ], }); expect(result[1]?.id).toBe("caption-1"); expect(result[1]?.text).toBe("world"); diff --git a/apps/desktop/src/routes/editor/filler-detection.ts b/apps/desktop/src/routes/editor/filler-detection.ts new file mode 100644 index 00000000000..f1d4b613265 --- /dev/null +++ b/apps/desktop/src/routes/editor/filler-detection.ts @@ -0,0 +1,106 @@ +const FILLER_WORDS = new Set(["uh", "um", "ah", "er", "hmm", "mhm"]); + +export function isFillerWord(text: string): boolean { + const normalized = text.toLowerCase().replace(/[^a-z]/g, ""); + return FILLER_WORDS.has(normalized); +} + +export const PAUSE_DETECTION_THRESHOLD = 0.5; +export const AUTO_CLEAN_SILENCE_THRESHOLD = 1.5; +export const DEFAULT_PAUSE_BUFFER = 0.0; + +export interface PauseEntry { + text: string; + start: number; + end: number; + duration: number; + deleted: boolean; + isPause: true; + isFiller: false; + bufferStart: number; + bufferEnd: number; + segmentIndex: number; + afterWordIndex: number; +} + +export function detectPauses( + words: Array<{ + start: number; + end: number; + segmentIndex: number; + wordIndex: number; + }>, + threshold = PAUSE_DETECTION_THRESHOLD, +): PauseEntry[] { + const pauses: PauseEntry[] = []; + for (let i = 1; i < words.length; i++) { + const prev = words[i - 1]; + const curr = words[i]; + const gap = curr.start - prev.end; + if (gap >= threshold) { + pauses.push({ + text: `[Pause ${gap.toFixed(1)}s]`, + start: prev.end, + end: curr.start, + duration: gap, + deleted: false, + isPause: true, + isFiller: false, + bufferStart: DEFAULT_PAUSE_BUFFER, + bufferEnd: DEFAULT_PAUSE_BUFFER, + segmentIndex: prev.segmentIndex, + afterWordIndex: prev.wordIndex, + }); + } + } + return pauses; +} + +if (import.meta.vitest) { + const { describe, expect, it } = import.meta.vitest; + + describe("isFillerWord", () => { + it("detects basic filler words", () => { + expect(isFillerWord("uh")).toBe(true); + expect(isFillerWord("um")).toBe(true); + expect(isFillerWord("hmm")).toBe(true); + }); + + it("ignores punctuation and capitalization", () => { + expect(isFillerWord("Uh,")).toBe(true); + expect(isFillerWord("UM...")).toBe(true); + expect(isFillerWord("er?")).toBe(true); + }); + + it("returns false for non-filler words", () => { + expect(isFillerWord("hello")).toBe(false); + expect(isFillerWord("the")).toBe(false); + expect(isFillerWord("under")).toBe(false); + }); + }); + + describe("detectPauses", () => { + it("detects pauses above threshold", () => { + const words = [ + { start: 0, end: 1.0, segmentIndex: 0, wordIndex: 0 }, + { start: 1.6, end: 2.0, segmentIndex: 0, wordIndex: 1 }, + ]; + const pauses = detectPauses(words, 0.5); + expect(pauses).toHaveLength(1); + expect(pauses[0].duration).toBeCloseTo(0.6); + expect(pauses[0].start).toBe(1.0); + expect(pauses[0].end).toBe(1.6); + expect(pauses[0].isPause).toBe(true); + expect(pauses[0].text).toBe("[Pause 0.6s]"); + }); + + it("ignores pauses below threshold", () => { + const words = [ + { start: 0, end: 1.0, segmentIndex: 0, wordIndex: 0 }, + { start: 1.2, end: 2.0, segmentIndex: 0, wordIndex: 1 }, + ]; + const pauses = detectPauses(words, 0.5); + expect(pauses).toHaveLength(0); + }); + }); +} diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index b1c1ecd8bd6..1253e50731c 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -11,25 +11,71 @@ export function shiftCaptionTimesAfterCut( segments: Array<{ start: number; end: number; - words?: Array<{ start: number; end: number }>; + words?: Array<{ + start: number; + end: number; + deleted?: boolean; + isPause?: boolean; + }>; }>, cutStart: number, cutDuration: number, ) { - for (const seg of segments) { + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; if (seg.words) { - for (const w of seg.words) { - w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); - w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); + for (let j = 0; j < seg.words.length; j++) { + const w = seg.words[j]; + + if (w.deleted || w.isPause) { + if ( + w.start >= cutStart - 0.001 && + w.end <= cutStart + cutDuration + 0.001 + ) { + continue; + } + const duration = w.end - w.start; + w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); + w.end = w.start + duration; + } else { + w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); + w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); + } } if (seg.words.length > 0) { - seg.start = seg.words[0].start; - seg.end = seg.words[seg.words.length - 1].end; + const visible = seg.words.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } } } } } +const SEGMENT_EPSILON = 0.001; + +export function cleanupDegenerateSegments( + segments: Array<{ start: number; end: number }>, +) { + for (let i = segments.length - 1; i >= 0; i--) { + if (segments[i].end - segments[i].start < SEGMENT_EPSILON) { + segments.splice(i, 1); + } + } +} + +export function cleanupDegenerateClipSegments( + segments: Array<{ timescale: number; start: number; end: number }>, +) { + for (let i = segments.length - 1; i >= 0; i--) { + const seg = segments[i]; + if ((seg.end - seg.start) / seg.timescale < SEGMENT_EPSILON) { + segments.splice(i, 1); + } + } +} + export function rippleDeleteFromTrack( segments: Array<{ start: number; end: number }>, cutStart: number, @@ -55,6 +101,7 @@ export function rippleDeleteFromTrack( seg.end -= cutDuration; } } + cleanupDegenerateSegments(segments); } export function cutClipSegmentsForRange( @@ -90,7 +137,12 @@ export function cutClipSegmentsForRange( editedOffset += duration; } - if (startSegIdx === -1 || endSegIdx === -1) return; + if (startSegIdx === -1) return; + + if (endSegIdx === -1) { + endSegIdx = segments.length - 1; + endRelative = segments[endSegIdx].end - segments[endSegIdx].start; + } if (startSegIdx === endSegIdx) { const seg = segments[startSegIdx]; @@ -122,6 +174,7 @@ export function cutClipSegmentsForRange( segments.splice(idx, 1); } } + cleanupDegenerateClipSegments(segments); } export function rippleDeleteAllTracks( @@ -151,3 +204,315 @@ export function rippleDeleteAllTracks( if (timeline.keyboardSegments) rippleDeleteFromTrack(timeline.keyboardSegments, cutStart, cutEnd); } + +export function shiftTimeAfterInsert( + time: number, + insertPoint: number, + duration: number, +): number { + if (time < insertPoint - 0.001) return time; + return time + duration; +} + +export function shiftCaptionTimesAfterInsert( + segments: Array<{ + start: number; + end: number; + words?: Array<{ start: number; end: number; deleted?: boolean }>; + }>, + insertPoint: number, + duration_arg: number, +) { + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; + if (seg.words) { + for (let j = 0; j < seg.words.length; j++) { + const w = seg.words[j]; + + if (w.deleted) { + if ( + w.start >= insertPoint - 0.001 && + w.end <= insertPoint + duration_arg + 0.001 + ) { + continue; + } + } + const duration = w.end - w.start; + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); + w.end = w.start + duration; + } + if (seg.words.length > 0) { + const visible = seg.words.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + } +} + +export function rippleInsertIntoTrack( + segments: Array<{ start: number; end: number }>, + insertPoint: number, + duration: number, +) { + for (const seg of segments) { + if (seg.start >= insertPoint) { + seg.start += duration; + seg.end += duration; + } else if (seg.end > insertPoint) { + seg.end += duration; + } + } + cleanupDegenerateSegments(segments); +} + +export function insertClipSegmentForRange( + segments: Array<{ timescale: number; start: number; end: number }>, + insertPoint: number, + duration: number, +) { + let editedOffset = 0; + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; + const segDuration = (seg.end - seg.start) / seg.timescale; + const segEditedEnd = editedOffset + segDuration; + + if (insertPoint <= segEditedEnd) { + const relativeInSeg = (insertPoint - editedOffset) * seg.timescale; + const splitPoint = seg.start + relativeInSeg; + const insertAmount = duration * seg.timescale; + + if (splitPoint <= seg.start + 0.001) { + seg.start = Math.max(0, seg.start - insertAmount); + } else if (splitPoint >= seg.end - 0.001) { + seg.end += insertAmount; + } else { + const originalEnd = seg.end; + seg.end = splitPoint; + const insertedSeg = { + timescale: seg.timescale, + start: splitPoint, + end: splitPoint + insertAmount, + }; + const afterSeg = { + timescale: seg.timescale, + start: splitPoint + insertAmount, + end: originalEnd + insertAmount, + }; + segments.splice(i + 1, 0, insertedSeg, afterSeg); + } + return; + } + editedOffset += segDuration; + } + + if (segments.length > 0) { + const lastSeg = segments[segments.length - 1]; + lastSeg.end += duration * lastSeg.timescale; + } + cleanupDegenerateClipSegments(segments); +} + +export function rippleInsertAllTracks( + timeline: { + segments: Array<{ timescale: number; start: number; end: number }>; + zoomSegments?: Array<{ start: number; end: number }> | null; + sceneSegments?: Array<{ start: number; end: number }> | null; + maskSegments?: Array<{ start: number; end: number }> | null; + textSegments?: Array<{ start: number; end: number }> | null; + captionSegments?: Array<{ start: number; end: number }> | null; + keyboardSegments?: Array<{ start: number; end: number }> | null; + }, + insertPoint: number, + duration: number, +) { + insertClipSegmentForRange(timeline.segments, insertPoint, duration); + if (timeline.zoomSegments) + rippleInsertIntoTrack(timeline.zoomSegments, insertPoint, duration); + if (timeline.sceneSegments) + rippleInsertIntoTrack(timeline.sceneSegments, insertPoint, duration); + if (timeline.maskSegments) + rippleInsertIntoTrack(timeline.maskSegments, insertPoint, duration); + if (timeline.textSegments) + rippleInsertIntoTrack(timeline.textSegments, insertPoint, duration); + if (timeline.captionSegments) + rippleInsertIntoTrack(timeline.captionSegments, insertPoint, duration); + if (timeline.keyboardSegments) + rippleInsertIntoTrack(timeline.keyboardSegments, insertPoint, duration); +} + +if (import.meta.vitest) { + const { describe, expect, it } = import.meta.vitest; + + describe("shiftTimeAfterCut", () => { + it("does not shift time before the cut", () => { + expect(shiftTimeAfterCut(1, 2, 1)).toBe(1); + }); + it("snaps time inside the cut to the start of the cut", () => { + expect(shiftTimeAfterCut(2.5, 2, 1)).toBe(1.5); + }); + it("shifts time after the cut by the cut duration", () => { + expect(shiftTimeAfterCut(4, 2, 1)).toBe(3); + }); + }); + + describe("shiftCaptionTimesAfterCut", () => { + it("shifts regular words and adjusts segment bounds", () => { + const segments = [ + { + start: 2, + end: 4, + words: [ + { start: 2, end: 3, deleted: false }, + { start: 3, end: 4, deleted: false }, + ], + }, + ]; + // Cut from 1 to 2 (duration 1). Both words start at >= 2, so they shift by -1. + shiftCaptionTimesAfterCut(segments, 1, 1); + expect(segments[0].words?.[0].start).toBe(1); + expect(segments[0].words?.[0].end).toBe(2); + expect(segments[0].words?.[1].start).toBe(2); + expect(segments[0].words?.[1].end).toBe(3); + expect(segments[0].start).toBe(1); + expect(segments[0].end).toBe(3); + }); + + it("handles deleted words by keeping their timings if they fall strictly inside the cut", () => { + const segments = [ + { + start: 0, + end: 4, + words: [ + { start: 1, end: 2, deleted: true }, + { start: 2, end: 3, deleted: true }, + { start: 3, end: 4, deleted: false }, + ], + }, + ]; + // Cut from 1 to 3 (duration 2). + shiftCaptionTimesAfterCut(segments, 1, 2); + + // Word 1: inside cut -> continues without shifting + expect(segments[0].words?.[0].start).toBe(1); + expect(segments[0].words?.[0].end).toBe(2); + + // Word 2: inside cut -> continues without shifting + expect(segments[0].words?.[1].start).toBe(2); + expect(segments[0].words?.[1].end).toBe(3); + + // Word 3: after cut -> shifts left by 2 + expect(segments[0].words?.[2].start).toBe(1); + expect(segments[0].words?.[2].end).toBe(2); + + // visible start/end based on undeleted words (Word 3 is the only one) + expect(segments[0].start).toBe(1); + expect(segments[0].end).toBe(2); + }); + }); + + describe("shiftTimeAfterInsert", () => { + it("shifts time after insertion point", () => { + expect(shiftTimeAfterInsert(3, 2, 1)).toBe(4); + expect(shiftTimeAfterInsert(1, 2, 1)).toBe(1); + }); + }); + + describe("cleanupDegenerateSegments", () => { + it("removes zero-duration segments", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1 }, + { start: 1, end: 2 }, + ]; + cleanupDegenerateSegments(segments); + expect(segments).toEqual([ + { start: 0, end: 1 }, + { start: 1, end: 2 }, + ]); + }); + + it("removes near-zero segments below epsilon", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1.0005 }, + { start: 1.0005, end: 2 }, + ]; + cleanupDegenerateSegments(segments); + expect(segments).toEqual([ + { start: 0, end: 1 }, + { start: 1.0005, end: 2 }, + ]); + }); + }); + + describe("cleanupDegenerateClipSegments", () => { + it("removes zero-duration clip segments", () => { + const segments = [ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + ]; + cleanupDegenerateClipSegments(segments); + expect(segments).toEqual([ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + ]); + }); + + it("accounts for timescale when checking duration", () => { + const segments = [ + { timescale: 2, start: 0, end: 0.001 }, + { timescale: 1, start: 0, end: 0.002 }, + ]; + cleanupDegenerateClipSegments(segments); + expect(segments).toEqual([{ timescale: 1, start: 0, end: 0.002 }]); + }); + }); + + describe("rippleDeleteFromTrack cleanup", () => { + it("removes segments that become zero-duration after trimming", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1.5 }, + { start: 1.5, end: 3 }, + ]; + rippleDeleteFromTrack(segments, 1, 1.5); + const hasDegenerateSegments = segments.some( + (s) => s.end - s.start < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + expect(segments.length).toBe(2); + }); + }); + + describe("cutClipSegmentsForRange cleanup", () => { + it("does not leave zero-duration segments after cutting", () => { + const segments = [ + { timescale: 1, start: 0, end: 2 }, + { timescale: 1, start: 2, end: 4 }, + ]; + cutClipSegmentsForRange(segments, 1.999, 2.001); + const hasDegenerateSegments = segments.some( + (s) => (s.end - s.start) / s.timescale < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + }); + + it("handles cutting at exact segment boundaries", () => { + const segments = [ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + { timescale: 1, start: 2, end: 3 }, + ]; + cutClipSegmentsForRange(segments, 1, 2); + const hasDegenerateSegments = segments.some( + (s) => (s.end - s.start) / s.timescale < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + expect(segments.length).toBe(2); + }); + }); +} diff --git a/apps/desktop/src/utils/tauri.ts b/apps/desktop/src/utils/tauri.ts index 2160787f708..3e8906a468b 100644 --- a/apps/desktop/src/utils/tauri.ts +++ b/apps/desktop/src/utils/tauri.ts @@ -527,7 +527,7 @@ export type CaptionData = { segments: CaptionSegment[]; settings: CaptionSetting export type CaptionSegment = { id: string; start: number; end: number; text: string; words?: CaptionWord[] } export type CaptionSettings = { enabled: boolean; font: string; size: number; color: string; backgroundColor: string; backgroundOpacity: number; position: string; italic: boolean; fontWeight: number; outline: boolean; outlineColor: string; exportWithSubtitles: boolean; highlightColor: string; fadeDuration: number; lingerDuration: number; wordTransitionDuration: number; activeWordHighlight: boolean } export type CaptionTrackSegment = { id: string; start: number; end: number; text: string; words?: CaptionWord[]; fadeDurationOverride?: number | null; lingerDurationOverride?: number | null; positionOverride?: string | null; colorOverride?: string | null; backgroundColorOverride?: string | null; fontSizeOverride?: number | null } -export type CaptionWord = { text: string; start: number; end: number } +export type CaptionWord = { text: string; start: number; end: number; deleted?: boolean; isFiller?: boolean; isPause?: boolean; bufferStart?: number; bufferEnd?: number } export type CaptionsData = { segments: CaptionSegment[]; settings: CaptionSettings } export type CaptureDisplay = { id: DisplayId; name: string; refresh_rate: number } export type CaptureDisplayWithThumbnail = { id: DisplayId; name: string; refresh_rate: number; thumbnail: string | null } diff --git a/crates/project/src/configuration.rs b/crates/project/src/configuration.rs index de23b80c91a..040941f8477 100644 --- a/crates/project/src/configuration.rs +++ b/crates/project/src/configuration.rs @@ -933,6 +933,16 @@ pub struct CaptionWord { pub text: String, pub start: f32, pub end: f32, + #[serde(default)] + pub deleted: bool, + #[serde(default)] + pub is_filler: bool, + #[serde(default)] + pub is_pause: bool, + #[serde(default)] + pub buffer_start: f32, + #[serde(default)] + pub buffer_end: f32, } #[derive(Type, Serialize, Deserialize, Clone, Debug, Default)] diff --git a/crates/rendering/src/layers/captions.rs b/crates/rendering/src/layers/captions.rs index 15ec581e9d8..94055420f4b 100644 --- a/crates/rendering/src/layers/captions.rs +++ b/crates/rendering/src/layers/captions.rs @@ -396,12 +396,24 @@ impl CaptionsLayer { active.segment.end as f32, ); - let raw_caption_text = self.current_text.clone().unwrap_or_default(); + let raw_caption_text = if active.segment.words.iter().any(|w| w.deleted) { + active + .segment + .words + .iter() + .filter(|w| !w.deleted) + .map(|w| w.text.as_str()) + .collect::>() + .join(" ") + } else { + self.current_text.clone().unwrap_or_default() + }; let caption_text = wrap_text_by_words(&raw_caption_text, MAX_WORDS_PER_LINE); let caption_words: Vec = active .segment .words .iter() + .filter(|w| !w.deleted) .map(|w| CaptionWord { text: w.text.clone(), start: w.start,