Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions internal/storage/qdrant.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package storage
import (
"context"
"fmt"
"strings"

"github.com/qdrant/go-client/qdrant"
)
Expand Down Expand Up @@ -515,6 +516,53 @@ func (c *QdrantClient) DeleteByFilter(ctx context.Context, key, value string) er
return nil
}

// DeleteByPrefix deletes all vectors where the payload field `key` starts with `prefix`.
// Uses Scroll to find matching points (client-side prefix filter) then batch deletes them.
// Returns the number of deleted points.
func (c *QdrantClient) DeleteByPrefix(ctx context.Context, key, prefix string) (int, error) {
// Single-pass scroll: request all points with only the target payload field.
// Workspace collections typically have <10k points, so a single pass is fine.
// If pagination is needed in the future, the Scroll Offset field can be used.
results, err := c.client.Scroll(ctx, &qdrant.ScrollPoints{
CollectionName: c.config.Collection,
Limit: qdrant.PtrOf(uint32(50000)),
WithPayload: qdrant.NewWithPayloadInclude(key),
})
Comment on lines +526 to +530
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The Scroll method in the Qdrant Go SDK returns three values: ([]*RetrievedPoint, *PointId, error). The current implementation only captures two, which will cause a compilation error. Additionally, the code should handle pagination by checking the returned nextPageOffset and looping until all matching points are retrieved, rather than assuming all points fit within a single 50,000-limit request.

var allMatchingIDs []*qdrant.PointId
var offset *qdrant.PointId

for {
	results, nextOffset, err := c.client.Scroll(ctx, &qdrant.ScrollPoints{
		CollectionName: c.config.Collection,
		Limit:          qdrant.PtrOf(uint32(10000)),
		WithPayload:    qdrant.NewWithPayloadInclude(key),
		Offset:         offset,
	})
	if err != nil {
		return 0, fmt.Errorf("scroll for prefix %q failed: %w", prefix, err)
	}

	for _, point := range results {
		if val, ok := point.Payload[key]; ok {
			if strings.HasPrefix(val.GetStringValue(), prefix) {
				allMatchingIDs = append(allMatchingIDs, point.Id)
			}
		}
	}

	if nextOffset == nil || len(results) == 0 {
		break
	}
	offset = nextOffset
}

if err != nil {
return 0, fmt.Errorf("scroll for prefix %q failed: %w", prefix, err)
}

var allMatchingIDs []*qdrant.PointId
for _, point := range results {
if val, ok := point.Payload[key]; ok {
if strings.HasPrefix(val.GetStringValue(), prefix) {
allMatchingIDs = append(allMatchingIDs, point.Id)
}
}
}

if len(allMatchingIDs) == 0 {
return 0, nil
}

// Batch delete all matching point IDs
_, err = c.client.Delete(ctx, &qdrant.DeletePoints{
CollectionName: c.config.Collection,
Points: &qdrant.PointsSelector{
PointsSelectorOneOf: &qdrant.PointsSelector_Points{
Points: &qdrant.PointsIdsList{
Ids: allMatchingIDs,
},
},
},
})
if err != nil {
return 0, fmt.Errorf("batch delete of %d points by prefix %q failed: %w", len(allMatchingIDs), prefix, err)
}

return len(allMatchingIDs), nil
Comment on lines +520 to +563
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DeleteByPrefix only scrolls a single page (Limit=50000) and doesn't paginate, so collections larger than the limit will not be fully cleaned up even though the method claims to delete “all vectors” with the prefix. Consider iterating with scroll offsets until exhaustion (or lowering the claim in the docstring / returning partial-deletion info), and deleting in batches if the ID list grows large.

Suggested change
// Uses Scroll to find matching points (client-side prefix filter) then batch deletes them.
// Returns the number of deleted points.
func (c *QdrantClient) DeleteByPrefix(ctx context.Context, key, prefix string) (int, error) {
// Single-pass scroll: request all points with only the target payload field.
// Workspace collections typically have <10k points, so a single pass is fine.
// If pagination is needed in the future, the Scroll Offset field can be used.
results, err := c.client.Scroll(ctx, &qdrant.ScrollPoints{
CollectionName: c.config.Collection,
Limit: qdrant.PtrOf(uint32(50000)),
WithPayload: qdrant.NewWithPayloadInclude(key),
})
if err != nil {
return 0, fmt.Errorf("scroll for prefix %q failed: %w", prefix, err)
}
var allMatchingIDs []*qdrant.PointId
for _, point := range results {
if val, ok := point.Payload[key]; ok {
if strings.HasPrefix(val.GetStringValue(), prefix) {
allMatchingIDs = append(allMatchingIDs, point.Id)
}
}
}
if len(allMatchingIDs) == 0 {
return 0, nil
}
// Batch delete all matching point IDs
_, err = c.client.Delete(ctx, &qdrant.DeletePoints{
CollectionName: c.config.Collection,
Points: &qdrant.PointsSelector{
PointsSelectorOneOf: &qdrant.PointsSelector_Points{
Points: &qdrant.PointsIdsList{
Ids: allMatchingIDs,
},
},
},
})
if err != nil {
return 0, fmt.Errorf("batch delete of %d points by prefix %q failed: %w", len(allMatchingIDs), prefix, err)
}
return len(allMatchingIDs), nil
// Uses paginated Scroll requests to find matching points (client-side prefix filter)
// and deletes matches in batches. Returns the number of deleted points.
func (c *QdrantClient) DeleteByPrefix(ctx context.Context, key, prefix string) (int, error) {
const scrollPageSize uint32 = 1000
const deleteBatchSize = 1000
var totalDeleted int
var offset *qdrant.PointId
for {
results, err := c.client.Scroll(ctx, &qdrant.ScrollPoints{
CollectionName: c.config.Collection,
Limit: qdrant.PtrOf(scrollPageSize),
Offset: offset,
WithPayload: qdrant.NewWithPayloadInclude(key),
})
if err != nil {
return totalDeleted, fmt.Errorf("scroll for prefix %q failed: %w", prefix, err)
}
if len(results) == 0 {
break
}
var matchingIDs []*qdrant.PointId
for _, point := range results {
if point == nil || point.Id == nil {
continue
}
if val, ok := point.Payload[key]; ok && strings.HasPrefix(val.GetStringValue(), prefix) {
matchingIDs = append(matchingIDs, point.Id)
}
}
for start := 0; start < len(matchingIDs); start += deleteBatchSize {
end := start + deleteBatchSize
if end > len(matchingIDs) {
end = len(matchingIDs)
}
_, err = c.client.Delete(ctx, &qdrant.DeletePoints{
CollectionName: c.config.Collection,
Points: &qdrant.PointsSelector{
PointsSelectorOneOf: &qdrant.PointsSelector_Points{
Points: &qdrant.PointsIdsList{
Ids: matchingIDs[start:end],
},
},
},
})
if err != nil {
return totalDeleted, fmt.Errorf("batch delete of %d points by prefix %q failed: %w", end-start, prefix, err)
}
totalDeleted += end - start
}
if len(results) < int(scrollPageSize) {
break
}
lastPoint := results[len(results)-1]
if lastPoint == nil || lastPoint.Id == nil {
break
}
offset = lastPoint.Id
}
return totalDeleted, nil

Copilot uses AI. Check for mistakes.
}

// Close closes the Qdrant client connection
func (c *QdrantClient) Close() error {
if c.client != nil {
Expand Down
45 changes: 37 additions & 8 deletions internal/workspace/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -634,13 +634,43 @@ func (m *Manager) IndexLanguage(ctx context.Context, info *Info, language string
}

// Check for deleted files (both code and docs)
// We scan the state and check if files still exist in current scan
// But scan only has current files.
// Better: iterate state.Files and check if they exist on disk.
state.mu.RLock()
for path := range state.Files {
if _, err := os.Stat(path); os.IsNotExist(err) {
// It's deleted. Determine if it was code or doc based on extension
// Use directory-aware cleanup: detect the highest deleted directory and
// bulk-remove all vectors under it rather than deleting one by one.
staleFiles := CollectStaleFiles(state)

if len(staleFiles) > 0 {
dirPrefixes, individualStale := GroupByDeletedRoot(staleFiles, info.Root)

// --- Bulk delete for entire deleted directories ---
for prefix, files := range dirPrefixes {
log.Printf("🔍 Detected deleted directory: %s (%d tracked files)", prefix, len(files))

deleted, delErr := collectionClient.DeleteByPrefix(ctx, "file", prefix)
if delErr != nil {
log.Printf("⚠️ Prefix delete failed for %s: %v — falling back to per-file delete", prefix, delErr)
// Fallback: delete individually
for _, path := range files {
if err := ltm.DeleteByMetadata(ctx, "file", path); err != nil {
log.Printf("⚠️ Failed to delete chunks for %s: %v", path, err)
}
}
} else if deleted > 0 {
log.Printf("🧹 Bulk-deleted %d vectors under %s", deleted, prefix)
}

// Also remove any other state entries that match this prefix
// (files we haven't seen in staleFiles because they weren't in the iteration)
state.mu.Lock()
for path := range state.Files {
if strings.HasPrefix(path, prefix) {
delete(state.Files, path)
}
Comment on lines +649 to +667
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Directory cleanup removes entries from state.Files for the prefix regardless of whether vector deletion actually succeeded (both when DeleteByPrefix errors and when it returns deleted==0). This can permanently prevent future cleanup attempts and leave orphaned vectors in Qdrant. Consider only deleting state entries after confirming deletion succeeded for that prefix (or keeping failed paths in state / re-queuing them), and treat deleted==0 as a signal to fall back to per-file deletes rather than wiping state.

Suggested change
if delErr != nil {
log.Printf("⚠️ Prefix delete failed for %s: %v — falling back to per-file delete", prefix, delErr)
// Fallback: delete individually
for _, path := range files {
if err := ltm.DeleteByMetadata(ctx, "file", path); err != nil {
log.Printf("⚠️ Failed to delete chunks for %s: %v", path, err)
}
}
} else if deleted > 0 {
log.Printf("🧹 Bulk-deleted %d vectors under %s", deleted, prefix)
}
// Also remove any other state entries that match this prefix
// (files we haven't seen in staleFiles because they weren't in the iteration)
state.mu.Lock()
for path := range state.Files {
if strings.HasPrefix(path, prefix) {
delete(state.Files, path)
}
if delErr == nil && deleted > 0 {
log.Printf("🧹 Bulk-deleted %d vectors under %s", deleted, prefix)
// Only remove all matching state entries after confirming prefix deletion succeeded.
// This also clears tracked files under the prefix that weren't present in staleFiles.
state.mu.Lock()
for path := range state.Files {
if strings.HasPrefix(path, prefix) {
delete(state.Files, path)
}
}
state.mu.Unlock()
continue
}
if delErr != nil {
log.Printf("⚠️ Prefix delete failed for %s: %v — falling back to per-file delete", prefix, delErr)
} else {
log.Printf("⚠️ Prefix delete removed 0 vectors for %s — falling back to per-file delete", prefix)
}
// Fallback: delete individually and only remove state for paths confirmed deleted.
state.mu.Lock()
for _, path := range files {
if err := ltm.DeleteByMetadata(ctx, "file", path); err != nil {
log.Printf("⚠️ Failed to delete chunks for %s: %v", path, err)
continue
}
delete(state.Files, path)

Copilot uses AI. Check for mistakes.
}
state.mu.Unlock()
Comment on lines +663 to +669
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Iterating over the entire state.Files map inside a loop for each prefix is inefficient ($O(N_{prefixes} * N_{files})$). Since dirPrefixes[prefix] already contains the list of stale files under that directory, you can directly remove those specific paths from the state. This reduces the complexity to $O(N_{stale_files})$ and avoids redundant map scans.

state.mu.Lock()
for _, path := range files {
	delete(state.Files, path)
}
state.mu.Unlock()

}

// --- Individual file deletions (parent dir still exists) ---
for _, path := range individualStale {
ext := strings.ToLower(filepath.Ext(path))
if ext == ".md" {
docsToDelete = append(docsToDelete, path)
Expand All @@ -649,7 +679,6 @@ func (m *Manager) IndexLanguage(ctx context.Context, info *Info, language string
}
}
}
state.mu.RUnlock()

// Process deletions (Code)
if len(filesToDelete) > 0 {
Expand Down
86 changes: 86 additions & 0 deletions internal/workspace/stale.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package workspace

import (
"os"
"path/filepath"
"strings"
)

// FindDeletedRoot walks up from filePath to find the highest deleted directory.
// If only the file itself is missing but its parent directory exists, returns filePath.
// If a parent directory is also missing, walks further up to find the topmost deleted ancestor
// below workspaceRoot.
//
// Example:
//
// Given: filePath="/project/tmp/vendor/src/foo.php", workspaceRoot="/project"
// If /project/tmp/ was deleted entirely:
// Returns "/project/tmp" (the highest missing ancestor)
func FindDeletedRoot(filePath, workspaceRoot string) string {
highestDeleted := filePath
dir := filepath.Dir(filePath)
wsClean := filepath.Clean(workspaceRoot)

for {
dirClean := filepath.Clean(dir)
// Stop if we've reached the workspace root or filesystem root
if dirClean == wsClean || dirClean == "/" || dirClean == "." {
break
}

if _, err := os.Stat(dir); os.IsNotExist(err) {
// This directory is also deleted — record it and keep going up
highestDeleted = dir
dir = filepath.Dir(dir)
continue
}
// Directory exists — stop here
break
}

return filepath.Clean(highestDeleted)
}

// IsDirectoryDeletion returns true if the deleted root is a directory path
// (i.e., a parent directory was deleted, not just the individual file).
func IsDirectoryDeletion(deletedRoot, originalFile string) bool {
return filepath.Clean(deletedRoot) != filepath.Clean(originalFile)
}

// CollectStaleFiles iterates WorkspaceState.Files and returns paths
// that no longer exist on disk.
func CollectStaleFiles(state *WorkspaceState) []string {
state.mu.RLock()
defer state.mu.RUnlock()

var stale []string
for path := range state.Files {
if _, err := os.Stat(path); os.IsNotExist(err) {
stale = append(stale, path)
}
}
return stale
}

// GroupByDeletedRoot groups stale file paths by their highest deleted ancestor directory.
// Returns two maps:
// - dirPrefixes: map[prefix][]filePath — files under a deleted directory
// - individualFiles: files whose parent directory still exists
func GroupByDeletedRoot(staleFiles []string, workspaceRoot string) (dirPrefixes map[string][]string, individualFiles []string) {
dirPrefixes = make(map[string][]string)

for _, f := range staleFiles {
root := FindDeletedRoot(f, workspaceRoot)
if IsDirectoryDeletion(root, f) {
prefix := root
if !strings.HasSuffix(prefix, string(os.PathSeparator)) {
prefix += string(os.PathSeparator)
}
dirPrefixes[prefix] = append(dirPrefixes[prefix], f)
} else {
individualFiles = append(individualFiles, f)
}
}

return dirPrefixes, individualFiles
}
151 changes: 151 additions & 0 deletions internal/workspace/stale_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package workspace

import (
"os"
"path/filepath"
"sort"
"testing"
)

func TestFindDeletedRoot_FileOnlyDeleted(t *testing.T) {
// Setup: create workspace/dir/subdir/ but NOT the file inside it
wsRoot := t.TempDir()
subdir := filepath.Join(wsRoot, "src", "pkg")
if err := os.MkdirAll(subdir, 0755); err != nil {
t.Fatal(err)
}

// File that doesn't exist, but parent directory does
missingFile := filepath.Join(subdir, "deleted.go")

result := FindDeletedRoot(missingFile, wsRoot)
if result != missingFile {
t.Errorf("expected %q, got %q — when only the file is missing, should return the file path itself", missingFile, result)
}
}

func TestFindDeletedRoot_DirectoryDeleted(t *testing.T) {
// Setup: workspace root exists, but "tmp" directory does NOT
wsRoot := t.TempDir()

// Neither tmp/ nor its children exist
missingFile := filepath.Join(wsRoot, "tmp", "vendor", "src", "foo.php")

result := FindDeletedRoot(missingFile, wsRoot)
expected := filepath.Join(wsRoot, "tmp")
if result != expected {
t.Errorf("expected %q, got %q — should return the highest deleted directory", expected, result)
}
}

func TestFindDeletedRoot_IntermediateDirectoryDeleted(t *testing.T) {
// Setup: workspace/src/ exists, but workspace/src/old/ does NOT
wsRoot := t.TempDir()
srcDir := filepath.Join(wsRoot, "src")
if err := os.MkdirAll(srcDir, 0755); err != nil {
t.Fatal(err)
}

// src/ exists, but src/old/ and deeper don't
missingFile := filepath.Join(wsRoot, "src", "old", "legacy", "file.go")

result := FindDeletedRoot(missingFile, wsRoot)
expected := filepath.Join(wsRoot, "src", "old")
if result != expected {
t.Errorf("expected %q, got %q — should return the highest deleted directory below existing parent", expected, result)
}
}

func TestFindDeletedRoot_WorkspaceRootBoundary(t *testing.T) {
// Ensure we never go above workspace root
wsRoot := t.TempDir()

// Everything under workspace root is deleted
missingFile := filepath.Join(wsRoot, "a", "b", "c", "d.txt")

result := FindDeletedRoot(missingFile, wsRoot)
expected := filepath.Join(wsRoot, "a")
if result != expected {
t.Errorf("expected %q, got %q — should stop at direct child of workspace root", expected, result)
}
}

func TestIsDirectoryDeletion(t *testing.T) {
tests := []struct {
deletedRoot string
originalFile string
expected bool
}{
{"/project/tmp", "/project/tmp/foo.php", true},
{"/project/foo.go", "/project/foo.go", false},
{"/project/src/old", "/project/src/old/deep/file.go", true},
}

for _, tt := range tests {
result := IsDirectoryDeletion(tt.deletedRoot, tt.originalFile)
if result != tt.expected {
t.Errorf("IsDirectoryDeletion(%q, %q) = %v, want %v",
tt.deletedRoot, tt.originalFile, result, tt.expected)
}
}
}

func TestGroupByDeletedRoot(t *testing.T) {
wsRoot := t.TempDir()

// Create src/ (exists) but not tmp/ (deleted)
srcDir := filepath.Join(wsRoot, "src")
if err := os.MkdirAll(srcDir, 0755); err != nil {
t.Fatal(err)
}

staleFiles := []string{
filepath.Join(wsRoot, "tmp", "a", "file1.go"),
filepath.Join(wsRoot, "tmp", "b", "file2.go"),
filepath.Join(wsRoot, "src", "deleted.go"), // parent exists, only file missing
}

dirPrefixes, individualFiles := GroupByDeletedRoot(staleFiles, wsRoot)

// tmp/ is the deleted root for both tmp files
expectedPrefix := filepath.Join(wsRoot, "tmp") + string(os.PathSeparator)
if _, ok := dirPrefixes[expectedPrefix]; !ok {
t.Errorf("expected prefix %q in dirPrefixes, got: %v", expectedPrefix, dirPrefixes)
}

if len(dirPrefixes[expectedPrefix]) != 2 {
t.Errorf("expected 2 files under prefix %q, got %d", expectedPrefix, len(dirPrefixes[expectedPrefix]))
}

sort.Strings(individualFiles)
expectedIndividual := filepath.Join(wsRoot, "src", "deleted.go")
if len(individualFiles) != 1 || individualFiles[0] != expectedIndividual {
t.Errorf("expected individualFiles = [%q], got %v", expectedIndividual, individualFiles)
}
}

func TestCollectStaleFiles(t *testing.T) {
wsRoot := t.TempDir()

// Create a real file
existingFile := filepath.Join(wsRoot, "exists.go")
if err := os.WriteFile(existingFile, []byte("package main"), 0644); err != nil {
t.Fatal(err)
}

// Create state with both existing and non-existing files
state := NewWorkspaceState()

realInfo, _ := os.Stat(existingFile)
state.UpdateFile(existingFile, realInfo)
state.Files[filepath.Join(wsRoot, "deleted.go")] = FileState{}

stale := CollectStaleFiles(state)

if len(stale) != 1 {
t.Fatalf("expected 1 stale file, got %d: %v", len(stale), stale)
}
if stale[0] != filepath.Join(wsRoot, "deleted.go") {
t.Errorf("expected stale file %q, got %q", filepath.Join(wsRoot, "deleted.go"), stale[0])
}
}
Loading