diff --git a/CHANGELOG.md b/CHANGELOG.md index 731f398..0d7d347 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,17 @@ versioning (`YY.MM.VV`). ## [Unreleased] +### Fixed +- **Searching for a site or tool by name now keeps its official page on the first screen.** Two + ranking steps could bury the official site even after it had been ranked first: the freshness + blend was reordering by list position instead of the real relevance score, so a single dated page + (a news story, an encyclopedia entry) could jump ahead of the undated official homepage; and the + built-in low-quality filter, which is on by default, was sourced from community lists that include + the official sites of many well-known companies and developer hubs, so those sites were quietly + pushed down. Freshness now rides on top of the real score and can only reorder results of similar + relevance, and a curated allowlist keeps well-known destinations out of the filter. All on-device, + no new requests. + ## 26.06.05 — 2026-06-10 ### Fixed diff --git a/app/src/main/assets/blocklist/allowlist.txt b/app/src/main/assets/blocklist/allowlist.txt new file mode 100644 index 0000000..bac052a --- /dev/null +++ b/app/src/main/assets/blocklist/allowlist.txt @@ -0,0 +1,47 @@ +# Slop-filter allowlist: domains the AI-slop blocklist must NEVER downrank or hide. +# +# The bundled blocklist is merged from community "hide AI from my browser" lists (see NOTICE). Those +# lists deliberately include the OFFICIAL sites of AI companies and major developer hubs, because +# their goal is to keep AI tools out of a browser's results. A search engine has the opposite job: +# when someone searches "huggingface" or "github", the official site is exactly what they want. So +# every domain here is subtracted from the effective blocklist at load and build time, so a +# navigational search for one of these names returns its real site at the top instead of burying it. +# +# Keep this list to widely-known, unambiguously-legitimate destinations. One bare registrable domain +# per line; subdomains are covered automatically. Lines starting with "#" are comments. + +# AI labs and their official products / hubs +openai.com +anthropic.com +claude.ai +huggingface.co +perplexity.ai +mistral.ai +cohere.com +deepmind.com +stability.ai +midjourney.com +civitai.com +replicate.com +character.ai +poe.com +runwayml.com +leonardo.ai +suno.com +udio.com +elevenlabs.io +copilot.microsoft.com +gemini.google.com + +# Developer, package, and reference hubs +github.com +gitlab.com +stackoverflow.com +stackexchange.com +pytorch.org +tensorflow.org +kaggle.com +npmjs.com +pypi.org +crates.io +arxiv.org diff --git a/app/src/main/java/org/searchmob/engine/MetaSearchResultProvider.kt b/app/src/main/java/org/searchmob/engine/MetaSearchResultProvider.kt index 03588cd..e23dc7e 100644 --- a/app/src/main/java/org/searchmob/engine/MetaSearchResultProvider.kt +++ b/app/src/main/java/org/searchmob/engine/MetaSearchResultProvider.kt @@ -207,6 +207,7 @@ class MetaSearchResultProvider( sortMode, query, System.currentTimeMillis(), + relevanceOf = { it.score }, publishedOf = { it.publishedMillis }, ) val personalizedBase = diff --git a/app/src/main/java/org/searchmob/engine/aggregate/Aggregator.kt b/app/src/main/java/org/searchmob/engine/aggregate/Aggregator.kt index c401966..82edc6f 100644 --- a/app/src/main/java/org/searchmob/engine/aggregate/Aggregator.kt +++ b/app/src/main/java/org/searchmob/engine/aggregate/Aggregator.kt @@ -20,6 +20,11 @@ data class AggregatedResult( val url: String, val snippet: String, val engines: List, + /** + * Final ranking score: RRF fused, lexical-blended, and navigationally boosted. The freshness + * sort multiplies it by a recency factor, so a strong match (the official site a navigational + * query named) keeps its lead instead of being flattened to its list position. + */ val score: Double, /** Best-known publication time (epoch millis), or null. Drives freshness sorting. */ val publishedMillis: Long? = null, @@ -155,25 +160,32 @@ class Aggregator( // and language-agnostic; the existing tie-breakers keep ordering deterministic. See Relevance. val terms = Relevance.contentTerms(query) return buckets.values - .map { AggregatedResult(it.title, it.url, it.snippet, it.engines.toList(), it.score, it.publishedMillis) } - .map { + .map { bucket -> // Navigational promotion: when the squished query names this result's domain (query // "threejs" -> threejs.org), float it to the top past the demotion-only relevance // blend, so the official site is not buried under forum posts that merely contain it. - val nav = Relevance.navigationalFactor(terms, DomainRanker.host(it.url) ?: "") - it to + // The final score is stored on the result so the freshness sort scales it directly. + val nav = Relevance.navigationalFactor(terms, DomainRanker.host(bucket.url) ?: "") + val finalScore = Relevance.blendedScore( - it.score, - Relevance.lexicalScore(it.title, it.snippet, terms), - Relevance.languageAffinity(query, it.title, it.snippet), + bucket.score, + Relevance.lexicalScore(bucket.title, bucket.snippet, terms), + Relevance.languageAffinity(query, bucket.title, bucket.snippet), ) * nav + AggregatedResult( + bucket.title, + bucket.url, + bucket.snippet, + bucket.engines.toList(), + finalScore, + bucket.publishedMillis, + ) } .sortedWith( - compareByDescending> { it.second } - .thenBy { UrlNormalizer.normalize(it.first.url) } - .thenBy { it.first.engines.joinToString(",") }, + compareByDescending { it.score } + .thenBy { UrlNormalizer.normalize(it.url) } + .thenBy { it.engines.joinToString(",") }, ) - .map { it.first } } /** A structured date from the engine wins; else parse snippet/title. Weak (bare-year) -> null. */ diff --git a/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklist.kt b/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklist.kt index d5e617c..d3a905c 100644 --- a/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklist.kt +++ b/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklist.kt @@ -23,4 +23,19 @@ object AiSlopBlocklist { } return false } + + /** + * The effective blocklist: [raw] minus every [allow]listed domain and any subdomain of one. The + * community lists include the official sites of AI companies and major dev hubs, which a search + * ranker must never bury, so a search for one of those names returns its real site at the top. + */ + fun effectiveBlocklist( + raw: Set, + allow: Set, + ): Set { + if (allow.isEmpty()) return raw + return raw.filterTo(HashSet()) { domain -> + allow.none { domain == it || domain.endsWith(".$it") } + } + } } diff --git a/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklistLoader.kt b/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklistLoader.kt index 2d0841e..3ac9bc5 100644 --- a/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklistLoader.kt +++ b/app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklistLoader.kt @@ -14,10 +14,16 @@ import java.util.zip.GZIPInputStream * returns the cached set or an empty set before the first load completes, which is what the ranker * uses so a search before loading simply applies no slop filter. Fail-soft: any read error yields an * empty set rather than failing a search. + * + * The community lists are "hide AI from my browser" lists, so they include the official sites of AI + * companies and major developer hubs (github.com, huggingface.co, openai.com, ...). A search ranker + * must not bury those, so the plain-text `blocklist/allowlist.txt` asset names the legitimate + * destinations to keep, and they (with any subdomains) are subtracted from the effective blocklist. */ class AiSlopBlocklistLoader( private val context: Context, private val assetPath: String = "blocklist/ai-slop-domains.txt.gz", + private val allowlistPath: String = "blocklist/allowlist.txt", ) { @Volatile private var cached: Set? = null @@ -27,6 +33,7 @@ class AiSlopBlocklistLoader( suspend fun load(): Set { cached?.let { return it } + val allow = loadAllowlist() val domains = runCatching { val out = HashSet() @@ -44,11 +51,26 @@ class AiSlopBlocklistLoader( } } } - out as Set + // Subtract the allowlist so a search for a legitimate destination (huggingface.co, + // github.com) is never downranked, even though the community lists include them. + AiSlopBlocklist.effectiveBlocklist(out, allow) }.getOrDefault(emptySet()) return domains.also { cached = it } } + private fun loadAllowlist(): Set = + runCatching { + val out = HashSet() + context.assets.open(allowlistPath).bufferedReader().use { reader -> + while (true) { + val line = reader.readLine() ?: break + val domain = line.trim().lowercase() + if (domain.isNotEmpty() && !domain.startsWith("#")) out.add(domain) + } + } + out as Set + }.getOrDefault(emptySet()) + private companion object { /** Upper bound on the decompressed blocklist; the real list is ~1k short lines (well under). */ const val MAX_DECOMPRESSED_BYTES = 8L * 1024 * 1024 diff --git a/app/src/main/java/org/searchmob/engine/sort/ResultSorter.kt b/app/src/main/java/org/searchmob/engine/sort/ResultSorter.kt index da6dfed..0c87ada 100644 --- a/app/src/main/java/org/searchmob/engine/sort/ResultSorter.kt +++ b/app/src/main/java/org/searchmob/engine/sort/ResultSorter.kt @@ -34,6 +34,13 @@ object ResultSorter { mode: SortMode, query: String, nowMillis: Long, + // The aggregator's final relevance score for each item (RRF fused, lexical-blended, and + // navigationally boosted). The freshness blend multiplies a recency boost into THIS, so a + // strong match (the official site a navigational query named) keeps its lead and freshness + // only reorders results of comparable relevance. Defaults to 0.0 ("unscored"), in which case + // the blend falls back to a positional `1/(60+index)` proxy. `publishedOf` stays last so the + // existing trailing-lambda callers (which pass only `publishedOf`) keep working unchanged. + relevanceOf: (T) -> Double = { 0.0 }, publishedOf: (T) -> Long?, ): List { if (mode == SortMode.RELEVANCE || items.size < 2) return items @@ -50,19 +57,19 @@ object ResultSorter { } val weight = QdfHeuristic.weightFor(query, nowMillis) + // Earlier this scaled a positional `1/(60+index)` proxy, which flattened every rank gap to a + // hair and let a single dated result leapfrog an undated #1 (a news/wiki page over the + // queried site itself). Scale the real score when present, else the positional proxy. + val hasScores = items.any { relevanceOf(it) > 0.0 } return items - .mapIndexed { index, item -> Triple(score(index, publishedOf(item), nowMillis, weight), index, item) } + .mapIndexed { index, item -> + val base = if (hasScores) relevanceOf(item) else 1.0 / (RRF_K + index) + Triple(base * recency(publishedOf(item), nowMillis, weight), index, item) + } .sortedWith(compareByDescending> { it.first }.thenBy { it.second }) .map { it.third } } - private fun score( - index: Int, - published: Long?, - nowMillis: Long, - weight: Double, - ): Double = 1.0 / (RRF_K + index) * recency(published, nowMillis, weight) - private fun recency( published: Long?, nowMillis: Long, diff --git a/app/src/test/java/org/searchmob/engine/ResultSorterTest.kt b/app/src/test/java/org/searchmob/engine/ResultSorterTest.kt index c26ea44..da23ade 100644 --- a/app/src/test/java/org/searchmob/engine/ResultSorterTest.kt +++ b/app/src/test/java/org/searchmob/engine/ResultSorterTest.kt @@ -11,7 +11,7 @@ class ResultSorterTest { private val now = 1_900_000_000_000L private val day = 86_400_000L - private data class R(val name: String, val published: Long?) + private data class R(val name: String, val published: Long?, val relevance: Double = 0.0) private fun sort( items: List, @@ -19,6 +19,13 @@ class ResultSorterTest { query: String = "q", ) = ResultSorter.sort(items, mode, query, now) { it.published }.map { it.name } + private fun sortScored( + items: List, + mode: SortMode, + query: String = "q", + ) = ResultSorter.sort(items, mode, query, now, relevanceOf = { it.relevance }, publishedOf = { it.published }) + .map { it.name } + @Test fun fromValueDefaultsToFresh() { assertEquals(SortMode.FRESH_RELEVANT, SortMode.fromValue(null)) @@ -51,6 +58,33 @@ class ResultSorterTest { assertEquals("fresh", sort(items, SortMode.FRESH_RELEVANT, "the matrix 5 release date").first()) } + @Test + fun freshBlendDoesNotLetDatedResultDisplaceStrongUndatedMatch() { + // Regression: a navigational query ("huggingface") nav-boosts the official site to a high + // aggregator score, but its homepage is undated. A dated wiki/news page must NOT leapfrog it + // under the default freshness sort. Earlier the blend scaled a positional 1/(60+index) proxy, + // which flattened the nav boost and let any dated result overtake the queried site itself. + val items = + listOf( + R("huggingface.co", null, relevance = 0.199), + R("wikipedia", now - 3 * day, relevance = 0.049), + R("techcrunch", now - day, relevance = 0.016), + ) + assertEquals("huggingface.co", sortScored(items, SortMode.FRESH_RELEVANT, "huggingface").first()) + } + + @Test + fun freshBlendStillReordersComparableRelevance() { + // Freshness must still reorder peers of similar relevance: a fresh dated result rises above a + // stale one just above it. This is the QDF behavior the blend is meant to provide. + val items = + listOf( + R("stale", now - 300 * day, relevance = 0.050), + R("fresh", now - day, relevance = 0.048), + ) + assertEquals("fresh", sortScored(items, SortMode.FRESH_RELEVANT, "the matrix 5 release date").first()) + } + @Test fun qdfBoostsTimeSensitiveQueries() { val base = QdfHeuristic.weightFor("best laptops", now) diff --git a/app/src/test/java/org/searchmob/engine/rank/AiSlopBlocklistTest.kt b/app/src/test/java/org/searchmob/engine/rank/AiSlopBlocklistTest.kt index 7a6014a..9fcc1d6 100644 --- a/app/src/test/java/org/searchmob/engine/rank/AiSlopBlocklistTest.kt +++ b/app/src/test/java/org/searchmob/engine/rank/AiSlopBlocklistTest.kt @@ -33,4 +33,19 @@ class AiSlopBlocklistTest { fun emptyBlocklistNeverMatches() { assertFalse(AiSlopBlocklist.matches("anything.test", emptySet())) } + + @Test + fun effectiveBlocklistDropsAllowlistedDomainsAndSubdomains() { + // The community lists include the official sites of AI companies and major dev hubs. A search + // for "huggingface"/"github" must surface those, so the allowlist (and any subdomain of one) + // is subtracted. Regression for the "official site missing from page one" report. + val raw = setOf("huggingface.co", "discuss.huggingface.co", "github.com", "slopfarm.example") + val allow = setOf("huggingface.co", "github.com") + val effective = AiSlopBlocklist.effectiveBlocklist(raw, allow) + assertFalse(AiSlopBlocklist.matches("huggingface.co", effective)) + assertFalse(AiSlopBlocklist.matches("discuss.huggingface.co", effective)) + assertFalse(AiSlopBlocklist.matches("github.com", effective)) + // A genuine low-quality domain is still blocked. + assertTrue(AiSlopBlocklist.matches("slopfarm.example", effective)) + } }