Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ versioning (`YY.MM.VV`).

## [Unreleased]

### Fixed
- **Searching for a site or tool by name now keeps its official page on the first screen.** Two
ranking steps could bury the official site even after it had been ranked first: the freshness
blend was reordering by list position instead of the real relevance score, so a single dated page
(a news story, an encyclopedia entry) could jump ahead of the undated official homepage; and the
built-in low-quality filter, which is on by default, was sourced from community lists that include
the official sites of many well-known companies and developer hubs, so those sites were quietly
pushed down. Freshness now rides on top of the real score and can only reorder results of similar
relevance, and a curated allowlist keeps well-known destinations out of the filter. All on-device,
no new requests.

## 26.06.05 — 2026-06-10

### Fixed
Expand Down
47 changes: 47 additions & 0 deletions app/src/main/assets/blocklist/allowlist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Slop-filter allowlist: domains the AI-slop blocklist must NEVER downrank or hide.
#
# The bundled blocklist is merged from community "hide AI from my browser" lists (see NOTICE). Those
# lists deliberately include the OFFICIAL sites of AI companies and major developer hubs, because
# their goal is to keep AI tools out of a browser's results. A search engine has the opposite job:
# when someone searches "huggingface" or "github", the official site is exactly what they want. So
# every domain here is subtracted from the effective blocklist at load and build time, so a
# navigational search for one of these names returns its real site at the top instead of burying it.
#
# Keep this list to widely-known, unambiguously-legitimate destinations. One bare registrable domain
# per line; subdomains are covered automatically. Lines starting with "#" are comments.

# AI labs and their official products / hubs
openai.com
anthropic.com
claude.ai
huggingface.co
perplexity.ai
mistral.ai
cohere.com
deepmind.com
stability.ai
midjourney.com
civitai.com
replicate.com
character.ai
poe.com
runwayml.com
leonardo.ai
suno.com
udio.com
elevenlabs.io
copilot.microsoft.com
gemini.google.com

# Developer, package, and reference hubs
github.com
gitlab.com
stackoverflow.com
stackexchange.com
pytorch.org
tensorflow.org
kaggle.com
npmjs.com
pypi.org
crates.io
arxiv.org
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ class MetaSearchResultProvider(
sortMode,
query,
System.currentTimeMillis(),
relevanceOf = { it.score },
publishedOf = { it.publishedMillis },
)
val personalizedBase =
Expand Down
34 changes: 23 additions & 11 deletions app/src/main/java/org/searchmob/engine/aggregate/Aggregator.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ data class AggregatedResult(
val url: String,
val snippet: String,
val engines: List<String>,
/**
* Final ranking score: RRF fused, lexical-blended, and navigationally boosted. The freshness
* sort multiplies it by a recency factor, so a strong match (the official site a navigational
* query named) keeps its lead instead of being flattened to its list position.
*/
val score: Double,
/** Best-known publication time (epoch millis), or null. Drives freshness sorting. */
val publishedMillis: Long? = null,
Expand Down Expand Up @@ -155,25 +160,32 @@ class Aggregator(
// and language-agnostic; the existing tie-breakers keep ordering deterministic. See Relevance.
val terms = Relevance.contentTerms(query)
return buckets.values
.map { AggregatedResult(it.title, it.url, it.snippet, it.engines.toList(), it.score, it.publishedMillis) }
.map {
.map { bucket ->
// Navigational promotion: when the squished query names this result's domain (query
// "threejs" -> threejs.org), float it to the top past the demotion-only relevance
// blend, so the official site is not buried under forum posts that merely contain it.
val nav = Relevance.navigationalFactor(terms, DomainRanker.host(it.url) ?: "")
it to
// The final score is stored on the result so the freshness sort scales it directly.
val nav = Relevance.navigationalFactor(terms, DomainRanker.host(bucket.url) ?: "")
val finalScore =
Relevance.blendedScore(
it.score,
Relevance.lexicalScore(it.title, it.snippet, terms),
Relevance.languageAffinity(query, it.title, it.snippet),
bucket.score,
Relevance.lexicalScore(bucket.title, bucket.snippet, terms),
Relevance.languageAffinity(query, bucket.title, bucket.snippet),
) * nav
AggregatedResult(
bucket.title,
bucket.url,
bucket.snippet,
bucket.engines.toList(),
finalScore,
bucket.publishedMillis,
)
}
.sortedWith(
compareByDescending<Pair<AggregatedResult, Double>> { it.second }
.thenBy { UrlNormalizer.normalize(it.first.url) }
.thenBy { it.first.engines.joinToString(",") },
compareByDescending<AggregatedResult> { it.score }
.thenBy { UrlNormalizer.normalize(it.url) }
.thenBy { it.engines.joinToString(",") },
)
.map { it.first }
}

/** A structured date from the engine wins; else parse snippet/title. Weak (bare-year) -> null. */
Expand Down
15 changes: 15 additions & 0 deletions app/src/main/java/org/searchmob/engine/rank/AiSlopBlocklist.kt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,19 @@ object AiSlopBlocklist {
}
return false
}

/**
* The effective blocklist: [raw] minus every [allow]listed domain and any subdomain of one. The
* community lists include the official sites of AI companies and major dev hubs, which a search
* ranker must never bury, so a search for one of those names returns its real site at the top.
*/
fun effectiveBlocklist(
raw: Set<String>,
allow: Set<String>,
): Set<String> {
if (allow.isEmpty()) return raw
return raw.filterTo(HashSet()) { domain ->
allow.none { domain == it || domain.endsWith(".$it") }
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@ import java.util.zip.GZIPInputStream
* returns the cached set or an empty set before the first load completes, which is what the ranker
* uses so a search before loading simply applies no slop filter. Fail-soft: any read error yields an
* empty set rather than failing a search.
*
* The community lists are "hide AI from my browser" lists, so they include the official sites of AI
* companies and major developer hubs (github.com, huggingface.co, openai.com, ...). A search ranker
* must not bury those, so the plain-text `blocklist/allowlist.txt` asset names the legitimate
* destinations to keep, and they (with any subdomains) are subtracted from the effective blocklist.
*/
class AiSlopBlocklistLoader(
private val context: Context,
private val assetPath: String = "blocklist/ai-slop-domains.txt.gz",
private val allowlistPath: String = "blocklist/allowlist.txt",
) {
@Volatile
private var cached: Set<String>? = null
Expand All @@ -27,6 +33,7 @@ class AiSlopBlocklistLoader(

suspend fun load(): Set<String> {
cached?.let { return it }
val allow = loadAllowlist()
val domains =
runCatching {
val out = HashSet<String>()
Expand All @@ -44,11 +51,26 @@ class AiSlopBlocklistLoader(
}
}
}
out as Set<String>
// Subtract the allowlist so a search for a legitimate destination (huggingface.co,
// github.com) is never downranked, even though the community lists include them.
AiSlopBlocklist.effectiveBlocklist(out, allow)
}.getOrDefault(emptySet())
return domains.also { cached = it }
}

private fun loadAllowlist(): Set<String> =
runCatching {
val out = HashSet<String>()
context.assets.open(allowlistPath).bufferedReader().use { reader ->
while (true) {
val line = reader.readLine() ?: break
val domain = line.trim().lowercase()
if (domain.isNotEmpty() && !domain.startsWith("#")) out.add(domain)
}
}
out as Set<String>
}.getOrDefault(emptySet())

private companion object {
/** Upper bound on the decompressed blocklist; the real list is ~1k short lines (well under). */
const val MAX_DECOMPRESSED_BYTES = 8L * 1024 * 1024
Expand Down
23 changes: 15 additions & 8 deletions app/src/main/java/org/searchmob/engine/sort/ResultSorter.kt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ object ResultSorter {
mode: SortMode,
query: String,
nowMillis: Long,
// The aggregator's final relevance score for each item (RRF fused, lexical-blended, and
// navigationally boosted). The freshness blend multiplies a recency boost into THIS, so a
// strong match (the official site a navigational query named) keeps its lead and freshness
// only reorders results of comparable relevance. Defaults to 0.0 ("unscored"), in which case
// the blend falls back to a positional `1/(60+index)` proxy. `publishedOf` stays last so the
// existing trailing-lambda callers (which pass only `publishedOf`) keep working unchanged.
relevanceOf: (T) -> Double = { 0.0 },
publishedOf: (T) -> Long?,
): List<T> {
if (mode == SortMode.RELEVANCE || items.size < 2) return items
Expand All @@ -50,19 +57,19 @@ object ResultSorter {
}

val weight = QdfHeuristic.weightFor(query, nowMillis)
// Earlier this scaled a positional `1/(60+index)` proxy, which flattened every rank gap to a
// hair and let a single dated result leapfrog an undated #1 (a news/wiki page over the
// queried site itself). Scale the real score when present, else the positional proxy.
val hasScores = items.any { relevanceOf(it) > 0.0 }
return items
.mapIndexed { index, item -> Triple(score(index, publishedOf(item), nowMillis, weight), index, item) }
.mapIndexed { index, item ->
val base = if (hasScores) relevanceOf(item) else 1.0 / (RRF_K + index)
Triple(base * recency(publishedOf(item), nowMillis, weight), index, item)
}
.sortedWith(compareByDescending<Triple<Double, Int, T>> { it.first }.thenBy { it.second })
.map { it.third }
}

private fun score(
index: Int,
published: Long?,
nowMillis: Long,
weight: Double,
): Double = 1.0 / (RRF_K + index) * recency(published, nowMillis, weight)

private fun recency(
published: Long?,
nowMillis: Long,
Expand Down
36 changes: 35 additions & 1 deletion app/src/test/java/org/searchmob/engine/ResultSorterTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,21 @@ class ResultSorterTest {
private val now = 1_900_000_000_000L
private val day = 86_400_000L

private data class R(val name: String, val published: Long?)
private data class R(val name: String, val published: Long?, val relevance: Double = 0.0)

private fun sort(
items: List<R>,
mode: SortMode,
query: String = "q",
) = ResultSorter.sort(items, mode, query, now) { it.published }.map { it.name }

private fun sortScored(
items: List<R>,
mode: SortMode,
query: String = "q",
) = ResultSorter.sort(items, mode, query, now, relevanceOf = { it.relevance }, publishedOf = { it.published })
.map { it.name }

@Test
fun fromValueDefaultsToFresh() {
assertEquals(SortMode.FRESH_RELEVANT, SortMode.fromValue(null))
Expand Down Expand Up @@ -51,6 +58,33 @@ class ResultSorterTest {
assertEquals("fresh", sort(items, SortMode.FRESH_RELEVANT, "the matrix 5 release date").first())
}

@Test
fun freshBlendDoesNotLetDatedResultDisplaceStrongUndatedMatch() {
// Regression: a navigational query ("huggingface") nav-boosts the official site to a high
// aggregator score, but its homepage is undated. A dated wiki/news page must NOT leapfrog it
// under the default freshness sort. Earlier the blend scaled a positional 1/(60+index) proxy,
// which flattened the nav boost and let any dated result overtake the queried site itself.
val items =
listOf(
R("huggingface.co", null, relevance = 0.199),
R("wikipedia", now - 3 * day, relevance = 0.049),
R("techcrunch", now - day, relevance = 0.016),
)
assertEquals("huggingface.co", sortScored(items, SortMode.FRESH_RELEVANT, "huggingface").first())
}

@Test
fun freshBlendStillReordersComparableRelevance() {
// Freshness must still reorder peers of similar relevance: a fresh dated result rises above a
// stale one just above it. This is the QDF behavior the blend is meant to provide.
val items =
listOf(
R("stale", now - 300 * day, relevance = 0.050),
R("fresh", now - day, relevance = 0.048),
)
assertEquals("fresh", sortScored(items, SortMode.FRESH_RELEVANT, "the matrix 5 release date").first())
}

@Test
fun qdfBoostsTimeSensitiveQueries() {
val base = QdfHeuristic.weightFor("best laptops", now)
Expand Down
15 changes: 15 additions & 0 deletions app/src/test/java/org/searchmob/engine/rank/AiSlopBlocklistTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,19 @@ class AiSlopBlocklistTest {
fun emptyBlocklistNeverMatches() {
assertFalse(AiSlopBlocklist.matches("anything.test", emptySet()))
}

@Test
fun effectiveBlocklistDropsAllowlistedDomainsAndSubdomains() {
// The community lists include the official sites of AI companies and major dev hubs. A search
// for "huggingface"/"github" must surface those, so the allowlist (and any subdomain of one)
// is subtracted. Regression for the "official site missing from page one" report.
val raw = setOf("huggingface.co", "discuss.huggingface.co", "github.com", "slopfarm.example")
val allow = setOf("huggingface.co", "github.com")
val effective = AiSlopBlocklist.effectiveBlocklist(raw, allow)
assertFalse(AiSlopBlocklist.matches("huggingface.co", effective))
assertFalse(AiSlopBlocklist.matches("discuss.huggingface.co", effective))
assertFalse(AiSlopBlocklist.matches("github.com", effective))
// A genuine low-quality domain is still blocked.
assertTrue(AiSlopBlocklist.matches("slopfarm.example", effective))
}
}