Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion src/main/kotlin/net/dankito/readability4j/Readability4J.kt
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,23 @@ import net.dankito.readability4j.util.RegExUtil
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import org.jsoup.select.Evaluator
import org.slf4j.LoggerFactory
import kotlin.math.sqrt


open class Readability4J {

companion object {
private val log = LoggerFactory.getLogger(Readability4J::class.java)

var REGEXPS = mapOf(
// NOTE: These two regular expressions are duplicated in
// Readability.js. Please keep both copies in sync.
"unlikelyCandidates" to "/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i".toRegex(),
"okMaybeItsACandidate" to "/and|article|body|column|content|main|shadow/i".toRegex()
);
}


Expand Down Expand Up @@ -106,12 +116,55 @@ open class Readability4J {

article.articleContent = articleContent
}

setArticleMetadata(article, metadata, articleContent)

return article
}

fun isProbablyReaderable() : Boolean {
val paragraphNodes = document.select("p")
val preNodes = document.select("pre")
val nodes = Elements()
nodes.addAll(paragraphNodes);
nodes.addAll(preNodes);

val brNodes = document.select("div > br")

if(brNodes.isNotEmpty()) {
brNodes.forEach { brNode ->
brNode.parent()?.let {
nodes.add(it)
}
}
}

var score = 0.0
nodes.forEach { node ->

val matchString = node.className() + " " + node.id();
if (REGEXPS["unlikelyCandidates"]?.matches(matchString) == true &&
REGEXPS["okMaybeItsACandidate"]?.matches(matchString) == false) {
return false;
}

if(nodes.select("li p").isNotEmpty()) {
return false
}

val textContentLength = node.text().trim().length;
if (textContentLength < options.characterThreshold) {
return false;
}

score += sqrt((textContentLength - options.characterThreshold).toDouble());
}



return score >= options.minAccumulatedScore
}

private fun setArticleMetadata(article: Article, metadata: ArticleMetadata, articleContent: Element?) {
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ package net.dankito.readability4j.model
open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PARSE,
val nbTopCandidates: Int = DEFAULT_N_TOP_CANDIDATES,
val wordThreshold: Int = DEFAULT_WORD_THRESHOLD,
val characterThreshold : Int = DEFAULT_MIN_LENGTH,
val minAccumulatedScore : Int = DEFAULT_ACCUMULATED_SCORE,
val additionalClassesToPreserve: Collection<String> = ArrayList()) {

companion object {
Expand All @@ -16,6 +18,14 @@ open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PA

// The default number of words an article must have in order to return a result
const val DEFAULT_WORD_THRESHOLD = 500

// The minimum character length an article must have in order to be readerable
// default value set by Readablity.js is 140. We are setting 50 for our convenience
const val DEFAULT_MIN_LENGTH = 50

// The minimum accumulated score an article needs to have in order to be readerable
const val DEFAULT_ACCUMULATED_SCORE = 20

}

}