diff --git a/src/main/kotlin/net/dankito/readability4j/Readability4J.kt b/src/main/kotlin/net/dankito/readability4j/Readability4J.kt index b971dee..d6b935d 100644 --- a/src/main/kotlin/net/dankito/readability4j/Readability4J.kt +++ b/src/main/kotlin/net/dankito/readability4j/Readability4J.kt @@ -10,13 +10,23 @@ import net.dankito.readability4j.util.RegExUtil import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.jsoup.nodes.Element +import org.jsoup.select.Elements +import org.jsoup.select.Evaluator import org.slf4j.LoggerFactory +import kotlin.math.sqrt open class Readability4J { companion object { private val log = LoggerFactory.getLogger(Readability4J::class.java) + + var REGEXPS = mapOf( + // NOTE: These two regular expressions are duplicated in + // Readability.js. Please keep both copies in sync. + "unlikelyCandidates" to "/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i".toRegex(), + "okMaybeItsACandidate" to "/and|article|body|column|content|main|shadow/i".toRegex() + ); } @@ -106,12 +116,55 @@ open class Readability4J { article.articleContent = articleContent } - + setArticleMetadata(article, metadata, articleContent) return article } + fun isProbablyReaderable() : Boolean { + val paragraphNodes = document.select("p") + val preNodes = document.select("pre") + val nodes = Elements() + nodes.addAll(paragraphNodes); + nodes.addAll(preNodes); + + val brNodes = document.select("div > br") + + if(brNodes.isNotEmpty()) { + brNodes.forEach { brNode -> + brNode.parent()?.let { + nodes.add(it) + } + } + } + + var score = 0.0 + nodes.forEach { node -> + + val matchString = node.className() + " " + node.id(); + if (REGEXPS["unlikelyCandidates"]?.matches(matchString) == true && + REGEXPS["okMaybeItsACandidate"]?.matches(matchString) == false) { + return false; + } + + if(nodes.select("li p").isNotEmpty()) { + return false + } + + val textContentLength = node.text().trim().length; + if (textContentLength < options.characterThreshold) { + return false; + } + + score += sqrt((textContentLength - options.characterThreshold).toDouble()); + } + + + + return score >= options.minAccumulatedScore + } + private fun setArticleMetadata(article: Article, metadata: ArticleMetadata, articleContent: Element?) { // If we haven't found an excerpt in the article's metadata, use the article's // first paragraph as the excerpt. This is used for displaying a preview of diff --git a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt index e3b3e55..3b1b25c 100644 --- a/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt +++ b/src/main/kotlin/net/dankito/readability4j/model/ReadabilityOptions.kt @@ -4,6 +4,8 @@ package net.dankito.readability4j.model open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PARSE, val nbTopCandidates: Int = DEFAULT_N_TOP_CANDIDATES, val wordThreshold: Int = DEFAULT_WORD_THRESHOLD, + val characterThreshold : Int = DEFAULT_MIN_LENGTH, + val minAccumulatedScore : Int = DEFAULT_ACCUMULATED_SCORE, val additionalClassesToPreserve: Collection = ArrayList()) { companion object { @@ -16,6 +18,14 @@ open class ReadabilityOptions(val maxElemsToParse: Int = DEFAULT_MAX_ELEMS_TO_PA // The default number of words an article must have in order to return a result const val DEFAULT_WORD_THRESHOLD = 500 + + // The minimum character length an article must have in order to be readerable + // default value set by Readablity.js is 140. We are setting 50 for our convenience + const val DEFAULT_MIN_LENGTH = 50 + + // The minimum accumulated score an article needs to have in order to be readerable + const val DEFAULT_ACCUMULATED_SCORE = 20 + } } \ No newline at end of file