From 0a853b7044f8f8a437bf0f93f9c4ca5b022d7705 Mon Sep 17 00:00:00 2001 From: Nat Baca <104159322+natbaca-wmf@users.noreply.github.com> Date: Sun, 23 Nov 2025 16:02:34 -0700 Subject: [PATCH] Update wikipedia.js fixing to use page content service --- wikipedia.js | 106 +++++++++++++++------------------------------------ 1 file changed, 30 insertions(+), 76 deletions(-) diff --git a/wikipedia.js b/wikipedia.js index b888bea..6c02be8 100644 --- a/wikipedia.js +++ b/wikipedia.js @@ -58,12 +58,11 @@ async function getWikipediaData(language, topic) { }; const wikipediaHTMLPromise = function() { - const requestConfig = { - baseURL: "https://" + language + ".wikipedia.org/api/rest_v1/", - url: "/page/mobile-sections/" + encodedTopic, + baseURL: "https://" + language + ".wikipedia.org/w/rest.php/v1/page/", + url: encodedTopic + "/html", method: "get", - responseType: "json", + responseType: "text", headers: { "Api-User-Agent": process.env.WIKIDOCUMENTARIES_API_USER_AGENT }, @@ -72,65 +71,38 @@ async function getWikipediaData(language, topic) { else return axios.request(requestConfig); }; - const [wikipediaSummaryResponse, wikipediaHTMLResponse] - = await axios.all([wikipediaSummaryPromise(), wikipediaHTMLPromise()]); + const [summaryRes, htmlRes] = await Promise.allSettled([ + wikipediaSummaryPromise(), + wikipediaHTMLPromise() + ]); - if (wikipediaHTMLResponse.data == undefined ) { - // No wikipedia article - excerptHTML=""; - remainingHTML=null; - } - else { - var origHTML = wikipediaHTMLResponse.data.lead.sections[0].text; - var remainingHTML = null; - - if (wikipediaHTMLResponse.data.lead.disambiguation != undefined && wikipediaHTMLResponse.data.lead.disambiguation == true) { - wikipediaHTMLResponse.data.remaining.sections.forEach(section => { - origHTML += section.text; - }); + const wikipediaSummaryResponse = summaryRes.status === "fulfilled" ? summaryRes.value : null; + const wikipediaHTMLResponse = htmlRes.status === "fulfilled" ? htmlRes.value : null; + + let excerptHTML = ""; + let remainingHTML = null; + + if (wikipediaHTMLResponse && wikipediaHTMLResponse.data != null && typeof wikipediaHTMLResponse.data === 'string') { + let rawHTML = wikipediaHTMLResponse.data; + + const bodyMatch = rawHTML.match(/]*>([\s\S]*?)<\/body>/i); + if (bodyMatch) { + rawHTML = bodyMatch[1]; } - else { - var remainingOrigHTML = ""; - - wikipediaHTMLResponse.data.remaining.sections.forEach(section => { - if (section.isReferenceSection == undefined) { - var sectionHeaderStartTag = ""; - var sectionHeaderEndTag = ""; - switch(section.toclevel) { - case 1: - sectionHeaderStartTag = "

"; - sectionHeaderEndTag = "

"; - break; - case 2: - sectionHeaderStartTag = "

"; - sectionHeaderEndTag = "

"; - break; - case 3: - sectionHeaderStartTag = "

"; - sectionHeaderEndTag = "

"; - break; - case 4: - sectionHeaderStartTag = "
"; - sectionHeaderEndTag = "
"; - break; - } - remainingOrigHTML += sectionHeaderStartTag + section.line + sectionHeaderEndTag; - remainingOrigHTML += section.text; - } - }); - -/* if (remainingOrigHTML.length > 3000) { */ // Small count of HTML should be with the leading section + + const splitIndex = rawHTML.search(/]/i); + const origHTML = splitIndex > -1 ? rawHTML.substring(0, splitIndex) : rawHTML; + + if (splitIndex > -1) { + const remainingOrigHTML = rawHTML.substring(splitIndex); remainingHTML = convertToWikidocumentariesHTML(remainingOrigHTML, topic, language); -/* } - else { - origHTML += remainingOrigHTML; - } */ } - var excerptHTML = convertToWikidocumentariesHTML(origHTML, topic, language); + + excerptHTML = convertToWikidocumentariesHTML(origHTML, topic, language); } return { - wikipedia: wikipediaSummaryResponse.data, + wikipedia: wikipediaSummaryResponse ? wikipediaSummaryResponse.data : null, excerptHTML, remainingHTML, }; @@ -172,25 +144,7 @@ const convertToWikidocumentariesHTML = function(origHTML, topic, language) { //$(this).replaceWith($(this).html()); } }); -/* $("table").each(function(index) { - $(this).remove(); - }); - $("figure").each(function(index) { - $(this).remove(); - }); - $("figure-inline").each(function(index) { - $(this).remove(); - }); - $("sup").each(function(index) { - $(this).remove(); - }); - - $("div").each(function(index) { - var div_class = $(this).attr('class'); - if (div_class == undefined || div_class != 'noprint') { - $(this).remove(); - } - }); */ + $("table").each(function(index) { //Remove English Wikipedia infobox var div_class = $(this).attr('class'); if (div_class != undefined && div_class.indexOf('infobox') != -1) { @@ -217,4 +171,4 @@ const convertToWikidocumentariesHTML = function(origHTML, topic, language) { }); return $.html(); -} +};