From 180eb233beb2dabc2a7944390e653f0fcf09235f Mon Sep 17 00:00:00 2001 From: Kimmo Virtanen Date: Tue, 7 Apr 2026 20:01:12 +0300 Subject: [PATCH 1/2] added sparql endpoint parameter where user can directly define sparql endpoint which will be used --- config/config.se.yml | 1 + config/sites/cawiki.yml | 1 + config/sites/enwiki.yml | 1 + config/sites/eswiki.yml | 194 ++++++++++++++++++++-------------------- config/sites/euwiki.yml | 1 + config/sites/fiwiki.yml | 1 + config/sites/glwiki.yml | 193 +++++++++++++++++++-------------------- config/sites/nowiki.yml | 1 + test/test_filters.py | 117 +++++++++++++++++++++++- ukbot/filters.py | 19 ++-- 10 files changed, 330 insertions(+), 199 deletions(-) diff --git a/config/config.se.yml b/config/config.se.yml index 8a427f0..30f327f 100644 --- a/config/config.se.yml +++ b/config/config.se.yml @@ -101,6 +101,7 @@ templates: ignore: ignore sparql: sparql # as in {{ ukb criterion | sparql }} query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint pages: catignore: Käyttäjä:UKBot/cat-ignore base: Wikipedia:Elokuun_kuvitustalkoot/ diff --git a/config/sites/cawiki.yml b/config/sites/cawiki.yml index 9035326..ff77b2f 100644 --- a/config/sites/cawiki.yml +++ b/config/sites/cawiki.yml @@ -99,6 +99,7 @@ templates: name: sparql # as in {{ ukb criterion | sparql }} params: query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint awards: blava: { file: Article blue.svg, winner: true } blau: { file: Article blue.svg, winner: true } diff --git a/config/sites/enwiki.yml b/config/sites/enwiki.yml index 9885d58..9a529d3 100644 --- a/config/sites/enwiki.yml +++ b/config/sites/enwiki.yml @@ -97,6 +97,7 @@ templates: name: sparql # as in {{ ukb criterion | sparql }} params: query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint awards: blå: { file: Article blue.svg, winner: true } rød: { file: Article red.svg, winner: true } diff --git a/config/sites/eswiki.yml b/config/sites/eswiki.yml index 15786e2..e945e1b 100644 --- a/config/sites/eswiki.yml +++ b/config/sites/eswiki.yml @@ -1,96 +1,98 @@ -_extends: default.yml -locale: [es_ES,en_US] -homesite: es.wikipedia.org -default_prefix: es -wikidata_languages: ['es', 'en'] -contestPages: - participantsSection: 'Participantes' - footer: "" -templates: - botinfo: UKB botinfo - status: UKB status - commonargs: - year: year - week: week - week2: week2 - yes: yes - no: no - infobox: - name: UKB infotabla - status: status - start: inicio - end: final - organizer: organizer - winner: winner - suspended: - name: UKB participante eliminado - disqualified: - name: UKB participante descalificado - penalty: - name: UKB quitar puntos - bonus: - name: UK bonus - rule: - name: UKB puntos - new: nuevo - redirect: redirección - regexp: regexp - section: sección - qualified: calificado - contrib: edición - byte: byte - word: palabra - maxpoints: maxpoints - site: site - image: imagen - external_link: enlace externo - ref: referencia - templateremoval: eliminarplantilla - categoryremoval: eliminarcategoría - bytebonus: bonusbyte - wordbonus: bonuspalabra - alias: alias - own: self-uploaded - ownwork: own-work - maxinitialcount: límite inicial - wikidata: wikidata - properties: propiedades - labels: labels - aliases: aliases - descriptions: descriptions - description: description - require_reference: require_reference - all: todos - filters: - name: UKB criterio - params: - new: - name: nuevo - params: - redirects: redirección - existing: - name: existía - template: - name: plantilla - bytes: - name: byte - category: - name: categoría - ignore_page: User:UKBot/ignored categories - params: - ignore: excluír - maxdepth: maxdepth - backlink: - name: linkado desde - forwardlink: - name: linka a - pages: - name: páginas - namespace: - name: espacio de nombres - params: - site: site - sparql: - name: sparql # as in {{ ukb criterion | sparql }} - params: - query: query # as in {{ ukb criterion | sparql | query=... }} +_extends: default.yml +locale: [es_ES,en_US] +homesite: es.wikipedia.org +default_prefix: es +wikidata_languages: ['es', 'en'] +contestPages: + participantsSection: 'Participantes' + footer: "" +templates: + botinfo: UKB botinfo + status: UKB status + commonargs: + year: year + week: week + week2: week2 + yes: yes + no: no + infobox: + name: UKB infotabla + status: status + start: inicio + end: final + organizer: organizer + winner: winner + suspended: + name: UKB participante eliminado + disqualified: + name: UKB participante descalificado + penalty: + name: UKB quitar puntos + bonus: + name: UK bonus + rule: + name: UKB puntos + new: nuevo + redirect: redirección + regexp: regexp + section: sección + qualified: calificado + contrib: edición + byte: byte + word: palabra + maxpoints: maxpoints + site: site + image: imagen + external_link: enlace externo + ref: referencia + templateremoval: eliminarplantilla + categoryremoval: eliminarcategoría + bytebonus: bonusbyte + wordbonus: bonuspalabra + alias: alias + own: self-uploaded + ownwork: own-work + maxinitialcount: límite inicial + wikidata: wikidata + properties: propiedades + labels: labels + aliases: aliases + descriptions: descriptions + description: description + require_reference: require_reference + all: todos + filters: + name: UKB criterio + params: + new: + name: nuevo + params: + redirects: redirección + existing: + name: existía + template: + name: plantilla + bytes: + name: byte + category: + name: categoría + ignore_page: User:UKBot/ignored categories + params: + ignore: excluír + maxdepth: maxdepth + backlink: + name: linkado desde + forwardlink: + name: linka a + pages: + name: páginas + namespace: + name: espacio de nombres + params: + site: site + sparql: + name: sparql # as in {{ ukb criterion | sparql }} + params: + query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint + diff --git a/config/sites/euwiki.yml b/config/sites/euwiki.yml index f57032f..3f040f8 100644 --- a/config/sites/euwiki.yml +++ b/config/sites/euwiki.yml @@ -97,3 +97,4 @@ templates: name: sparql # as in {{ ukb criterion | sparql }} params: query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint diff --git a/config/sites/fiwiki.yml b/config/sites/fiwiki.yml index 1cd6b9a..aeedbc0 100644 --- a/config/sites/fiwiki.yml +++ b/config/sites/fiwiki.yml @@ -98,3 +98,4 @@ templates: name: sparql # as in {{ ukb criterion | sparql }} params: query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint diff --git a/config/sites/glwiki.yml b/config/sites/glwiki.yml index 8ff2a7a..7a2df38 100644 --- a/config/sites/glwiki.yml +++ b/config/sites/glwiki.yml @@ -1,96 +1,97 @@ -_extends: default.yml -locale: [gl_ES,en_US] -homesite: gl.wikipedia.org -default_prefix: gl -wikidata_languages: ['gl', 'en'] -contestPages: - participantsSection: 'Participantes' - footer: "" -templates: - botinfo: UKB botinfo - status: UKB status - commonargs: - year: year - week: week - week2: week2 - yes: yes - no: no - infobox: - name: UKB infotabla - status: status - start: comezo - end: remate - organizer: organizador - winner: gañador - suspended: - name: UKB usuario eliminado - disqualified: - name: UKB usuario descualificado - penalty: - name: UKB penalización - bonus: - name: UK bonificación - rule: - name: UKB puntos - new: novo - redirect: redirección - regexp: regexp - section: sección - qualified: cualificado - contrib: edición - byte: byte - word: palabra - maxpoints: max - site: sitio - image: imaxe - external_link: ligazón externa - ref: referencia - templateremoval: eliminar modelo - categoryremoval: eliminar categoría - bytebonus: bytebonus - wordbonus: bonuspalabra - alias: alcume - own: propio - ownwork: propio - maxinitialcount: límite inicial - wikidata: wikidata - properties: propiedades - labels: etiquetas - aliases: alcumes - descriptions: descricións - description: descrición - require_reference: cómpren referencias - all: todos - filters: - name: UKB criterio - params: - new: - name: novo - params: - redirects: redirección - existing: - name: existía - template: - name: modelo - bytes: - name: byte - category: - name: categoría - ignore_page: User:UKBot/categorías ignoradas - params: - ignore: excluír - maxdepth: maxdepth - backlink: - name: ligado dende - forwardlink: - name: liga a - pages: - name: páxinas - namespace: - name: espazo de nomes - params: - site: site - sparql: - name: sparql # as in {{ ukb criterion | sparql }} - params: - query: query # as in {{ ukb criterion | sparql | query=... }} +_extends: default.yml +locale: [gl_ES,en_US] +homesite: gl.wikipedia.org +default_prefix: gl +wikidata_languages: ['gl', 'en'] +contestPages: + participantsSection: 'Participantes' + footer: "" +templates: + botinfo: UKB botinfo + status: UKB status + commonargs: + year: year + week: week + week2: week2 + yes: yes + no: no + infobox: + name: UKB infotabla + status: status + start: comezo + end: remate + organizer: organizador + winner: gañador + suspended: + name: UKB usuario eliminado + disqualified: + name: UKB usuario descualificado + penalty: + name: UKB penalización + bonus: + name: UK bonificación + rule: + name: UKB puntos + new: novo + redirect: redirección + regexp: regexp + section: sección + qualified: cualificado + contrib: edición + byte: byte + word: palabra + maxpoints: max + site: sitio + image: imaxe + external_link: ligazón externa + ref: referencia + templateremoval: eliminar modelo + categoryremoval: eliminar categoría + bytebonus: bytebonus + wordbonus: bonuspalabra + alias: alcume + own: propio + ownwork: propio + maxinitialcount: límite inicial + wikidata: wikidata + properties: propiedades + labels: etiquetas + aliases: alcumes + descriptions: descricións + description: descrición + require_reference: cómpren referencias + all: todos + filters: + name: UKB criterio + params: + new: + name: novo + params: + redirects: redirección + existing: + name: existía + template: + name: modelo + bytes: + name: byte + category: + name: categoría + ignore_page: User:UKBot/categorías ignoradas + params: + ignore: excluír + maxdepth: maxdepth + backlink: + name: ligado dende + forwardlink: + name: liga a + pages: + name: páxinas + namespace: + name: espazo de nomes + params: + site: site + sparql: + name: sparql # as in {{ ukb criterion | sparql }} + params: + query: query # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint diff --git a/config/sites/nowiki.yml b/config/sites/nowiki.yml index 4f8a667..3619a3b 100644 --- a/config/sites/nowiki.yml +++ b/config/sites/nowiki.yml @@ -104,6 +104,7 @@ templates: name: sparql # as in {{ ukb criterion | sparql }} params: query: spørring # as in {{ ukb criterion | sparql | query=... }} + endpoint: endpoint awards: blå: { file: Article blue.svg, winner: true } rød: { file: Article red.svg, winner: true } diff --git a/test/test_filters.py b/test/test_filters.py index 2286867..d628d83 100644 --- a/test/test_filters.py +++ b/test/test_filters.py @@ -2,14 +2,14 @@ import re from collections import OrderedDict import unittest -from mock import Mock +from mock import Mock, patch from unittest import TestCase from faker import Faker from mwclient.page import Page from ukbot.article import Article -from ukbot.filters import CatFilter +from ukbot.filters import CatFilter, SparqlFilter from ukbot.site import Site from ukbot.sites import SiteManager @@ -121,6 +121,119 @@ def kwargs(maxdepth): assert self.filter_and_return_keys(**kwargs(2)) == [] assert self.filter_and_return_keys(**kwargs(3)) == [dummy.a_key(0)] +class TestSparqlFilter(TestCase): + + @patch('ukbot.filters.SparqlFilter.fetch') + def test_make_reads_endpoint_param(self, fetch_mock): + tpl = Mock() + tpl.sites = Mock() + tpl.has_param = lambda name: name in ['query', 'endpoint'] + tpl.get_raw_param = lambda name: { + 'query': 'SELECT ?item WHERE { ?item wdt:P31 wd:Q5 . }', + 'endpoint': 'https://example.org/sparql', + }[name] + + cfg = { + 'params': { + 'query': 'query', + 'endpoint': 'endpoint', + }, + } + + sparql_filter = SparqlFilter.make(tpl=tpl, cfg=cfg) + + assert sparql_filter.endpoint == 'https://example.org/sparql' + fetch_mock.assert_called_once() + + @patch('ukbot.filters.SparqlFilter.fetch') + def test_make_uses_default_endpoint_when_missing(self, fetch_mock): + tpl = Mock() + tpl.sites = Mock() + tpl.has_param = lambda name: name == 'query' + tpl.get_raw_param = lambda name: { + 'query': 'SELECT ?item WHERE { ?item wdt:P31 wd:Q5 . }', + }[name] + + cfg = { + 'params': { + 'query': 'query', + 'endpoint': 'endpoint', + }, + } + + sparql_filter = SparqlFilter.make(tpl=tpl, cfg=cfg) + + assert sparql_filter.endpoint == 'https://query.wikidata.org/sparql' + fetch_mock.assert_called_once() + + @patch('ukbot.filters.SparqlFilter.fetch') + def test_make_reads_mode_param(self, fetch_mock): + tpl = Mock() + tpl.sites = Mock() + tpl.has_param = lambda name: name in ['query', 'mode'] + tpl.get_raw_param = lambda name: { + 'query': 'SELECT ?article WHERE { ?article ?p ?o . }', + 'mode': 'pages', + }[name] + + cfg = { + 'params': { + 'query': 'query', + 'mode': 'mode', + }, + } + + sparql_filter = SparqlFilter.make(tpl=tpl, cfg=cfg) + + assert sparql_filter.mode == 'pages' + fetch_mock.assert_called_once() + + @patch('ukbot.filters.SparqlFilter.fetch') + def test_make_rejects_non_http_endpoint(self, fetch_mock): + tpl = Mock() + tpl.sites = Mock() + tpl.has_param = lambda name: name in ['query', 'endpoint'] + tpl.get_raw_param = lambda name: { + 'query': 'SELECT ?item WHERE { ?item wdt:P31 wd:Q5 . }', + 'endpoint': 'ftp://example.org/sparql', + }[name] + + cfg = { + 'params': { + 'query': 'query', + 'endpoint': 'endpoint', + }, + } + + with self.assertRaises(ValueError): + SparqlFilter.make(tpl=tpl, cfg=cfg) + fetch_mock.assert_not_called() + + @patch('ukbot.filters.requests_retry_session') + @patch('ukbot.filters.SparqlFilter.fetch') + def test_do_query_uses_custom_endpoint(self, fetch_mock, requests_retry_session_mock): + response = Mock() + response.ok = True + response.headers = {} + response.json.return_value = { + 'head': {'vars': ['item']}, + 'results': {'bindings': [{'item': {'value': 'http://www.wikidata.org/entity/Q1'}}]}, + } + requests_retry_session_mock.return_value.get.return_value = response + + sparql_filter = SparqlFilter( + sites=Mock(), + query='SELECT ?item WHERE { ?item wdt:P31 wd:Q5 . }', + endpoint='https://example.org/sparql', + ) + result = sparql_filter.do_query('SELECT ?item WHERE { ?item wdt:P31 wd:Q5 . }') + + requests_retry_session_mock.return_value.get.assert_called_once() + call_args = requests_retry_session_mock.return_value.get.call_args + assert call_args[0][0] == 'https://example.org/sparql' + assert result['var'] == 'item' + assert result['rows'] == ['http://www.wikidata.org/entity/Q1'] + if __name__ == '__main__': unittest.main() diff --git a/ukbot/filters.py b/ukbot/filters.py index 45ef82a..7ed26b8 100644 --- a/ukbot/filters.py +++ b/ukbot/filters.py @@ -740,29 +740,38 @@ class SparqlFilter(Filter): @classmethod def make(cls, tpl, cfg, **kwargs): - if not tpl.has_param('query'): + query_param = cfg['params']['query'] + if not tpl.has_param(query_param): raise RuntimeError(_('No "%s" parameter given') % cfg['params']['query']) + + endpoint_param = cfg['params'].get('endpoint') params = { - 'query': tpl.get_raw_param('query'), + 'query': tpl.get_raw_param(query_param), 'sites': tpl.sites, + 'endpoint': tpl.get_raw_param(endpoint_param) if endpoint_param and tpl.has_param(endpoint_param) else None, } return cls(**params) - def __init__(self, sites, query): + def __init__(self, sites, query, endpoint=None): """ Args: sites (SiteManager): References to the sites part of this contest query (str): The SPARQL query + endpoint (str): SPARQL endpoint URL. Defaults to Wikidata Query Service. """ Filter.__init__(self, sites) self.query = query + self.endpoint = endpoint or 'https://query.wikidata.org/sparql' + endpoint_scheme = urllib.parse.urlparse(self.endpoint).scheme.lower() + if endpoint_scheme not in ['http', 'https']: + raise ValueError('Invalid sparql endpoint scheme: %s' % endpoint_scheme) self.fetch() def do_query(self, querystring): - logger.info('Running SPARQL query: %s', querystring) + logger.info('Running SPARQL query at %s: %s', self.endpoint, querystring) try: response = requests_retry_session().get( - 'https://query.wikidata.org/sparql', + self.endpoint, params={ 'query': querystring, }, From 84789f59c5058cc272c2083de640176d10e3a337 Mon Sep 17 00:00:00 2001 From: Kimmo Virtanen Date: Tue, 7 Apr 2026 22:38:06 +0300 Subject: [PATCH 2/2] added SPARQL mode filter items (default) and pages where it directly fetches the page urls from SPARQL --- config/sites/cawiki.yml | 1 + config/sites/enwiki.yml | 1 + config/sites/eswiki.yml | 1 + config/sites/euwiki.yml | 1 + config/sites/fiwiki.yml | 2 ++ config/sites/glwiki.yml | 1 + config/sites/nowiki.yml | 1 + test/test_filters.py | 27 +++++++++++++++++++++++++++ ukbot/filters.py | 32 +++++++++++++++++++++++++++++++- 9 files changed, 66 insertions(+), 1 deletion(-) diff --git a/config/sites/cawiki.yml b/config/sites/cawiki.yml index ff77b2f..44012d7 100644 --- a/config/sites/cawiki.yml +++ b/config/sites/cawiki.yml @@ -100,6 +100,7 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode awards: blava: { file: Article blue.svg, winner: true } blau: { file: Article blue.svg, winner: true } diff --git a/config/sites/enwiki.yml b/config/sites/enwiki.yml index 9a529d3..08c5aa2 100644 --- a/config/sites/enwiki.yml +++ b/config/sites/enwiki.yml @@ -98,6 +98,7 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode awards: blå: { file: Article blue.svg, winner: true } rød: { file: Article red.svg, winner: true } diff --git a/config/sites/eswiki.yml b/config/sites/eswiki.yml index e945e1b..bc9d3a9 100644 --- a/config/sites/eswiki.yml +++ b/config/sites/eswiki.yml @@ -95,4 +95,5 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode diff --git a/config/sites/euwiki.yml b/config/sites/euwiki.yml index 3f040f8..2836eea 100644 --- a/config/sites/euwiki.yml +++ b/config/sites/euwiki.yml @@ -98,3 +98,4 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode diff --git a/config/sites/fiwiki.yml b/config/sites/fiwiki.yml index aeedbc0..9be3a4b 100644 --- a/config/sites/fiwiki.yml +++ b/config/sites/fiwiki.yml @@ -99,3 +99,5 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode + diff --git a/config/sites/glwiki.yml b/config/sites/glwiki.yml index 7a2df38..2e9772f 100644 --- a/config/sites/glwiki.yml +++ b/config/sites/glwiki.yml @@ -95,3 +95,4 @@ templates: params: query: query # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode diff --git a/config/sites/nowiki.yml b/config/sites/nowiki.yml index 3619a3b..6ed7721 100644 --- a/config/sites/nowiki.yml +++ b/config/sites/nowiki.yml @@ -105,6 +105,7 @@ templates: params: query: spørring # as in {{ ukb criterion | sparql | query=... }} endpoint: endpoint + mode: mode awards: blå: { file: Article blue.svg, winner: true } rød: { file: Article red.svg, winner: true } diff --git a/test/test_filters.py b/test/test_filters.py index d628d83..550b06a 100644 --- a/test/test_filters.py +++ b/test/test_filters.py @@ -234,6 +234,33 @@ def test_do_query_uses_custom_endpoint(self, fetch_mock, requests_retry_session_ assert result['var'] == 'item' assert result['rows'] == ['http://www.wikidata.org/entity/Q1'] + @patch('ukbot.filters.SparqlFilter.fetch') + def test_add_pages_filters_to_contest_wikis(self, fetch_mock): + sites = {'en.wikipedia.org': Mock(), 'fi.wikipedia.org': Mock(), '*.wikivoyage.org': Mock()} + sparql_filter = SparqlFilter( + sites=sites, + query='SELECT ?article WHERE { ?article ?p ?o . }', + mode='pages', + ) + sparql_filter.do_query = Mock(return_value={ + 'rows': [ + 'https://en.wikipedia.org/wiki/Foo_bar', + 'http://fi.wikipedia.org/wiki/Baz', + 'https://fi.wikivoyage.org/wiki/Helsinki', + 'https://en.wikipedia.org/w/index.php?title=Ignored', + 'https://example.org/wiki/Outside', + 'not a url', + ], + }) + + sparql_filter.add_pages() + + assert sparql_filter.page_keys == { + 'en.wikipedia.org:Foo bar', + 'fi.wikipedia.org:Baz', + 'fi.wikivoyage.org:Helsinki', + } + if __name__ == '__main__': unittest.main() diff --git a/ukbot/filters.py b/ukbot/filters.py index 7ed26b8..d899e5a 100644 --- a/ukbot/filters.py +++ b/ukbot/filters.py @@ -3,6 +3,7 @@ import sys import re from copy import copy +from fnmatch import fnmatch from more_itertools import first import logging @@ -745,19 +746,22 @@ def make(cls, tpl, cfg, **kwargs): raise RuntimeError(_('No "%s" parameter given') % cfg['params']['query']) endpoint_param = cfg['params'].get('endpoint') + mode_param = cfg['params'].get('mode', 'mode') params = { 'query': tpl.get_raw_param(query_param), 'sites': tpl.sites, 'endpoint': tpl.get_raw_param(endpoint_param) if endpoint_param and tpl.has_param(endpoint_param) else None, + 'mode': tpl.get_raw_param(mode_param) if tpl.has_param(mode_param) else 'items', } return cls(**params) - def __init__(self, sites, query, endpoint=None): + def __init__(self, sites, query, endpoint=None, mode='items'): """ Args: sites (SiteManager): References to the sites part of this contest query (str): The SPARQL query endpoint (str): SPARQL endpoint URL. Defaults to Wikidata Query Service. + mode (str): "items" (default) or "pages" """ Filter.__init__(self, sites) self.query = query @@ -765,6 +769,9 @@ def __init__(self, sites, query, endpoint=None): endpoint_scheme = urllib.parse.urlparse(self.endpoint).scheme.lower() if endpoint_scheme not in ['http', 'https']: raise ValueError('Invalid sparql endpoint scheme: %s' % endpoint_scheme) + if mode not in ['items', 'pages']: + raise ValueError('Invalid sparql mode: %s' % mode) + self.mode = mode self.fetch() def do_query(self, querystring): @@ -813,6 +820,10 @@ def fetch(self): logger.debug('SparqlFilter: %s', self.query) item_var = 'item' + if self.mode == 'pages': + self.add_pages() + logger.info('SparqlFilter: Initialized with %d articles', len(self.page_keys)) + return # Implementation notes: # - When the contest includes multiple sites, we do one query per site. I tried using @@ -835,6 +846,25 @@ def fetch(self): logger.info('SparqlFilter: Initialized with %d articles', len(self.page_keys)) + def add_pages(self): + allowed_hosts = list(self.sites.keys()) + + for res in self.do_query(self.query)['rows']: + parsed = urllib.parse.urlparse(res) + if parsed.scheme not in ['http', 'https']: + continue + hostname = parsed.hostname or '' + if not any(fnmatch(hostname, pattern) for pattern in allowed_hosts): + continue + if not parsed.path.startswith('/wiki/'): + continue + + article = urllib.parse.unquote(parsed.path[len('/wiki/'):]).replace('_', ' ') + if article == '': + continue + page_key = '%s:%s' % (hostname, article) + self.page_keys.add(page_key) + def add_linked_articles(self, site, item_var): article_var = 'article19472065' # "random string" to avoid matching anything in the subquery query = """