From aa4188c479e00bff842db170f3939257bf1cc7a6 Mon Sep 17 00:00:00 2001 From: Nazar Leush Date: Wed, 20 May 2026 20:34:53 +0300 Subject: [PATCH 1/3] fix reading `domainOptions` from config --- lib/core.js | 12 ++++++------ package.json | 3 ++- pnpm-lock.yaml | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/lib/core.js b/lib/core.js index 5e3f5af8c..1ec48746f 100644 --- a/lib/core.js +++ b/lib/core.js @@ -1692,6 +1692,11 @@ } } + // Read from CONFIG. + if (typeof value === 'undefined' && CONFIG.domainOptions) { + value = searchParamInObj(0, bits, CONFIG.domainOptions); + } + if ( typeof defaultValue !== "undefined" && typeof value !== typeof defaultValue @@ -1702,12 +1707,7 @@ } else { return defaultValue; } - } - - // Read from CONFIG. - if (typeof value === 'undefined' && CONFIG.domainOptions) { - value = searchParamInObj(0, bits, CONFIG.domainOptions); - } + } return typeof value !== 'undefined' ? value : defaultValue; }; diff --git a/package.json b/package.json index 2935f3b41..e8dcec8bf 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,8 @@ "redis": "^4.6.14", "redis-clustr": "1.7.0", "sax": "^1.2.4", - "send": "^1.2.0" + "send": "^1.2.0", + "turndown": "^7.2.4" }, "devDependencies": { "chai": "^6.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index aa947b0c2..f97a82dd9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -106,6 +106,9 @@ importers: send: specifier: ^1.2.0 version: 1.2.0 + turndown: + specifier: ^7.2.4 + version: 7.2.4 devDependencies: chai: specifier: ^6.2.0 @@ -132,6 +135,9 @@ packages: resolution: {integrity: sha512-Zak2kPJuIdg9UQQfUgNm848vRAg2pdOqYYU+7DkCYWO+SgZiMV+qy99BpO1geDiP2rQ2M7JH5oNXRTEvEWglRQ==} engines: {node: '>=14.16'} + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@mongodb-js/saslprep@1.3.2': resolution: {integrity: sha512-QgA5AySqB27cGTXBFmnpifAi7HxoGUeezwo6p9dI03MuDB6Pp33zgclqVb6oVK3j6I9Vesg0+oojW2XxB59SGg==} @@ -1245,6 +1251,10 @@ packages: resolution: {integrity: sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==} engines: {node: '>=14'} + turndown@7.2.4: + resolution: {integrity: sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==} + engines: {node: '>=18', npm: '>=9'} + type-fest@4.32.0: resolution: {integrity: sha512-rfgpoi08xagF3JSdtJlCwMq9DGNDE0IMh3Mkpc1wUypg9vPi786AiqeBBKcqvIkq42azsBM85N490fyZjeUftw==} engines: {node: '>=16'} @@ -1338,6 +1348,8 @@ snapshots: transitivePeerDependencies: - supports-color + '@mixmark-io/domino@2.2.0': {} + '@mongodb-js/saslprep@1.3.2': dependencies: sparse-bitfield: 3.0.3 @@ -2524,6 +2536,10 @@ snapshots: dependencies: punycode: 2.3.0 + turndown@7.2.4: + dependencies: + '@mixmark-io/domino': 2.2.0 + type-fest@4.32.0: {} type-is@2.0.1: From 8046f6fabaccfab8d365c8035f701cbc7675d170 Mon Sep 17 00:00:00 2001 From: Nazar Leush Date: Thu, 21 May 2026 15:22:56 +0300 Subject: [PATCH 2/3] update readability usage --- lib/core.js | 2 +- lib/plugins/system/readability.js | 6 +++--- plugins/links/article/$readability.js | 20 ++++++++++++++++++++ plugins/links/article/article.js | 7 +++---- 4 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 plugins/links/article/$readability.js diff --git a/lib/core.js b/lib/core.js index 1ec48746f..3a7eec565 100644 --- a/lib/core.js +++ b/lib/core.js @@ -1154,7 +1154,7 @@ return hasDomainData; } - const BIG_CONTEXT = ['readability', 'decode', 'cheerio']; + const BIG_CONTEXT = ['$readability', 'readabilitySAX', 'decode', 'cheerio']; function prepareResultData(uri, result, options) { diff --git a/lib/plugins/system/readability.js b/lib/plugins/system/readability.js index fdbd84d7e..843f8178c 100644 --- a/lib/plugins/system/readability.js +++ b/lib/plugins/system/readability.js @@ -2,7 +2,7 @@ import { Readability } from 'readabilitySAX'; export default { - provides: 'self', + provides: 'readabilitySAX', getData: function(url, meta, htmlparser, cb) { @@ -37,7 +37,7 @@ export default { } cb(null, { - readability: readability + readabilitySAX: readability }); } @@ -46,4 +46,4 @@ export default { htmlparser.addHandler(readability); } -}; \ No newline at end of file +}; diff --git a/plugins/links/article/$readability.js b/plugins/links/article/$readability.js new file mode 100644 index 000000000..0d78666c9 --- /dev/null +++ b/plugins/links/article/$readability.js @@ -0,0 +1,20 @@ +import * as cheerio from 'cheerio'; + +export default { + + provides: [ + '$readability', + 'articleHtml' + ], + + getData: function(__readabilityEnabled, readabilitySAX, meta, utils) { + + const articleHtml = utils.encodeText(meta.charset, readabilitySAX.getHTML()); + const $readability = cheerio.load(articleHtml); + + return { + $readability, + articleHtml + }; + } +}; diff --git a/plugins/links/article/article.js b/plugins/links/article/article.js index 505547520..b065cb086 100644 --- a/plugins/links/article/article.js +++ b/plugins/links/article/article.js @@ -4,10 +4,9 @@ export default { provides: 'articlebody', // if not yet provided from LD articlebody - getData: function(__readabilityEnabled, readability, meta, utils) { + getData: function(__readabilityEnabled, $readability, articleHtml, meta, utils) { - const articleHtml = utils.encodeText(meta.charset, readability.getHTML()); - const $p = cheerio.load(articleHtml)('p'); + const $p = $readability('p'); const articleText = $p.text(); if (articleText) { @@ -22,4 +21,4 @@ export default { articlebody: articlebody }; } -}; \ No newline at end of file +}; From a7463f4c51ac0b3a815a6e60d83638367c9d6432 Mon Sep 17 00:00:00 2001 From: Ivan Paramonau Date: Wed, 27 May 2026 16:33:50 -0400 Subject: [PATCH 3/3] clean up readability use --- lib/core.js | 2 +- lib/plugins/system/readability.js | 4 ++-- plugins/links/article/$readability.js | 20 ---------------- plugins/links/article/article.js | 24 ------------------- plugins/links/article/check-article.js | 33 -------------------------- plugins/links/article/reader.js | 10 -------- 6 files changed, 3 insertions(+), 90 deletions(-) delete mode 100644 plugins/links/article/$readability.js delete mode 100644 plugins/links/article/article.js delete mode 100644 plugins/links/article/check-article.js delete mode 100644 plugins/links/article/reader.js diff --git a/lib/core.js b/lib/core.js index 3a7eec565..4dd99bccb 100644 --- a/lib/core.js +++ b/lib/core.js @@ -1154,7 +1154,7 @@ return hasDomainData; } - const BIG_CONTEXT = ['$readability', 'readabilitySAX', 'decode', 'cheerio']; + const BIG_CONTEXT = ['readability', '$fulltext', 'decode', 'cheerio']; function prepareResultData(uri, result, options) { diff --git a/lib/plugins/system/readability.js b/lib/plugins/system/readability.js index 843f8178c..bf7842c65 100644 --- a/lib/plugins/system/readability.js +++ b/lib/plugins/system/readability.js @@ -2,7 +2,7 @@ import { Readability } from 'readabilitySAX'; export default { - provides: 'readabilitySAX', + provides: 'readability', getData: function(url, meta, htmlparser, cb) { @@ -37,7 +37,7 @@ export default { } cb(null, { - readabilitySAX: readability + readability: readability }); } diff --git a/plugins/links/article/$readability.js b/plugins/links/article/$readability.js deleted file mode 100644 index 0d78666c9..000000000 --- a/plugins/links/article/$readability.js +++ /dev/null @@ -1,20 +0,0 @@ -import * as cheerio from 'cheerio'; - -export default { - - provides: [ - '$readability', - 'articleHtml' - ], - - getData: function(__readabilityEnabled, readabilitySAX, meta, utils) { - - const articleHtml = utils.encodeText(meta.charset, readabilitySAX.getHTML()); - const $readability = cheerio.load(articleHtml); - - return { - $readability, - articleHtml - }; - } -}; diff --git a/plugins/links/article/article.js b/plugins/links/article/article.js deleted file mode 100644 index b065cb086..000000000 --- a/plugins/links/article/article.js +++ /dev/null @@ -1,24 +0,0 @@ -import * as cheerio from 'cheerio'; - -export default { - - provides: 'articlebody', // if not yet provided from LD articlebody - - getData: function(__readabilityEnabled, $readability, articleHtml, meta, utils) { - - const $p = $readability('p'); - const articleText = $p.text(); - - if (articleText) { - return { - articlebody: __readabilityEnabled === 'html' ? articleHtml : articleText.replace(/\.([^\.\d\s\n\r\'\"\”\)\]])/g, '. $1') - } - } - }, - - getVars: function(articlebody) { - return { - articlebody: articlebody - }; - } -}; diff --git a/plugins/links/article/check-article.js b/plugins/links/article/check-article.js deleted file mode 100644 index f7a0cdb87..000000000 --- a/plugins/links/article/check-article.js +++ /dev/null @@ -1,33 +0,0 @@ -export default { - - provides: [ - "__readabilityEnabled", - "articlebody" - ], - - getData: function(meta, options) { - - const ld = meta.ld?.newsarticle || meta.ld?.article || meta.ld?.blogposting || meta.ld?.reportagenewsarticle || meta.ld?.socialmediaposting; - - if ((ld - || (meta.og && (meta.og.type === "article" || meta.og.type === "blog" || meta.og.type === 'website') - || meta.twitter?.card === 'summary_large_image' - || meta.article)) - - && (options.getRequestOptions('readability.articlebody') || CONFIG.providerOptions?.app?.allow_readability === true)) { - - const article_format = (options.getRequestOptions('readability.articlebody') === 'html' - || CONFIG.providerOptions?.app?.allow_readability === true) ? 'html' : 'txt'; - - if (ld?.articlebody && (article_format !== 'html' || /\/>/.test(ld.articlebody))) { - return { - articlebody: ld.articlebody - } - } else if (options.getProviderOptions('app.allow_readability')) { - return { - __readabilityEnabled: article_format - } - } - } - } -}; \ No newline at end of file diff --git a/plugins/links/article/reader.js b/plugins/links/article/reader.js deleted file mode 100644 index 3d496baa7..000000000 --- a/plugins/links/article/reader.js +++ /dev/null @@ -1,10 +0,0 @@ -export default { - - getData: function(__readabilityEnabled, articlebody) { - if (__readabilityEnabled === 'html' && CONFIG.providerOptions?.app?.allow_readability === true && !CONFIG.SKIP_IFRAMELY_RENDERS) { - return { - safe_html: articlebody - } - } - } -}; \ No newline at end of file