var debug = false var path = require('path') var fs = require('fs') var prettyPrint = require('./utils').prettyPrint var htmltidy = require('htmltidy2').tidy var { Readability, isProbablyReaderable } = require('../index') const { parseHTML } = require('linkedom') const puppeteer = require('puppeteer-extra') // Add stealth plugin to hide puppeteer usage const StealthPlugin = require('puppeteer-extra-plugin-stealth') puppeteer.use(StealthPlugin()) // Add adblocker plugin to block all ads and trackers (saves bandwidth) const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker') puppeteer.use(AdblockerPlugin({ blockTrackers: true })) var testcaseRoot = path.join(__dirname, 'test-pages') var argURL = process.argv[3] // Could be undefined, we'll warn if it is if that is an issue. const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com'] const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com'] const userAgentForUrl = (url) => { try { const u = new URL(url) for (const host of NON_BOT_HOSTS) { if (u.hostname.endsWith(host)) { return NON_BOT_DESKTOP_USER_AGENT } } } catch (e) { console.log('error getting user agent for url', url, e) } return DESKTOP_USER_AGENT } const enableJavascriptForUrl = (url) => { try { const u = new URL(url) for (const host of NON_SCRIPT_HOSTS) { if (u.hostname.endsWith(host)) { return false } } } catch (e) { console.log('error getting hostname for url', url, e) } return true } function generateTestcase(slug) { const options = { debug, } if (slug.startsWith('newsletters/')) { // keep the newsletter content in tables options.keepTables = true options.ignoreLinkDensity = true } var destRoot = path.join(testcaseRoot, slug) fs.mkdir(destRoot, function (err) { if (err) { var sourceFile = path.join(destRoot, 'source.html') fs.exists(sourceFile, function (exists) { if (exists) { fs.readFile( sourceFile, { encoding: 'utf-8' }, function (readFileErr, data) { if (readFileErr) { console.error("Source existed but couldn't be read?") process.exit(1) } onResponseReceived(null, data, destRoot, options) } ) } else { fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null) fetchSource(argURL, function (fetchErr, data) { onResponseReceived(fetchErr, data, destRoot, options) }) } }) return } fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null) fetchSource(argURL, function (fetchErr, data) { onResponseReceived(fetchErr, data, destRoot, options) }) }) } async function fetchSource(url, callbackFn) { if (!url) { console.error("You should pass a URL if the source doesn't exist yet!") process.exit(1) } const browser = await puppeteer.launch({ args: [ '--allow-running-insecure-content', '--autoplay-policy=user-gesture-required', '--disable-component-update', '--disable-domain-reliability', '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process', '--disable-print-preview', '--disable-setuid-sandbox', '--disable-site-isolation-trials', '--disable-speech-api', '--disable-web-security', '--disk-cache-size=33554432', '--enable-features=SharedArrayBuffer', '--hide-scrollbars', '--disable-gpu', '--mute-audio', '--no-default-browser-check', '--no-pings', '--no-sandbox', '--no-zygote', '--window-size=1920,1080', '--disable-extensions', ], defaultViewport: { deviceScaleFactor: 1, hasTouch: false, height: 1080, isLandscape: true, isMobile: false, width: 1920, }, headless: true, executablePath: process.env.CHROMIUM_PATH || '/opt/homebrew/bin/chromium', }) const page = await browser.newPage() if (!enableJavascriptForUrl(url)) { await page.setJavaScriptEnabled(false) } await page.setUserAgent(userAgentForUrl(url)) try { /* * Disallow MathJax from running in Puppeteer and modifying the document, * we shall instead run it in our frontend application to transform any * mathjax content when present. */ await page.setRequestInterception(true) page.on('request', (request) => { if ( request.resourceType() === 'script' && request.url().toLowerCase().indexOf('mathjax') > -1 ) { request.abort() } else { request.continue() } }) await page.goto(url, { waitUntil: ['networkidle2'] }) /* scroll with a 5 second timeout */ await Promise.race([ new Promise((resolve) => { ;(async function () { try { await page.evaluate(`(async () => { /* credit: https://github.com/puppeteer/puppeteer/issues/305 */ return new Promise((resolve, reject) => { let scrollHeight = document.body.scrollHeight; let totalHeight = 0; let distance = 500; let timer = setInterval(() => { window.scrollBy(0, distance); totalHeight += distance; if(totalHeight >= scrollHeight){ clearInterval(timer); resolve(true); } }, 10); }); })()`) } catch (e) { console.error('error in scrolling url', { e, url }) } finally { resolve(true) } })() }), page.waitForTimeout(5000), //5 second timeout ]) // get document body with all hidden elements removed const domContent = await page.evaluate(() => { const BI_SRC_REGEXP = /url\("(.+?)"\)/gi Array.from(document.body.getElementsByTagName('*')).forEach((el) => { const style = window.getComputedStyle(el) try { // Removing blurred images since they are mostly the copies of lazy loaded ones if ( el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase()) ) { const filter = style.getPropertyValue('filter') if (filter && filter.startsWith('blur')) { el.parentNode && el.parentNode.removeChild(el) } } } catch (err) { // throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*')))) } // convert all nodes with background image to img nodes if ( !['', 'none'].includes(style.getPropertyValue('background-image')) ) { const filter = style.getPropertyValue('filter') // avoiding image nodes with a blur effect creation if (filter && filter.startsWith('blur')) { el && el.parentNode && el.parentNode.removeChild(el) } else { const matchedSRC = BI_SRC_REGEXP.exec( style.getPropertyValue('background-image') ) // Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage // More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results BI_SRC_REGEXP.lastIndex = 0 if (matchedSRC && matchedSRC[1] && !el.src) { // Replacing element only of there are no content inside, b/c might remove important div with content. // Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html // DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image. if (!el.textContent) { const img = document.createElement('img') img.src = matchedSRC[1] el && el.parentNode && el.parentNode.replaceChild(img, el) } } } } }) return document.documentElement.innerHTML }) sanitizeSource(domContent, callbackFn) } catch (error) { console.error('Error occured while fetching content') console.error(error) } finally { await page.close() await browser.close() } } function sanitizeSource(html, callbackFn) { htmltidy( html, { indent: true, 'indent-spaces': 4, 'numeric-entities': true, 'output-xhtml': true, wrap: 0, }, callbackFn ) } function onResponseReceived(error, source, destRoot, options) { if (error) { console.error("Couldn't tidy source html!") console.error(error) return } if (debug) { console.log('writing') } var sourcePath = path.join(destRoot, 'source.html') fs.writeFile(sourcePath, source, async function (err) { if (err) { console.error("Couldn't write data to source.html!") console.error(err) return } if (debug) { console.log('Running readability stuff') } await runReadability( source, path.join(destRoot, 'expected.html'), path.join(destRoot, 'expected-metadata.json'), options ) }) } async function runReadability(source, destPath, metadataDestPath, options) { console.log('running readability') var uri = 'http://fakehost/test/page.html' var myReader, result, readerable try { // Use linkedom for isProbablyReaderable because it supports querySelectorAll var dom = parseHTML(source).document readerable = isProbablyReaderable(dom) // We pass `caption` as a class to check that passing in extra classes works, // given that it appears in some of the test documents. myReader = new Readability(dom, { classesToPreserve: ['caption'], url: uri, ...options, }) result = await myReader.parse() } catch (ex) { console.error(ex) ex.stack.forEach(console.log.bind(console)) } console.log('result', result) if (!result) { console.error( 'No content generated by readability, not going to write expected.html!' ) return } fs.writeFile(destPath, prettyPrint(result.content), function (fileWriteErr) { if (fileWriteErr) { console.error("Couldn't write data to expected.html!") console.error(fileWriteErr) } // Delete the result data we don't care about checking. delete result.content delete result.textContent delete result.length delete result.dom // Add isProbablyReaderable result result.readerable = readerable fs.writeFile( metadataDestPath, JSON.stringify(result, null, 2) + '\n', function (metadataWriteErr) { if (metadataWriteErr) { console.error("Couldn't write data to expected-metadata.json!") console.error(metadataWriteErr) } } ) }) } if (process.argv.length < 3) { console.error( "Need at least a destination slug and potentially a URL (if the slug doesn't have source)." ) process.exit(0) } if (process.argv[2] === 'all') { fs.readdir(testcaseRoot, function (err, files) { if (err) { console.error('error reading testcases') return } files.forEach(function (file) { generateTestcase(file) }) }) } else { generateTestcase(process.argv[2]) }