386 lines
12 KiB
JavaScript
386 lines
12 KiB
JavaScript
var debug = false
|
|
|
|
var path = require('path')
|
|
var fs = require('fs')
|
|
var prettyPrint = require('./utils').prettyPrint
|
|
var htmltidy = require('htmltidy2').tidy
|
|
|
|
var { Readability, isProbablyReaderable } = require('../index')
|
|
const { parseHTML } = require('linkedom')
|
|
|
|
const puppeteer = require('puppeteer-extra')
|
|
|
|
// Add stealth plugin to hide puppeteer usage
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
|
|
puppeteer.use(StealthPlugin())
|
|
|
|
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
|
|
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
|
|
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
|
|
|
var testcaseRoot = path.join(__dirname, 'test-pages')
|
|
|
|
var argURL = process.argv[3] // Could be undefined, we'll warn if it is if that is an issue.
|
|
|
|
const DESKTOP_USER_AGENT =
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
|
const NON_BOT_DESKTOP_USER_AGENT =
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
|
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
|
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com']
|
|
|
|
const userAgentForUrl = (url) => {
|
|
try {
|
|
const u = new URL(url)
|
|
for (const host of NON_BOT_HOSTS) {
|
|
if (u.hostname.endsWith(host)) {
|
|
return NON_BOT_DESKTOP_USER_AGENT
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.log('error getting user agent for url', url, e)
|
|
}
|
|
return DESKTOP_USER_AGENT
|
|
}
|
|
|
|
const enableJavascriptForUrl = (url) => {
|
|
try {
|
|
const u = new URL(url)
|
|
for (const host of NON_SCRIPT_HOSTS) {
|
|
if (u.hostname.endsWith(host)) {
|
|
return false
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.log('error getting hostname for url', url, e)
|
|
}
|
|
return true
|
|
}
|
|
|
|
function generateTestcase(slug) {
|
|
const options = {
|
|
debug,
|
|
}
|
|
if (slug.startsWith('newsletters/')) {
|
|
// keep the newsletter content in tables
|
|
options.keepTables = true
|
|
options.ignoreLinkDensity = true
|
|
}
|
|
var destRoot = path.join(testcaseRoot, slug)
|
|
|
|
fs.mkdir(destRoot, function (err) {
|
|
if (err) {
|
|
var sourceFile = path.join(destRoot, 'source.html')
|
|
fs.exists(sourceFile, function (exists) {
|
|
if (exists) {
|
|
fs.readFile(
|
|
sourceFile,
|
|
{ encoding: 'utf-8' },
|
|
function (readFileErr, data) {
|
|
if (readFileErr) {
|
|
console.error("Source existed but couldn't be read?")
|
|
process.exit(1)
|
|
}
|
|
onResponseReceived(null, data, destRoot, options)
|
|
}
|
|
)
|
|
} else {
|
|
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null)
|
|
fetchSource(argURL, function (fetchErr, data) {
|
|
onResponseReceived(fetchErr, data, destRoot, options)
|
|
})
|
|
}
|
|
})
|
|
return
|
|
}
|
|
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null)
|
|
fetchSource(argURL, function (fetchErr, data) {
|
|
onResponseReceived(fetchErr, data, destRoot, options)
|
|
})
|
|
})
|
|
}
|
|
|
|
async function fetchSource(url, callbackFn) {
|
|
if (!url) {
|
|
console.error("You should pass a URL if the source doesn't exist yet!")
|
|
process.exit(1)
|
|
}
|
|
|
|
const browser = await puppeteer.launch({
|
|
args: [
|
|
'--allow-running-insecure-content',
|
|
'--autoplay-policy=user-gesture-required',
|
|
'--disable-component-update',
|
|
'--disable-domain-reliability',
|
|
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
|
|
'--disable-print-preview',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-site-isolation-trials',
|
|
'--disable-speech-api',
|
|
'--disable-web-security',
|
|
'--disk-cache-size=33554432',
|
|
'--enable-features=SharedArrayBuffer',
|
|
'--hide-scrollbars',
|
|
'--disable-gpu',
|
|
'--mute-audio',
|
|
'--no-default-browser-check',
|
|
'--no-pings',
|
|
'--no-sandbox',
|
|
'--no-zygote',
|
|
'--window-size=1920,1080',
|
|
'--disable-extensions',
|
|
],
|
|
defaultViewport: {
|
|
deviceScaleFactor: 1,
|
|
hasTouch: false,
|
|
height: 1080,
|
|
isLandscape: true,
|
|
isMobile: false,
|
|
width: 1920,
|
|
},
|
|
headless: true,
|
|
executablePath: process.env.CHROMIUM_PATH || '/opt/homebrew/bin/chromium',
|
|
})
|
|
|
|
const page = await browser.newPage()
|
|
if (!enableJavascriptForUrl(url)) {
|
|
await page.setJavaScriptEnabled(false)
|
|
}
|
|
await page.setUserAgent(userAgentForUrl(url))
|
|
|
|
try {
|
|
/*
|
|
* Disallow MathJax from running in Puppeteer and modifying the document,
|
|
* we shall instead run it in our frontend application to transform any
|
|
* mathjax content when present.
|
|
*/
|
|
await page.setRequestInterception(true)
|
|
page.on('request', (request) => {
|
|
if (
|
|
request.resourceType() === 'script' &&
|
|
request.url().toLowerCase().indexOf('mathjax') > -1
|
|
) {
|
|
request.abort()
|
|
} else {
|
|
request.continue()
|
|
}
|
|
})
|
|
|
|
await page.goto(url, { waitUntil: ['networkidle2'] })
|
|
|
|
/* scroll with a 5 second timeout */
|
|
await Promise.race([
|
|
new Promise((resolve) => {
|
|
;(async function () {
|
|
try {
|
|
await page.evaluate(`(async () => {
|
|
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
|
|
return new Promise((resolve, reject) => {
|
|
let scrollHeight = document.body.scrollHeight;
|
|
let totalHeight = 0;
|
|
let distance = 500;
|
|
let timer = setInterval(() => {
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
if(totalHeight >= scrollHeight){
|
|
clearInterval(timer);
|
|
resolve(true);
|
|
}
|
|
}, 10);
|
|
});
|
|
})()`)
|
|
} catch (e) {
|
|
console.error('error in scrolling url', { e, url })
|
|
} finally {
|
|
resolve(true)
|
|
}
|
|
})()
|
|
}),
|
|
page.waitForTimeout(5000), //5 second timeout
|
|
])
|
|
|
|
// get document body with all hidden elements removed
|
|
const domContent = await page.evaluate(() => {
|
|
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi
|
|
|
|
Array.from(document.body.getElementsByTagName('*')).forEach((el) => {
|
|
const style = window.getComputedStyle(el)
|
|
|
|
try {
|
|
// Removing blurred images since they are mostly the copies of lazy loaded ones
|
|
if (
|
|
el.tagName &&
|
|
['img', 'image'].includes(el.tagName.toLowerCase())
|
|
) {
|
|
const filter = style.getPropertyValue('filter')
|
|
if (filter && filter.startsWith('blur')) {
|
|
el.parentNode && el.parentNode.removeChild(el)
|
|
}
|
|
}
|
|
} catch (err) {
|
|
// throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
|
|
}
|
|
|
|
// convert all nodes with background image to img nodes
|
|
if (
|
|
!['', 'none'].includes(style.getPropertyValue('background-image'))
|
|
) {
|
|
const filter = style.getPropertyValue('filter')
|
|
// avoiding image nodes with a blur effect creation
|
|
if (filter && filter.startsWith('blur')) {
|
|
el && el.parentNode && el.parentNode.removeChild(el)
|
|
} else {
|
|
const matchedSRC = BI_SRC_REGEXP.exec(
|
|
style.getPropertyValue('background-image')
|
|
)
|
|
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
|
|
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
|
|
BI_SRC_REGEXP.lastIndex = 0
|
|
|
|
if (matchedSRC && matchedSRC[1] && !el.src) {
|
|
// Replacing element only of there are no content inside, b/c might remove important div with content.
|
|
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
|
|
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
|
|
if (!el.textContent) {
|
|
const img = document.createElement('img')
|
|
img.src = matchedSRC[1]
|
|
el && el.parentNode && el.parentNode.replaceChild(img, el)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})
|
|
return document.documentElement.innerHTML
|
|
})
|
|
|
|
sanitizeSource(domContent, callbackFn)
|
|
} catch (error) {
|
|
console.error('Error occured while fetching content')
|
|
console.error(error)
|
|
} finally {
|
|
await page.close()
|
|
await browser.close()
|
|
}
|
|
}
|
|
|
|
function sanitizeSource(html, callbackFn) {
|
|
htmltidy(
|
|
html,
|
|
{
|
|
indent: true,
|
|
'indent-spaces': 4,
|
|
'numeric-entities': true,
|
|
'output-xhtml': true,
|
|
wrap: 0,
|
|
},
|
|
callbackFn
|
|
)
|
|
}
|
|
|
|
function onResponseReceived(error, source, destRoot, options) {
|
|
if (error) {
|
|
console.error("Couldn't tidy source html!")
|
|
console.error(error)
|
|
return
|
|
}
|
|
if (debug) {
|
|
console.log('writing')
|
|
}
|
|
var sourcePath = path.join(destRoot, 'source.html')
|
|
fs.writeFile(sourcePath, source, async function (err) {
|
|
if (err) {
|
|
console.error("Couldn't write data to source.html!")
|
|
console.error(err)
|
|
return
|
|
}
|
|
if (debug) {
|
|
console.log('Running readability stuff')
|
|
}
|
|
await runReadability(
|
|
source,
|
|
path.join(destRoot, 'expected.html'),
|
|
path.join(destRoot, 'expected-metadata.json'),
|
|
options
|
|
)
|
|
})
|
|
}
|
|
|
|
async function runReadability(source, destPath, metadataDestPath, options) {
|
|
console.log('running readability')
|
|
|
|
var uri = 'http://fakehost/test/page.html'
|
|
var myReader, result, readerable
|
|
try {
|
|
// Use linkedom for isProbablyReaderable because it supports querySelectorAll
|
|
var dom = parseHTML(source).document
|
|
readerable = isProbablyReaderable(dom)
|
|
// We pass `caption` as a class to check that passing in extra classes works,
|
|
// given that it appears in some of the test documents.
|
|
myReader = new Readability(dom, {
|
|
classesToPreserve: ['caption'],
|
|
url: uri,
|
|
...options,
|
|
})
|
|
result = await myReader.parse()
|
|
} catch (ex) {
|
|
console.error(ex)
|
|
ex.stack.forEach(console.log.bind(console))
|
|
}
|
|
console.log('result', result)
|
|
if (!result) {
|
|
console.error(
|
|
'No content generated by readability, not going to write expected.html!'
|
|
)
|
|
return
|
|
}
|
|
|
|
fs.writeFile(destPath, prettyPrint(result.content), function (fileWriteErr) {
|
|
if (fileWriteErr) {
|
|
console.error("Couldn't write data to expected.html!")
|
|
console.error(fileWriteErr)
|
|
}
|
|
|
|
// Delete the result data we don't care about checking.
|
|
delete result.content
|
|
delete result.textContent
|
|
delete result.length
|
|
delete result.dom
|
|
|
|
// Add isProbablyReaderable result
|
|
result.readerable = readerable
|
|
|
|
fs.writeFile(
|
|
metadataDestPath,
|
|
JSON.stringify(result, null, 2) + '\n',
|
|
function (metadataWriteErr) {
|
|
if (metadataWriteErr) {
|
|
console.error("Couldn't write data to expected-metadata.json!")
|
|
console.error(metadataWriteErr)
|
|
}
|
|
}
|
|
)
|
|
})
|
|
}
|
|
|
|
if (process.argv.length < 3) {
|
|
console.error(
|
|
"Need at least a destination slug and potentially a URL (if the slug doesn't have source)."
|
|
)
|
|
process.exit(0)
|
|
}
|
|
|
|
if (process.argv[2] === 'all') {
|
|
fs.readdir(testcaseRoot, function (err, files) {
|
|
if (err) {
|
|
console.error('error reading testcases')
|
|
return
|
|
}
|
|
|
|
files.forEach(function (file) {
|
|
generateTestcase(file)
|
|
})
|
|
})
|
|
} else {
|
|
generateTestcase(process.argv[2])
|
|
}
|