dockerfile/examples/omnivore/content-fetch/readabilityjs/test/generate-testcase.js

386 lines
12 KiB
JavaScript

var debug = false
var path = require('path')
var fs = require('fs')
var prettyPrint = require('./utils').prettyPrint
var htmltidy = require('htmltidy2').tidy
var { Readability, isProbablyReaderable } = require('../index')
const { parseHTML } = require('linkedom')
const puppeteer = require('puppeteer-extra')
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
var testcaseRoot = path.join(__dirname, 'test-pages')
var argURL = process.argv[3] // Could be undefined, we'll warn if it is if that is an issue.
const DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com']
const userAgentForUrl = (url) => {
try {
const u = new URL(url)
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
}
const enableJavascriptForUrl = (url) => {
try {
const u = new URL(url)
for (const host of NON_SCRIPT_HOSTS) {
if (u.hostname.endsWith(host)) {
return false
}
}
} catch (e) {
console.log('error getting hostname for url', url, e)
}
return true
}
function generateTestcase(slug) {
const options = {
debug,
}
if (slug.startsWith('newsletters/')) {
// keep the newsletter content in tables
options.keepTables = true
options.ignoreLinkDensity = true
}
var destRoot = path.join(testcaseRoot, slug)
fs.mkdir(destRoot, function (err) {
if (err) {
var sourceFile = path.join(destRoot, 'source.html')
fs.exists(sourceFile, function (exists) {
if (exists) {
fs.readFile(
sourceFile,
{ encoding: 'utf-8' },
function (readFileErr, data) {
if (readFileErr) {
console.error("Source existed but couldn't be read?")
process.exit(1)
}
onResponseReceived(null, data, destRoot, options)
}
)
} else {
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null)
fetchSource(argURL, function (fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot, options)
})
}
})
return
}
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null)
fetchSource(argURL, function (fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot, options)
})
})
}
async function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!")
process.exit(1)
}
const browser = await puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--disable-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--window-size=1920,1080',
'--disable-extensions',
],
defaultViewport: {
deviceScaleFactor: 1,
hasTouch: false,
height: 1080,
isLandscape: true,
isMobile: false,
width: 1920,
},
headless: true,
executablePath: process.env.CHROMIUM_PATH || '/opt/homebrew/bin/chromium',
})
const page = await browser.newPage()
if (!enableJavascriptForUrl(url)) {
await page.setJavaScriptEnabled(false)
}
await page.setUserAgent(userAgentForUrl(url))
try {
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true)
page.on('request', (request) => {
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort()
} else {
request.continue()
}
})
await page.goto(url, { waitUntil: ['networkidle2'] })
/* scroll with a 5 second timeout */
await Promise.race([
new Promise((resolve) => {
;(async function () {
try {
await page.evaluate(`(async () => {
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
return new Promise((resolve, reject) => {
let scrollHeight = document.body.scrollHeight;
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve(true);
}
}, 10);
});
})()`)
} catch (e) {
console.error('error in scrolling url', { e, url })
} finally {
resolve(true)
}
})()
}),
page.waitForTimeout(5000), //5 second timeout
])
// get document body with all hidden elements removed
const domContent = await page.evaluate(() => {
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi
Array.from(document.body.getElementsByTagName('*')).forEach((el) => {
const style = window.getComputedStyle(el)
try {
// Removing blurred images since they are mostly the copies of lazy loaded ones
if (
el.tagName &&
['img', 'image'].includes(el.tagName.toLowerCase())
) {
const filter = style.getPropertyValue('filter')
if (filter && filter.startsWith('blur')) {
el.parentNode && el.parentNode.removeChild(el)
}
}
} catch (err) {
// throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
}
// convert all nodes with background image to img nodes
if (
!['', 'none'].includes(style.getPropertyValue('background-image'))
) {
const filter = style.getPropertyValue('filter')
// avoiding image nodes with a blur effect creation
if (filter && filter.startsWith('blur')) {
el && el.parentNode && el.parentNode.removeChild(el)
} else {
const matchedSRC = BI_SRC_REGEXP.exec(
style.getPropertyValue('background-image')
)
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
BI_SRC_REGEXP.lastIndex = 0
if (matchedSRC && matchedSRC[1] && !el.src) {
// Replacing element only of there are no content inside, b/c might remove important div with content.
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
if (!el.textContent) {
const img = document.createElement('img')
img.src = matchedSRC[1]
el && el.parentNode && el.parentNode.replaceChild(img, el)
}
}
}
}
})
return document.documentElement.innerHTML
})
sanitizeSource(domContent, callbackFn)
} catch (error) {
console.error('Error occured while fetching content')
console.error(error)
} finally {
await page.close()
await browser.close()
}
}
function sanitizeSource(html, callbackFn) {
htmltidy(
html,
{
indent: true,
'indent-spaces': 4,
'numeric-entities': true,
'output-xhtml': true,
wrap: 0,
},
callbackFn
)
}
function onResponseReceived(error, source, destRoot, options) {
if (error) {
console.error("Couldn't tidy source html!")
console.error(error)
return
}
if (debug) {
console.log('writing')
}
var sourcePath = path.join(destRoot, 'source.html')
fs.writeFile(sourcePath, source, async function (err) {
if (err) {
console.error("Couldn't write data to source.html!")
console.error(err)
return
}
if (debug) {
console.log('Running readability stuff')
}
await runReadability(
source,
path.join(destRoot, 'expected.html'),
path.join(destRoot, 'expected-metadata.json'),
options
)
})
}
async function runReadability(source, destPath, metadataDestPath, options) {
console.log('running readability')
var uri = 'http://fakehost/test/page.html'
var myReader, result, readerable
try {
// Use linkedom for isProbablyReaderable because it supports querySelectorAll
var dom = parseHTML(source).document
readerable = isProbablyReaderable(dom)
// We pass `caption` as a class to check that passing in extra classes works,
// given that it appears in some of the test documents.
myReader = new Readability(dom, {
classesToPreserve: ['caption'],
url: uri,
...options,
})
result = await myReader.parse()
} catch (ex) {
console.error(ex)
ex.stack.forEach(console.log.bind(console))
}
console.log('result', result)
if (!result) {
console.error(
'No content generated by readability, not going to write expected.html!'
)
return
}
fs.writeFile(destPath, prettyPrint(result.content), function (fileWriteErr) {
if (fileWriteErr) {
console.error("Couldn't write data to expected.html!")
console.error(fileWriteErr)
}
// Delete the result data we don't care about checking.
delete result.content
delete result.textContent
delete result.length
delete result.dom
// Add isProbablyReaderable result
result.readerable = readerable
fs.writeFile(
metadataDestPath,
JSON.stringify(result, null, 2) + '\n',
function (metadataWriteErr) {
if (metadataWriteErr) {
console.error("Couldn't write data to expected-metadata.json!")
console.error(metadataWriteErr)
}
}
)
})
}
if (process.argv.length < 3) {
console.error(
"Need at least a destination slug and potentially a URL (if the slug doesn't have source)."
)
process.exit(0)
}
if (process.argv[2] === 'all') {
fs.readdir(testcaseRoot, function (err, files) {
if (err) {
console.error('error reading testcases')
return
}
files.forEach(function (file) {
generateTestcase(file)
})
})
} else {
generateTestcase(process.argv[2])
}