dockerfile/examples/omnivore/content-fetch/puppeteer-parse/index.js

1054 lines
32 KiB
JavaScript

/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
const { encode } = require("urlsafe-base64");
const crypto = require("crypto");
const Url = require('url');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const signToken = promisify(jwt.sign);
const os = require('os');
const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");
const puppeteer = require('puppeteer-extra');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const createDOMPurify = require("dompurify");
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
const filePath = `${os.tmpdir()}/previewImage.png`;
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
const REQUEST_TIMEOUT = 30000; // 30 seconds
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
const userAgentForUrl = (url) => {
try {
const u = new URL(url);
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT;
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
};
const fetchContentWithScrapingBee = async (url) => {
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
'api_key': process.env.SCRAPINGBEE_API_KEY,
'url': url,
'render_js': 'false',
'premium_proxy': 'true',
'country_code':'us'
},
timeout: REQUEST_TIMEOUT,
})
const dom = parseHTML(response.data).document;
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
} catch (e) {
console.error('error fetching with scrapingbee', e.message)
return { title: url, domContent: '', url }
}
}
const enableJavascriptForUrl = (url) => {
try {
const u = new URL(url);
for (const host of NON_SCRIPT_HOSTS) {
if (u.hostname.endsWith(host)) {
return false;
}
}
} catch (e) {
console.log('error getting hostname for url', url, e)
}
return true
};
// launch Puppeteer
const getBrowserPromise = (async () => {
console.log("starting puppeteer browser")
return puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--disable-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--window-size=1920,1080',
'--disable-extensions',
].filter((item) => !!item),
defaultViewport: {
deviceScaleFactor: 1,
hasTouch: false,
height: 1080,
isLandscape: true,
isMobile: false,
width: 1920
},
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 120000, // 2 minutes
});
})();
const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
try {
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
return axios.put(uploadSignedUrl, stream.data, {
headers: {
'Content-Type': contentType,
},
maxBodyLength: 1000000000,
maxContentLength: 100000000,
timeout: REQUEST_TIMEOUT,
});
} catch (error) {
console.error('error uploading to signed url', error.message);
return null;
}
};
const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
const data = JSON.stringify({
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
uploadFileRequest(input:$input) {
... on UploadFileRequestError {
errorCodes
}
... on UploadFileRequestSuccess {
id
uploadSignedUrl
}
}
}`,
variables: {
input: {
url,
contentType: 'application/pdf',
clientRequestId: articleSavingRequestId,
}
}
});
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
return null;
}
return response.data.data.uploadFileRequest;
} catch (e) {
console.error('error getting upload id and signed url', e.message);
return null;
}
};
const uploadPdf = async (url, userId, articleSavingRequestId) => {
validateUrlString(url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
if (!uploadResult) {
throw new Error('error while getting upload id and signed url');
}
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
if (!uploaded) {
throw new Error('error while uploading pdf');
}
return uploadResult.id;
};
const sendCreateArticleMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation CreateArticle ($input: CreateArticleInput!){
createArticle(input:$input){
... on CreateArticleSuccess{
createdArticle{
id
}
}
... on CreateArticleError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
return null;
}
return response.data.data.createArticle;
} catch (error) {
console.error('error creating article', error.message);
return null;
}
};
const sendSavePageMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
return { error: 'UNAUTHORIZED' };
}
return null;
}
return response.data.data.savePage;
} catch (error) {
console.error('error saving page', error.message);
return null;
}
};
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId: uploadFileId,
state,
labels,
source,
folder,
},
);
};
const sendImportStatusUpdate = async (userId, taskId, status) => {
try {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
await axios.post(
IMPORTER_METRICS_COLLECTOR_URL,
{
taskId,
status,
},
{
headers: {
'Authorization': auth,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
} catch (e) {
console.error('error while sending import status update', e);
}
};
async function fetchContent(req, res) {
let functionStartTime = Date.now();
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
const state = req.body.state
const labels = req.body.labels
const source = req.body.source || 'puppeteer-parse';
const taskId = req.body.taskId; // taskId is used to update import status
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
const rssFeedUrl = req.body.rssFeedUrl;
const savedAt = req.body.savedAt;
const publishedAt = req.body.publishedAt;
const folder = req.body.folder;
let logRecord = {
url: urlStr,
userId,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale,
timezone,
rssFeedUrl,
savedAt,
publishedAt,
folder,
};
console.info(`Article parsing request`, logRecord);
let url, context, page, finalUrl, title, content, contentType, importStatus, statusCode = 200;
try {
url = getUrl(urlStr);
if (!url) {
logRecord.urlIsInvalid = true;
logRecord.error = 'Valid URL to parse not specified';
statusCode = 400;
return;
}
// pre handle url with custom handlers
try {
const browser = await getBrowserPromise;
const result = await preHandleContent(url, browser);
if (result && result.url) {
validateUrlString(url);
url = result.url;
}
if (result && result.title) { title = result.title }
if (result && result.content) { content = result.content }
if (result && result.contentType) { contentType = result.contentType }
} catch (e) {
console.info('error with handler: ', e);
}
if ((!content || !title) && contentType !== 'application/pdf') {
const result = await retrievePage(url, logRecord, functionStartTime, locale, timezone);
if (result && result.context) { context = result.context }
if (result && result.page) { page = result.page }
if (result && result.finalUrl) { finalUrl = result.finalUrl }
if (result && result.contentType) { contentType = result.contentType }
} else {
finalUrl = url
}
if (contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(finalUrl),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
statusCode = 500;
logRecord.error = 'error while saving uploaded pdf';
} else {
importStatus = 'imported';
}
} else {
if (!content || !title) {
const result = await retrieveHtml(page, logRecord);
if (result.isBlocked) {
const sbResult = await fetchContentWithScrapingBee(url)
title = sbResult.title
content = sbResult.domContent
} else {
title = result.title;
content = result.domContent;
}
} else {
console.info('using prefetched content and title');
}
logRecord.fetchContentTime = Date.now() - functionStartTime;
}
} catch (e) {
logRecord.error = e.message;
console.error(`Error while retrieving page`, logRecord);
statusCode = 500;
// fallback to scrapingbee for non pdf content
if (url && contentType !== 'application/pdf') {
console.info('fallback to scrapingbee', url);
const fetchStartTime = Date.now();
const sbResult = await fetchContentWithScrapingBee(url);
content = sbResult.domContent;
title = sbResult.title;
logRecord.fetchContentTime = Date.now() - fetchStartTime;
statusCode = 200;
}
} finally {
// close browser context if it was opened
if (context) {
await context.close();
}
// save non pdf content
if (url && contentType !== 'application/pdf') {
// parse content if it is not empty
let readabilityResult = null;
if (content) {
let document = parseHTML(content).document;
// preParse content
const preParsedDom = await preParseContent(url, document)
if (preParsedDom) {
document = preParsedDom
}
readabilityResult = await getReadabilityResult(url, document);
}
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
logRecord.error = 'error while saving page';
statusCode = 500;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.info('user is deleted, do not retry', logRecord);
return res.sendStatus(200);
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
}
logRecord.totalTime = Date.now() - functionStartTime;
console.info(`parse-page`, logRecord);
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount == MAX_RETRY_COUNT) {
console.info('max retry count reached');
importStatus = importStatus || 'failed';
}
// send import status to update the metrics
if (taskId && importStatus) {
await sendImportStatusUpdate(userId, taskId, importStatus);
}
res.sendStatus(statusCode);
}
}
function validateUrlString(url) {
const u = new URL(url);
// Make sure the URL is http or https
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new Error('Invalid URL protocol check failed')
}
// Make sure the domain is not localhost
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
throw new Error('Invalid URL is localhost')
}
// Make sure the domain is not a private IP
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
throw new Error('Invalid URL is private ip')
}
}
function tryParseUrl(urlStr) {
if (!urlStr) {
return null;
}
// a regular expression to match all URLs
const regex = /(https?:\/\/[^\s]+)/g;
const matches = urlStr.match(regex);
if (matches) {
return matches[0]; // only return first match
} else {
return null;
}
}
function getUrl(urlStr) {
const url = tryParseUrl(urlStr)
if (!url) {
throw new Error('No URL specified');
}
validateUrlString(url);
const parsed = Url.parse(url);
return parsed.href;
}
async function retrievePage(url, logRecord, functionStartTime, locale, timezone) {
validateUrlString(url);
const browser = await getBrowserPromise;
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage()
if (!enableJavascriptForUrl(url)) {
await page.setJavaScriptEnabled(false);
}
await page.setUserAgent(userAgentForUrl(url));
// set locale for the page
if (locale) {
await page.setExtraHTTPHeaders({ 'Accept-Language': locale });
}
// set timezone for the page
if (timezone) {
await page.emulateTimezone(timezone);
}
const client = await page.target().createCDPSession();
// intercept request when response headers was received
await client.send('Network.setRequestInterception', {
patterns: [
{
urlPattern: '*',
resourceType: 'Document',
interceptionStage: 'HeadersReceived',
},
],
});
const path = require('path');
const download_path = path.resolve('./download_dir/');
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
userDataDir: './',
downloadPath: download_path,
})
client.on('Network.requestIntercepted', async e => {
const headers = e.responseHeaders || {};
const [contentType] = (headers['content-type'] || headers['Content-Type'] || '')
.toLowerCase()
.split(';');
const obj = { interceptionId: e.interceptionId };
if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) {
// We only check content-type on success responses
// as it doesn't matter what the content type is for things
// like redirects
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
obj['errorReason'] = 'BlockedByClient';
}
}
try {
await client.send('Network.continueInterceptedRequest', obj);
// eslint-disable-next-line no-empty
} catch {}
});
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true);
let requestCount = 0;
page.on('request', request => {
if (request.resourceType() === 'font') {
// Disallow fonts from loading
request.abort();
return;
}
if (requestCount++ > 100) {
request.abort();
return;
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
return
}
request.continue();
});
// Puppeteer fails during download of PDf files,
// so record the failure and use those items
let lastPdfUrl = undefined;
page.on('response', response => {
if (response.headers()['content-type'] === 'application/pdf') {
lastPdfUrl = response.url();
}
});
try {
const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] });
const finalUrl = response.url();
const contentType = response.headers()['content-type'];
logRecord.finalUrl = response.url();
logRecord.contentType = response.headers()['content-type'];
return { context, page, response, finalUrl, contentType };
} catch (error) {
if (lastPdfUrl) {
return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
}
await context.close();
throw error;
}
}
async function retrieveHtml(page, logRecord) {
let domContent = '', title;
try {
title = await page.title();
logRecord.title = title;
const pageScrollingStart = Date.now();
/* scroll with a 5 seconds timeout */
await Promise.race([
new Promise(resolve => {
(async function () {
try {
await page.evaluate(`(async () => {
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
return new Promise((resolve, reject) => {
let scrollHeight = document.body.scrollHeight;
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve(true);
}
}, 10);
});
})()`);
} catch (e) {
logRecord.scrollError = true;
} finally {
resolve(true);
}
})();
}),
page.waitForTimeout(5000),
]);
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };
const iframes = {};
const urls = [];
const framesPromises = [];
const allowedUrls = /instagram\.com/gi;
for (const frame of page.mainFrame().childFrames()) {
if (frame.url() && allowedUrls.test(frame.url())) {
urls.push(frame.url());
framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body')));
}
}
(await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame));
const domContentCapturingStart = Date.now();
// get document body with all hidden elements removed
domContent = await page.evaluate(iframes => {
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
const style = window.getComputedStyle(el);
try {
// Removing blurred images since they are mostly the copies of lazy loaded ones
if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
const filter = style.getPropertyValue('filter');
if (filter && filter.startsWith('blur')) {
el.parentNode && el.parentNode.removeChild(el);
}
}
} catch (err) {
// throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
}
// convert all nodes with background image to img nodes
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
const filter = style.getPropertyValue('filter');
// avoiding image nodes with a blur effect creation
if (filter && filter.startsWith('blur')) {
el && el.parentNode && el.parentNode.removeChild(el);
} else {
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
BI_SRC_REGEXP.lastIndex = 0;
if (matchedSRC && matchedSRC[1] && !el.src) {
// Replacing element only of there are no content inside, b/c might remove important div with content.
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
if (!el.textContent) {
const img = document.createElement('img');
img.src = matchedSRC[1];
el && el.parentNode && el.parentNode.replaceChild(img, el);
}
}
}
}
if (el.tagName === 'IFRAME') {
if (iframes[el.src]) {
const newNode = document.createElement('div');
newNode.className = 'omnivore-instagram-embed';
newNode.innerHTML = iframes[el.src];
el && el.parentNode && el.parentNode.replaceChild(newNode, el);
}
}
});
if (document.querySelector('[data-translate="managed_checking_msg"]') ||
document.getElementById('px-block-form-wrapper')) {
return 'IS_BLOCKED'
}
return document.documentElement.outerHTML;
}, iframes);
logRecord.puppeteerSuccess = true;
logRecord.timing = {
...logRecord.timing,
contenCaptured: Date.now() - domContentCapturingStart,
};
// [END puppeteer-block]
} catch (e) {
if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
logRecord.blockedByClient = true;
} else {
logRecord.puppeteerSuccess = false;
logRecord.puppeteerError = {
message: e.message,
stack: e.stack,
};
}
}
if (domContent === 'IS_BLOCKED') {
return { isBlocked: true };
}
return { domContent, title };
}
async function preview(req, res) {
const functionStartTime = Date.now();
// Grabbing execution and trace ids to attach logs to the appropriate function call
const execution_id = req.get('function-execution-id');
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', {
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
labels: {
execution_id: execution_id,
},
});
if (!process.env.PREVIEW_IMAGE_BUCKET) {
console.error(`PREVIEW_IMAGE_BUCKET not set`)
return res.sendStatus(500);
}
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const url = getUrl(urlStr);
console.log('preview request url', url);
const logRecord = {
url,
query: req.query,
origin: req.get('Origin'),
labels: {
source: 'publicImagePreview',
},
};
console.info(`Public preview image generation request`, logRecord);
if (!url) {
logRecord.urlIsInvalid = true;
console.error(`Valid URL to parse is not specified`, logRecord);
return res.sendStatus(400);
}
const { origin } = new URL(url);
if (!ALLOWED_ORIGINS.some(o => o === origin)) {
logRecord.forbiddenOrigin = true;
console.error(`This origin is not allowed: ${origin}`, logRecord);
return res.sendStatus(400);
}
const browser = await getBrowserPromise;
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const page = await browser.newPage();
const pageLoadingStart = Date.now();
const modifiedUrl = new URL(url);
modifiedUrl.searchParams.append('fontSize', '24');
modifiedUrl.searchParams.append('adjustAspectRatio', '1.91');
try {
await page.goto(modifiedUrl.toString());
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
} catch (error) {
console.log('error going to page: ', modifiedUrl)
console.log(error)
throw error
}
// We lookup the destination path from our own page content and avoid trusting any passed query params
// selector - CSS selector of the element to get screenshot of
const selector = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_selector']",
element => element.content,
),
);
if (!selector) {
logRecord.selectorIsInvalid = true;
console.error(`Valid element selector is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.selector = selector;
// destination - destination pathname for the image to save with
const destination = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_destination']",
element => element.content,
),
);
if (!destination) {
logRecord.destinationIsInvalid = true;
console.error(`Valid file destination is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.destination = destination;
const screenshotTakingStart = Date.now();
try {
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
} catch (error) {
logRecord.elementNotFound = true;
console.error(`Element is not presented on the page`, logRecord);
await page.close();
return res.sendStatus(400);
}
const element = await page.$(selector);
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
await page.close();
try {
const [file] = await previewBucket.upload(filePath, {
destination,
metadata: logRecord,
});
logRecord.file = file.metadata;
} catch (e) {
console.log('error uploading to bucket, this is non-fatal', e)
}
console.info(`preview-image`, logRecord);
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
}
const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'],
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
FORBID_ATTR: [
'data-ml-dynamic',
'data-ml-dynamic-type',
'data-orig-url',
'data-ml-id',
'data-ml',
'data-xid',
'data-feature',
],
}
function domPurifySanitizeHook(node, data) {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
const dataSrc = node.getAttribute('data-src') || ''
if (src && urlRegex.test(src)) {
return
}
if (dataSrc && urlRegex.test(dataSrc)) {
node.setAttribute('src', dataSrc)
return
}
node.parentNode?.removeChild(node)
}
}
function getPurifiedContent(html) {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return parseHTML(clean).document
}
function signImageProxyUrl(url) {
return encode(
crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest()
)
}
function createImageProxyUrl(url, width = 0, height = 0) {
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
return url
}
const urlWithOptions = `${url}#${width}x${height}`
const signature = signImageProxyUrl(urlWithOptions)
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}
async function getReadabilityResult(url, document) {
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
() => {
return document
},
() => {
return getPurifiedContent(document)
},
]
for (const source of sources) {
const document = source()
if (!document) {
continue
}
try {
const article = await new Readability(document, {
createImageProxyUrl,
url,
}).parse()
if (article) {
return article
}
} catch (error) {
console.log('parsing error for url', url, error)
}
}
return null
}
module.exports = {
fetchContent,
preview,
};