dockerfile/examples/omnivore/content-fetch/puppeteer-parse/index.js

/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
const { encode } = require("urlsafe-base64");
const crypto = require("crypto");

const Url = require('url');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const signToken = promisify(jwt.sign);
const os = require('os');
const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");

const puppeteer = require('puppeteer-extra');

// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());

// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));

const createDOMPurify = require("dompurify");

const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;

const filePath = `${os.tmpdir()}/previewImage.png`;

const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];

const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];

const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;

const REQUEST_TIMEOUT = 30000; // 30 seconds
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';

const userAgentForUrl = (url) => {
  try {
    const u = new URL(url);
    for (const host of NON_BOT_HOSTS) {
      if (u.hostname.endsWith(host)) {
        return NON_BOT_DESKTOP_USER_AGENT;
      }
    }
  } catch (e) {
    console.log('error getting user agent for url', url, e)
  }
  return DESKTOP_USER_AGENT
};

const fetchContentWithScrapingBee = async (url) => {
  try {
    const response = await axios.get('https://app.scrapingbee.com/api/v1', {
      params: {
        'api_key':  process.env.SCRAPINGBEE_API_KEY,
        'url': url,
        'render_js': 'false',
        'premium_proxy': 'true',
        'country_code':'us'
      },
      timeout: REQUEST_TIMEOUT,
    })

    const dom = parseHTML(response.data).document;
    return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
  } catch (e) {
    console.error('error fetching with scrapingbee', e.message)

    return { title: url, domContent: '', url }
  }
}

const enableJavascriptForUrl = (url) => {
  try {
    const u = new URL(url);
    for (const host of NON_SCRIPT_HOSTS) {
      if (u.hostname.endsWith(host)) {
        return false;
      }
    }
  } catch (e) {
    console.log('error getting hostname for url', url, e)
  }
  return true
};

// launch Puppeteer
const getBrowserPromise = (async () => {
  console.log("starting puppeteer browser")
  return puppeteer.launch({
    args: [
      '--allow-running-insecure-content',
      '--autoplay-policy=user-gesture-required',
      '--disable-component-update',
      '--disable-domain-reliability',
      '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
      '--disable-print-preview',
      '--disable-setuid-sandbox',
      '--disable-site-isolation-trials',
      '--disable-speech-api',
      '--disable-web-security',
      '--disk-cache-size=33554432',
      '--enable-features=SharedArrayBuffer',
      '--hide-scrollbars',
      '--disable-gpu',
      '--mute-audio',
      '--no-default-browser-check',
      '--no-pings',
      '--no-sandbox',
      '--no-zygote',
      '--window-size=1920,1080',
      '--disable-extensions',
    ].filter((item) => !!item),
    defaultViewport: {
      deviceScaleFactor: 1,
      hasTouch: false,
      height: 1080,
      isLandscape: true,
      isMobile: false,
      width: 1920
    },
    executablePath: process.env.CHROMIUM_PATH,
    headless: !!process.env.LAUNCH_HEADLESS,
    timeout: 120000, // 2 minutes
  });
})();

const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
  try {
    const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
    return axios.put(uploadSignedUrl, stream.data, {
      headers: {
        'Content-Type': contentType,
      },
      maxBodyLength: 1000000000,
      maxContentLength: 100000000,
      timeout: REQUEST_TIMEOUT,
    });
  } catch (error) {
    console.error('error uploading to signed url', error.message);
    return null;
  }
};

const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
  const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
  const data = JSON.stringify({
    query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
      uploadFileRequest(input:$input) {
        ... on UploadFileRequestError {
          errorCodes
        }
        ... on UploadFileRequestSuccess {
          id
          uploadSignedUrl
        }
      }
    }`,
    variables: {
      input: {
        url,
        contentType: 'application/pdf',
        clientRequestId: articleSavingRequestId,
      }
    }
  });

  try {
    const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
    {
      headers: {
        Cookie: `auth=${auth};`,
        'Content-Type': 'application/json',
      },
      timeout: REQUEST_TIMEOUT,
    });

    if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
      console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
      return null;
    }

    return response.data.data.uploadFileRequest;
  } catch (e) {
    console.error('error getting upload id and signed url', e.message);
    return null;
  }
};

const uploadPdf = async (url, userId, articleSavingRequestId) => {
  validateUrlString(url);

  const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
  if (!uploadResult) {
    throw new Error('error while getting upload id and signed url');
  }
  const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
  if (!uploaded) {
    throw new Error('error while uploading pdf');
  }
  return uploadResult.id;
};

const sendCreateArticleMutation = async (userId, input) => {
  const data = JSON.stringify({
    query: `mutation CreateArticle ($input: CreateArticleInput!){
          createArticle(input:$input){
            ... on CreateArticleSuccess{
              createdArticle{
                id
            }
        }
          ... on CreateArticleError{
              errorCodes
          }
      }
    }`,
    variables: {
      input,
    },
  });

  const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
  try {
    const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
    {
      headers: {
        Cookie: `auth=${auth};`,
        'Content-Type': 'application/json',
      },
      timeout: REQUEST_TIMEOUT,
    });

    if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
      console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
      return null;
    }

    return response.data.data.createArticle;
  } catch (error) {
    console.error('error creating article', error.message);
    return null;
  }
};

const sendSavePageMutation = async (userId, input) => {
  const data = JSON.stringify({
    query: `mutation SavePage ($input: SavePageInput!){
          savePage(input:$input){
            ... on SaveSuccess{
              url
              clientRequestId
            }
            ... on SaveError{
                errorCodes
            }
          }
    }`,
    variables: {
      input,
    },
  });

  const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
  try {
    const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
    {
      headers: {
        Cookie: `auth=${auth};`,
        'Content-Type': 'application/json',
      },
      timeout: REQUEST_TIMEOUT,
    });

    if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
      console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
      if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
        return { error: 'UNAUTHORIZED' };
      }

      return null;
    }

    return response.data.data.savePage;
  } catch (error) {
    console.error('error saving page', error.message);
    return null;
  }
};

const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
  return sendCreateArticleMutation(userId, {
      url: encodeURI(url),
      articleSavingRequestId,
      uploadFileId: uploadFileId,
      state,
      labels,
      source,
      folder,
    },
  );
};

const sendImportStatusUpdate = async (userId, taskId, status) => {
  try {
    const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);

    await axios.post(
      IMPORTER_METRICS_COLLECTOR_URL,
      {
        taskId,
        status,
      },
      {
        headers: {
          'Authorization': auth,
          'Content-Type': 'application/json',
        },
        timeout: REQUEST_TIMEOUT,
      });
  } catch (e) {
    console.error('error while sending import status update', e);
  }
};

async function fetchContent(req, res) {
  let functionStartTime = Date.now();

  const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
  const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
  const state = req.body.state
  const labels = req.body.labels
  const source = req.body.source || 'puppeteer-parse';
  const taskId = req.body.taskId; // taskId is used to update import status
  const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
  const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
  const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
  const rssFeedUrl = req.body.rssFeedUrl;
  const savedAt = req.body.savedAt;
  const publishedAt = req.body.publishedAt;
  const folder = req.body.folder;

  let logRecord = {
    url: urlStr,
    userId,
    articleSavingRequestId,
    labels: {
      source,
    },
    state,
    labelsToAdd: labels,
    taskId: taskId,
    locale,
    timezone,
    rssFeedUrl,
    savedAt,
    publishedAt,
    folder,
  };

  console.info(`Article parsing request`, logRecord);

  let url, context, page, finalUrl, title, content, contentType, importStatus, statusCode = 200;
  try {
    url = getUrl(urlStr);
    if (!url) {
      logRecord.urlIsInvalid = true;
      logRecord.error = 'Valid URL to parse not specified';
      statusCode = 400;
      return;
    }

    // pre handle url with custom handlers
    try {
      const browser = await getBrowserPromise;
      const result = await preHandleContent(url, browser);
      if (result && result.url) {
        validateUrlString(url);
        url = result.url;
      }
      if (result && result.title) { title = result.title }
      if (result && result.content) { content = result.content }
      if (result && result.contentType) { contentType = result.contentType }
    } catch (e) {
      console.info('error with handler: ', e);
    }

    if ((!content || !title) && contentType !== 'application/pdf') {
      const result = await retrievePage(url, logRecord, functionStartTime, locale, timezone);
      if (result && result.context) { context = result.context }
      if (result && result.page) { page = result.page }
      if (result && result.finalUrl) { finalUrl = result.finalUrl }
      if (result && result.contentType) { contentType = result.contentType }
    } else {
      finalUrl = url
    }

    if (contentType === 'application/pdf') {
      const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
      const uploadedPdf = await sendCreateArticleMutation(userId, {
        url: encodeURI(finalUrl),
        articleSavingRequestId,
        uploadFileId,
        state,
        labels,
        source,
        folder,
        rssFeedUrl,
        savedAt,
        publishedAt,
      });
      if (!uploadedPdf) {
        statusCode = 500;
        logRecord.error = 'error while saving uploaded pdf';
      } else {
        importStatus = 'imported';
      }
    } else {
      if (!content || !title) {
        const result = await retrieveHtml(page, logRecord);
        if (result.isBlocked) {
          const sbResult = await fetchContentWithScrapingBee(url)
          title = sbResult.title
          content = sbResult.domContent
        } else {
          title = result.title;
          content = result.domContent;
        }
      } else {
        console.info('using prefetched content and title');
      }
      logRecord.fetchContentTime = Date.now() - functionStartTime;
    }
  } catch (e) {
    logRecord.error = e.message;
    console.error(`Error while retrieving page`, logRecord);
    statusCode = 500;

    // fallback to scrapingbee for non pdf content
    if (url && contentType !== 'application/pdf') {
      console.info('fallback to scrapingbee', url);

      const fetchStartTime = Date.now();
      const sbResult = await fetchContentWithScrapingBee(url);
      content = sbResult.domContent;
      title = sbResult.title;
      logRecord.fetchContentTime = Date.now() - fetchStartTime;
      statusCode = 200;
    }
  } finally {
    // close browser context if it was opened
    if (context) {
      await context.close();
    }
    // save non pdf content
    if (url && contentType !== 'application/pdf') {
      // parse content if it is not empty
      let readabilityResult = null;
      if (content) {
        let document = parseHTML(content).document;
        // preParse content
        const preParsedDom = await preParseContent(url, document)
        if (preParsedDom) {
          document = preParsedDom
        }
        readabilityResult = await getReadabilityResult(url, document);
      }

      const apiResponse = await sendSavePageMutation(userId, {
        url,
        clientRequestId: articleSavingRequestId,
        title,
        originalContent: content,
        parseResult: readabilityResult,
        state,
        labels,
        rssFeedUrl,
        savedAt,
        publishedAt,
        source,
        folder,
      });
      if (!apiResponse) {
        logRecord.error = 'error while saving page';
        statusCode = 500;
      } else if (apiResponse.error === 'UNAUTHORIZED') {
        console.info('user is deleted, do not retry', logRecord);
        return res.sendStatus(200);
      } else {
        importStatus = readabilityResult ? 'imported' : 'failed';
      }
    }

    logRecord.totalTime = Date.now() - functionStartTime;
    console.info(`parse-page`, logRecord);

    // mark import failed on the last failed retry
    const retryCount = req.headers['x-cloudtasks-taskretrycount'];
    if (retryCount == MAX_RETRY_COUNT) {
      console.info('max retry count reached');
      importStatus = importStatus || 'failed';
    }

    // send import status to update the metrics
    if (taskId && importStatus) {
      await sendImportStatusUpdate(userId, taskId, importStatus);
    }

    res.sendStatus(statusCode);
  }
}

function validateUrlString(url) {
  const u = new URL(url);
  // Make sure the URL is http or https
  if (u.protocol !== 'http:' && u.protocol !== 'https:') {
    throw new Error('Invalid URL protocol check failed')
  }
  // Make sure the domain is not localhost
  if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
    throw new Error('Invalid URL is localhost')
  }
  // Make sure the domain is not a private IP
  if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
    throw new Error('Invalid URL is private ip')
  }
}

function tryParseUrl(urlStr) {
  if (!urlStr) {
    return null;
  }

  // a regular expression to match all URLs
  const regex = /(https?:\/\/[^\s]+)/g;

  const matches = urlStr.match(regex);

  if (matches) {
    return matches[0]; // only return first match
  } else {
    return null;
  }
}

function getUrl(urlStr) {
  const url = tryParseUrl(urlStr)
  if (!url) {
    throw new Error('No URL specified');
  }

  validateUrlString(url);

  const parsed = Url.parse(url);
  return parsed.href;
}

async function retrievePage(url, logRecord, functionStartTime, locale, timezone) {
  validateUrlString(url);

  const browser = await getBrowserPromise;
  logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };

  const context = await browser.createIncognitoBrowserContext();
  const page = await context.newPage()

  if (!enableJavascriptForUrl(url)) {
    await page.setJavaScriptEnabled(false);
  }
  await page.setUserAgent(userAgentForUrl(url));

  // set locale for the page
  if (locale) {
    await page.setExtraHTTPHeaders({ 'Accept-Language': locale });
  }

  // set timezone for the page
  if (timezone) {
    await page.emulateTimezone(timezone);
  }

  const client = await page.target().createCDPSession();

  // intercept request when response headers was received
  await client.send('Network.setRequestInterception', {
    patterns: [
      {
        urlPattern: '*',
        resourceType: 'Document',
        interceptionStage: 'HeadersReceived',
      },
    ],
  });

  const path = require('path');
  const download_path = path.resolve('./download_dir/');

  await client.send('Page.setDownloadBehavior', {
    behavior: 'allow',
    userDataDir: './',
    downloadPath: download_path,
  })

  client.on('Network.requestIntercepted', async e => {
    const headers = e.responseHeaders || {};

    const [contentType] = (headers['content-type'] || headers['Content-Type'] || '')
      .toLowerCase()
      .split(';');
    const obj = { interceptionId: e.interceptionId };

    if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) {
      // We only check content-type on success responses
      // as it doesn't matter what the content type is for things
      // like redirects
      if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
        obj['errorReason'] = 'BlockedByClient';
      }
    }

    try {
      await client.send('Network.continueInterceptedRequest', obj);
      // eslint-disable-next-line no-empty
    } catch {}
  });

  /*
    * Disallow MathJax from running in Puppeteer and modifying the document,
    * we shall instead run it in our frontend application to transform any
    * mathjax content when present.
    */
  await page.setRequestInterception(true);
  let requestCount = 0;
  page.on('request', request => {
    if (request.resourceType() === 'font') {
      // Disallow fonts from loading
      request.abort();
      return;
    }
    if (requestCount++ > 100) {
      request.abort();
      return;
    }
    if (
      request.resourceType() === 'script' &&
      request.url().toLowerCase().indexOf('mathjax') > -1
    ) {
      request.abort();
      return
    }
    request.continue();
  });

  // Puppeteer fails during download of PDf files,
  // so record the failure and use those items
  let lastPdfUrl = undefined;
  page.on('response', response => {
    if (response.headers()['content-type'] === 'application/pdf') {
      lastPdfUrl = response.url();
    }
  });

  try {
    const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] });
    const finalUrl = response.url();
    const contentType = response.headers()['content-type'];

    logRecord.finalUrl = response.url();
    logRecord.contentType = response.headers()['content-type'];

    return { context, page, response, finalUrl, contentType };
  } catch (error) {
    if (lastPdfUrl) {
      return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
    }
    await context.close();
    throw error;
  }
}

async function retrieveHtml(page, logRecord) {
  let domContent = '', title;
  try {
    title = await page.title();
    logRecord.title = title;

    const pageScrollingStart = Date.now();
    /* scroll with a 5 seconds timeout */
    await Promise.race([
      new Promise(resolve => {
        (async function () {
          try {
            await page.evaluate(`(async () => {
                /* credit: https://github.com/puppeteer/puppeteer/issues/305 */
                return new Promise((resolve, reject) => {
                  let scrollHeight = document.body.scrollHeight;
                  let totalHeight = 0;
                  let distance = 500;
                  let timer = setInterval(() => {
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    if(totalHeight >= scrollHeight){
                      clearInterval(timer);
                      resolve(true);
                    }
                  }, 10);
                });
              })()`);
          } catch (e) {
            logRecord.scrollError = true;
          } finally {
            resolve(true);
          }
        })();
      }),
      page.waitForTimeout(5000),
    ]);
    logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };

    const iframes = {};
    const urls = [];
    const framesPromises = [];
    const allowedUrls = /instagram\.com/gi;

    for (const frame of page.mainFrame().childFrames()) {
      if (frame.url() && allowedUrls.test(frame.url())) {
        urls.push(frame.url());
        framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body')));
      }
    }

    (await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame));

    const domContentCapturingStart = Date.now();
    // get document body with all hidden elements removed
    domContent = await page.evaluate(iframes => {
      const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;

      Array.from(document.body.getElementsByTagName('*')).forEach(el => {
        const style = window.getComputedStyle(el);

        try {
          // Removing blurred images since they are mostly the copies of lazy loaded ones
          if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
            const filter = style.getPropertyValue('filter');
            if (filter && filter.startsWith('blur')) {
              el.parentNode && el.parentNode.removeChild(el);
            }
          }
        } catch (err) {
          // throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
        }

        // convert all nodes with background image to img nodes
        if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
          const filter = style.getPropertyValue('filter');
          // avoiding image nodes with a blur effect creation
          if (filter && filter.startsWith('blur')) {
            el && el.parentNode && el.parentNode.removeChild(el);
          } else {
            const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
            // Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
            // More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
            BI_SRC_REGEXP.lastIndex = 0;

            if (matchedSRC && matchedSRC[1] && !el.src) {
              // Replacing element only of there are no content inside, b/c might remove important div with content.
              // Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
              // DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
              if (!el.textContent) {
                const img = document.createElement('img');
                img.src = matchedSRC[1];
                el && el.parentNode && el.parentNode.replaceChild(img, el);
              }
            }
          }
        }

        if (el.tagName === 'IFRAME') {
          if (iframes[el.src]) {
            const newNode = document.createElement('div');
            newNode.className = 'omnivore-instagram-embed';
            newNode.innerHTML = iframes[el.src];
            el && el.parentNode && el.parentNode.replaceChild(newNode, el);
          }
        }
      });

      if (document.querySelector('[data-translate="managed_checking_msg"]') ||
        document.getElementById('px-block-form-wrapper')) {
        return 'IS_BLOCKED'
      }

      return document.documentElement.outerHTML;
    }, iframes);
    logRecord.puppeteerSuccess = true;
    logRecord.timing = {
      ...logRecord.timing,
      contenCaptured: Date.now() - domContentCapturingStart,
    };

    // [END puppeteer-block]
  } catch (e) {
    if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
      logRecord.blockedByClient = true;
    } else {
      logRecord.puppeteerSuccess = false;
      logRecord.puppeteerError = {
        message: e.message,
        stack: e.stack,
      };
    }
  }
  if (domContent === 'IS_BLOCKED') {
    return { isBlocked: true };
  }
  return { domContent, title };
}

async function preview(req, res) {
  const functionStartTime = Date.now();
  // Grabbing execution and trace ids to attach logs to the appropriate function call
  const execution_id = req.get('function-execution-id');
  const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
  const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', {
    trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
    labels: {
      execution_id: execution_id,
    },
  });

  if (!process.env.PREVIEW_IMAGE_BUCKET) {
    console.error(`PREVIEW_IMAGE_BUCKET not set`)
    return res.sendStatus(500);
  }

  const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
  const url = getUrl(urlStr);
  console.log('preview request url', url);

  const logRecord = {
    url,
    query: req.query,
    origin: req.get('Origin'),
    labels: {
      source: 'publicImagePreview',
    },
  };

  console.info(`Public preview image generation request`, logRecord);

  if (!url) {
    logRecord.urlIsInvalid = true;
    console.error(`Valid URL to parse is not specified`, logRecord);
    return res.sendStatus(400);
  }
  const { origin } = new URL(url);
  if (!ALLOWED_ORIGINS.some(o => o === origin)) {
    logRecord.forbiddenOrigin = true;
    console.error(`This origin is not allowed: ${origin}`, logRecord);
    return res.sendStatus(400);
  }

  const browser = await getBrowserPromise;
  logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };

  const page = await browser.newPage();
  const pageLoadingStart = Date.now();
  const modifiedUrl = new URL(url);
  modifiedUrl.searchParams.append('fontSize', '24');
  modifiedUrl.searchParams.append('adjustAspectRatio', '1.91');
  try {
    await page.goto(modifiedUrl.toString());
    logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
  } catch (error) {
    console.log('error going to page: ', modifiedUrl)
    console.log(error)
    throw error
  }

  // We lookup the destination path from our own page content and avoid trusting any passed query params
  // selector - CSS selector of the element to get screenshot of
  const selector = decodeURIComponent(
    await page.$eval(
      "head > meta[name='omnivore:preview_image_selector']",
      element => element.content,
    ),
  );
  if (!selector) {
    logRecord.selectorIsInvalid = true;
    console.error(`Valid element selector is not specified`, logRecord);
    await page.close();
    return res.sendStatus(400);
  }
  logRecord.selector = selector;

  // destination - destination pathname for the image to save with
  const destination = decodeURIComponent(
    await page.$eval(
      "head > meta[name='omnivore:preview_image_destination']",
      element => element.content,
    ),
  );
  if (!destination) {
    logRecord.destinationIsInvalid = true;
    console.error(`Valid file destination is not specified`, logRecord);
    await page.close();
    return res.sendStatus(400);
  }
  logRecord.destination = destination;

  const screenshotTakingStart = Date.now();
  try {
    await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
  } catch (error) {
    logRecord.elementNotFound = true;
    console.error(`Element is not presented on the page`, logRecord);
    await page.close();
    return res.sendStatus(400);
  }
  const element = await page.$(selector);
  await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
  logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };

  await page.close();

  try {
    const [file] = await previewBucket.upload(filePath, {
      destination,
      metadata: logRecord,
    });
    logRecord.file = file.metadata;
  } catch (e) {
    console.log('error uploading to bucket, this is non-fatal', e)
  }

  console.info(`preview-image`, logRecord);
  return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
}

const DOM_PURIFY_CONFIG = {
  ADD_TAGS: ['iframe'],
  ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
  FORBID_ATTR: [
    'data-ml-dynamic',
    'data-ml-dynamic-type',
    'data-orig-url',
    'data-ml-id',
    'data-ml',
    'data-xid',
    'data-feature',
  ],
}

function domPurifySanitizeHook(node, data) {
  if (data.tagName === 'iframe') {
    const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
    const src = node.getAttribute('src') || ''
    const dataSrc = node.getAttribute('data-src') || ''

    if (src && urlRegex.test(src)) {
      return
    }

    if (dataSrc && urlRegex.test(dataSrc)) {
      node.setAttribute('src', dataSrc)
      return
    }

    node.parentNode?.removeChild(node)
  }
}

function getPurifiedContent(html) {
  const newWindow = parseHTML('')
  const DOMPurify = createDOMPurify(newWindow)
  DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
  const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
  return parseHTML(clean).document
}

function signImageProxyUrl(url) {
  return encode(
    crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest()
  )
}

function createImageProxyUrl(url, width = 0, height = 0) {
  if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
    return url
  }

  const urlWithOptions = `${url}#${width}x${height}`
  const signature = signImageProxyUrl(urlWithOptions)

  return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}

async function getReadabilityResult(url, document) {
  // First attempt to read the article as is.
  // if that fails attempt to purify then read
  const sources = [
    () => {
      return document
    },
    () => {
      return getPurifiedContent(document)
    },
  ]

  for (const source of sources) {
    const document = source()
    if (!document) {
      continue
    }

    try {
      const article = await new Readability(document, {
        createImageProxyUrl,
        url,
      }).parse()

      if (article) {
        return article
      }
    } catch (error) {
      console.log('parsing error for url', url, error)
    }
  }

  return null
}

module.exports = {
  fetchContent,
  preview,
};