dockerfile/examples/omnivore/content-fetch/content-handler/test/ars-technica.test.ts

83 lines
2.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { ArsTechnicaHandler } from '../src/websites/ars-technica-handler'
import fs from 'fs'
import nock from 'nock'
import { expect } from 'chai'
import { parseHTML } from 'linkedom'
describe('Testing parsing multi-page articles from arstechnica.', () => {
let orignalArticle: Document | undefined
let htmlPg1: string | null
let htmlPg2: string | null
let htmlPg3: string | null
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
before(() => {
htmlPg1 = load('./test/data/ars-multipage/ars-technica-page-1.html')
htmlPg2 = load('./test/data/ars-multipage/ars-technica-page-2.html')
htmlPg3 = load('./test/data/ars-multipage/ars-technica-page-3.html')
orignalArticle = parseHTML(htmlPg1).document
})
beforeEach(() => {
nock('https://arstechnica.com').get('/article/').reply(200, htmlPg1!)
nock('https://arstechnica.com').get('/article/2/').reply(200, htmlPg2!)
nock('https://arstechnica.com').get('/article/3/').reply(200, htmlPg3!)
})
afterEach(() => {
nock.cleanAll();
})
it('should parse the title of the atlantic article.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We grab the title from the doucment.
expect(response.title).not.to.be.undefined
expect(response.title).to.equal(
'Whats going on with the reports of a room-temperature superconductor? | Ars Technica'
)
})
it('should remove the navigation links', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
expect(orignalArticle?.querySelector('nav.page-numbers')).not.to.be.null
expect(response.dom?.querySelectorAll('nav.page-numbers').length).to.equal(0);
})
it('should append all new content into the main article', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We name the div to ensure we can validate that it has been inserted.
expect(
orignalArticle?.getElementsByClassName('nextPageContents')?.length || 0
).to.equal(0)
expect(
response.dom?.getElementsByClassName('nextPageContents')?.length || 0
).not.to.equal(0)
})
it('should remove any related content links.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// This exists in the HTML, but we remove it when preparsing.
expect(
response.dom?.getElementsByClassName(
'ArticleRelatedContentModule_root__BBa6g'
).length
).to.eql(0)
})
})