dockerfile/examples/omnivore/content-fetch/content-handler/test/ars-technica.test.ts

83 lines
2.6 KiB
TypeScript
Raw Permalink Normal View History

2024-03-15 14:52:38 +08:00
import { ArsTechnicaHandler } from '../src/websites/ars-technica-handler'
import fs from 'fs'
import nock from 'nock'
import { expect } from 'chai'
import { parseHTML } from 'linkedom'
describe('Testing parsing multi-page articles from arstechnica.', () => {
let orignalArticle: Document | undefined
let htmlPg1: string | null
let htmlPg2: string | null
let htmlPg3: string | null
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
before(() => {
htmlPg1 = load('./test/data/ars-multipage/ars-technica-page-1.html')
htmlPg2 = load('./test/data/ars-multipage/ars-technica-page-2.html')
htmlPg3 = load('./test/data/ars-multipage/ars-technica-page-3.html')
orignalArticle = parseHTML(htmlPg1).document
})
beforeEach(() => {
nock('https://arstechnica.com').get('/article/').reply(200, htmlPg1!)
nock('https://arstechnica.com').get('/article/2/').reply(200, htmlPg2!)
nock('https://arstechnica.com').get('/article/3/').reply(200, htmlPg3!)
})
afterEach(() => {
nock.cleanAll();
})
it('should parse the title of the atlantic article.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We grab the title from the doucment.
expect(response.title).not.to.be.undefined
expect(response.title).to.equal(
'Whats going on with the reports of a room-temperature superconductor? | Ars Technica'
)
})
it('should remove the navigation links', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
expect(orignalArticle?.querySelector('nav.page-numbers')).not.to.be.null
expect(response.dom?.querySelectorAll('nav.page-numbers').length).to.equal(0);
})
it('should append all new content into the main article', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We name the div to ensure we can validate that it has been inserted.
expect(
orignalArticle?.getElementsByClassName('nextPageContents')?.length || 0
).to.equal(0)
expect(
response.dom?.getElementsByClassName('nextPageContents')?.length || 0
).not.to.equal(0)
})
it('should remove any related content links.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// This exists in the HTML, but we remove it when preparsing.
expect(
response.dom?.getElementsByClassName(
'ArticleRelatedContentModule_root__BBa6g'
).length
).to.eql(0)
})
})