dockerfile/examples/omnivore/content-fetch/readabilityjs/test/test-readability.js

364 lines
15 KiB
JavaScript

var chai = require("chai");
var sinon = require("sinon");
var chaiAsPromised = require("chai-as-promised");
const { parseHTML } = require("linkedom");
const nock = require("nock");
chai.use(chaiAsPromised);
chai.config.includeStack = true;
var expect = chai.expect;
var Readability = require("../index").Readability;
var JSDOMParser = require("../JSDOMParser");
var prettyPrint = require("./utils").prettyPrint;
const isOmnivore = process.env.IS_OMNIVORE;
var testPages = require("./utils").getTestPages(isOmnivore);
function reformatError(err) {
var formattedError = new Error(err.message);
formattedError.stack = err.stack;
return formattedError;
}
function inOrderTraverse(fromNode) {
if (fromNode.firstChild) {
return fromNode.firstChild;
}
while (fromNode && !fromNode.nextSibling) {
fromNode = fromNode.parentNode;
}
return fromNode ? fromNode.nextSibling : null;
}
function inOrderIgnoreEmptyTextNodes(fromNode) {
do {
fromNode = inOrderTraverse(fromNode);
} while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim());
return fromNode;
}
function traverseDOM(callback, expectedDOM, actualDOM) {
var actualNode = actualDOM.documentElement || actualDOM.childNodes[0];
var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0];
while (actualNode || expectedNode) {
// We'll stop if we don't have both actualNode and expectedNode
if (!callback(actualNode, expectedNode)) {
break;
}
actualNode = inOrderIgnoreEmptyTextNodes(actualNode);
expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode);
}
}
// Collapse subsequent whitespace like HTML:
function htmlTransform(str) {
return str.replace(/\s+/g, " ");
}
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) {
describe(label, function() {
this.timeout(30000);
var result;
before(async function() {
try {
var doc = domGenerationFn(source);
// Provide one class name to preserve, which we know appears in a few
// of the test documents.
var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri });
result = await myReader.parse();
} catch (err) {
throw reformatError(err);
}
});
it("should return a result object", function() {
expect(result).to.include.keys("content", "title", "excerpt", "byline");
});
it("should extract expected content", function() {
function nodeStr(n) {
if (!n) {
return "(no node)";
}
if (n.nodeType == 3) {
return "#text(" + htmlTransform(n.textContent) + ")";
}
if (n.nodeType != 1) {
return "some other node type: " + n.nodeType + " with data " + n.data;
}
var rv = n.localName;
if (n.id) {
rv += "#" + n.id;
}
if (n.className) {
rv += ".(" + n.className + ")";
}
return rv;
}
function genPath(node) {
if (node.id) {
return "#" + node.id;
}
if (node.tagName == "BODY") {
return "body";
}
var parent = node.parentNode;
var parentPath = genPath(parent);
var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1;
return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")";
}
function findableNodeDesc(node) {
return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)";
}
function attributesForNode(node) {
return Array.from(node.attributes).map(function(attr) {
return attr.name + "=" + attr.value;
}).join(",");
}
var actualDOM = domGenerationFn(prettyPrint(result.content));
var expectedDOM = domGenerationFn(prettyPrint(expectedContent));
traverseDOM(function(actualNode, expectedNode) {
if (actualNode && expectedNode) {
var actualDesc = nodeStr(actualNode);
var expectedDesc = nodeStr(expectedNode);
if (actualDesc != expectedDesc) {
expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc);
return false;
}
// Compare text for text nodes:
if (actualNode.nodeType == 3) {
var actualText = htmlTransform(actualNode.textContent);
var expectedText = htmlTransform(expectedNode.textContent);
expect(actualText, findableNodeDesc(actualNode)).eql(expectedText);
if (actualText != expectedText) {
return false;
}
// Compare attributes for element nodes:
} else if (actualNode.nodeType == 1) {
var actualNodeDesc = attributesForNode(actualNode);
var expectedNodeDesc = attributesForNode(expectedNode);
var desc = "node " + nodeStr(actualNode) + " attributes (" + actualNodeDesc + ") should match (" + expectedNodeDesc + ") ";
expect(actualNode.attributes.length, desc).eql(expectedNode.attributes.length);
for (var i = 0; i < actualNode.attributes.length; i++) {
var attr = actualNode.attributes[i].name;
var actualValue = actualNode.getAttribute(attr);
var expectedValue = expectedNode.getAttribute(attr);
expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue);
}
}
} else {
expect(nodeStr(actualNode), "Should have a node from both DOMs").eql(nodeStr(expectedNode));
return false;
}
return true;
}, actualDOM, expectedDOM);
});
it("should extract expected title", function() {
expect(result.title).eql(expectedMetadata.title);
});
it("should extract expected byline", function() {
expect(result.byline).eql(expectedMetadata.byline);
});
it("should extract expected excerpt", function() {
expect(result.excerpt).eql(expectedMetadata.excerpt);
});
it("should extract expected site name", function() {
expect(result.siteName).eql(expectedMetadata.siteName);
});
expectedMetadata.dir && it("should extract expected direction", function() {
expect(result.dir).eql(expectedMetadata.dir);
});
});
}
function removeCommentNodesRecursively(node) {
for (var i = node.childNodes.length - 1; i >= 0; i--) {
var child = node.childNodes[i];
if (child.nodeType === child.COMMENT_NODE) {
node.removeChild(child);
} else if (child.nodeType === child.ELEMENT_NODE) {
removeCommentNodesRecursively(child);
}
}
}
describe("Readability API", function() {
describe("#constructor", function() {
var doc = new JSDOMParser().parse("<html><div>yo</div></html>");
it("should accept a debug option", function() {
expect(new Readability(doc)._debug).eql(false);
expect(new Readability(doc, {debug: true})._debug).eql(true);
});
it("should accept a nbTopCandidates option", function() {
expect(new Readability(doc)._nbTopCandidates).eql(5);
expect(new Readability(doc, {nbTopCandidates: 42})._nbTopCandidates).eql(42);
});
it("should accept a maxElemsToParse option", function() {
expect(new Readability(doc)._maxElemsToParse).eql(0);
expect(new Readability(doc, {maxElemsToParse: 42})._maxElemsToParse).eql(42);
});
it("should accept a keepClasses option", function() {
expect(new Readability(doc)._keepClasses).eql(false);
expect(new Readability(doc, {keepClasses: true})._keepClasses).eql(true);
expect(new Readability(doc, {keepClasses: false})._keepClasses).eql(false);
});
});
describe("#parse", function() {
var exampleSource = testPages[0].source;
it("shouldn't parse oversized documents as per configuration", async function() {
var doc = new JSDOMParser().parse("<html><div>yo</div></html>");
await expect(
(new Readability(doc, { maxElemsToParse: 1 })).parse()
).to.be.rejectedWith("Aborting parsing document; 2 elements found");
});
it("should run _cleanClasses with default configuration", async function() {
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc);
parser._cleanClasses = sinon.fake();
await parser.parse();
expect(parser._cleanClasses.called).eql(true);
});
it("should run _cleanClasses when option keepClasses = false", async function() {
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc, { keepClasses: false });
parser._cleanClasses = sinon.fake();
await parser.parse();
expect(parser._cleanClasses.called).eql(true);
});
it("shouldn't run _cleanClasses when option keepClasses = true", async function() {
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc, { keepClasses: true });
parser._cleanClasses = sinon.fake();
await parser.parse();
expect(parser._cleanClasses.called).eql(false);
});
xit("should use custom content serializer sent as option", async function() {
var dom = parseHTML("<html><body>My cat: <img src=''></body></html>");
var expected_xhtml = "<div xmlns=\"http://www.w3.org/1999/xhtml\" id=\"readability-page-1\" class=\"page\">My cat: <img src=\"\" /></div>";
var xml = new dom.window.XMLSerializer();
var content = await (new Readability(dom.window.document, {
serializer: function(el) {
return xml.serializeToString(el.firstChild);
}
})).parse().content;
expect(content).eql(expected_xhtml);
});
it("should not proxy image with data uri", async function() {
var dom = parseHTML("<html><body>My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" +
"AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==\"" +
" alt=\"Red dot\" /></body></html>");
var expected_xhtml = "<DIV class=\"page\" id=\"readability-page-1\">My cat: <img src=\"data:image/png;base64," +
" iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0" +
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></DIV>";
var content = (await (new Readability(dom.document)).parse()).content;
expect(content).eql(expected_xhtml);
});
it("should handle srcset elements with density descriptors", async function() {
var dom = parseHTML('<html><body>My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x, ' +
'https://webkit.org/demos/srcset/image-2x.png 2x, ' +
'https://webkit.org/demos/srcset/image-3x.png 3x, ' +
'https://webkit.org/demos/srcset/image-4x.png 4x">' +
'</body></html>');
var expected_xhtml = '<DIV class="page" id="readability-page-1">My image: ' +
'<img data-omnivore-original-src="https://webkit.org/demos/srcset/image-src.png" ' +
'src="https://webkit.org/demos/srcset/image-src.png" ' +
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x,' +
'https://webkit.org/demos/srcset/image-2x.png 2x,' +
'https://webkit.org/demos/srcset/image-3x.png 3x,' +
'https://webkit.org/demos/srcset/image-4x.png 4x,"></DIV>';
var content = (await (new Readability(dom.document, {
createImageProxyUrl: function(url) {
return url;
}
})).parse()).content;
expect(content).eql(expected_xhtml);
});
it("should remove srcset elements that are lazy loading placeholders", async function() {
var dom = parseHTML('<html><body>My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&#038;ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" srcset=""></body></html>');
var expected_xhtml = '<DIV class="page" id="readability-page-1">' +
'My image: <img data-omnivore-original-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' +
'</DIV>';
var content = (await (new Readability(dom.document, {
createImageProxyUrl: function(url) {
return url;
}
})).parse()).content;
expect(content).eql(expected_xhtml);
});
});
});
describe("Test pages", function() {
before(function() {
// mock the substack redirect to twitter
nock('https://email.mg2.substack.com')
.get('/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3')
.reply(302, '', [
'location',
'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A'
]);
nock('https://twitter.com')
.get('/ShellenbergerMD/status/1529847068138778624')
.query({"s":"20","t":"A2G3yBHyxcYI6szVC2TJ0A"})
.reply(200);
});
testPages.forEach(function(testPage) {
describe(testPage.dir, function() {
var uri = "http://fakehost/test/page.html";
runTestsWithItems("linkedom", function(source) {
var doc = parseHTML(source).document;
removeCommentNodesRecursively(doc);
return doc;
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri);
// runTestsWithItems("JSDOMParser", function(source) {
// var parser = new JSDOMParser();
// var doc = parser.parse(source, uri);
// if (parser.errorState) {
// console.error("Parsing this DOM caused errors:", parser.errorState);
// return null;
// }
// return doc;
// }, testPage.source, testPage.expectedContent, testPage.expectedMetadata);
});
});
});