var chai = require("chai"); var sinon = require("sinon"); var chaiAsPromised = require("chai-as-promised"); const { parseHTML } = require("linkedom"); const nock = require("nock"); chai.use(chaiAsPromised); chai.config.includeStack = true; var expect = chai.expect; var Readability = require("../index").Readability; var JSDOMParser = require("../JSDOMParser"); var prettyPrint = require("./utils").prettyPrint; const isOmnivore = process.env.IS_OMNIVORE; var testPages = require("./utils").getTestPages(isOmnivore); function reformatError(err) { var formattedError = new Error(err.message); formattedError.stack = err.stack; return formattedError; } function inOrderTraverse(fromNode) { if (fromNode.firstChild) { return fromNode.firstChild; } while (fromNode && !fromNode.nextSibling) { fromNode = fromNode.parentNode; } return fromNode ? fromNode.nextSibling : null; } function inOrderIgnoreEmptyTextNodes(fromNode) { do { fromNode = inOrderTraverse(fromNode); } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()); return fromNode; } function traverseDOM(callback, expectedDOM, actualDOM) { var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]; var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]; while (actualNode || expectedNode) { // We'll stop if we don't have both actualNode and expectedNode if (!callback(actualNode, expectedNode)) { break; } actualNode = inOrderIgnoreEmptyTextNodes(actualNode); expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode); } } // Collapse subsequent whitespace like HTML: function htmlTransform(str) { return str.replace(/\s+/g, " "); } function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) { describe(label, function() { this.timeout(30000); var result; before(async function() { try { var doc = domGenerationFn(source); // Provide one class name to preserve, which we know appears in a few // of the test documents. var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri }); result = await myReader.parse(); } catch (err) { throw reformatError(err); } }); it("should return a result object", function() { expect(result).to.include.keys("content", "title", "excerpt", "byline"); }); it("should extract expected content", function() { function nodeStr(n) { if (!n) { return "(no node)"; } if (n.nodeType == 3) { return "#text(" + htmlTransform(n.textContent) + ")"; } if (n.nodeType != 1) { return "some other node type: " + n.nodeType + " with data " + n.data; } var rv = n.localName; if (n.id) { rv += "#" + n.id; } if (n.className) { rv += ".(" + n.className + ")"; } return rv; } function genPath(node) { if (node.id) { return "#" + node.id; } if (node.tagName == "BODY") { return "body"; } var parent = node.parentNode; var parentPath = genPath(parent); var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1; return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")"; } function findableNodeDesc(node) { return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)"; } function attributesForNode(node) { return Array.from(node.attributes).map(function(attr) { return attr.name + "=" + attr.value; }).join(","); } var actualDOM = domGenerationFn(prettyPrint(result.content)); var expectedDOM = domGenerationFn(prettyPrint(expectedContent)); traverseDOM(function(actualNode, expectedNode) { if (actualNode && expectedNode) { var actualDesc = nodeStr(actualNode); var expectedDesc = nodeStr(expectedNode); if (actualDesc != expectedDesc) { expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc); return false; } // Compare text for text nodes: if (actualNode.nodeType == 3) { var actualText = htmlTransform(actualNode.textContent); var expectedText = htmlTransform(expectedNode.textContent); expect(actualText, findableNodeDesc(actualNode)).eql(expectedText); if (actualText != expectedText) { return false; } // Compare attributes for element nodes: } else if (actualNode.nodeType == 1) { var actualNodeDesc = attributesForNode(actualNode); var expectedNodeDesc = attributesForNode(expectedNode); var desc = "node " + nodeStr(actualNode) + " attributes (" + actualNodeDesc + ") should match (" + expectedNodeDesc + ") "; expect(actualNode.attributes.length, desc).eql(expectedNode.attributes.length); for (var i = 0; i < actualNode.attributes.length; i++) { var attr = actualNode.attributes[i].name; var actualValue = actualNode.getAttribute(attr); var expectedValue = expectedNode.getAttribute(attr); expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue); } } } else { expect(nodeStr(actualNode), "Should have a node from both DOMs").eql(nodeStr(expectedNode)); return false; } return true; }, actualDOM, expectedDOM); }); it("should extract expected title", function() { expect(result.title).eql(expectedMetadata.title); }); it("should extract expected byline", function() { expect(result.byline).eql(expectedMetadata.byline); }); it("should extract expected excerpt", function() { expect(result.excerpt).eql(expectedMetadata.excerpt); }); it("should extract expected site name", function() { expect(result.siteName).eql(expectedMetadata.siteName); }); expectedMetadata.dir && it("should extract expected direction", function() { expect(result.dir).eql(expectedMetadata.dir); }); }); } function removeCommentNodesRecursively(node) { for (var i = node.childNodes.length - 1; i >= 0; i--) { var child = node.childNodes[i]; if (child.nodeType === child.COMMENT_NODE) { node.removeChild(child); } else if (child.nodeType === child.ELEMENT_NODE) { removeCommentNodesRecursively(child); } } } describe("Readability API", function() { describe("#constructor", function() { var doc = new JSDOMParser().parse("