新增构建OpenSSL镜像相关文件
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
node_modules
|
||||
build
|
||||
.env*
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
@@ -0,0 +1,4 @@
|
||||
node_modules/
|
||||
dist/
|
||||
readabilityjs/
|
||||
src/generated/
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"extends": "../../.eslintrc",
|
||||
"parserOptions": {
|
||||
"project": "tsconfig.json"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
# This file specifies files that are *not* uploaded to Google Cloud Platform
|
||||
# using gcloud. It follows the same syntax as .gitignore, with the addition of
|
||||
# "#!include" directives (which insert the entries of the given .gitignore-style
|
||||
# file at that point).
|
||||
#
|
||||
# For more information, run:
|
||||
# $ gcloud topic gcloudignore
|
||||
#
|
||||
.gcloudignore
|
||||
# If you would like to upload your .git directory, .gitignore file or files
|
||||
# from your .gitignore file, remove the corresponding line
|
||||
# below:
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
node_modules
|
||||
@@ -0,0 +1,27 @@
|
||||
FROM node:18.16-alpine
|
||||
|
||||
# Run everything after as non-privileged user.
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json .
|
||||
COPY yarn.lock .
|
||||
COPY tsconfig.json .
|
||||
COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/pdf-handler/package.json ./packages/pdf-handler/package.json
|
||||
|
||||
RUN yarn install --pure-lockfile
|
||||
|
||||
ADD /packages/pdf-handler ./packages/pdf-handler
|
||||
RUN yarn workspace @omnivore/pdf-handler build
|
||||
|
||||
# After building, fetch the production dependencies
|
||||
RUN rm -rf /app/packages/pdf-handler/node_modules
|
||||
RUN rm -rf /app/node_modules
|
||||
RUN yarn install --pure-lockfile --production
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["yarn", "workspace", "@omnivore/pdf-handler", "start"]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"extension": ["ts"],
|
||||
"spec": "test/**/*.test.ts",
|
||||
"require": "test/babel-register.js"
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
{
|
||||
"name": "@omnivore/pdf-handler",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "build/src/index.js",
|
||||
"types": "build/src/index.d.ts",
|
||||
"files": [
|
||||
"build/src"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"keywords": [],
|
||||
"scripts": {
|
||||
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
|
||||
"test:typecheck": "tsc --noEmit",
|
||||
"lint": "eslint src --ext ts,js,tsx,jsx",
|
||||
"compile": "tsc",
|
||||
"build": "tsc",
|
||||
"start": "functions-framework --source=build/src/ --target=pdfHandler",
|
||||
"dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
|
||||
"gcloud-deploy": "gcloud functions deploy pdfHandler --region=$npm_config_region --runtime nodejs14 --trigger-bucket=$npm_config_bucket --env-vars-file=../gcf-shared/env-$npm_config_env.yaml",
|
||||
"deploy": "yarn build && yarn gcloud-deploy"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^14.11.2",
|
||||
"chai": "^4.3.6",
|
||||
"chai-string": "^1.5.0",
|
||||
"mocha": "^10.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@google-cloud/functions-framework": "3.1.2",
|
||||
"@google-cloud/pubsub": "^4.0.0",
|
||||
"@google-cloud/storage": "^7.0.1",
|
||||
"@sentry/serverless": "^7.77.0",
|
||||
"axios": "^0.27.2",
|
||||
"concurrently": "^7.0.0",
|
||||
"pdfjs-dist": "^2.9.359"
|
||||
},
|
||||
"volta": {
|
||||
"extends": "../../package.json"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
/* eslint-disable prefer-const */
|
||||
/* eslint-disable @typescript-eslint/restrict-template-expressions */
|
||||
import { Storage } from '@google-cloud/storage'
|
||||
import { parsePdf } from './pdf'
|
||||
import axios from 'axios'
|
||||
|
||||
const storage = new Storage()
|
||||
|
||||
const postUpdate = async (
|
||||
fileId: string,
|
||||
content: string,
|
||||
title?: string,
|
||||
author?: string,
|
||||
description?: string
|
||||
) => {
|
||||
const url =
|
||||
'https://backend-dot-omnivore-production.wl.r.appspot.com/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
|
||||
|
||||
// const localUrl =
|
||||
// 'http://localhost:4000/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
|
||||
|
||||
const data = JSON.stringify({
|
||||
fileId,
|
||||
content,
|
||||
title,
|
||||
author,
|
||||
description,
|
||||
})
|
||||
|
||||
const body = {
|
||||
message: {
|
||||
data: Buffer.from(data).toString('base64'),
|
||||
},
|
||||
}
|
||||
|
||||
const res = await axios.post(url, body)
|
||||
console.log('res', res.status)
|
||||
}
|
||||
|
||||
const listFiles = async () => {
|
||||
const res = await storage
|
||||
.bucket('omnivore')
|
||||
.getFiles({ prefix: 'u/', maxResults: 50 })
|
||||
console.log('result', res)
|
||||
|
||||
const [files] = res
|
||||
console.log('Files:')
|
||||
for (const file of files) {
|
||||
const url = file.publicUrl()
|
||||
const [isPublic] = await file.isPublic()
|
||||
console.log(file.publicUrl(), 'is public:', isPublic)
|
||||
if (isPublic) {
|
||||
const parsed = await parsePdf(new URL(url))
|
||||
// console.log(text)
|
||||
// console.log('\n\n')
|
||||
await postUpdate(
|
||||
file.name,
|
||||
parsed.content,
|
||||
parsed.title,
|
||||
parsed.author,
|
||||
parsed.description
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
listFiles().catch(console.error)
|
||||
@@ -0,0 +1,135 @@
|
||||
import { PubSub } from '@google-cloud/pubsub'
|
||||
import { GetSignedUrlConfig, Storage } from '@google-cloud/storage'
|
||||
import * as Sentry from '@sentry/serverless'
|
||||
import { parsePdf } from './pdf'
|
||||
|
||||
Sentry.GCPFunction.init({
|
||||
dsn: process.env.SENTRY_DSN,
|
||||
tracesSampleRate: 0,
|
||||
})
|
||||
|
||||
const pubsub = new PubSub()
|
||||
const storage = new Storage()
|
||||
const CONTENT_UPDATE_TOPIC = 'updatePageContent'
|
||||
|
||||
interface StorageEventData {
|
||||
bucket: string
|
||||
name: string
|
||||
contentType: string
|
||||
}
|
||||
|
||||
function isStorageEventData(event: any): event is StorageEventData {
|
||||
return 'name' in event && 'bucket' in event && 'contentType' in event
|
||||
}
|
||||
|
||||
// Ensure this is a finalize event and that it is stored in the `u/` directory and is a PDF
|
||||
const shouldHandle = (data: StorageEventData) => {
|
||||
return (
|
||||
data.name.startsWith('u/') &&
|
||||
data.contentType.toLowerCase() === 'application/pdf'
|
||||
)
|
||||
}
|
||||
|
||||
const getDocumentUrl = async (
|
||||
data: StorageEventData
|
||||
): Promise<URL | undefined> => {
|
||||
const options: GetSignedUrlConfig = {
|
||||
version: 'v4',
|
||||
action: 'read',
|
||||
expires: Date.now() + 240 * 60 * 1000,
|
||||
}
|
||||
|
||||
try {
|
||||
const bucket = storage.bucket(data.bucket)
|
||||
const file = bucket.file(data.name)
|
||||
const [url] = await file.getSignedUrl(options)
|
||||
return new URL(url)
|
||||
} catch (e) {
|
||||
console.debug('error getting signed url', e)
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
export const updatePageContent = (
|
||||
fileId: string,
|
||||
content: string,
|
||||
title?: string,
|
||||
author?: string,
|
||||
description?: string
|
||||
): Promise<string | undefined> => {
|
||||
return pubsub
|
||||
.topic(CONTENT_UPDATE_TOPIC)
|
||||
.publish(
|
||||
Buffer.from(
|
||||
JSON.stringify({ fileId, content, title, author, description })
|
||||
)
|
||||
)
|
||||
.catch((err) => {
|
||||
console.error('error publishing conentUpdate:', err)
|
||||
return undefined
|
||||
})
|
||||
}
|
||||
|
||||
const getStorageEventData = (
|
||||
pubSubMessage: string
|
||||
): StorageEventData | undefined => {
|
||||
try {
|
||||
const str = Buffer.from(pubSubMessage, 'base64').toString().trim()
|
||||
const obj = JSON.parse(str) as unknown
|
||||
if (isStorageEventData(obj)) {
|
||||
return obj
|
||||
}
|
||||
} catch (err) {
|
||||
console.log('error deserializing event: ', { pubSubMessage, err })
|
||||
}
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const pdfHandler = Sentry.GCPFunction.wrapHttpFunction(
|
||||
async (req, res) => {
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
if ('message' in req.body && 'data' in req.body.message) {
|
||||
const pubSubMessage = req.body.message.data as string
|
||||
const data = getStorageEventData(pubSubMessage)
|
||||
if (data) {
|
||||
try {
|
||||
if (shouldHandle(data)) {
|
||||
console.log('handling pdf data', data)
|
||||
|
||||
const url = await getDocumentUrl(data)
|
||||
console.log('PDF url: ', url)
|
||||
if (!url) {
|
||||
console.log('Could not fetch PDF', data.bucket, data.name)
|
||||
return res.status(404).send('Could not fetch PDF')
|
||||
}
|
||||
|
||||
const parsed = await parsePdf(url)
|
||||
const result = await updatePageContent(
|
||||
data.name,
|
||||
parsed.content,
|
||||
parsed.title,
|
||||
parsed.author,
|
||||
parsed.description
|
||||
)
|
||||
console.log(
|
||||
'publish result',
|
||||
result,
|
||||
'title',
|
||||
parsed.title,
|
||||
'author',
|
||||
parsed.author
|
||||
)
|
||||
} else {
|
||||
console.log('not handling pdf data', data)
|
||||
}
|
||||
} catch (err) {
|
||||
console.log('error handling event', { err, data })
|
||||
return res.status(500).send('Error handling event')
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log('no pubsub message')
|
||||
}
|
||||
res.send('ok')
|
||||
}
|
||||
)
|
||||
@@ -0,0 +1,212 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
||||
/* eslint-disable @typescript-eslint/restrict-plus-operands */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-argument */
|
||||
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
|
||||
import {
|
||||
TextItem,
|
||||
PDFPageProxy,
|
||||
PDFDocumentProxy,
|
||||
} from 'pdfjs-dist/types/display/api'
|
||||
|
||||
interface Page {
|
||||
lines: string[]
|
||||
}
|
||||
|
||||
// Unused at the moment -- commented out for now to satisfy linter
|
||||
const MAX_TITLE_LENGTH = 95
|
||||
|
||||
type MetadataInfoKey =
|
||||
| 'Title'
|
||||
| 'Author'
|
||||
| 'Subject'
|
||||
| 'CreationDate'
|
||||
| 'ModDate'
|
||||
|
||||
interface MetadataInfo {
|
||||
Title?: string
|
||||
Author?: string
|
||||
CreationDate?: string
|
||||
ModDate?: string
|
||||
Subject?: string
|
||||
}
|
||||
|
||||
interface ParsedPdf {
|
||||
content: string
|
||||
title?: string
|
||||
author?: string
|
||||
description?: string
|
||||
}
|
||||
|
||||
export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
|
||||
const documentLoadingTask = _getDocument(url)
|
||||
const document = await documentLoadingTask.promise
|
||||
|
||||
const text = await getDocumentText(document)
|
||||
// eslint-disable-next-line no-control-regex
|
||||
const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
|
||||
|
||||
const title = await getMetadataItem(document, 'Title')
|
||||
if (title) result.title = title
|
||||
|
||||
const author = await getMetadataItem(document, 'Author')
|
||||
if (author) result.author = author
|
||||
|
||||
const description = await getMetadataItem(document, 'Subject')
|
||||
if (description) result.description = description
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
|
||||
const documentLoadingTask = _getDocument(source)
|
||||
return documentLoadingTask.promise
|
||||
}
|
||||
|
||||
const getMetadataItem = async (
|
||||
document: PDFDocumentProxy,
|
||||
key: MetadataInfoKey
|
||||
): Promise<string | undefined> => {
|
||||
return await document
|
||||
.getMetadata()
|
||||
.then((metadata) => metadata.info as MetadataInfo)
|
||||
.then((info) => {
|
||||
return info[key]
|
||||
})
|
||||
}
|
||||
|
||||
export const getDocumentTitle = async (
|
||||
document: PDFDocumentProxy
|
||||
): Promise<string | undefined> => {
|
||||
const title = await getMetadataItem(document, 'Title')
|
||||
if (title) {
|
||||
return title
|
||||
}
|
||||
|
||||
// Attempt to grab the title from the first page
|
||||
// because extracted text is returned as joined
|
||||
// lines, we replace the line breaks with spaces
|
||||
const pageText = await readPdfText(document, 1)
|
||||
if (pageText.length) {
|
||||
const result = pageText.substring(0, MAX_TITLE_LENGTH)
|
||||
return result.split('\n').join('')
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const getDocumentText = async (
|
||||
document: PDFDocumentProxy
|
||||
): Promise<string> => {
|
||||
const pages = await readPdfText(document)
|
||||
return pages
|
||||
}
|
||||
|
||||
export const readPdfText = async (
|
||||
document: PDFDocumentProxy,
|
||||
maxPages: number | undefined = undefined
|
||||
): Promise<string> => {
|
||||
const pages: Page[] = []
|
||||
const numPages = maxPages || document.numPages
|
||||
|
||||
for (let i = 0; i < numPages; i++) {
|
||||
pages.push(await parsePage(await document.getPage(i + 1)))
|
||||
}
|
||||
|
||||
return pages.reduce((accum, page) => {
|
||||
return accum.concat(page.lines.join('\n') + '\n')
|
||||
}, '')
|
||||
}
|
||||
|
||||
const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
|
||||
const rawContent = await pdfPage.getTextContent()
|
||||
return parsePageItems(
|
||||
rawContent.items.filter((item): item is TextItem => 'str' in item)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses individual text items generated by pdf.js This allows lower level control of what actually
|
||||
* gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
|
||||
* prior to passing items in here. See parsePage function above for example usage.
|
||||
*
|
||||
* @param pdfItems An array of TextItem items.
|
||||
*/
|
||||
const parsePageItems = (pdfItems: TextItem[]): Page => {
|
||||
const lineData: { [y: number]: TextItem[] } = {}
|
||||
|
||||
for (let i = 0; i < pdfItems.length; i++) {
|
||||
const item = pdfItems[i]
|
||||
const y = item.transform[5]
|
||||
/* eslint-disable no-prototype-builtins */
|
||||
if (!lineData.hasOwnProperty(y)) {
|
||||
lineData[y] = []
|
||||
}
|
||||
lineData[y].push(item)
|
||||
}
|
||||
|
||||
const yCoords = Object.keys(lineData)
|
||||
.map((key) => Number(key))
|
||||
// b - a here because the bottom is y = 0 so we want that to be last
|
||||
.sort((a, b) => b - a)
|
||||
// insert an empty line between any 2 lines where their distance is greater than the upper line's height
|
||||
.reduce((accum: number[], currentY, index, array) => {
|
||||
const nextY = array[index + 1]
|
||||
if (nextY != undefined) {
|
||||
const currentLineHeight: number = lineData[currentY].reduce(
|
||||
(finalValue, current) =>
|
||||
finalValue > current.height ? finalValue : current.height,
|
||||
-1
|
||||
)
|
||||
|
||||
// currentY - nextY because currentY will be higher than nextY
|
||||
if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
|
||||
const newY = currentY - currentLineHeight
|
||||
lineData[newY] = []
|
||||
return accum.concat(currentY, newY)
|
||||
}
|
||||
}
|
||||
return accum.concat(currentY)
|
||||
}, [])
|
||||
|
||||
const lines: string[] = []
|
||||
for (let i = 0; i < yCoords.length; i++) {
|
||||
const y = yCoords[i]
|
||||
// sort by x position (position in line)
|
||||
const lineItems = lineData[y]
|
||||
.sort((a, b) => a.transform[4] - b.transform[4])
|
||||
.filter((item) => !!item.str)
|
||||
let line = lineItems.length ? lineItems[0].str : ''
|
||||
for (let j = 1; j < lineItems.length; j++) {
|
||||
const item = lineItems[j]
|
||||
const lastItem = lineItems[j - 1]
|
||||
const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
|
||||
|
||||
// insert spaces for items that are far apart horizontally
|
||||
if (
|
||||
item.height !== 0 &&
|
||||
(xDiff > item.height || xDiff > lastItem.height)
|
||||
) {
|
||||
const spaceCountA = Math.ceil(xDiff / item.height)
|
||||
let spaceCount = spaceCountA
|
||||
if (lastItem.height !== item.height) {
|
||||
const spaceCountB = Math.ceil(xDiff / lastItem.height)
|
||||
spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
|
||||
}
|
||||
|
||||
if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
|
||||
spaceCount = 1
|
||||
}
|
||||
|
||||
line += Array(spaceCount).fill('').join(' ')
|
||||
}
|
||||
line += item.str
|
||||
}
|
||||
lines.push(line)
|
||||
}
|
||||
|
||||
return {
|
||||
lines,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
const register = require('@babel/register').default;
|
||||
|
||||
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] });
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,54 @@
|
||||
import 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import { expect } from 'chai'
|
||||
import chaiString from 'chai-string'
|
||||
import {
|
||||
getDocument,
|
||||
getDocumentText,
|
||||
getDocumentTitle,
|
||||
parsePdf,
|
||||
} from '../../src/pdf'
|
||||
|
||||
chai.use(chaiString)
|
||||
|
||||
describe('open a simple PDF with a set title', () => {
|
||||
it('should return the title', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect('Document1').to.equal(result)
|
||||
})
|
||||
it('should return the document text', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
|
||||
const result = await getDocumentText(doc)
|
||||
expect(result).to.equal(
|
||||
'This is the page title \n \nThis is some more text \n'
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('open a complex PDF with no title', () => {
|
||||
it('should return some initial content as the title', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect(result).to.startWith(
|
||||
'Improving communications around vaccine breakthrough and vaccine effectiveness'
|
||||
)
|
||||
})
|
||||
|
||||
it('should be less than the max title length', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect(result?.length).to.lessThanOrEqual(95)
|
||||
})
|
||||
})
|
||||
|
||||
describe('open a PDF with metadata set', () => {
|
||||
it('should return metadata', async () => {
|
||||
const parsed = await parsePdf(
|
||||
new URL('file://' + __dirname + '/data/welcome_to_your_library.pdf')
|
||||
)
|
||||
expect(parsed.title).to.eq('Welcome to your Omnivore Library')
|
||||
expect(parsed.author).to.eq('Jackson Harper')
|
||||
expect(parsed.description).to.eq('This is the description of my PDF')
|
||||
})
|
||||
})
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"extends": "@tsconfig/node14/tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "build",
|
||||
"rootDir": ".",
|
||||
"lib": ["dom"]
|
||||
},
|
||||
"include": ["src"]
|
||||
}
|
||||
Reference in New Issue
Block a user