新增构建OpenSSL镜像相关文件

This commit is contained in:
2024-03-15 14:52:38 +08:00
committed by huty
parent 43337c1a0b
commit 132c17af2d
10119 changed files with 1581963 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
node_modules
build
.env*
Dockerfile
.dockerignore

View File

@@ -0,0 +1,4 @@
node_modules/
dist/
readabilityjs/
src/generated/

View File

@@ -0,0 +1,6 @@
{
"extends": "../../.eslintrc",
"parserOptions": {
"project": "tsconfig.json"
}
}

View File

@@ -0,0 +1,16 @@
# This file specifies files that are *not* uploaded to Google Cloud Platform
# using gcloud. It follows the same syntax as .gitignore, with the addition of
# "#!include" directives (which insert the entries of the given .gitignore-style
# file at that point).
#
# For more information, run:
# $ gcloud topic gcloudignore
#
.gcloudignore
# If you would like to upload your .git directory, .gitignore file or files
# from your .gitignore file, remove the corresponding line
# below:
.git
.gitignore
node_modules

View File

@@ -0,0 +1,27 @@
FROM node:18.16-alpine
# Run everything after as non-privileged user.
WORKDIR /app
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/pdf-handler/package.json ./packages/pdf-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/pdf-handler ./packages/pdf-handler
RUN yarn workspace @omnivore/pdf-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/pdf-handler/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
CMD ["yarn", "workspace", "@omnivore/pdf-handler", "start"]

View File

@@ -0,0 +1,5 @@
{
"extension": ["ts"],
"spec": "test/**/*.test.ts",
"require": "test/babel-register.js"
}

View File

@@ -0,0 +1,41 @@
{
"name": "@omnivore/pdf-handler",
"version": "1.0.0",
"description": "",
"main": "build/src/index.js",
"types": "build/src/index.d.ts",
"files": [
"build/src"
],
"license": "Apache-2.0",
"keywords": [],
"scripts": {
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
"test:typecheck": "tsc --noEmit",
"lint": "eslint src --ext ts,js,tsx,jsx",
"compile": "tsc",
"build": "tsc",
"start": "functions-framework --source=build/src/ --target=pdfHandler",
"dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
"gcloud-deploy": "gcloud functions deploy pdfHandler --region=$npm_config_region --runtime nodejs14 --trigger-bucket=$npm_config_bucket --env-vars-file=../gcf-shared/env-$npm_config_env.yaml",
"deploy": "yarn build && yarn gcloud-deploy"
},
"devDependencies": {
"@types/node": "^14.11.2",
"chai": "^4.3.6",
"chai-string": "^1.5.0",
"mocha": "^10.0.0"
},
"dependencies": {
"@google-cloud/functions-framework": "3.1.2",
"@google-cloud/pubsub": "^4.0.0",
"@google-cloud/storage": "^7.0.1",
"@sentry/serverless": "^7.77.0",
"axios": "^0.27.2",
"concurrently": "^7.0.0",
"pdfjs-dist": "^2.9.359"
},
"volta": {
"extends": "../../package.json"
}
}

View File

@@ -0,0 +1,67 @@
/* eslint-disable prefer-const */
/* eslint-disable @typescript-eslint/restrict-template-expressions */
import { Storage } from '@google-cloud/storage'
import { parsePdf } from './pdf'
import axios from 'axios'
const storage = new Storage()
const postUpdate = async (
fileId: string,
content: string,
title?: string,
author?: string,
description?: string
) => {
const url =
'https://backend-dot-omnivore-production.wl.r.appspot.com/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
// const localUrl =
// 'http://localhost:4000/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
const data = JSON.stringify({
fileId,
content,
title,
author,
description,
})
const body = {
message: {
data: Buffer.from(data).toString('base64'),
},
}
const res = await axios.post(url, body)
console.log('res', res.status)
}
const listFiles = async () => {
const res = await storage
.bucket('omnivore')
.getFiles({ prefix: 'u/', maxResults: 50 })
console.log('result', res)
const [files] = res
console.log('Files:')
for (const file of files) {
const url = file.publicUrl()
const [isPublic] = await file.isPublic()
console.log(file.publicUrl(), 'is public:', isPublic)
if (isPublic) {
const parsed = await parsePdf(new URL(url))
// console.log(text)
// console.log('\n\n')
await postUpdate(
file.name,
parsed.content,
parsed.title,
parsed.author,
parsed.description
)
}
}
}
listFiles().catch(console.error)

View File

@@ -0,0 +1,135 @@
import { PubSub } from '@google-cloud/pubsub'
import { GetSignedUrlConfig, Storage } from '@google-cloud/storage'
import * as Sentry from '@sentry/serverless'
import { parsePdf } from './pdf'
Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0,
})
const pubsub = new PubSub()
const storage = new Storage()
const CONTENT_UPDATE_TOPIC = 'updatePageContent'
interface StorageEventData {
bucket: string
name: string
contentType: string
}
function isStorageEventData(event: any): event is StorageEventData {
return 'name' in event && 'bucket' in event && 'contentType' in event
}
// Ensure this is a finalize event and that it is stored in the `u/` directory and is a PDF
const shouldHandle = (data: StorageEventData) => {
return (
data.name.startsWith('u/') &&
data.contentType.toLowerCase() === 'application/pdf'
)
}
const getDocumentUrl = async (
data: StorageEventData
): Promise<URL | undefined> => {
const options: GetSignedUrlConfig = {
version: 'v4',
action: 'read',
expires: Date.now() + 240 * 60 * 1000,
}
try {
const bucket = storage.bucket(data.bucket)
const file = bucket.file(data.name)
const [url] = await file.getSignedUrl(options)
return new URL(url)
} catch (e) {
console.debug('error getting signed url', e)
return undefined
}
}
export const updatePageContent = (
fileId: string,
content: string,
title?: string,
author?: string,
description?: string
): Promise<string | undefined> => {
return pubsub
.topic(CONTENT_UPDATE_TOPIC)
.publish(
Buffer.from(
JSON.stringify({ fileId, content, title, author, description })
)
)
.catch((err) => {
console.error('error publishing conentUpdate:', err)
return undefined
})
}
const getStorageEventData = (
pubSubMessage: string
): StorageEventData | undefined => {
try {
const str = Buffer.from(pubSubMessage, 'base64').toString().trim()
const obj = JSON.parse(str) as unknown
if (isStorageEventData(obj)) {
return obj
}
} catch (err) {
console.log('error deserializing event: ', { pubSubMessage, err })
}
return undefined
}
export const pdfHandler = Sentry.GCPFunction.wrapHttpFunction(
async (req, res) => {
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
if ('message' in req.body && 'data' in req.body.message) {
const pubSubMessage = req.body.message.data as string
const data = getStorageEventData(pubSubMessage)
if (data) {
try {
if (shouldHandle(data)) {
console.log('handling pdf data', data)
const url = await getDocumentUrl(data)
console.log('PDF url: ', url)
if (!url) {
console.log('Could not fetch PDF', data.bucket, data.name)
return res.status(404).send('Could not fetch PDF')
}
const parsed = await parsePdf(url)
const result = await updatePageContent(
data.name,
parsed.content,
parsed.title,
parsed.author,
parsed.description
)
console.log(
'publish result',
result,
'title',
parsed.title,
'author',
parsed.author
)
} else {
console.log('not handling pdf data', data)
}
} catch (err) {
console.log('error handling event', { err, data })
return res.status(500).send('Error handling event')
}
}
} else {
console.log('no pubsub message')
}
res.send('ok')
}
)

View File

@@ -0,0 +1,212 @@
/* eslint-disable @typescript-eslint/no-unsafe-call */
/* eslint-disable @typescript-eslint/restrict-plus-operands */
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unsafe-argument */
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
import {
TextItem,
PDFPageProxy,
PDFDocumentProxy,
} from 'pdfjs-dist/types/display/api'
interface Page {
lines: string[]
}
// Unused at the moment -- commented out for now to satisfy linter
const MAX_TITLE_LENGTH = 95
type MetadataInfoKey =
| 'Title'
| 'Author'
| 'Subject'
| 'CreationDate'
| 'ModDate'
interface MetadataInfo {
Title?: string
Author?: string
CreationDate?: string
ModDate?: string
Subject?: string
}
interface ParsedPdf {
content: string
title?: string
author?: string
description?: string
}
export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
const documentLoadingTask = _getDocument(url)
const document = await documentLoadingTask.promise
const text = await getDocumentText(document)
// eslint-disable-next-line no-control-regex
const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
const title = await getMetadataItem(document, 'Title')
if (title) result.title = title
const author = await getMetadataItem(document, 'Author')
if (author) result.author = author
const description = await getMetadataItem(document, 'Subject')
if (description) result.description = description
return result
}
export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
const documentLoadingTask = _getDocument(source)
return documentLoadingTask.promise
}
const getMetadataItem = async (
document: PDFDocumentProxy,
key: MetadataInfoKey
): Promise<string | undefined> => {
return await document
.getMetadata()
.then((metadata) => metadata.info as MetadataInfo)
.then((info) => {
return info[key]
})
}
export const getDocumentTitle = async (
document: PDFDocumentProxy
): Promise<string | undefined> => {
const title = await getMetadataItem(document, 'Title')
if (title) {
return title
}
// Attempt to grab the title from the first page
// because extracted text is returned as joined
// lines, we replace the line breaks with spaces
const pageText = await readPdfText(document, 1)
if (pageText.length) {
const result = pageText.substring(0, MAX_TITLE_LENGTH)
return result.split('\n').join('')
}
return undefined
}
export const getDocumentText = async (
document: PDFDocumentProxy
): Promise<string> => {
const pages = await readPdfText(document)
return pages
}
export const readPdfText = async (
document: PDFDocumentProxy,
maxPages: number | undefined = undefined
): Promise<string> => {
const pages: Page[] = []
const numPages = maxPages || document.numPages
for (let i = 0; i < numPages; i++) {
pages.push(await parsePage(await document.getPage(i + 1)))
}
return pages.reduce((accum, page) => {
return accum.concat(page.lines.join('\n') + '\n')
}, '')
}
const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
const rawContent = await pdfPage.getTextContent()
return parsePageItems(
rawContent.items.filter((item): item is TextItem => 'str' in item)
)
}
/**
* Parses individual text items generated by pdf.js This allows lower level control of what actually
* gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
* prior to passing items in here. See parsePage function above for example usage.
*
* @param pdfItems An array of TextItem items.
*/
const parsePageItems = (pdfItems: TextItem[]): Page => {
const lineData: { [y: number]: TextItem[] } = {}
for (let i = 0; i < pdfItems.length; i++) {
const item = pdfItems[i]
const y = item.transform[5]
/* eslint-disable no-prototype-builtins */
if (!lineData.hasOwnProperty(y)) {
lineData[y] = []
}
lineData[y].push(item)
}
const yCoords = Object.keys(lineData)
.map((key) => Number(key))
// b - a here because the bottom is y = 0 so we want that to be last
.sort((a, b) => b - a)
// insert an empty line between any 2 lines where their distance is greater than the upper line's height
.reduce((accum: number[], currentY, index, array) => {
const nextY = array[index + 1]
if (nextY != undefined) {
const currentLineHeight: number = lineData[currentY].reduce(
(finalValue, current) =>
finalValue > current.height ? finalValue : current.height,
-1
)
// currentY - nextY because currentY will be higher than nextY
if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
const newY = currentY - currentLineHeight
lineData[newY] = []
return accum.concat(currentY, newY)
}
}
return accum.concat(currentY)
}, [])
const lines: string[] = []
for (let i = 0; i < yCoords.length; i++) {
const y = yCoords[i]
// sort by x position (position in line)
const lineItems = lineData[y]
.sort((a, b) => a.transform[4] - b.transform[4])
.filter((item) => !!item.str)
let line = lineItems.length ? lineItems[0].str : ''
for (let j = 1; j < lineItems.length; j++) {
const item = lineItems[j]
const lastItem = lineItems[j - 1]
const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
// insert spaces for items that are far apart horizontally
if (
item.height !== 0 &&
(xDiff > item.height || xDiff > lastItem.height)
) {
const spaceCountA = Math.ceil(xDiff / item.height)
let spaceCount = spaceCountA
if (lastItem.height !== item.height) {
const spaceCountB = Math.ceil(xDiff / lastItem.height)
spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
}
if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
spaceCount = 1
}
line += Array(spaceCount).fill('').join(' ')
}
line += item.str
}
lines.push(line)
}
return {
lines,
}
}

View File

@@ -0,0 +1,3 @@
const register = require('@babel/register').default;
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] });

View File

@@ -0,0 +1,54 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import chaiString from 'chai-string'
import {
getDocument,
getDocumentText,
getDocumentTitle,
parsePdf,
} from '../../src/pdf'
chai.use(chaiString)
describe('open a simple PDF with a set title', () => {
it('should return the title', async () => {
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
const result = await getDocumentTitle(doc)
expect('Document1').to.equal(result)
})
it('should return the document text', async () => {
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
const result = await getDocumentText(doc)
expect(result).to.equal(
'This is the page title \n \nThis is some more text \n'
)
})
})
describe('open a complex PDF with no title', () => {
it('should return some initial content as the title', async () => {
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
const result = await getDocumentTitle(doc)
expect(result).to.startWith(
'Improving communications around vaccine breakthrough and vaccine effectiveness'
)
})
it('should be less than the max title length', async () => {
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
const result = await getDocumentTitle(doc)
expect(result?.length).to.lessThanOrEqual(95)
})
})
describe('open a PDF with metadata set', () => {
it('should return metadata', async () => {
const parsed = await parsePdf(
new URL('file://' + __dirname + '/data/welcome_to_your_library.pdf')
)
expect(parsed.title).to.eq('Welcome to your Omnivore Library')
expect(parsed.author).to.eq('Jackson Harper')
expect(parsed.description).to.eq('This is the description of my PDF')
})
})

View File

@@ -0,0 +1,9 @@
{
"extends": "@tsconfig/node14/tsconfig.json",
"compilerOptions": {
"outDir": "build",
"rootDir": ".",
"lib": ["dom"]
},
"include": ["src"]
}