-
Notifications
You must be signed in to change notification settings - Fork 220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
enable pdf bulk export #9283
base: feat/135772-pdf-page-bulk-export
Are you sure you want to change the base?
enable pdf bulk export #9283
Changes from 16 commits
e852b29
9c5a514
aa6154f
7095c0f
e00dfc4
ee444b3
4209913
52fa250
8b7cec1
b0636d3
5fe093a
45d88ed
48f12c9
426cf56
d0ae507
7ee3eaf
4bae443
c07165d
b98f78e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,23 @@ | ||
import { createHash } from 'crypto'; | ||
import fs from 'fs'; | ||
import path from 'path'; | ||
import { Writable } from 'stream'; | ||
import { Writable, pipeline } from 'stream'; | ||
import { pipeline as pipelinePromise } from 'stream/promises'; | ||
|
||
|
||
import type { IUser } from '@growi/core'; | ||
import { | ||
getIdForRef, getIdStringForRef, type IPage, isPopulated, SubscriptionStatusType, | ||
} from '@growi/core'; | ||
import { getParentPath, normalizePath } from '@growi/core/dist/utils/path-utils'; | ||
import { pdfCtrlSyncJobStatus, PdfCtrlSyncJobStatus202Status, PdfCtrlSyncJobStatusBodyStatus } from '@growi/pdf-converter/dist/client-library'; | ||
import type { Archiver } from 'archiver'; | ||
import archiver from 'archiver'; | ||
import gc from 'expose-gc/function'; | ||
import type { HydratedDocument } from 'mongoose'; | ||
import mongoose from 'mongoose'; | ||
import remark from 'remark'; | ||
import html from 'remark-html'; | ||
|
||
import type { SupportedActionType } from '~/interfaces/activity'; | ||
import { SupportedAction, SupportedTargetModel } from '~/interfaces/activity'; | ||
|
@@ -23,6 +27,7 @@ import type { IAttachmentDocument } from '~/server/models/attachment'; | |
import { Attachment } from '~/server/models/attachment'; | ||
import type { PageModel, PageDocument } from '~/server/models/page'; | ||
import Subscription from '~/server/models/subscription'; | ||
import { configManager } from '~/server/service/config-manager'; | ||
import type { FileUploader } from '~/server/service/file-uploader'; | ||
import type { IMultipartUploader } from '~/server/service/file-uploader/multipart-uploader'; | ||
import { preNotifyService } from '~/server/service/pre-notify'; | ||
|
@@ -81,14 +86,15 @@ class PageBulkExportService implements IPageBulkExportService { | |
/** | ||
* Create a new page bulk export job and execute it | ||
*/ | ||
async createAndExecuteOrRestartBulkExportJob(basePagePath: string, currentUser, activityParameters: ActivityParameters, restartJob = false): Promise<void> { | ||
async createAndExecuteOrRestartBulkExportJob( | ||
basePagePath: string, format: PageBulkExportFormat, currentUser, activityParameters: ActivityParameters, restartJob = false, | ||
): Promise<void> { | ||
const basePage = await this.pageModel.findByPathAndViewer(basePagePath, currentUser, null, true); | ||
|
||
if (basePage == null) { | ||
throw new Error('Base page not found or not accessible'); | ||
} | ||
|
||
const format = PageBulkExportFormat.md; | ||
const duplicatePageBulkExportJobInProgress: HydratedDocument<PageBulkExportJobDocument> | null = await PageBulkExportJob.findOne({ | ||
user: currentUser, | ||
page: basePage, | ||
|
@@ -276,14 +282,21 @@ class PageBulkExportService implements IPageBulkExportService { | |
|
||
this.pageBulkExportJobManager.updateJobStream(pageBulkExportJob._id, pageSnapshotsReadable); | ||
|
||
return pipelinePromise(pageSnapshotsReadable, pagesWritable); | ||
if (pageBulkExportJob.format === PageBulkExportFormat.pdf) { | ||
pipeline(pageSnapshotsReadable, pagesWritable, (err) => { if (err != null) logger.error(err); }); | ||
await this.startAndWaitPdfExportFinish(pageBulkExportJob); | ||
} | ||
else { | ||
await pipelinePromise(pageSnapshotsReadable, pagesWritable); | ||
} | ||
} | ||
|
||
/** | ||
* Get a Writable that writes the page body temporarily to fs | ||
*/ | ||
private getPageWritable(pageBulkExportJob: PageBulkExportJobDocument): Writable { | ||
const outputDir = this.getTmpOutputDir(pageBulkExportJob); | ||
const isHtmlPath = pageBulkExportJob.format === PageBulkExportFormat.pdf; | ||
const outputDir = this.getTmpOutputDir(pageBulkExportJob, isHtmlPath); | ||
return new Writable({ | ||
objectMode: true, | ||
write: async(page: PageBulkExportPageSnapshotDocument, encoding, callback) => { | ||
|
@@ -292,25 +305,101 @@ class PageBulkExportService implements IPageBulkExportService { | |
|
||
if (revision != null && isPopulated(revision)) { | ||
const markdownBody = revision.body; | ||
const pathNormalized = `${normalizePath(page.path)}.${PageBulkExportFormat.md}`; | ||
const format = pageBulkExportJob.format === PageBulkExportFormat.pdf ? 'html' : pageBulkExportJob.format; | ||
const pathNormalized = `${normalizePath(page.path)}.${format}`; | ||
const fileOutputPath = path.join(outputDir, pathNormalized); | ||
const fileOutputParentPath = getParentPath(fileOutputPath); | ||
|
||
await fs.promises.mkdir(fileOutputParentPath, { recursive: true }); | ||
await fs.promises.writeFile(fileOutputPath, markdownBody); | ||
|
||
if (pageBulkExportJob.format === PageBulkExportFormat.md) { | ||
await fs.promises.writeFile(fileOutputPath, markdownBody); | ||
} | ||
else { | ||
const htmlString = await this.convertMdToHtml(markdownBody); | ||
await fs.promises.writeFile(fileOutputPath, htmlString); | ||
} | ||
pageBulkExportJob.lastExportedPagePath = page.path; | ||
await pageBulkExportJob.save(); | ||
} | ||
} | ||
catch (err) { | ||
callback(err); | ||
// update status to notify failure and report to pdf converter in startAndWaitPdfExportFinish | ||
pageBulkExportJob.status = PageBulkExportJobStatus.failed; | ||
await pageBulkExportJob.save(); | ||
return; | ||
} | ||
callback(); | ||
}, | ||
}); | ||
} | ||
|
||
private async convertMdToHtml(md: string): Promise<string> { | ||
const htmlString = (await remark() | ||
.use(html) | ||
.process(md)) | ||
.toString(); | ||
|
||
return htmlString; | ||
} | ||
|
||
/** | ||
* Start pdf export by requesting pdf-converter and keep updating/checking the status until the export is done | ||
* ref) https://dev.growi.org/66ee8495830566b31e02c953#growi | ||
* @param pageBulkExportJob page bulk export job in execution | ||
*/ | ||
private async startAndWaitPdfExportFinish(pageBulkExportJob: PageBulkExportJobDocument): Promise<void> { | ||
const jobCreatedAt = pageBulkExportJob.createdAt; | ||
if (jobCreatedAt == null) throw new Error('createdAt is not set'); | ||
|
||
const exportJobExpirationSeconds = configManager.getConfig('crowi', 'app:bulkExportJobExpirationSeconds'); | ||
const jobExpirationDate = new Date(jobCreatedAt.getTime() + exportJobExpirationSeconds * 1000); | ||
let status: PdfCtrlSyncJobStatusBodyStatus = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_IN_PROGRESS; | ||
|
||
const lastExportPagePath = (await PageBulkExportPageSnapshot.findOne({ pageBulkExportJob }).sort({ path: -1 }))?.path; | ||
if (lastExportPagePath == null) throw new Error('lastExportPagePath is missing'); | ||
|
||
return new Promise<void>((resolve, reject) => { | ||
// Request sync job API until the pdf export is done. If pdf export status is updated in growi, send the status to pdf-converter. | ||
const interval = setInterval(async() => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. この setInterval、job の数だけ走るのはあまりよくない気がする。 また、サーバーが再起動した時に job を参照して再開してほしい。 参考:
実装アイデアメモcronJob は2種類あってもいいかも。
|
||
if (new Date() > jobExpirationDate) { | ||
reject(new BulkExportJobExpiredError()); | ||
} | ||
try { | ||
const latestPageBulkExportJob = await PageBulkExportJob.findById(pageBulkExportJob._id); | ||
if (latestPageBulkExportJob == null) throw new Error('pageBulkExportJob is missing'); | ||
if (latestPageBulkExportJob.lastExportedPagePath === lastExportPagePath) { | ||
status = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_DONE; | ||
} | ||
|
||
if (latestPageBulkExportJob.status === PageBulkExportJobStatus.failed) { | ||
status = PdfCtrlSyncJobStatusBodyStatus.FAILED; | ||
} | ||
|
||
const res = await pdfCtrlSyncJobStatus({ | ||
jobId: pageBulkExportJob._id.toString(), expirationDate: jobExpirationDate.toISOString(), status, | ||
}, { baseURL: configManager.getConfig('crowi', 'app:pageBulkExportPdfConverterUrl') }); | ||
|
||
if (res.data.status === PdfCtrlSyncJobStatus202Status.PDF_EXPORT_DONE) { | ||
clearInterval(interval); | ||
resolve(); | ||
} | ||
else if (res.data.status === PdfCtrlSyncJobStatus202Status.FAILED) { | ||
clearInterval(interval); | ||
reject(new Error('PDF export failed')); | ||
} | ||
} | ||
catch (err) { | ||
// continue the loop if the host is not ready | ||
if (!['ENOTFOUND', 'ECONNREFUSED'].includes(err.code)) { | ||
clearInterval(interval); | ||
reject(err); | ||
} | ||
} | ||
}, 60 * 1000 * 1); | ||
}); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. このメソッドはレビューしきれてないので引き続き |
||
|
||
/** | ||
* Execute a pipeline that reads the page files from the temporal fs directory, compresses them, and uploads to the cloud storage | ||
*/ | ||
|
@@ -377,6 +466,8 @@ class PageBulkExportService implements IPageBulkExportService { | |
} | ||
catch (err) { | ||
await multipartUploader.abortUpload(); | ||
pageBulkExportJob.status = PageBulkExportJobStatus.failed; | ||
await pageBulkExportJob.save(); | ||
callback(err); | ||
return; | ||
} | ||
|
@@ -405,9 +496,14 @@ class PageBulkExportService implements IPageBulkExportService { | |
|
||
/** | ||
* Get the output directory on the fs to temporarily store page files before compressing and uploading | ||
* @param pageBulkExportJob page bulk export job in execution | ||
* @param isHtmlPath whether the tmp output path is for html files | ||
*/ | ||
private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument): string { | ||
return `${this.tmpOutputRootDir}/${pageBulkExportJob._id}`; | ||
private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument, isHtmlPath = false): string { | ||
if (isHtmlPath) { | ||
return path.join(this.tmpOutputRootDir, 'html', pageBulkExportJob._id.toString()); | ||
} | ||
return path.join(this.tmpOutputRootDir, pageBulkExportJob._id.toString()); | ||
} | ||
|
||
async notifyExportResult( | ||
|
@@ -442,6 +538,12 @@ class PageBulkExportService implements IPageBulkExportService { | |
fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob), { recursive: true, force: true }), | ||
]; | ||
|
||
if (pageBulkExportJob.format === PageBulkExportFormat.pdf) { | ||
promises.push( | ||
fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob, true), { recursive: true, force: true }), | ||
); | ||
} | ||
|
||
const fileUploadService: FileUploader = this.crowi.fileUploadService; | ||
if (pageBulkExportJob.uploadKey != null && pageBulkExportJob.uploadId != null) { | ||
promises.push(fileUploadService.abortPreviousMultipartUpload(pageBulkExportJob.uploadKey, pageBulkExportJob.uploadId)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. slackbot-proxy を起動しなくても app が使えるように、 開発時に開発者がやらないといけないことはマニュアルに書いて周知する。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. こちら pdf-converter の起動ではなく、クライアントコード生成のための build になります。 それはそれとして、pdf-converter の開発ドキュメントは書こうと思います。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
なるほど。理解した。 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
{ | ||
"extends": "./tsconfig.json", | ||
"compilerOptions": { | ||
"noEmit": false, | ||
}, | ||
"exclude": ["node_modules", "dist", "test"] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"$schema": "https://turbo.build/schema.json", | ||
"extends": ["//"], | ||
"tasks": { | ||
"dev:pdf-converter": { | ||
"cache": false, | ||
"persistent": true | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remark().use(html)
で得られるインスタンスは保持して使い回した方がいいんじゃないかなどこまで差が出るかはわからないけど
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
対応しました。
インスタンス変数にして全ての job で共通にするのは並列で処理が走ったときバグる可能性があるように思えたので、ジョブごとに stream 開始前に定義して引数として渡す形にしました。
引数の型ですが、remark().use(html) の返り値の type が unified のものですが、
remark が依存している unified と growi が package.json で指定している unified のバージョンが異なるっぽくて、同様の type を付与するとエラーが生じてしまいます。
そのため引数に type は付与していません。
(バージョンを対応させようとすると、esm only の remark 系のパッケージが必要となり、エラーが生じてしまいます)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dynamic-import
で対応できるimport { dynamicImport } from '@cspell/dynamic-import';
でソース内検索