Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable pdf bulk export #9283

Open
wants to merge 19 commits into
base: feat/135772-pdf-page-bulk-export
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
e852b29
enable pdf bulk export
arafubeatbox Oct 20, 2024
9c5a514
Merge branch 'feat/135772-155547-pdf-converter-tsed-devcontainer' int…
arafubeatbox Oct 27, 2024
aa6154f
Merge branch 'feat/135772-155547-pdf-converter-tsed-devcontainer' int…
arafubeatbox Nov 4, 2024
7095c0f
remove file added in wrong merge
arafubeatbox Nov 4, 2024
e00dfc4
run pnpm install
arafubeatbox Nov 4, 2024
ee444b3
Merge branch 'feat/135772-155547-pdf-converter-tsed-devcontainer' int…
arafubeatbox Nov 4, 2024
4209913
fix pdf-converter port number
arafubeatbox Nov 4, 2024
52fa250
add noEmit false to pdf-converter tsconfig
arafubeatbox Nov 4, 2024
8b7cec1
get pdf convert job status before cleanup
arafubeatbox Nov 4, 2024
b0636d3
Merge branch 'feat/135772-155547-pdf-converter-tsed-devcontainer' int…
arafubeatbox Nov 4, 2024
5fe093a
Merge branch 'feat/135772-pdf-page-bulk-export' into feat/135772-1533…
arafubeatbox Nov 12, 2024
45d88ed
add comments
arafubeatbox Nov 12, 2024
48f12c9
Merge branch 'feat/135772-pdf-page-bulk-export' into feat/135772-1533…
arafubeatbox Nov 12, 2024
426cf56
change pdf-converter dev command to dev:pdf-converter
arafubeatbox Nov 13, 2024
d0ae507
Merge branch 'feat/135772-pdf-page-bulk-export' into feat/135772-1533…
arafubeatbox Nov 19, 2024
7ee3eaf
Merge branch 'feat/135772-pdf-page-bulk-export' into feat/135772-1533…
arafubeatbox Nov 19, 2024
4bae443
create remark instance before pdf conversion stream starts
arafubeatbox Nov 22, 2024
c07165d
Merge branch 'feat/135772-pdf-page-bulk-export' into feat/135772-1533…
arafubeatbox Nov 22, 2024
b98f78e
add PageBulkExportPdfConvertCronService
arafubeatbox Nov 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apps/app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
"@growi/remark-growi-directive": "workspace:^",
"@growi/remark-lsx": "workspace:^",
"@growi/slack": "workspace:^",
"@growi/pdf-converter": "workspace:^",
"@keycloak/keycloak-admin-client": "^18.0.0",
"@slack/web-api": "^6.2.4",
"@slack/webhook": "^6.0.0",
Expand Down Expand Up @@ -202,10 +203,12 @@
"rehype-sanitize": "^6.0.0",
"rehype-slug": "^6.0.0",
"rehype-toc": "^3.0.2",
"remark": "^13.0.0",
"remark-breaks": "^4.0.0",
"remark-directive": "^3.0.0",
"remark-frontmatter": "^5.0.0",
"remark-gfm": "^4.0.0",
"remark-html": "^11.0.0",
"remark-math": "^6.0.0",
"remark-parse": "^11.0.0",
"remark-rehype": "^11.1.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ const PageBulkExportSelectModal = (): JSX.Element => {
<button className="btn btn-primary" type="button" onClick={() => startBulkExport(PageBulkExportFormat.md)}>
{t('page_export.markdown')}
</button>
{/* TODO: enable in https://redmine.weseek.co.jp/issues/135772 */}
{/* <button className="btn btn-primary ms-2" type="button" onClick={() => startBulkExport(PageBulkExportFormat.pdf)}>PDF</button> */}
<button className="btn btn-primary ms-2" type="button" onClick={() => startBulkExport(PageBulkExportFormat.pdf)}>PDF</button>
</div>
</ModalBody>
</Modal>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ module.exports = (crowi: Crowi): Router => {
};

try {
await pageBulkExportService?.createAndExecuteOrRestartBulkExportJob(path, req.user, activityParameters, restartJob);
await pageBulkExportService?.createAndExecuteOrRestartBulkExportJob(path, format, req.user, activityParameters, restartJob);
return res.apiv3({}, 204);
}
catch (err) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import { createHash } from 'crypto';
import fs from 'fs';
import path from 'path';
import { Writable } from 'stream';
import { Writable, pipeline } from 'stream';
import { pipeline as pipelinePromise } from 'stream/promises';


import type { IUser } from '@growi/core';
import {
getIdForRef, getIdStringForRef, type IPage, isPopulated, SubscriptionStatusType,
} from '@growi/core';
import { getParentPath, normalizePath } from '@growi/core/dist/utils/path-utils';
import { pdfCtrlSyncJobStatus, PdfCtrlSyncJobStatus202Status, PdfCtrlSyncJobStatusBodyStatus } from '@growi/pdf-converter/dist/client-library';
import type { Archiver } from 'archiver';
import archiver from 'archiver';
import gc from 'expose-gc/function';
import type { HydratedDocument } from 'mongoose';
import mongoose from 'mongoose';
import remark from 'remark';
import html from 'remark-html';

import type { SupportedActionType } from '~/interfaces/activity';
import { SupportedAction, SupportedTargetModel } from '~/interfaces/activity';
Expand All @@ -23,6 +27,7 @@ import type { IAttachmentDocument } from '~/server/models/attachment';
import { Attachment } from '~/server/models/attachment';
import type { PageModel, PageDocument } from '~/server/models/page';
import Subscription from '~/server/models/subscription';
import { configManager } from '~/server/service/config-manager';
import type { FileUploader } from '~/server/service/file-uploader';
import type { IMultipartUploader } from '~/server/service/file-uploader/multipart-uploader';
import { preNotifyService } from '~/server/service/pre-notify';
Expand Down Expand Up @@ -81,14 +86,15 @@ class PageBulkExportService implements IPageBulkExportService {
/**
* Create a new page bulk export job and execute it
*/
async createAndExecuteOrRestartBulkExportJob(basePagePath: string, currentUser, activityParameters: ActivityParameters, restartJob = false): Promise<void> {
async createAndExecuteOrRestartBulkExportJob(
basePagePath: string, format: PageBulkExportFormat, currentUser, activityParameters: ActivityParameters, restartJob = false,
): Promise<void> {
const basePage = await this.pageModel.findByPathAndViewer(basePagePath, currentUser, null, true);

if (basePage == null) {
throw new Error('Base page not found or not accessible');
}

const format = PageBulkExportFormat.md;
const duplicatePageBulkExportJobInProgress: HydratedDocument<PageBulkExportJobDocument> | null = await PageBulkExportJob.findOne({
user: currentUser,
page: basePage,
Expand Down Expand Up @@ -276,14 +282,21 @@ class PageBulkExportService implements IPageBulkExportService {

this.pageBulkExportJobManager.updateJobStream(pageBulkExportJob._id, pageSnapshotsReadable);

return pipelinePromise(pageSnapshotsReadable, pagesWritable);
if (pageBulkExportJob.format === PageBulkExportFormat.pdf) {
pipeline(pageSnapshotsReadable, pagesWritable, (err) => { if (err != null) logger.error(err); });
await this.startAndWaitPdfExportFinish(pageBulkExportJob);
}
else {
await pipelinePromise(pageSnapshotsReadable, pagesWritable);
}
}

/**
* Get a Writable that writes the page body temporarily to fs
*/
private getPageWritable(pageBulkExportJob: PageBulkExportJobDocument): Writable {
const outputDir = this.getTmpOutputDir(pageBulkExportJob);
const isHtmlPath = pageBulkExportJob.format === PageBulkExportFormat.pdf;
const outputDir = this.getTmpOutputDir(pageBulkExportJob, isHtmlPath);
return new Writable({
objectMode: true,
write: async(page: PageBulkExportPageSnapshotDocument, encoding, callback) => {
Expand All @@ -292,25 +305,101 @@ class PageBulkExportService implements IPageBulkExportService {

if (revision != null && isPopulated(revision)) {
const markdownBody = revision.body;
const pathNormalized = `${normalizePath(page.path)}.${PageBulkExportFormat.md}`;
const format = pageBulkExportJob.format === PageBulkExportFormat.pdf ? 'html' : pageBulkExportJob.format;
const pathNormalized = `${normalizePath(page.path)}.${format}`;
const fileOutputPath = path.join(outputDir, pathNormalized);
const fileOutputParentPath = getParentPath(fileOutputPath);

await fs.promises.mkdir(fileOutputParentPath, { recursive: true });
await fs.promises.writeFile(fileOutputPath, markdownBody);

if (pageBulkExportJob.format === PageBulkExportFormat.md) {
await fs.promises.writeFile(fileOutputPath, markdownBody);
}
else {
const htmlString = await this.convertMdToHtml(markdownBody);
await fs.promises.writeFile(fileOutputPath, htmlString);
}
pageBulkExportJob.lastExportedPagePath = page.path;
await pageBulkExportJob.save();
}
}
catch (err) {
callback(err);
// update status to notify failure and report to pdf converter in startAndWaitPdfExportFinish
pageBulkExportJob.status = PageBulkExportJobStatus.failed;
await pageBulkExportJob.save();
return;
}
callback();
},
});
}

private async convertMdToHtml(md: string): Promise<string> {
const htmlString = (await remark()
.use(html)
.process(md))
.toString();

return htmlString;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remark().use(html) で得られるインスタンスは保持して使い回した方がいいんじゃないかな
どこまで差が出るかはわからないけど

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

対応しました。
インスタンス変数にして全ての job で共通にするのは並列で処理が走ったときバグる可能性があるように思えたので、ジョブごとに stream 開始前に定義して引数として渡す形にしました。

引数の型ですが、remark().use(html) の返り値の type が unified のものですが、

unified.Processor<remark.RemarkOptions>

remark が依存している unified と growi が package.json で指定している unified のバージョンが異なるっぽくて、同様の type を付与するとエラーが生じてしまいます。
そのため引数に type は付与していません。
(バージョンを対応させようとすると、esm only の remark 系のパッケージが必要となり、エラーが生じてしまいます)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 使われるバージョンが一致するように別タスクで処理した方がいいように思う
  • サーバーサイドでの ESM only package のインポートは、dynamic-import で対応できる
    • import { dynamicImport } from '@cspell/dynamic-import'; でソース内検索


/**
* Start pdf export by requesting pdf-converter and keep updating/checking the status until the export is done
* ref) https://dev.growi.org/66ee8495830566b31e02c953#growi
* @param pageBulkExportJob page bulk export job in execution
*/
private async startAndWaitPdfExportFinish(pageBulkExportJob: PageBulkExportJobDocument): Promise<void> {
const jobCreatedAt = pageBulkExportJob.createdAt;
if (jobCreatedAt == null) throw new Error('createdAt is not set');

const exportJobExpirationSeconds = configManager.getConfig('crowi', 'app:bulkExportJobExpirationSeconds');
const jobExpirationDate = new Date(jobCreatedAt.getTime() + exportJobExpirationSeconds * 1000);
let status: PdfCtrlSyncJobStatusBodyStatus = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_IN_PROGRESS;

const lastExportPagePath = (await PageBulkExportPageSnapshot.findOne({ pageBulkExportJob }).sort({ path: -1 }))?.path;
if (lastExportPagePath == null) throw new Error('lastExportPagePath is missing');

return new Promise<void>((resolve, reject) => {
// Request sync job API until the pdf export is done. If pdf export status is updated in growi, send the status to pdf-converter.
const interval = setInterval(async() => {
Copy link
Member

@yuki-takei yuki-takei Nov 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

この setInterval、job の数だけ走るのはあまりよくない気がする。
setInterval よりシステムワイドに node-cron を使うべきではないか?

また、サーバーが再起動した時に job を参照して再開してほしい。

参考:

  • thread-deletion-cron.ts
  • vector-store-file-deletion-cron.ts

実装アイデアメモ

cronJob は2種類あってもいいかも。

  • 1つは処理完了待ちの job で、10秒とか比較的短いインターバルで動いて fail / done へステータスを遷移させる
  • もう1つは上記の処理完了待ち job の start/stop を担当する (2~3分の長めのインターバルでよい)
    • in progress な job が 1 つでもあれば処理完了待ち job を start させる
    • in progress な job が 0 個なら処理完了待ち job を stop させる
    • サーバー再起動時はこれを start しておくことで、数分後には必要があれば処理完了待ち job が start することを期待

if (new Date() > jobExpirationDate) {
reject(new BulkExportJobExpiredError());
}
try {
const latestPageBulkExportJob = await PageBulkExportJob.findById(pageBulkExportJob._id);
if (latestPageBulkExportJob == null) throw new Error('pageBulkExportJob is missing');
if (latestPageBulkExportJob.lastExportedPagePath === lastExportPagePath) {
status = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_DONE;
}

if (latestPageBulkExportJob.status === PageBulkExportJobStatus.failed) {
status = PdfCtrlSyncJobStatusBodyStatus.FAILED;
}

const res = await pdfCtrlSyncJobStatus({
jobId: pageBulkExportJob._id.toString(), expirationDate: jobExpirationDate.toISOString(), status,
}, { baseURL: configManager.getConfig('crowi', 'app:pageBulkExportPdfConverterUrl') });

if (res.data.status === PdfCtrlSyncJobStatus202Status.PDF_EXPORT_DONE) {
clearInterval(interval);
resolve();
}
else if (res.data.status === PdfCtrlSyncJobStatus202Status.FAILED) {
clearInterval(interval);
reject(new Error('PDF export failed'));
}
}
catch (err) {
// continue the loop if the host is not ready
if (!['ENOTFOUND', 'ECONNREFUSED'].includes(err.code)) {
clearInterval(interval);
reject(err);
}
}
}, 60 * 1000 * 1);
});
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

このメソッドはレビューしきれてないので引き続き


/**
* Execute a pipeline that reads the page files from the temporal fs directory, compresses them, and uploads to the cloud storage
*/
Expand Down Expand Up @@ -377,6 +466,8 @@ class PageBulkExportService implements IPageBulkExportService {
}
catch (err) {
await multipartUploader.abortUpload();
pageBulkExportJob.status = PageBulkExportJobStatus.failed;
await pageBulkExportJob.save();
callback(err);
return;
}
Expand Down Expand Up @@ -405,9 +496,14 @@ class PageBulkExportService implements IPageBulkExportService {

/**
* Get the output directory on the fs to temporarily store page files before compressing and uploading
* @param pageBulkExportJob page bulk export job in execution
* @param isHtmlPath whether the tmp output path is for html files
*/
private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument): string {
return `${this.tmpOutputRootDir}/${pageBulkExportJob._id}`;
private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument, isHtmlPath = false): string {
if (isHtmlPath) {
return path.join(this.tmpOutputRootDir, 'html', pageBulkExportJob._id.toString());
}
return path.join(this.tmpOutputRootDir, pageBulkExportJob._id.toString());
}

async notifyExportResult(
Expand Down Expand Up @@ -442,6 +538,12 @@ class PageBulkExportService implements IPageBulkExportService {
fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob), { recursive: true, force: true }),
];

if (pageBulkExportJob.format === PageBulkExportFormat.pdf) {
promises.push(
fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob, true), { recursive: true, force: true }),
);
}

const fileUploadService: FileUploader = this.crowi.fileUploadService;
if (pageBulkExportJob.uploadKey != null && pageBulkExportJob.uploadId != null) {
promises.push(fileUploadService.abortPreviousMultipartUpload(pageBulkExportJob.uploadKey, pageBulkExportJob.uploadId));
Expand Down
6 changes: 6 additions & 0 deletions apps/app/src/server/service/config-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,12 @@ const ENV_VAR_NAME_TO_CONFIG_INFO: Record<string, EnvConfig> = {
type: ValueType.NUMBER,
default: 5,
},
BULK_EXPORT_PDF_CONVERTER_URL: {
ns: 'crowi',
key: 'app:pageBulkExportPdfConverterUrl',
type: ValueType.STRING,
default: 'http://pdf-converter:3010',
},
AI_ENABLED: {
ns: 'crowi',
key: 'app:aiEnabled',
Expand Down
4 changes: 2 additions & 2 deletions apps/app/turbo.json
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

slackbot-proxy を起動しなくても app が使えるように、
pdf-converter もまた app の起動に必須ではないので、dependsOn からは外してほしい。

開発時に開発者がやらないといけないことはマニュアルに書いて周知する。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

こちら pdf-converter の起動ではなく、クライアントコード生成のための build になります。
pdf-converter の起動は無関係であり、これを取り除くと Error: Cannot find module '@growi/pdf-converter/dist/client-library'apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts で生じてしまいます。

それはそれとして、pdf-converter の開発ドキュメントは書こうと思います。
ストーリー作成しました。
https://redmine.weseek.co.jp/issues/157930

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

クライアントコード生成のための build になります

なるほど。理解した。
apps/* 間の依存はない方がいいかなとも思ったけど、どうなんだろう。
@growi/slack みたいに分けてすっきりするなら分けていいけど、分けるメリットを感じないならそのままでOK

Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"outputLogs": "new-only"
},
"dev": {
"dependsOn": ["^dev", "dev:migrate", "dev:pre:styles"],
"dependsOn": ["^dev", "dev:migrate", "dev:pre:styles", "@growi/pdf-converter#build"],
"cache": false,
"persistent": true
},
Expand All @@ -56,7 +56,7 @@
},

"lint": {
"dependsOn": ["^dev", "dev:pre:styles"]
"dependsOn": ["^dev", "dev:pre:styles", "@growi/pdf-converter#build"]
},

"test": {
Expand Down
2 changes: 1 addition & 1 deletion apps/pdf-converter/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"license": "MIT",
"private": true,
"scripts": {
"dev": "nodemon --watch \"src/**/*.ts\" --ignore \"node_modules/**/*\" --exec ts-node -r \"dotenv-flow/config\" src/index.ts",
"dev:pdf-converter": "nodemon --watch \"src/**/*.ts\" --ignore \"node_modules/**/*\" --exec ts-node -r \"dotenv-flow/config\" src/index.ts",
yuki-takei marked this conversation as resolved.
Show resolved Hide resolved
"lint": "pnpm eslint **/*.{js,ts}",
"gen:client-code": "tsed run generate-swagger --output ./specs && orval",
"build": "pnpm gen:client-code && tsc -p tsconfig.build.json"
Expand Down
3 changes: 2 additions & 1 deletion apps/pdf-converter/src/controllers/pdf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ class PdfCtrl {
const expirationDate = new Date(expirationDateStr);
try {
await this.pdfConvertService.registerOrUpdateJob(jobId, expirationDate, growiJobStatus);
const status = this.pdfConvertService.getJobStatus(jobId); // get status before cleanup
this.pdfConvertService.cleanUpJobList();
return { status: this.pdfConvertService.getJobStatus(jobId) };
return { status };
}
catch (err) {
this.logger.error('Failed to register or update job', err);
Expand Down
3 changes: 3 additions & 0 deletions apps/pdf-converter/tsconfig.build.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
"noEmit": false,
},
"exclude": ["node_modules", "dist", "test"]
}
10 changes: 10 additions & 0 deletions apps/pdf-converter/turbo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"$schema": "https://turbo.build/schema.json",
"extends": ["//"],
"tasks": {
"dev:pdf-converter": {
"cache": false,
"persistent": true
}
}
}
Loading
Loading