weseek · arafubeatbox · Oct 20, 2024 · Oct 27, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/apps/app/package.json b/apps/app/package.json
@@ -83,6 +83,7 @@
     "@growi/remark-growi-directive": "workspace:^",
     "@growi/remark-lsx": "workspace:^",
     "@growi/slack": "workspace:^",
+    "@growi/pdf-converter": "workspace:^",
     "@keycloak/keycloak-admin-client": "^18.0.0",
     "@slack/web-api": "^6.2.4",
     "@slack/webhook": "^6.0.0",
@@ -202,10 +203,12 @@
     "rehype-sanitize": "^6.0.0",
     "rehype-slug": "^6.0.0",
     "rehype-toc": "^3.0.2",
+    "remark": "^13.0.0",
     "remark-breaks": "^4.0.0",
     "remark-directive": "^3.0.0",
     "remark-frontmatter": "^5.0.0",
     "remark-gfm": "^4.0.0",
+    "remark-html": "^11.0.0",
     "remark-math": "^6.0.0",
     "remark-parse": "^11.0.0",
     "remark-rehype": "^11.1.1",

diff --git a/apps/app/src/features/page-bulk-export/client/components/PageBulkExportSelectModal.tsx b/apps/app/src/features/page-bulk-export/client/components/PageBulkExportSelectModal.tsx
@@ -69,8 +69,7 @@ const PageBulkExportSelectModal = (): JSX.Element => {
               <button className="btn btn-primary" type="button" onClick={() => startBulkExport(PageBulkExportFormat.md)}>
                 {t('page_export.markdown')}
               </button>
-              {/* TODO: enable in https://redmine.weseek.co.jp/issues/135772 */}
-              {/* <button className="btn btn-primary ms-2" type="button" onClick={() => startBulkExport(PageBulkExportFormat.pdf)}>PDF</button> */}
+              <button className="btn btn-primary ms-2" type="button" onClick={() => startBulkExport(PageBulkExportFormat.pdf)}>PDF</button>
             </div>
           </ModalBody>
         </Modal>

diff --git a/apps/app/src/features/page-bulk-export/server/routes/apiv3/page-bulk-export.ts b/apps/app/src/features/page-bulk-export/server/routes/apiv3/page-bulk-export.ts
@@ -42,7 +42,7 @@ module.exports = (crowi: Crowi): Router => {
     };
 
     try {
-      await pageBulkExportService?.createAndExecuteOrRestartBulkExportJob(path, req.user, activityParameters, restartJob);
+      await pageBulkExportService?.createAndExecuteOrRestartBulkExportJob(path, format, req.user, activityParameters, restartJob);
       return res.apiv3({}, 204);
     }
     catch (err) {

diff --git a/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts b/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts
@@ -1,19 +1,23 @@
 import { createHash } from 'crypto';
 import fs from 'fs';
 import path from 'path';
-import { Writable } from 'stream';
+import { Writable, pipeline } from 'stream';
 import { pipeline as pipelinePromise } from 'stream/promises';
 
+
 import type { IUser } from '@growi/core';
 import {
   getIdForRef, getIdStringForRef, type IPage, isPopulated, SubscriptionStatusType,
 } from '@growi/core';
 import { getParentPath, normalizePath } from '@growi/core/dist/utils/path-utils';
+import { pdfCtrlSyncJobStatus, PdfCtrlSyncJobStatus202Status, PdfCtrlSyncJobStatusBodyStatus } from '@growi/pdf-converter/dist/client-library';
 import type { Archiver } from 'archiver';
 import archiver from 'archiver';
 import gc from 'expose-gc/function';
 import type { HydratedDocument } from 'mongoose';
 import mongoose from 'mongoose';
+import remark from 'remark';
+import html from 'remark-html';
 
 import type { SupportedActionType } from '~/interfaces/activity';
 import { SupportedAction, SupportedTargetModel } from '~/interfaces/activity';
@@ -23,6 +27,7 @@ import type { IAttachmentDocument } from '~/server/models/attachment';
 import { Attachment } from '~/server/models/attachment';
 import type { PageModel, PageDocument } from '~/server/models/page';
 import Subscription from '~/server/models/subscription';
+import { configManager } from '~/server/service/config-manager';
 import type { FileUploader } from '~/server/service/file-uploader';
 import type { IMultipartUploader } from '~/server/service/file-uploader/multipart-uploader';
 import { preNotifyService } from '~/server/service/pre-notify';
@@ -81,14 +86,15 @@ class PageBulkExportService implements IPageBulkExportService {
   /**
    * Create a new page bulk export job and execute it
    */
-  async createAndExecuteOrRestartBulkExportJob(basePagePath: string, currentUser, activityParameters: ActivityParameters, restartJob = false): Promise<void> {
+  async createAndExecuteOrRestartBulkExportJob(
+      basePagePath: string, format: PageBulkExportFormat, currentUser, activityParameters: ActivityParameters, restartJob = false,
+  ): Promise<void> {
     const basePage = await this.pageModel.findByPathAndViewer(basePagePath, currentUser, null, true);
 
     if (basePage == null) {
       throw new Error('Base page not found or not accessible');
     }
 
-    const format = PageBulkExportFormat.md;
     const duplicatePageBulkExportJobInProgress: HydratedDocument<PageBulkExportJobDocument> | null = await PageBulkExportJob.findOne({
       user: currentUser,
       page: basePage,
@@ -276,14 +282,21 @@ class PageBulkExportService implements IPageBulkExportService {
 
     this.pageBulkExportJobManager.updateJobStream(pageBulkExportJob._id, pageSnapshotsReadable);
 
-    return pipelinePromise(pageSnapshotsReadable, pagesWritable);
+    if (pageBulkExportJob.format === PageBulkExportFormat.pdf) {
+      pipeline(pageSnapshotsReadable, pagesWritable, (err) => { if (err != null) logger.error(err); });
+      await this.startAndWaitPdfExportFinish(pageBulkExportJob);
+    }
+    else {
+      await pipelinePromise(pageSnapshotsReadable, pagesWritable);
+    }
   }
 
   /**
    * Get a Writable that writes the page body temporarily to fs
    */
   private getPageWritable(pageBulkExportJob: PageBulkExportJobDocument): Writable {
-    const outputDir = this.getTmpOutputDir(pageBulkExportJob);
+    const isHtmlPath = pageBulkExportJob.format === PageBulkExportFormat.pdf;
+    const outputDir = this.getTmpOutputDir(pageBulkExportJob, isHtmlPath);
     return new Writable({
       objectMode: true,
       write: async(page: PageBulkExportPageSnapshotDocument, encoding, callback) => {
@@ -292,25 +305,101 @@ class PageBulkExportService implements IPageBulkExportService {
 
           if (revision != null && isPopulated(revision)) {
             const markdownBody = revision.body;
-            const pathNormalized = `${normalizePath(page.path)}.${PageBulkExportFormat.md}`;
+            const format = pageBulkExportJob.format === PageBulkExportFormat.pdf ? 'html' : pageBulkExportJob.format;
+            const pathNormalized = `${normalizePath(page.path)}.${format}`;
             const fileOutputPath = path.join(outputDir, pathNormalized);
             const fileOutputParentPath = getParentPath(fileOutputPath);
-
             await fs.promises.mkdir(fileOutputParentPath, { recursive: true });
-            await fs.promises.writeFile(fileOutputPath, markdownBody);
+
+            if (pageBulkExportJob.format === PageBulkExportFormat.md) {
+              await fs.promises.writeFile(fileOutputPath, markdownBody);
+            }
+            else {
+              const htmlString = await this.convertMdToHtml(markdownBody);
+              await fs.promises.writeFile(fileOutputPath, htmlString);
+            }
             pageBulkExportJob.lastExportedPagePath = page.path;
             await pageBulkExportJob.save();
           }
         }
         catch (err) {
           callback(err);
+          // update status to notify failure and report to pdf converter in startAndWaitPdfExportFinish
+          pageBulkExportJob.status = PageBulkExportJobStatus.failed;
+          await pageBulkExportJob.save();
           return;
         }
         callback();
       },
     });
   }
 
+  private async convertMdToHtml(md: string): Promise<string> {
+    const htmlString = (await remark()
+      .use(html)
+      .process(md))
+      .toString();
+
+    return htmlString;
+  }
+
+  /**
+   * Start pdf export by requesting pdf-converter and keep updating/checking the status until the export is done
+   * ref) https://dev.growi.org/66ee8495830566b31e02c953#growi
+   * @param pageBulkExportJob page bulk export job in execution
+   */
+  private async startAndWaitPdfExportFinish(pageBulkExportJob: PageBulkExportJobDocument): Promise<void> {
+    const jobCreatedAt = pageBulkExportJob.createdAt;
+    if (jobCreatedAt == null) throw new Error('createdAt is not set');
+
+    const exportJobExpirationSeconds = configManager.getConfig('crowi', 'app:bulkExportJobExpirationSeconds');
+    const jobExpirationDate = new Date(jobCreatedAt.getTime() + exportJobExpirationSeconds * 1000);
+    let status: PdfCtrlSyncJobStatusBodyStatus = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_IN_PROGRESS;
+
+    const lastExportPagePath = (await PageBulkExportPageSnapshot.findOne({ pageBulkExportJob }).sort({ path: -1 }))?.path;
+    if (lastExportPagePath == null) throw new Error('lastExportPagePath is missing');
+
+    return new Promise<void>((resolve, reject) => {
+      // Request sync job API until the pdf export is done. If pdf export status is updated in growi, send the status to pdf-converter.
+      const interval = setInterval(async() => {
+        if (new Date() > jobExpirationDate) {
+          reject(new BulkExportJobExpiredError());
+        }
+        try {
+          const latestPageBulkExportJob = await PageBulkExportJob.findById(pageBulkExportJob._id);
+          if (latestPageBulkExportJob == null) throw new Error('pageBulkExportJob is missing');
+          if (latestPageBulkExportJob.lastExportedPagePath === lastExportPagePath) {
+            status = PdfCtrlSyncJobStatusBodyStatus.HTML_EXPORT_DONE;
+          }
+
+          if (latestPageBulkExportJob.status === PageBulkExportJobStatus.failed) {
+            status = PdfCtrlSyncJobStatusBodyStatus.FAILED;
+          }
+
+          const res = await pdfCtrlSyncJobStatus({
+            jobId: pageBulkExportJob._id.toString(), expirationDate: jobExpirationDate.toISOString(), status,
+          }, { baseURL: configManager.getConfig('crowi', 'app:pageBulkExportPdfConverterUrl') });
+
+          if (res.data.status === PdfCtrlSyncJobStatus202Status.PDF_EXPORT_DONE) {
+            clearInterval(interval);
+            resolve();
+          }
+          else if (res.data.status === PdfCtrlSyncJobStatus202Status.FAILED) {
+            clearInterval(interval);
+            reject(new Error('PDF export failed'));
+          }
+        }
+        catch (err) {
+          // continue the loop if the host is not ready
+          if (!['ENOTFOUND', 'ECONNREFUSED'].includes(err.code)) {
+            clearInterval(interval);
+            reject(err);
+          }
+        }
+      }, 60 * 1000 * 1);
+    });
+  }
+
   /**
    * Execute a pipeline that reads the page files from the temporal fs directory, compresses them, and uploads to the cloud storage
    */
@@ -377,6 +466,8 @@ class PageBulkExportService implements IPageBulkExportService {
         }
         catch (err) {
           await multipartUploader.abortUpload();
+          pageBulkExportJob.status = PageBulkExportJobStatus.failed;
+          await pageBulkExportJob.save();
           callback(err);
           return;
         }
@@ -405,9 +496,14 @@ class PageBulkExportService implements IPageBulkExportService {
 
   /**
    * Get the output directory on the fs to temporarily store page files before compressing and uploading
+   * @param pageBulkExportJob page bulk export job in execution
+   * @param isHtmlPath whether the tmp output path is for html files
    */
-  private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument): string {
-    return `${this.tmpOutputRootDir}/${pageBulkExportJob._id}`;
+  private getTmpOutputDir(pageBulkExportJob: PageBulkExportJobDocument, isHtmlPath = false): string {
+    if (isHtmlPath) {
+      return path.join(this.tmpOutputRootDir, 'html', pageBulkExportJob._id.toString());
+    }
+    return path.join(this.tmpOutputRootDir, pageBulkExportJob._id.toString());
   }
 
   async notifyExportResult(
@@ -442,6 +538,12 @@ class PageBulkExportService implements IPageBulkExportService {
       fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob), { recursive: true, force: true }),
     ];
 
+    if (pageBulkExportJob.format === PageBulkExportFormat.pdf) {
+      promises.push(
+        fs.promises.rm(this.getTmpOutputDir(pageBulkExportJob, true), { recursive: true, force: true }),
+      );
+    }
+
     const fileUploadService: FileUploader = this.crowi.fileUploadService;
     if (pageBulkExportJob.uploadKey != null && pageBulkExportJob.uploadId != null) {
       promises.push(fileUploadService.abortPreviousMultipartUpload(pageBulkExportJob.uploadKey, pageBulkExportJob.uploadId));

diff --git a/apps/app/src/server/service/config-loader.ts b/apps/app/src/server/service/config-loader.ts
@@ -775,6 +775,12 @@ const ENV_VAR_NAME_TO_CONFIG_INFO: Record<string, EnvConfig> = {
     type: ValueType.NUMBER,
     default: 5,
   },
+  BULK_EXPORT_PDF_CONVERTER_URL: {
+    ns: 'crowi',
+    key: 'app:pageBulkExportPdfConverterUrl',
+    type: ValueType.STRING,
+    default: 'http://pdf-converter:3010',
+  },
   AI_ENABLED: {
     ns: 'crowi',
     key: 'app:aiEnabled',

diff --git a/apps/app/turbo.json b/apps/app/turbo.json
@@ -45,7 +45,7 @@
       "outputLogs": "new-only"
     },
     "dev": {
-      "dependsOn": ["^dev", "dev:migrate", "dev:pre:styles"],
+      "dependsOn": ["^dev", "dev:migrate", "dev:pre:styles", "@growi/pdf-converter#build"],
       "cache": false,
       "persistent": true
     },
@@ -56,7 +56,7 @@
     },
 
     "lint": {
-      "dependsOn": ["^dev", "dev:pre:styles"]
+      "dependsOn": ["^dev", "dev:pre:styles", "@growi/pdf-converter#build"]
     },
 
     "test": {

diff --git a/apps/pdf-converter/package.json b/apps/pdf-converter/package.json
@@ -6,7 +6,7 @@
   "license": "MIT",
   "private": true,
   "scripts": {
-    "dev": "nodemon --watch \"src/**/*.ts\" --ignore \"node_modules/**/*\" --exec ts-node -r \"dotenv-flow/config\" src/index.ts",
+    "dev:pdf-converter": "nodemon --watch \"src/**/*.ts\" --ignore \"node_modules/**/*\" --exec ts-node -r \"dotenv-flow/config\" src/index.ts",
     "lint": "pnpm eslint **/*.{js,ts}",
     "gen:client-code": "tsed run generate-swagger --output ./specs && orval",
     "build": "pnpm gen:client-code && tsc -p tsconfig.build.json"

diff --git a/apps/pdf-converter/src/controllers/pdf.ts b/apps/pdf-converter/src/controllers/pdf.ts
@@ -37,8 +37,9 @@ class PdfCtrl {
     const expirationDate = new Date(expirationDateStr);
     try {
       await this.pdfConvertService.registerOrUpdateJob(jobId, expirationDate, growiJobStatus);
+      const status = this.pdfConvertService.getJobStatus(jobId); // get status before cleanup
       this.pdfConvertService.cleanUpJobList();
-      return { status: this.pdfConvertService.getJobStatus(jobId) };
+      return { status };
     }
     catch (err) {
       this.logger.error('Failed to register or update job', err);

diff --git a/apps/pdf-converter/tsconfig.build.json b/apps/pdf-converter/tsconfig.build.json
@@ -1,4 +1,7 @@
 {
   "extends": "./tsconfig.json",
+  "compilerOptions": {
+    "noEmit": false,
+  },
   "exclude": ["node_modules", "dist", "test"]
 }
diff --git a/apps/pdf-converter/turbo.json b/apps/pdf-converter/turbo.json
@@ -0,0 +1,10 @@
+{
+  "$schema": "https://turbo.build/schema.json",
+  "extends": ["//"],
+  "tasks": {
+    "dev:pdf-converter": {
+      "cache": false,
+      "persistent": true
+    }
+  }
+}