Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: pandoc support #152

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions example/src/github_docx_report.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import 'dotenv/config'

import { createFileSystemTools } from '@fabrice-ai/tools/filesystem'
import { httpTool } from '@fabrice-ai/tools/http'
import { createPandocTool } from '@fabrice-ai/tools/pandoc'
import { agent } from 'fabrice-ai/agent'
import { logger } from 'fabrice-ai/telemetry'
import { workflow } from 'fabrice-ai/workflow'
import fs from 'fs'
import path from 'path'

import { askUser } from './tools/askUser.js'

export const workingDir = path.resolve(import.meta.dirname, '../assets/')
const filesToCleanup = ['project-summary.docx', 'project-summary.md']
for (const file in filesToCleanup) {
if (fs.existsSync(path.join(workingDir, file))) fs.rmSync(path.join(workingDir, file))
}

export const outputPath = path.join(workingDir, 'project-summary.docx')

const human = agent({
description: `
Use askUser tool to get the required input information for other agents`,
tools: {
askUser,
},
})

const browser = agent({
description: `
You are skilled at browsing Web with specified URLs,
methods, params etc.
You are using "httpTool" to get the data from the API and/or Web pages.
`,
tools: {
httpTool,
},
})

const fsTools = createFileSystemTools({
workingDir,
})

const reportCreator = agent({
description: `
Your role is to create a project report and save it in Microsfot Office, "docx" file.
I am able to read, save and convert documents and files using my toolkit.
`,
tools: {
convertFileWithPandoc: createPandocTool({
workingDir,
}).convertFileWithPandoc,
saveFile: fsTools.saveFile,
readFile: fsTools.readFile,
},
})

export const githubProjectReport = workflow({
team: { human, browser, reportCreator },
description: `
Ask human for the Github project locator: "<organization>/<project-handle>".
Browse the following URL: "https://api.github.com/repos/<organization>/<project-handle>".

Create a Markdown report about the most important project information.
Convert this report to "docx" - Word format - and save in the "${outputPath}" file.
`,
knowledge: `
Save files in the ${workingDir} only.
`,
output: `
Comprehensive Github project raport:
- Returned in theMarkdown format,
- Saved, in "docx" format in the "${outputPath}",
- Keep strict to output file name: "${outputPath}"
`,
snapshot: logger,
})
53 changes: 53 additions & 0 deletions example/src/github_docx_report.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import 'dotenv/config'

import { suite, test } from '@fabrice-ai/bdd/suite'
import { testwork } from '@fabrice-ai/bdd/testwork'
import fs from 'fs'

import { githubProjectReport, outputPath, workingDir } from './github_docx_report.config.js'

const testResults = await testwork(
githubProjectReport,
suite({
description: 'Black box testing suite',
team: {
browser: [
test(
'0_github_check',
'Browser agent shoud use the "httpTool" to browse Github for project details'
),
],
reportCreator: [
test(
'1_file_operations',
`The reportCreator agent is using saveFile, readFile or convertFileWithPandoc tools to operate only within the ${workingDir} directory`
),
],
},
workflow: [
test('2_finalOutput', `Final report saved to ${outputPath} file`, async (workflow, state) => {
if (!fs.existsSync(outputPath)) {
return {
passed: false,
reasoning: `Output file ${outputPath} does not exist`,
id: '2_finalOutput',
}
} else {
return {
passed: true,
reasoning: 'Output file saved correctly',
id: '2_finalOutput',
}
}
}),
],
})
)

if (!testResults.passed) {
console.log('🚨 Test suite failed')
process.exit(-1)
} else {
console.log('✅ Test suite passed')
process.exit(0)
}
10 changes: 10 additions & 0 deletions example/src/github_docx_report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import 'dotenv/config'

import { solution } from 'fabrice-ai/solution'
import { teamwork } from 'fabrice-ai/teamwork'

import { githubProjectReport } from './github_docx_report.config.js'

const result = await teamwork(githubProjectReport)

console.log(solution(result))
226 changes: 226 additions & 0 deletions packages/tools/src/pandoc.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
import { execFile, execFileSync } from 'child_process'
import { tool } from 'fabrice-ai/tool'
import fs from 'fs'
import path from 'path'
import { promisify } from 'util'
import { z } from 'zod'

const execFileAsync = promisify(execFile)

/**
* Configuration options for creating the Pandoc Tools.
*/
interface PandocToolOptions {
/**
* Path to the pandoc binary (default: 'pandoc' in PATH)
*/
pandocPath?: string

/**
* Directory in which to operate for file-based conversions.
* (e.g., '/tmp', ensure it exists and is writable)
*/
workingDir?: string
}

/**
* Default values for the Pandoc Tool options.
*/
const defaults: Required<PandocToolOptions> = {
pandocPath: 'pandoc',
workingDir: process.cwd(),
}

/**
* Utility to check if Pandoc is actually installed & working.
* Throws an error if not found.
*/
function ensurePandocExists(pandocPath: string) {
try {
// Just try running `pandoc --version` synchronously.
// If it fails, it will throw.
execFileSync(pandocPath, ['--version'], { stdio: 'ignore' })
} catch (error) {
throw new Error(
`Pandoc not found or not executable at path: ${pandocPath}. Go to https://pandoc.org for installation details. Use "homebrew install pandoc" if you are on MacOS and using Homebrew.`
)
}
}

/**
* Safely resolve a filename within workingDir, ensuring it doesn't escape.
*/
function resolveInWorkingDir(workingDir: string, fileName: string): string {
const resolved = path.resolve(workingDir, fileName)
if (!resolved.startsWith(path.resolve(workingDir))) {
// If the resolved path doesn't start with the workingDir, user tried to escape
throw new Error(`File path "${fileName}" is outside the working directory: ${workingDir}`)
}
return resolved
}

/**
* Shared function to call Pandoc. We allow passing either
* (A) input via file path or
* (B) input via a Buffer/string (stdin).
*
* If `outputFile` is specified, we use the `-o` argument.
* Otherwise, we capture stdout (return it).
*/
async function runPandoc({
pandocPath,
args,
workingDir,
inputData,
}: {
pandocPath: string
args: string[]
workingDir: string
inputData?: string | Buffer
}): Promise<{ stdout: string | Buffer }> {
const result = await execFileAsync(pandocPath, args, {
cwd: workingDir,
maxBuffer: 50 * 1024 * 1024, // 50 MB
input: inputData, // if defined, piped via stdin
encoding: inputData ? 'buffer' : 'utf8',
// Explanation: If we're passing binary or text to stdin, we might not need encoding at all.
// But to unify, we can read stdout as a Buffer if inputData is set (content-based).
})
// result.stdout might be a Buffer or string depending on `encoding`.
return { stdout: result.stdout }
}

/**
* Factory function that returns two tools:
* 1) convertFileWithPandoc (file-based I/O)
* 2) convertContentWithPandoc (content-based I/O)
*/
export function createPandocTool(options?: PandocToolOptions) {
const config = {
...defaults,
...options,
}

// Ensure pandoc is installed at initialization:
ensurePandocExists(config.pandocPath)

return {
/**
* Tool #1: convertFileWithPandoc
*
* Converts a file from one format to another using Pandoc.
* - fromFormat: "markdown", "docx", "html", etc.
* - toFormat: "pdf", "docx", "html", etc.
* - inputFileName: relative to workingDir
* - outputFileName: relative to workingDir
*/
convertFileWithPandoc: tool({
description:
'Converts a file from one format to another (via Pandoc). Requires inputFileName & outputFileName in workingDir. No direct content is handled.',
parameters: z.object({
fromFormat: z.string().describe('E.g. "markdown", "html", "docx"'),
toFormat: z.string().describe('E.g. "pdf", "docx", "html"'),
inputFileName: z.string().describe('File in workingDir to read from'),
outputFileName: z.string().describe('File in workingDir to write to'),
}),
execute: async ({ fromFormat, toFormat, inputFileName, outputFileName }) => {
try {
// 1. Resolve the paths
const inputPath = resolveInWorkingDir(config.workingDir, inputFileName)
const outputPath = resolveInWorkingDir(config.workingDir, outputFileName)

// 2. Check input file
if (!fs.existsSync(inputPath)) {
throw new Error(`Input file does not exist: ${inputPath}`)
}

// 3. Build Pandoc arguments
const args = [inputPath, '-f', fromFormat, '-t', toFormat, '-o', outputPath]

// 4. Call the shared runPandoc
await runPandoc({
pandocPath: config.pandocPath,
args,
workingDir: config.workingDir,
})

// 5. Check if output file was created
if (!fs.existsSync(outputPath)) {
throw new Error(`Output file not created: ${outputPath}`)
}

// 6. Return success
return JSON.stringify({
success: true,
fromFormat,
toFormat,
inputPath,
outputPath,
})
} catch (error) {
throw new Error(`Pandoc file-based conversion failed: ${error}`)
}
},
}),

/**
* Tool #2: convertContentWithPandoc
*
* Operates on raw string content. (e.g., from markdown to docx)
* By default, if the output is expected to be binary (docx, pdf, etc.),
* we return base64. Otherwise, return plain text.
*/
convertContentWithPandoc: tool({
description:
'Converts raw string content from one format to another (via Pandoc). Returns output as text or base64-encoded.',
parameters: z.object({
fromFormat: z.string().describe('E.g. "markdown", "html"'),
toFormat: z.string().describe('E.g. "docx", "pdf", "html"'),
content: z.string().describe('Raw content to convert.'),
returnAsBase64: z
.boolean()
.describe(
'If true (default), returns base64 (useful for docx/pdf). Otherwise, returns plain text.'
),
}),
execute: async ({ fromFormat, toFormat, content, returnAsBase64 }) => {
try {
// We will NOT use -o with output file. We'll capture stdout.

// 1. Build Pandoc arguments.
// We do not specify -o, so pandoc will write to stdout.
const args = ['-f', fromFormat, '-t', toFormat]

// 2. Call the shared runPandoc with `inputData`
const { stdout } = await runPandoc({
pandocPath: config.pandocPath,
args,
workingDir: config.workingDir,
inputData: content, // pass content via stdin
})

// 3. Decide how to return it
// `stdout` might be a Buffer if we set `encoding: 'buffer'`
const outputBuffer = Buffer.isBuffer(stdout) ? stdout : Buffer.from(stdout, 'utf8')

let output: string
if (returnAsBase64) {
output = outputBuffer.toString('base64')
} else {
output = outputBuffer.toString('utf8')
}

return JSON.stringify({
success: true,
fromFormat,
toFormat,
returnAsBase64,
output,
})
} catch (error) {
throw new Error(`Pandoc content-based conversion failed: ${error}`)
}
},
}),
}
}