diff --git a/README.md b/README.md index 63adbae..e420393 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # sfm-utils -Utilities to parse text files (e.g. from Toolbox) and migrate SFM markers into a format suitable for Paratext. -Each book input is written to individual .sfm files. +Utilities to parse book translations in SFM text files (.txt, .rtf, .sfm) into JSON objects, and then write out the books into SFM suitable for Paratext or .tsv. +When directories are processed, each book input is written to individual .sfm files. Assumptions: * Each text file is for a single chapter of a book @@ -13,7 +13,7 @@ Note for developers: Replace `sfm-utils.exe` references with `node dist/index.js Command-line ```bash -Usage: sfm-utils.exe -p p_arg [-t t_arg | -d d_arg | -j j_arg | -s s_arg] +Usage: sfm-utils.exe -p p_arg [-f f_arg | -t t_arg | -d d_arg | -j j_arg | -s s_arg] ``` Parameters @@ -21,7 +21,8 @@ Parameters Required -p [Paratext project name (can be 3-character abbreviation)] - Optional - one of: + Optional for processing txt or sfm files - one of: + -f [A single SFM file (can be an entire book)] -t [A single Toolbox text file (one chapter of a book)] -d [Directory of Toolbox text files for a single book (one chapter per file)] -j [JSON file representing a single book - used for testing conversion to SFM] @@ -34,11 +35,6 @@ Parameters ``` ### Help -Obtaining the sfm-utils version: -```bash -sfm-utils.exe --version -``` - For additional help: ```bash sfm-utils.exe -h @@ -49,7 +45,7 @@ sfm-utils.exe -h ## Developer Setup These utilities require Git, Node.js, and TypeScript (installed locally). -Back translations in .rtf text files will also need UnRTF installed for converting the Rich Text format. +Back translations in .rtf text files will also need UnRTF installed for converting the Rich Text format (only works on Linux). ### Install Git Download and install Git diff --git a/src/books.ts b/src/books.ts index 1672e3f..148d6d4 100644 --- a/src/books.ts +++ b/src/books.ts @@ -719,9 +719,9 @@ export const bookInfo: bookType[] = [ code: "XXE", name: "Extra Book E", num: 98, - chapters: 999, - versesInChapter: [0], - verses: 0 + chapters: 1, + versesInChapter: [0, 462], + verses: 462 }, { code: "XXF", @@ -743,13 +743,24 @@ export const bookInfo: bookType[] = [ //#endregion /** - * Description of the unit within a chapter. + * Description of the unit within a chapter or header. */ export type unitSubtype = - "padding" | - "chapter" | - "verse" | - "section"; + "padding" | + + // Units in headers + "header" | + "toc1" | + "toc2" | + "toc3" | + "main_title" | + "chapter_label" | + + // Units in chapters + "chapter" | + "verse" | + "section" | + "paragraph"; export interface unitType { type: unitSubtype, @@ -762,7 +773,8 @@ export interface unitType { export interface objType { header: { projectName: string, - bookInfo: bookType + bookInfo: bookType, + markers: unitType[] }, content: unitType[] } @@ -771,7 +783,8 @@ export const PLACEHOLDER_BOOK: bookType = bookInfo[0]; export const PLACEHOLDER_BOOK_OBJ: objType = { "header": { "projectName" : "", - "bookInfo" : PLACEHOLDER_BOOK + "bookInfo" : PLACEHOLDER_BOOK, + "markers": [] }, "content": [] } @@ -862,7 +875,7 @@ export function getBookByName(name: string): bookType { case 'I Corinthians': case '1Corinthians': case 'x1Corinthians': - case '1 Cor': + case '1 Cor': bookName = "1 Corinthians"; break; case '2Corinthians': @@ -880,9 +893,9 @@ export function getBookByName(name: string): bookType { break; case 'Phil': bookName = 'Philippians'; - break; + break; case '1Thessalonians': - case '1 Thess': + case '1 Thess': bookName = '1 Thessalonians'; break; case '2Thessalonians': @@ -890,7 +903,7 @@ export function getBookByName(name: string): bookType { bookName = '2 Thessalonians'; break; case '1Timothy': - case '1 Tim': + case '1 Tim': bookName = '1 Timothy'; break; case '2Timothy': @@ -898,7 +911,7 @@ export function getBookByName(name: string): bookType { bookName = '2 Timothy'; break; case '1Peter': - case '1 Pet': + case '1 Pet': bookName = '1 Peter'; break; case '2Peter': diff --git a/src/index.ts b/src/index.ts index 7bd506c..6d0df2c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,6 +4,7 @@ import { CommanderError, program } from 'commander'; import * as fs from 'fs'; import * as backTranslation from './backTranslation.js'; import * as books from './books.js'; +import * as path from 'path'; import * as toolbox from './toolbox.js'; import require from './cjs-require.js'; import * as sfm from './sfm.js'; @@ -19,6 +20,7 @@ program .description("Utilities to 1) parse Toolbox text files into JSON Objects. " + "2) take a JSON file and write out an .SFM file for Paratext.") .option("-b, --back ", "path to back translation rtf text file") + .option("-f, --sfm ", "path to SFM file") .option("-t, --text ", "path to a Toolbox text file") .option("-bd, --backDirectory ", "path to directory containing multiple RTF text files") .option("-d, --directory ", "path to directory containing multiple Toolbox text files") @@ -44,6 +46,9 @@ if (debugMode) { if (options.back) { console.log(`Back Translation text file path: "${options.back}"`); } + if (options.sfm) { + console.log(`SFM file path: "${options.sfm}"`); + } if (options.text) { console.log(`Toolbox text file path: "${options.text}"`); } @@ -86,6 +91,10 @@ if (options.back && !fs.existsSync(options.back)) { console.error("Can't open back translation text file " + options.back); process.exit(1); } +if (options.sfm && !fs.existsSync(options.sfm)) { + console.error("Can't open SFM file " + options.sfm); + process.exit(1); +} if (options.backDirectory && !fs.existsSync(options.backDirectory)) { console.error("Can't open back translation directory " + options.backDirectory); process.exit(1); @@ -104,9 +113,9 @@ if (options.superDirectory && !fs.existsSync(options.superDirectory)) { } // Validate one of the optional parameters is given -if (!options.back && !options.text && !options.backDirectory && !options.directory && +if (!options.back && !options.sfm && !options.text && !options.backDirectory && !options.directory && !options.json && !options.backSuperDirectory && !options.superDirectory) { - console.error("Need to pass another optional parameter [-b -t -bd -d -j -bs or -s]"); + console.error("Need to pass another optional parameter [-b -f -t -bd -d -j -bs or -s]"); process.exit(1); } @@ -121,6 +130,9 @@ if (options.json) { // Parse an rtf text file into a JSON object const bookObj: books.objType = books.PLACEHOLDER_BOOK_OBJ; processBackText(options.back, bookObj); +} else if (options.sfm) { + const bookObj: books.objType = books.PLACEHOLDER_BOOK_OBJ; + processSFMText(options.sfm, bookObj); } else if (options.text) { // Parse a txt file into a JSON object const bookObj: books.objType = books.PLACEHOLDER_BOOK_OBJ; @@ -320,6 +332,69 @@ function processText(filepath: string, bookObj: books.objType): books.objType { return bookObj; } +/** + * Take an SFM file and make a JSON book type object + * @param {string} filepath - file path of a single text file + * @param {books.bookType} bookObj - the book object to modify + * @returns {books.bookType} bookObj - modified book object + */ +function processSFMText(filepath: string, bookObj: books.objType): books.objType { + const bookInfo = toolbox.getBookAndChapter(filepath); + const currentChapter = bookInfo.chapterNumber; + const bookType = books.getBookByName(bookInfo.bookName); + if (bookInfo.bookName === "Placeholder") { + // Skip invalid book name + console.warn('Skipping invalid book name'); + return bookObj; + } else if (currentChapter > bookType.chapters) { + // Skip invalid chapter number + console.warn('Skipping invalid chapter number ' + currentChapter + ' when ' + + bookObj.header.bookInfo.name + ' only has ' + bookType.chapters + ' chapters.'); + return bookObj; + } + + if (bookObj.content.length == 0) { + bookObj = toolbox.initializeBookObj(bookInfo.bookName, options.projectName); + } + + if (!bookObj.content[currentChapter]) { + console.error(`${bookInfo.bookName} has insufficient chapters allocated to handle ${currentChapter}. Exiting`); + process.exit(1); + } + // Initialize all chapters for book + for (let ch:number=1; ch<= bookObj.content.length-1; ch++) { + if (bookObj.content[ch].type != "chapter") { + // Initialize current chapter + bookObj.content[ch].type = "chapter"; + bookObj.content[ch].content = []; + } + } + sfm.updateObj(bookObj, filepath, s, debugMode); + + // For single file parameter, write valid output + if (options.text && bookObj.header.bookInfo.code !== "000") { + // For testing, write out book JSON Object + writeJSON(bookObj); + + //valid JSON Object to SFM + sfm.convertToSFM(bookObj, s); + } else if (options.sfm && bookObj.header.bookInfo.code !== "000") { + const basename = path.parse(path.basename(filepath)).name; + + // For testing, write out book JSON Object + writeJSON(bookObj, basename + '.json'); + + if (bookObj.header.bookInfo.code == 'XXE') { + // Special SFM file written to TSV + sfm.convertToTSV(bookObj, basename); + } else { + //valid JSON Object to SFM + sfm.convertToSFM(bookObj, s); + } + } + + return bookObj; +} /** * Take a JSON file and make an SFM file @@ -350,18 +425,21 @@ async function processJSON(filepath: string){ /** * Write JSON file (for testing purposes). - * Filename will be [##][XYZ][Project name].json + * If filename not provided, it will be [##][XYZ][Project name].json * ## - 2-digit book number * XYZ - 3 character book code * Project name - Paratext project name * @param {books.bookType} bookObj - the book object to write to file + * @param {filename} string - filename to write. */ -function writeJSON(bookObj: books.objType) { +function writeJSON(bookObj: books.objType, filename : string = '') { if (debugMode) { // Add leading 0 if book number < 10 const padZero = bookObj.header.bookInfo.num < 10 ? '0' : ''; - const filename = padZero + bookObj.header.bookInfo.num + + if (filename == '') { + filename = padZero + bookObj.header.bookInfo.num + bookObj.header.bookInfo.code + bookObj.header.projectName + '.json'; + } fs.writeFileSync('./' + filename, JSON.stringify(bookObj, null, 2)); console.info(`Writing out "${filename}"`); } diff --git a/src/sfm.ts b/src/sfm.ts index 0ff35c8..b92324a 100644 --- a/src/sfm.ts +++ b/src/sfm.ts @@ -1,9 +1,147 @@ // Copyright 2022 SIL International -// Utilities for converting a JSON file to USFM +// Utilities for converting from USFM to JSON, and JSON to USFM or TSV import * as books from './books.js'; +import * as toolbox from './toolbox.js'; import * as sfmConsole from './sfmConsole.js'; import * as fs from 'fs'; +/** + * Regex to parse \v verse marker + */ +export const V_PATTERN = /\\v\s+(\d+)\s+(.+)/; + + +/** + * Parse an SFM text file and modify the corresponding + * book Object containing the chapter information + * @param {book.objType} bookObj - Book object to modify + * @param {string} file - Path to the SFM file + * @param {sfmConsole.SFMConsole} - Object that maintains logging + * @param {boolean} debugMode - Whether to print additional logging + */ +export function updateObj(bookObj: books.objType, file: string, + s: sfmConsole.SFMConsole, debugMode = false) { + + let currentChapter = 1; // Start with first chapter + // Read in SFM file and strip out empty lines + let sfmFile = fs.readFileSync(file, 'utf-8'); + sfmFile = sfmFile.replace(/(\r?\n){2,}/g, '\r\n'); + const sfmData = sfmFile.split(/\r?\n/); + let section_title_written = false; + if (sfmData[sfmData.length - 1] == '') { + // If last line empty, remove it + sfmData.pop(); + } + + // Split each line on marker and content + const markerPattern = /(\\_?[A-Za-z0-9]+)(\s+)?(.+)?/; + let verseNum = 2; // Keep track of the current verse to write + + sfmData.forEach(line => { + if (line.trim() === '') { + // Skip + return; + } + const lineMatch = line.match(markerPattern); + // Skip markers lacking content + if (lineMatch && lineMatch[2] != '') { + const marker: toolbox.markerType = lineMatch[1] as toolbox.markerType; + const content: string = lineMatch[3]; + const unit: books.unitType = { + "type": "padding", + "number": verseNum, + "text": content + }; + + // Basic processing mode + switch (marker) { + case '\\_sh': + case '\\ft' : + case '\\gl' : + case '\\ref' : + // Markers to ignore + break; + + // Header Markers + case '\\h' : + unit.type = "header"; + unit.number = 1; + if (content) { + unit.text = content; + } + bookObj.header.markers.push(unit); + break; + case '\\toc1' : + case '\\toc2' : + case '\\toc3' : + unit.type = marker.substring(1) as books.unitSubtype; + unit.number = 1; + if (content) { + unit.text = content; + } + bookObj.header.markers.push(unit); + break; + case '\\mt' : + unit.type = "main_title"; + unit.number = 1; + if (content) { + unit.text = content; + } + bookObj.header.markers.push(unit); + break; + case '\\cl' : + unit.type = "chapter_label"; + unit.number = 1; // Doesn't matter + if (content) { + unit.text = content; + } + bookObj.header.markers.push(unit); + break; + + // Content markers + case '\\c' : + // Update to new current chapter + currentChapter = parseInt(content); + section_title_written = false; + break; + case '\\s' : + // Write section content + unit.type = "section"; + unit.text = content; + unit.number = (section_title_written) ? 2 : 1; + section_title_written = true; + + // Add section + bookObj.content[currentChapter].content.push(unit); + break; + case '\\v' : { + const vPatternMatch = line.trim().match(V_PATTERN); + if (vPatternMatch) { + verseNum = parseInt(vPatternMatch[1]); + + // Write verse + unit.type = "verse"; + unit.number = verseNum; + unit.text = vPatternMatch[2]; + bookObj.content[currentChapter].content.push(unit); + } + break; + } + case '\\p' : { + // Write paragraph + unit.type = "paragraph"; + unit.number = 1; // number doesn't matter + unit.text = lineMatch[3] ? lineMatch[3] : ''; + bookObj.content[currentChapter].content.push(unit); + break; + } + default: + console.warn('Skipping unexpected marker:' + marker); + } + } + }); +} + /** * Parse a JSON file and converts it to USFM * @param {Books.objType} bookObj - a book type of JSON object @@ -13,25 +151,57 @@ export function convertToSFM(bookObj: books.objType, s: sfmConsole.SFMConsole) const ID_MARKER = "\\id "; const USFM_MARKER = "\\usfm "; const HEADER_MARKER = "\\h "; - const TOC_MARKER = "\\toc1 "; + const TOC1_MARKER = "\\toc1 "; + const TOC2_MARKER = "\\toc2 "; + const TOC3_MARKER = "\\toc3 "; const MAIN_TITLE_MARKER = "\\mt "; const CHAPTER_MARKER = "\\c "; + const CHAPTER_LABEL_MARKER = "\\cl "; const SECTION_MARKER = "\\s"; // number gets added later const PARAGRAPH_MARKER = "\n\\p"; const VERSE_MARKER = "\\v "; const CRLF = "\n"; - const chapters = bookObj.content; let SFMtext = ""; + // These were the initial headers written + /* SFMtext += ID_MARKER + bookObj.header.bookInfo.code + ' ' + bookObj.header.projectName + CRLF; SFMtext += USFM_MARKER + '3.0' + CRLF; SFMtext += HEADER_MARKER + bookObj.header.bookInfo.name + CRLF; - SFMtext += TOC_MARKER + bookObj.header.bookInfo.name + CRLF; + SFMtext += TOC1_MARKER + bookObj.header.bookInfo.name + CRLF; SFMtext += MAIN_TITLE_MARKER + bookObj.header.bookInfo.name + CRLF; + */ + SFMtext += ID_MARKER + bookObj.header.bookInfo.code + ' ' + bookObj.header.projectName + CRLF; + bookObj.header.markers.forEach(function(marker) { + const text = marker.text ? marker.text : ''; + switch(marker.type) { + case "header" : + SFMtext += HEADER_MARKER + text + CRLF; + break; + case "toc1" : + SFMtext += TOC1_MARKER + text + CRLF; + break; + case "toc2" : + SFMtext += TOC2_MARKER + text + CRLF; + break; + case "toc3" : + SFMtext += TOC3_MARKER + text + CRLF; + break; + case "main_title" : + SFMtext += MAIN_TITLE_MARKER + text + CRLF; + break; + case "chapter_label" : + SFMtext += CHAPTER_LABEL_MARKER + text + CRLF; + break; + default: + throw 'Invalid type on ' + JSON.stringify(marker) + '. \nUnexpected header marker.'; + } + }) + const chapters = bookObj.content; chapters.forEach(function(chapter) { if(chapter.number != 0){ SFMtext += CHAPTER_MARKER + chapter.number + CRLF; @@ -58,6 +228,11 @@ export function convertToSFM(bookObj: books.objType, s: sfmConsole.SFMConsole) SFMtext += VERSE_MARKER + unit.number + '-' + unit.bridgeEnd + ' ' + unit.text + CRLF; } break; + case "paragraph": { + const text = unit.text != '' ? ' ' + unit.text : ''; + SFMtext += PARAGRAPH_MARKER + text + CRLF; + break; + } default: throw 'Invalid type on ' + JSON.stringify(unit) + '. \nLooking for "section" or "verse".'; } @@ -74,3 +249,31 @@ export function convertToSFM(bookObj: books.objType, s: sfmConsole.SFMConsole) const padZero = bookObj.header.bookInfo.num < 10 ? '0': ''; fs.writeFileSync('./' + padZero + bookNum + bookCode + projectName + '.SFM', SFMtext); } + +/** + * Parse a JSON file and converts it to TSV. Only writing verses out + * @param {Books.objType} bookObj - a book type of JSON object + * @param {string} filepath - the original filename + */ +export function convertToTSV(bookObj: books.objType, filepath: string) { + const CRLF = "\n"; + + const chapters = bookObj.content; + + let TSVtext = ""; + + chapters.forEach(function(chapter) { + if(chapter.number != 0) { + if(chapter.content){ + chapter.content.forEach(v => { + if (v.type == "verse") { + TSVtext += v.number + '\t' + v.text + CRLF; + } + }); + } + } + }); + + const padZero = bookObj.header.bookInfo.num < 10 ? '0': ''; + fs.writeFileSync('./' + padZero + filepath + '.TSV', TSVtext); +} diff --git a/src/toolbox.ts b/src/toolbox.ts index 80f88d6..7e679e4 100644 --- a/src/toolbox.ts +++ b/src/toolbox.ts @@ -12,7 +12,7 @@ import * as sfmConsole from './sfmConsole.js'; * VS_AS_VERSE - `\vs` marks verse numbers along with section headers. * Uses the state machine (actions) */ -type modeType = +export type modeType = "TX_AS_VERSE" | "VS_AS_VERSE"; @@ -33,13 +33,21 @@ type actionType = */ export type markerType = // These are processed - "\\tx" | - "\\vs" | + "\\tx" | + "\\vs" | + "\\v" | + "\\s" | + "\\p" | + "\\h" | + "\\toc3" | + "\\toc1" | + "\\toc2" | + "\\mt" | + "\\cl" | // These are ignored "\\_sh" | "\\c" | - "\\cl" | "\\ft" | "\\gl" | "\\ref" | @@ -111,8 +119,13 @@ export function getVerseBridge(line: string, verseNum: number) : bridgeType { */ export function getBookAndChapter(file: string) : fileInfoType { const filename = path.parse(file).base; + const pattern = /([0-9A-Za-z]+)_(Ch|ch)?(\d+)[_\s]?.*\.txt/; const match = filename.match(pattern); + + const patternSFM = /([0-9]{2})([0-9A-Za-z]{3}).+\.(SFM|sfm)/; + const matchSFM = filename.match(patternSFM); + const obj: fileInfoType = { bookName: "Placeholder", chapterNumber: 0 @@ -124,6 +137,14 @@ export function getBookAndChapter(file: string) : fileInfoType { obj.bookName = bookName; obj.chapterNumber = parseInt(match[3]); } + // Attempt to parse SFM file name + } else if (matchSFM) { + const bookCode = matchSFM[2] as books.CodeType; + const bookName = books.getBookByCode(bookCode).name; + if (bookName !== "Placeholder") { + obj.bookName = bookName; + obj.chapterNumber = 1; // Special dataset to put everything into chapter 1 + } } else { console.warn('Unable to determine info from: ' + filename); } @@ -145,7 +166,8 @@ export function initializeBookObj(bookName: string, projectName: string) : books const bookObj : books.objType = { "header": { "projectName" : projectName, - "bookInfo" : bookType + "bookInfo" : bookType, + "markers": [], }, "content": [] }; @@ -186,9 +208,9 @@ export function updateObj(bookObj: books.objType, file: string, currentChapter: // Determine the mode of how to process the file let mode: modeType = 'TX_AS_VERSE'; - const modePattern = /\\vs\s+\d+/; + const modePatternVS = /\\vs\s+\d+/; toolboxData.every(line => { - if (line.match(modePattern)) { + if (line.match(modePatternVS)) { // Change mode and break out mode = 'VS_AS_VERSE'; return false; @@ -199,7 +221,7 @@ export function updateObj(bookObj: books.objType, file: string, currentChapter: // Split each line on marker and content const markerPattern = /(\\_?[A-Za-z]+)\s?(.*)/; - let verseNum = 1; // Keep track of the current verse to write + let verseNum = 2; // Keep track of the current verse to write. This may need to revert to 1 let action : actionType = 'START'; let section_title_written = false; toolboxData.forEach(line => {