Skip to content

Commit

Permalink
refactor: Move sfm logic
Browse files Browse the repository at this point in the history
  • Loading branch information
darcywong00 committed Dec 9, 2023
1 parent 08dc8ff commit dc5b494
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 58 deletions.
16 changes: 6 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# sfm-utils
Utilities to parse text files (e.g. from Toolbox) and migrate SFM markers into a format suitable for Paratext.
Each book input is written to individual .sfm files.
Utilities to parse book translations in SFM text files (.txt, .rtf, .sfm) into JSON objects, and then write out the books into SFM suitable for Paratext or .tsv.
When directories are processed, each book input is written to individual .sfm files.

Assumptions:
* Each text file is for a single chapter of a book
Expand All @@ -13,15 +13,16 @@ Note for developers: Replace `sfm-utils.exe` references with `node dist/index.js

Command-line
```bash
Usage: sfm-utils.exe -p p_arg [-t t_arg | -d d_arg | -j j_arg | -s s_arg]
Usage: sfm-utils.exe -p p_arg [-f f_arg | -t t_arg | -d d_arg | -j j_arg | -s s_arg]
```

Parameters
```bash
Required
-p [Paratext project name (can be 3-character abbreviation)]

Optional - one of:
Optional for processing txt or sfm files - one of:
-f [A single SFM file (can be an entire book)]
-t [A single Toolbox text file (one chapter of a book)]
-d [Directory of Toolbox text files for a single book (one chapter per file)]
-j [JSON file representing a single book - used for testing conversion to SFM]
Expand All @@ -34,11 +35,6 @@ Parameters
```

### Help
Obtaining the sfm-utils version:
```bash
sfm-utils.exe --version
```

For additional help:
```bash
sfm-utils.exe -h
Expand All @@ -49,7 +45,7 @@ sfm-utils.exe -h

## Developer Setup
These utilities require Git, Node.js, and TypeScript (installed locally).
Back translations in .rtf text files will also need UnRTF installed for converting the Rich Text format.
Back translations in .rtf text files will also need UnRTF installed for converting the Rich Text format (only works on Linux).

### Install Git
Download and install Git
Expand Down
17 changes: 7 additions & 10 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -327,14 +327,6 @@ function processText(filepath: string, bookObj: books.objType): books.objType {

//valid JSON Object to SFM
sfm.convertToSFM(bookObj, s);
} else if (options.sfm && bookObj.header.bookInfo.code !== "000") {
const basename = path.parse(path.basename(filepath)).name;

// For testing, write out book JSON Object
writeJSON(bookObj, basename + '.json');

// Write JSON Object to TSV file
sfm.convertToTSV(bookObj, basename);
}

return bookObj;
Expand Down Expand Up @@ -392,8 +384,13 @@ function processSFMText(filepath: string, bookObj: books.objType): books.objType
// For testing, write out book JSON Object
writeJSON(bookObj, basename + '.json');

//valid JSON Object to SFM
sfm.convertToSFM(bookObj, s);
if (bookObj.header.bookInfo.code == 'XXE') {
// Special SFM file written to TSV
sfm.convertToTSV(bookObj, basename);
} else {
//valid JSON Object to SFM
sfm.convertToSFM(bookObj, s);
}
}

return bookObj;
Expand Down
19 changes: 13 additions & 6 deletions src/sfm.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
// Copyright 2022 SIL International
// Utilities for converting a JSON file to USFM
// Utilities for converting from USFM to JSON, and JSON to USFM or TSV
import * as books from './books.js';
import * as toolbox from './toolbox.js';
import * as sfmConsole from './sfmConsole.js';
import * as fs from 'fs';

/**
* Regex to parse \v verse marker
*/
export const V_PATTERN = /\\v\s+(\d+)\s+(.+)/;


/**
* Parse an SFM text file and modify the corresponding
* book Object containing the chapter information
Expand All @@ -21,7 +27,6 @@ export function updateObj(bookObj: books.objType, file: string,
let sfmFile = fs.readFileSync(file, 'utf-8');
sfmFile = sfmFile.replace(/(\r?\n){2,}/g, '\r\n');
const sfmData = sfmFile.split(/\r?\n/);
const SECTION_TITLE = 'title.';
let section_title_written = false;
if (sfmData[sfmData.length - 1] == '') {
// If last line empty, remove it
Expand Down Expand Up @@ -110,7 +115,7 @@ export function updateObj(bookObj: books.objType, file: string,
bookObj.content[currentChapter].content.push(unit);
break;
case '\\v' : {
const vPatternMatch = line.trim().match(toolbox.V_PATTERN);
const vPatternMatch = line.trim().match(V_PATTERN);
if (vPatternMatch) {
verseNum = parseInt(vPatternMatch[1]);

Expand Down Expand Up @@ -160,7 +165,7 @@ export function convertToSFM(bookObj: books.objType, s: sfmConsole.SFMConsole)

let SFMtext = "";

// These were the initial header
// These were the initial headers written
/*
SFMtext += ID_MARKER + bookObj.header.bookInfo.code + ' ' + bookObj.header.projectName + CRLF;
SFMtext += USFM_MARKER + '3.0' + CRLF;
Expand Down Expand Up @@ -246,7 +251,7 @@ export function convertToSFM(bookObj: books.objType, s: sfmConsole.SFMConsole)
}

/**
* Parse a JSON file and converts it to TSV
* Parse a JSON file and converts it to TSV. Only writing verses out
* @param {Books.objType} bookObj - a book type of JSON object
* @param {string} filepath - the original filename
*/
Expand All @@ -261,7 +266,9 @@ export function convertToTSV(bookObj: books.objType, filepath: string) {
if(chapter.number != 0) {
if(chapter.content){
chapter.content.forEach(v => {
TSVtext += v.number + '\t' + v.text + CRLF;
if (v.type == "verse") {
TSVtext += v.number + '\t' + v.text + CRLF;
}
});
}
}
Expand Down
35 changes: 3 additions & 32 deletions src/toolbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ import * as sfmConsole from './sfmConsole.js';
*/
export type modeType =
"TX_AS_VERSE" |
"VS_AS_VERSE" |
"V_AS_VERSE";
"VS_AS_VERSE";

/**
* States for VS_AS_VERSE processing mode
Expand Down Expand Up @@ -72,8 +71,6 @@ export interface bridgeType {
*/
export const VS_PATTERN = /\\vs\s+\*?(\d+|\(?section title\)?|\(?section heading\)?|\(\d+-\d+\)|\[\d+-\d+\]|\d+-\d+)\s?\(?([a-z])?\)?\??.*/;

export const V_PATTERN = /\\v\s+(\d+)\s+(.+)/;

/**
* Regex to parse all the variations of verse bridges to extract verse ranges
* (13-14)
Expand Down Expand Up @@ -146,9 +143,8 @@ export function getBookAndChapter(file: string) : fileInfoType {
const bookName = books.getBookByCode(bookCode).name;
if (bookName !== "Placeholder") {
obj.bookName = bookName;
// obj.chapterNumber TODO
obj.chapterNumber = 1; // Special dataset to put everything into chapter 1
}
obj.chapterNumber = 1; // Special dataset to put everything into chapter 1
} else {
console.warn('Unable to determine info from: ' + filename);
}
Expand Down Expand Up @@ -213,24 +209,19 @@ export function updateObj(bookObj: books.objType, file: string, currentChapter:
// Determine the mode of how to process the file
let mode: modeType = 'TX_AS_VERSE';
const modePatternVS = /\\vs\s+\d+/;
const modePatternV = /\\v\s+\d+.*/;
toolboxData.every(line => {
if (line.match(modePatternVS)) {
// Change mode and break out
mode = 'VS_AS_VERSE';
return false;
} else if (line.match(modePatternV)) {
// Change mode and break out
mode = 'V_AS_VERSE';
return false;
}
// Continue the every() loop
return true;
});

// Split each line on marker and content
const markerPattern = /(\\_?[A-Za-z]+)\s?(.*)/;
let verseNum = 2; // Keep track of the current verse to write
let verseNum = 2; // Keep track of the current verse to write. This may need to revert to 1
let action : actionType = 'START';
let section_title_written = false;
toolboxData.forEach(line => {
Expand Down Expand Up @@ -423,26 +414,6 @@ export function updateObj(bookObj: books.objType, file: string, currentChapter:
section_title_written = true;
break;
}
} else if (mode == 'V_AS_VERSE') {
if (marker != "\\tx" && marker != "\\v") {
// Skip all other markers for now
return;
}
if (marker == '\\v') {
const vsPatternMatch = line.trim().match(V_PATTERN);
if(vsPatternMatch){
verseNum = parseInt(vsPatternMatch[1]);

unit.type = "verse";
unit.number = verseNum;
unit.text = vsPatternMatch[2];
bookObj.content[currentChapter].content.push(unit);
} else {
// Skip unrecognized \vs line
s.log('warn', `${bookObj.header.bookInfo.name} ch ${currentChapter}: Skipping unrecognized line "${line}".`);
return;
}
}
}
} else {
if (lineMatch && lineMatch[2] != '') {
Expand Down

0 comments on commit dc5b494

Please sign in to comment.