Skip to content

Commit

Permalink
Added tagged_titlecase function and -t option to build-title
Browse files Browse the repository at this point in the history
  • Loading branch information
drgrigg committed Dec 15, 2023
1 parent 6a06d6c commit b64d99b
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 2 deletions.
7 changes: 7 additions & 0 deletions se/commands/build_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ def build_title(plain_output: bool) -> int:
parser = argparse.ArgumentParser(description="Generate the title of an XHTML file based on its headings and update the file’s <title> element.")
parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="with --stdout, don’t end output with a newline")
parser.add_argument("-s", "--stdout", action="store_true", help="print to stdout intead of writing to the file")
parser.add_argument("-t", "--titlecase", action="store_true", help="titlecase both the title element and heading element")
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
args = parser.parse_args()

targets = se.get_target_filenames(args.targets, ".xhtml")

if args.stdout and args.titlecase:
se.print_error("The [bash]--titlecase[/] option cannot be used with the [bash]--stdout[/] option.", plain_output=plain_output)
return se.InvalidArgumentsException.code

if args.stdout and (len(targets) > 1):
se.print_error("Multiple targets or directories are only allowed without the [bash]--stdout[/] option.", plain_output=plain_output)
return se.InvalidArgumentsException.code
Expand All @@ -37,6 +42,8 @@ def build_title(plain_output: bool) -> int:
dom = se.easy_xml.EasyXmlTree(file.read())

title = se.formatting.generate_title(dom)
if args.titlecase:
title = se.formatting.titlecase(title)

if args.stdout:
if args.newline:
Expand Down
113 changes: 111 additions & 2 deletions se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,9 +1258,119 @@ def titlecase(text: str) -> str:

# Like `Will-o’-the-Wisp`
text = regex.sub(r"(?<=-)(O’|The)-", lambda result: result.group(1).lower() + "-", text)

return text

def check_badly_formed(text) -> bool:
"""
Helper function to determine if the supplied string starts partway through a tag
INPUTS
text: The string including angle brackets
RETURNS
True if the first right angle is BEFORE the first left angle, otherwise False
"""
# we know from earlier in the call stack that there's at least one < char
# but we need to know if the start of the string is already INSIDE a tag
right_angle_index = text.find(">")
left_angle_index = text.find("<")
return (right_angle_index < left_angle_index)


def remove_tags(text: str) -> str:
"""
Remove HTML tags from a string.
INPUTS
text: The string including tags
OUTPUTS
A version of the string stripped of tags
"""
# take out the tags by transcribing and omitting them
# with a well-formed string we could do this more efficiently with a regex,
# but this way is more certain in case a string is badly formed, eg 'h3>A TITLE</h3'
untagged = ""
in_a_tag = check_badly_formed(text)
tagged_index = 0
while tagged_index < len(text):
if not in_a_tag:
if text[tagged_index] == "<":
in_a_tag = True
else:
untagged += text[tagged_index] # transfer the char
else: # we're inside a tag
if text[tagged_index] == ">":
in_a_tag = False
tagged_index += 1
return untagged.strip()


def process_tagged_string(text: str) -> str:
"""
Applies titlecasing to those parts of a string outside of html tags.
INPUTS
text: The string including tags to be titlecased
OUTPUTS
A titlecased version of the tagged input string
"""
untagged = remove_tags(text)
cased = titlecase(untagged)
tagged_index = 0
untagged_index = 0
in_a_tag = check_badly_formed(text)
outstring = ""
while tagged_index < len(text):
if not in_a_tag:
if text[tagged_index] == "<":
in_a_tag = True
outstring += "<"
else:
outstring += cased[untagged_index] # transfer the titlecased letter
untagged_index += 1
else: # we're inside a tag
if text[tagged_index] == ">":
in_a_tag = False
outstring += ">"
else: # we're inside a tag, keep going
outstring += text[tagged_index]
tagged_index += 1
return outstring

def tagged_titlecase(text: str) -> str:
"""
Titlecase a string which includes <abbr> and other tags.
Calls SE titlecase on a version of the string stripped of tags,
then re-applies the casing around those tags.
INPUTS
text: The string to titlecase
OUTPUTS
A titlecased version of the tagged input string
"""
if "<" not in text: # if no tags, process normally
return titlecase(text)
else:
cased_string = process_tagged_string(text) # treat it specially

# now we have to check for book, play, etc titles and process them again as separate units
# this patternlooks for book/play/vessel etc. names
regex_pattern = regex.compile(r'<i epub:type="se:name\.(.*?)"(.*?)>(.*?)</i>')

for match in regex_pattern.finditer(cased_string): # we iterate because there may be more than one such
# we make a recursive call because the book title may contain tags of its own such as <abbr>
titled_semantic = tagged_titlecase(match.group(3))
replacement = f'<i epub:type="se:name.{match.group(1)}"{match.group(2)}>{titled_semantic}</i>'
cased_string = cased_string.replace(match.group(0), replacement)

return cased_string

def make_url_safe(text: str) -> str:
"""
Return a URL-safe version of the input. For example, the string "Mother's Day" becomes "mothers-day".
Expand Down Expand Up @@ -1442,7 +1552,6 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:

if h_elements:
h_element = h_elements[0]

# Strip any endnote references first
for node in h_element.xpath("//*[contains(@epub:type, 'noteref')]"):
node.remove()
Expand Down

0 comments on commit b64d99b

Please sign in to comment.