diff --git a/se/commands/build_title.py b/se/commands/build_title.py
index 83630710..1bf8127a 100644
--- a/se/commands/build_title.py
+++ b/se/commands/build_title.py
@@ -17,11 +17,16 @@ def build_title(plain_output: bool) -> int:
parser = argparse.ArgumentParser(description="Generate the title of an XHTML file based on its headings and update the file’s
element.")
parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="with --stdout, don’t end output with a newline")
parser.add_argument("-s", "--stdout", action="store_true", help="print to stdout intead of writing to the file")
+ parser.add_argument("-t", "--titlecase", action="store_true", help="titlecase both the title element and heading element")
parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
args = parser.parse_args()
targets = se.get_target_filenames(args.targets, ".xhtml")
+ if args.stdout and args.titlecase:
+ se.print_error("The [bash]--titlecase[/] option cannot be used with the [bash]--stdout[/] option.", plain_output=plain_output)
+ return se.InvalidArgumentsException.code
+
if args.stdout and (len(targets) > 1):
se.print_error("Multiple targets or directories are only allowed without the [bash]--stdout[/] option.", plain_output=plain_output)
return se.InvalidArgumentsException.code
@@ -37,6 +42,8 @@ def build_title(plain_output: bool) -> int:
dom = se.easy_xml.EasyXmlTree(file.read())
title = se.formatting.generate_title(dom)
+ if args.titlecase:
+ title = se.formatting.titlecase(title)
if args.stdout:
if args.newline:
diff --git a/se/formatting.py b/se/formatting.py
index 9335ba77..f0b86782 100644
--- a/se/formatting.py
+++ b/se/formatting.py
@@ -1258,9 +1258,119 @@ def titlecase(text: str) -> str:
# Like `Will-o’-the-Wisp`
text = regex.sub(r"(?<=-)(O’|The)-", lambda result: result.group(1).lower() + "-", text)
-
return text
+def check_badly_formed(text) -> bool:
+ """
+ Helper function to determine if the supplied string starts partway through a tag
+
+ INPUTS
+ text: The string including angle brackets
+
+ RETURNS
+ True if the first right angle is BEFORE the first left angle, otherwise False
+ """
+ # we know from earlier in the call stack that there's at least one < char
+ # but we need to know if the start of the string is already INSIDE a tag
+ right_angle_index = text.find(">")
+ left_angle_index = text.find("<")
+ return (right_angle_index < left_angle_index)
+
+
+def remove_tags(text: str) -> str:
+ """
+ Remove HTML tags from a string.
+
+ INPUTS
+ text: The string including tags
+
+ OUTPUTS
+ A version of the string stripped of tags
+
+ """
+ # take out the tags by transcribing and omitting them
+ # with a well-formed string we could do this more efficiently with a regex,
+ # but this way is more certain in case a string is badly formed, eg 'h3>A TITLE":
+ in_a_tag = False
+ tagged_index += 1
+ return untagged.strip()
+
+
+def process_tagged_string(text: str) -> str:
+ """
+ Applies titlecasing to those parts of a string outside of html tags.
+
+ INPUTS
+ text: The string including tags to be titlecased
+
+ OUTPUTS
+ A titlecased version of the tagged input string
+
+ """
+ untagged = remove_tags(text)
+ cased = titlecase(untagged)
+ tagged_index = 0
+ untagged_index = 0
+ in_a_tag = check_badly_formed(text)
+ outstring = ""
+ while tagged_index < len(text):
+ if not in_a_tag:
+ if text[tagged_index] == "<":
+ in_a_tag = True
+ outstring += "<"
+ else:
+ outstring += cased[untagged_index] # transfer the titlecased letter
+ untagged_index += 1
+ else: # we're inside a tag
+ if text[tagged_index] == ">":
+ in_a_tag = False
+ outstring += ">"
+ else: # we're inside a tag, keep going
+ outstring += text[tagged_index]
+ tagged_index += 1
+ return outstring
+
+def tagged_titlecase(text: str) -> str:
+ """
+ Titlecase a string which includes and other tags.
+
+ Calls SE titlecase on a version of the string stripped of tags,
+ then re-applies the casing around those tags.
+
+ INPUTS
+ text: The string to titlecase
+
+ OUTPUTS
+ A titlecased version of the tagged input string
+ """
+ if "<" not in text: # if no tags, process normally
+ return titlecase(text)
+ else:
+ cased_string = process_tagged_string(text) # treat it specially
+
+ # now we have to check for book, play, etc titles and process them again as separate units
+ # this patternlooks for book/play/vessel etc. names
+ regex_pattern = regex.compile(r'(.*?)')
+
+ for match in regex_pattern.finditer(cased_string): # we iterate because there may be more than one such
+ # we make a recursive call because the book title may contain tags of its own such as
+ titled_semantic = tagged_titlecase(match.group(3))
+ replacement = f'{titled_semantic}'
+ cased_string = cased_string.replace(match.group(0), replacement)
+
+ return cased_string
+
def make_url_safe(text: str) -> str:
"""
Return a URL-safe version of the input. For example, the string "Mother's Day" becomes "mothers-day".
@@ -1442,7 +1552,6 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
if h_elements:
h_element = h_elements[0]
-
# Strip any endnote references first
for node in h_element.xpath("//*[contains(@epub:type, 'noteref')]"):
node.remove()