diff --git a/se/commands/build_title.py b/se/commands/build_title.py index 83630710..1bf8127a 100644 --- a/se/commands/build_title.py +++ b/se/commands/build_title.py @@ -17,11 +17,16 @@ def build_title(plain_output: bool) -> int: parser = argparse.ArgumentParser(description="Generate the title of an XHTML file based on its headings and update the file’s element.") parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="with --stdout, don’t end output with a newline") parser.add_argument("-s", "--stdout", action="store_true", help="print to stdout intead of writing to the file") + parser.add_argument("-t", "--titlecase", action="store_true", help="titlecase both the title element and heading element") parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files") args = parser.parse_args() targets = se.get_target_filenames(args.targets, ".xhtml") + if args.stdout and args.titlecase: + se.print_error("The [bash]--titlecase[/] option cannot be used with the [bash]--stdout[/] option.", plain_output=plain_output) + return se.InvalidArgumentsException.code + if args.stdout and (len(targets) > 1): se.print_error("Multiple targets or directories are only allowed without the [bash]--stdout[/] option.", plain_output=plain_output) return se.InvalidArgumentsException.code @@ -37,6 +42,8 @@ def build_title(plain_output: bool) -> int: dom = se.easy_xml.EasyXmlTree(file.read()) title = se.formatting.generate_title(dom) + if args.titlecase: + title = se.formatting.titlecase(title) if args.stdout: if args.newline: diff --git a/se/formatting.py b/se/formatting.py index 9335ba77..f0b86782 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -1258,9 +1258,119 @@ def titlecase(text: str) -> str: # Like `Will-o’-the-Wisp` text = regex.sub(r"(?<=-)(O’|The)-", lambda result: result.group(1).lower() + "-", text) - return text +def check_badly_formed(text) -> bool: + """ + Helper function to determine if the supplied string starts partway through a tag + + INPUTS + text: The string including angle brackets + + RETURNS + True if the first right angle is BEFORE the first left angle, otherwise False + """ + # we know from earlier in the call stack that there's at least one < char + # but we need to know if the start of the string is already INSIDE a tag + right_angle_index = text.find(">") + left_angle_index = text.find("<") + return (right_angle_index < left_angle_index) + + +def remove_tags(text: str) -> str: + """ + Remove HTML tags from a string. + + INPUTS + text: The string including tags + + OUTPUTS + A version of the string stripped of tags + + """ + # take out the tags by transcribing and omitting them + # with a well-formed string we could do this more efficiently with a regex, + # but this way is more certain in case a string is badly formed, eg 'h3>A TITLE</h3' + untagged = "" + in_a_tag = check_badly_formed(text) + tagged_index = 0 + while tagged_index < len(text): + if not in_a_tag: + if text[tagged_index] == "<": + in_a_tag = True + else: + untagged += text[tagged_index] # transfer the char + else: # we're inside a tag + if text[tagged_index] == ">": + in_a_tag = False + tagged_index += 1 + return untagged.strip() + + +def process_tagged_string(text: str) -> str: + """ + Applies titlecasing to those parts of a string outside of html tags. + + INPUTS + text: The string including tags to be titlecased + + OUTPUTS + A titlecased version of the tagged input string + + """ + untagged = remove_tags(text) + cased = titlecase(untagged) + tagged_index = 0 + untagged_index = 0 + in_a_tag = check_badly_formed(text) + outstring = "" + while tagged_index < len(text): + if not in_a_tag: + if text[tagged_index] == "<": + in_a_tag = True + outstring += "<" + else: + outstring += cased[untagged_index] # transfer the titlecased letter + untagged_index += 1 + else: # we're inside a tag + if text[tagged_index] == ">": + in_a_tag = False + outstring += ">" + else: # we're inside a tag, keep going + outstring += text[tagged_index] + tagged_index += 1 + return outstring + +def tagged_titlecase(text: str) -> str: + """ + Titlecase a string which includes <abbr> and other tags. + + Calls SE titlecase on a version of the string stripped of tags, + then re-applies the casing around those tags. + + INPUTS + text: The string to titlecase + + OUTPUTS + A titlecased version of the tagged input string + """ + if "<" not in text: # if no tags, process normally + return titlecase(text) + else: + cased_string = process_tagged_string(text) # treat it specially + + # now we have to check for book, play, etc titles and process them again as separate units + # this patternlooks for book/play/vessel etc. names + regex_pattern = regex.compile(r'<i epub:type="se:name\.(.*?)"(.*?)>(.*?)</i>') + + for match in regex_pattern.finditer(cased_string): # we iterate because there may be more than one such + # we make a recursive call because the book title may contain tags of its own such as <abbr> + titled_semantic = tagged_titlecase(match.group(3)) + replacement = f'<i epub:type="se:name.{match.group(1)}"{match.group(2)}>{titled_semantic}</i>' + cased_string = cased_string.replace(match.group(0), replacement) + + return cased_string + def make_url_safe(text: str) -> str: """ Return a URL-safe version of the input. For example, the string "Mother's Day" becomes "mothers-day". @@ -1442,7 +1552,6 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str: if h_elements: h_element = h_elements[0] - # Strip any endnote references first for node in h_element.xpath("//*[contains(@epub:type, 'noteref')]"): node.remove()