Added tagged_titlecase function and -t option to build-title

standardebooks · Dec 15, 2023 · b64d99b · b64d99b
1 parent 6a06d6c
commit b64d99b
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 2 deletions.
diff --git a/se/commands/build_title.py b/se/commands/build_title.py
@@ -17,11 +17,16 @@ def build_title(plain_output: bool) -> int:
 	parser = argparse.ArgumentParser(description="Generate the title of an XHTML file based on its headings and update the file’s <title> element.")
 	parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="with --stdout, don’t end output with a newline")
 	parser.add_argument("-s", "--stdout", action="store_true", help="print to stdout intead of writing to the file")
+	parser.add_argument("-t", "--titlecase", action="store_true", help="titlecase both the title element and heading element")
 	parser.add_argument("targets", metavar="TARGET", nargs="+", help="an XHTML file, or a directory containing XHTML files")
 	args = parser.parse_args()
 
 	targets = se.get_target_filenames(args.targets, ".xhtml")
 
+	if args.stdout and args.titlecase:
+		se.print_error("The [bash]--titlecase[/] option cannot be used with the [bash]--stdout[/] option.", plain_output=plain_output)
+		return se.InvalidArgumentsException.code		
+
 	if args.stdout and (len(targets) > 1):
 		se.print_error("Multiple targets or directories are only allowed without the [bash]--stdout[/] option.", plain_output=plain_output)
 		return se.InvalidArgumentsException.code
@@ -37,6 +42,8 @@ def build_title(plain_output: bool) -> int:
 				dom = se.easy_xml.EasyXmlTree(file.read())
 
 				title = se.formatting.generate_title(dom)
+				if args.titlecase:
+					title = se.formatting.titlecase(title)
 
 				if args.stdout:
 					if args.newline:

diff --git a/se/formatting.py b/se/formatting.py
@@ -1258,9 +1258,119 @@ def titlecase(text: str) -> str:
 
 	# Like `Will-o’-the-Wisp`
 	text = regex.sub(r"(?<=-)(O’|The)-", lambda result: result.group(1).lower() + "-", text)
-
 	return text
 
+def check_badly_formed(text) -> bool:
+	"""
+	Helper function to determine if the supplied string starts partway through a tag
+	
+	INPUTS
+	text: The string including angle brackets
+	
+	RETURNS
+	True if the first right angle is BEFORE the first left angle, otherwise False
+	"""
+	# we know from earlier in the call stack that there's at least one < char
+	# but we need to know if the start of the string is already INSIDE a tag
+	right_angle_index = text.find(">")
+	left_angle_index = text.find("<")
+	return (right_angle_index < left_angle_index)  
+
+
+def remove_tags(text: str) -> str:
+	"""
+	Remove HTML tags from a string.
+
+	INPUTS
+	text: The string including tags
+
+	OUTPUTS
+	A version of the string stripped of tags
+
+	"""
+	# take out the tags by transcribing and omitting them
+	# with a well-formed string we could do this more efficiently with a regex,
+	# but this way is more certain in case a string is badly formed, eg 'h3>A TITLE</h3'
+	untagged = ""
+	in_a_tag = check_badly_formed(text)
+	tagged_index = 0
+	while tagged_index < len(text):
+		if not in_a_tag:
+			if text[tagged_index] == "<":
+				in_a_tag = True
+			else:
+				untagged += text[tagged_index]  # transfer the char
+		else:  # we're inside a tag
+			if text[tagged_index] == ">":
+				in_a_tag = False
+		tagged_index += 1
+	return untagged.strip()
+
+
+def process_tagged_string(text: str) -> str:
+	"""
+	Applies titlecasing to those parts of a string outside of html tags.
+
+	INPUTS
+	text: The string including tags to be titlecased
+
+	OUTPUTS
+	A titlecased version of the tagged input string
+
+	"""
+	untagged = remove_tags(text)
+	cased = titlecase(untagged)
+	tagged_index = 0
+	untagged_index = 0
+	in_a_tag = check_badly_formed(text)
+	outstring = ""
+	while tagged_index < len(text):
+		if not in_a_tag:
+			if text[tagged_index] == "<":
+				in_a_tag = True
+				outstring += "<"
+			else:
+				outstring += cased[untagged_index]  # transfer the titlecased letter
+				untagged_index += 1
+		else:  # we're inside a tag
+			if text[tagged_index] == ">":
+				in_a_tag = False
+				outstring += ">"
+			else:  # we're inside a tag, keep going
+				outstring += text[tagged_index]
+		tagged_index += 1
+	return outstring
+
+def tagged_titlecase(text: str) -> str:
+	"""
+	Titlecase a string which includes <abbr> and other tags. 
+	
+	Calls SE titlecase on a version of the string stripped of tags, 
+	then re-applies the casing around those tags.
+
+	INPUTS
+	text: The string to titlecase
+
+	OUTPUTS
+	A titlecased version of the tagged input string
+	"""
+	if "<" not in text:  # if no tags, process normally
+		return titlecase(text)
+	else:
+		cased_string = process_tagged_string(text)  # treat it specially
+
+		# now we have to check for book, play, etc titles and process them again as separate units
+		# this patternlooks for book/play/vessel etc. names
+		regex_pattern = regex.compile(r'<i epub:type="se:name\.(.*?)"(.*?)>(.*?)</i>')
+
+		for match in regex_pattern.finditer(cased_string): # we iterate because there may be more than one such
+			# we make a recursive call because the book title may contain tags of its own such as <abbr>
+			titled_semantic = tagged_titlecase(match.group(3))
+			replacement = f'<i epub:type="se:name.{match.group(1)}"{match.group(2)}>{titled_semantic}</i>'
+			cased_string = cased_string.replace(match.group(0), replacement)
+
+		return cased_string
+
 def make_url_safe(text: str) -> str:
 	"""
 	Return a URL-safe version of the input. For example, the string "Mother's Day" becomes "mothers-day".
@@ -1442,7 +1552,6 @@ def generate_title(xhtml: Union[str, EasyXmlTree]) -> str:
 
 		if h_elements:
 			h_element = h_elements[0]
-
 			# Strip any endnote references first
 			for node in h_element.xpath("//*[contains(@epub:type, 'noteref')]"):
 				node.remove()