From cf67d369acc68abb7bbcf45090af8a31b38361ff Mon Sep 17 00:00:00 2001 From: Andrew Myers Date: Wed, 18 Dec 2024 15:02:17 -0500 Subject: [PATCH] Forces encoding of XML to UTF-8 prior to converting to JSON Fixes #2894. --- app/controllers/api_controller.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/app/controllers/api_controller.rb b/app/controllers/api_controller.rb index 33f37a0010..713f274e32 100644 --- a/app/controllers/api_controller.rb +++ b/app/controllers/api_controller.rb @@ -55,6 +55,15 @@ def show # escape double quotes (because they may appear in node values) xml = xml.gsub(%(\"), %(\\\")) + # Non-ASCII characters that are valid UTF-8 will throw an error during + # the translation process. Since we want UTF-8 in and out, forcing + # encoding to UTF-8 here should alleviate issues of where a multibyte + # char is not recognized. However, if non UTF-U encodings are being + # used, then this still may error, and we need to re-open the discussion + # about how/wehther to support other encodings, which would have to be + # stored/read within the PBCore documents themselves, i would think. + xml.force_encoding('UTF-8') + json = pbcore_xml_to_json_xsl_doc.transform(Nokogiri::XML(xml)) render json: JSON.pretty_generate( JSON.parse(json)