From c68449a65cb594f4cc449795ad4cc057eb30856f Mon Sep 17 00:00:00 2001 From: rmhowe425 <45905457+rmhowe425@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:52:05 -0400 Subject: [PATCH] DEPR: Remove literal string input for read_xml (#53809) * Updating documentation and adding deprecation logic for read_xml. * Fixing documentation issue and adding unit test * Updating unit tests and documentation. * Fixing unit tests and documentation issues * Fixing unit tests and documentation issues * Fixing unit tests and documentation issues * Fixing import error in documentation * Updated deprecation logic per reviewer recommendations. * Updating deprecation logic and documentation per reviewer recommendations. * Fixing logic error * Fixing implementation per reviewer recommendations. * Updating implementation per reviewer recommendations. * Cleaning up the deprecation logic a bit. * Updating implementation per reviewer recommendations. * Updating unit tests * Fixing discrepancy in doc string. * Updating implementation based on reviewer recommendations. --- doc/source/user_guide/io.rst | 13 +-- doc/source/whatsnew/v1.5.0.rst | 3 +- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/xml.py | 30 +++++- pandas/tests/io/xml/test_xml.py | 128 +++++++++++++++---------- pandas/tests/io/xml/test_xml_dtypes.py | 30 +++--- 6 files changed, 133 insertions(+), 72 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ec0e7d0636b07..4d4b9e086e9e5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2919,6 +2919,7 @@ Read an XML string: .. ipython:: python + from io import StringIO xml = """ @@ -2941,7 +2942,7 @@ Read an XML string: """ - df = pd.read_xml(xml) + df = pd.read_xml(StringIO(xml)) df Read a URL with no options: @@ -2961,7 +2962,7 @@ as a string: f.write(xml) with open(file_path, "r") as f: - df = pd.read_xml(f.read()) + df = pd.read_xml(StringIO(f.read())) df Read in the content of the "books.xml" as instance of ``StringIO`` or @@ -3052,7 +3053,7 @@ For example, below XML contains a namespace with prefix, ``doc``, and URI at """ - df = pd.read_xml(xml, + df = pd.read_xml(StringIO(xml), xpath="//doc:row", namespaces={"doc": "https://example.com"}) df @@ -3082,7 +3083,7 @@ But assigning *any* temporary name to correct URI allows parsing by nodes. """ - df = pd.read_xml(xml, + df = pd.read_xml(StringIO(xml), xpath="//pandas:row", namespaces={"pandas": "https://example.com"}) df @@ -3117,7 +3118,7 @@ However, if XPath does not reference node names such as default, ``/*``, then """ - df = pd.read_xml(xml, xpath="./row") + df = pd.read_xml(StringIO(xml), xpath="./row") df shows the attribute ``sides`` on ``shape`` element was not parsed as @@ -3218,7 +3219,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: """ - df = pd.read_xml(xml, stylesheet=xsl) + df = pd.read_xml(StringIO(xml), stylesheet=xsl) df For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9653226b96196..44728e7e552ab 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -221,6 +221,7 @@ apply converter methods, and parse dates (:issue:`43567`). .. ipython:: python + from io import StringIO xml_dates = """ @@ -244,7 +245,7 @@ apply converter methods, and parse dates (:issue:`43567`). """ df = pd.read_xml( - xml_dates, + StringIO(xml_dates), dtype={'sides': 'Int64'}, converters={'degrees': str}, parse_dates=['date'] diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6fcddad70f22b..7450fc6fdc1da 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -313,6 +313,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) +- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) - Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index bb165c4724022..6421f710f80d6 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -11,6 +11,7 @@ Any, Callable, ) +import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -19,6 +20,7 @@ ParserError, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -29,6 +31,7 @@ file_exists, get_handle, infer_compression, + is_file_like, is_fsspec_url, is_url, stringify_path, @@ -802,6 +805,22 @@ def _parse( p: _EtreeFrameParser | _LxmlFrameParser + if isinstance(path_or_buffer, str) and not any( + [ + is_file_like(path_or_buffer), + file_exists(path_or_buffer), + is_url(path_or_buffer), + is_fsspec_url(path_or_buffer), + ] + ): + warnings.warn( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if parser == "lxml": lxml = import_optional_dependency("lxml.etree", errors="ignore") @@ -894,6 +913,10 @@ def read_xml( string or a path. The string can further be a URL. Valid URL schemes include http, ftp, s3, and file. + .. deprecated:: 2.1.0 + Passing xml literal strings is deprecated. + Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. + xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. XPath should return a collection of elements and not a single @@ -1049,6 +1072,7 @@ def read_xml( Examples -------- + >>> import io >>> xml = ''' ... ... @@ -1068,7 +1092,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml) + >>> df = pd.read_xml(io.StringIO(xml)) >>> df shape degrees sides 0 square 360 4.0 @@ -1082,7 +1106,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml, xpath=".//row") + >>> df = pd.read_xml(io.StringIO(xml), xpath=".//row") >>> df shape degrees sides 0 square 360 4.0 @@ -1108,7 +1132,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml, + >>> df = pd.read_xml(io.StringIO(xml), ... xpath="//doc:row", ... namespaces={{"doc": "https://example.com"}}) >>> df diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index a3a1646bc4748..1a64d9910d8bf 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -246,6 +246,19 @@ ) +@td.skip_if_no("lxml") +def test_literal_xml_deprecation(): + # GH 53809 + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + read_xml(xml_default_nmsp) + + @pytest.fixture(params=["rb", "r"]) def mode(request): return request.param @@ -300,7 +313,7 @@ def test_parser_consistency_file(xml_books): def test_parser_consistency_url(parser, httpserver): httpserver.serve_content(content=xml_default_nmsp) - df_xpath = read_xml(xml_default_nmsp, parser=parser) + df_xpath = read_xml(StringIO(xml_default_nmsp), parser=parser) df_iter = read_xml( BytesIO(xml_default_nmsp.encode()), parser=parser, @@ -353,6 +366,11 @@ def test_file_buffered_reader_string(xml_books, parser, mode): with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f: xml_obj = f.read() + if mode == "rb": + xml_obj = StringIO(xml_obj.decode()) + elif mode == "r": + xml_obj = StringIO(xml_obj) + df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( @@ -373,6 +391,11 @@ def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode): next(f) xml_obj = f.read() + if mode == "rb": + xml_obj = StringIO(xml_obj.decode()) + elif mode == "r": + xml_obj = StringIO(xml_obj) + df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( @@ -391,7 +414,7 @@ def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode): def test_string_charset(parser): txt = "<中文標籤>12" - df_str = read_xml(txt, parser=parser) + df_str = read_xml(StringIO(txt), parser=parser) df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) @@ -449,34 +472,48 @@ def test_empty_string_lxml(val): ] ) with pytest.raises(XMLSyntaxError, match=msg): - read_xml(val, parser="lxml") + if isinstance(val, str): + read_xml(StringIO(val), parser="lxml") + else: + read_xml(BytesIO(val), parser="lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_etree(val): with pytest.raises(ParseError, match="no element found"): - read_xml(val, parser="etree") + if isinstance(val, str): + read_xml(StringIO(val), parser="etree") + else: + read_xml(BytesIO(val), parser="etree") @td.skip_if_no("lxml") def test_wrong_file_path_lxml(): - from lxml.etree import XMLSyntaxError - + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) filename = os.path.join("data", "html", "books.xml") with pytest.raises( - XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FutureWarning, + match=msg, ): read_xml(filename, parser="lxml") def test_wrong_file_path_etree(): + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) filename = os.path.join("data", "html", "books.xml") with pytest.raises( - ParseError, - match=("not well-formed"), + FutureWarning, + match=msg, ): read_xml(filename, parser="etree") @@ -539,7 +576,7 @@ def test_bad_xpath_lxml(xml_books): def test_default_namespace(parser): df_nmsp = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser=parser, @@ -565,7 +602,7 @@ def test_default_namespace(parser): def test_prefix_namespace(parser): df_nmsp = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser=parser, @@ -589,14 +626,14 @@ def test_prefix_namespace(parser): @td.skip_if_no("lxml") def test_consistency_default_namespace(): df_lxml = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser="lxml", ) df_etree = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", @@ -608,14 +645,14 @@ def test_consistency_default_namespace(): @td.skip_if_no("lxml") def test_consistency_prefix_namespace(): df_lxml = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="lxml", ) df_etree = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", @@ -652,7 +689,7 @@ def test_none_namespace_prefix(key): TypeError, match=("empty namespace prefix is not supported in XPath") ): read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//kml:Placemark", namespaces={key: "http://www.opengis.net/kml/2.2"}, parser="lxml", @@ -741,7 +778,7 @@ def test_empty_attrs_only(parser): ValueError, match=("xpath does not return any nodes or attributes"), ): - read_xml(xml, xpath="./row", attrs_only=True, parser=parser) + read_xml(StringIO(xml), xpath="./row", attrs_only=True, parser=parser) def test_empty_elems_only(parser): @@ -756,7 +793,7 @@ def test_empty_elems_only(parser): ValueError, match=("xpath does not return any nodes or attributes"), ): - read_xml(xml, xpath="./row", elems_only=True, parser=parser) + read_xml(StringIO(xml), xpath="./row", elems_only=True, parser=parser) @td.skip_if_no("lxml") @@ -781,8 +818,8 @@ def test_attribute_centric_xml(): """ - df_lxml = read_xml(xml, xpath=".//station") - df_etree = read_xml(xml, xpath=".//station", parser="etree") + df_lxml = read_xml(StringIO(xml), xpath=".//station") + df_etree = read_xml(StringIO(xml), xpath=".//station", parser="etree") df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]}) df_iter_et = read_xml_iterparse( @@ -834,7 +871,10 @@ def test_repeat_names(parser): """ df_xpath = read_xml( - xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"] + StringIO(xml), + xpath=".//shape", + parser=parser, + names=["type_dim", "shape", "type_edge"], ) df_iter = read_xml_iterparse( @@ -876,7 +916,9 @@ def test_repeat_values_new_names(parser): ellipse """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"]) + df_xpath = read_xml( + StringIO(xml), xpath=".//shape", parser=parser, names=["name", "group"] + ) df_iter = read_xml_iterparse( xml, @@ -919,7 +961,7 @@ def test_repeat_elements(parser): """ df_xpath = read_xml( - xml, + StringIO(xml), xpath=".//shape", parser=parser, names=["name", "family", "degrees", "sides"], @@ -1154,8 +1196,8 @@ def test_style_charset(): """ - df_orig = read_xml(xml) - df_style = read_xml(xml, stylesheet=xsl) + df_orig = read_xml(StringIO(xml)) + df_style = read_xml(StringIO(xml), stylesheet=xsl) tm.assert_frame_equal(df_orig, df_style) @@ -1287,30 +1329,18 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_stylesheet(val): - from lxml.etree import XMLSyntaxError - + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises( - XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") - ): + with pytest.raises(FutureWarning, match=msg): read_xml(kml, stylesheet=val) # ITERPARSE - - -def test_string_error(parser): - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - xml_default_nmsp, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - def test_file_like_iterparse(xml_books, parser, mode): with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "r" and parser == "lxml": @@ -1492,7 +1522,7 @@ def test_comment(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1528,7 +1558,7 @@ def test_dtd(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1564,7 +1594,7 @@ def test_processing_instruction(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1842,7 +1872,7 @@ def test_online_stylesheet(): """ df_xsl = read_xml( - xml, + StringIO(xml), xpath=".//tr[td and position() <= 6]", names=["title", "artist"], stylesheet=xsl, @@ -1982,7 +2012,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): string_array_na = ArrowStringArray(pa.array(["x", None])) with pd.option_context("mode.string_storage", string_storage): - result = read_xml(data, parser=parser, dtype_backend=dtype_backend) + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) expected = DataFrame( { diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 911b540dbc380..fb24902efc0f5 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -1,5 +1,7 @@ from __future__ import annotations +from io import StringIO + import pytest from pandas.errors import ParserWarning @@ -81,7 +83,7 @@ def read_xml_iterparse(data, **kwargs): def test_dtype_single_str(parser): - df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -102,7 +104,7 @@ def test_dtype_single_str(parser): def test_dtypes_all_str(parser): - df_result = read_xml(xml_dates, dtype="string", parser=parser) + df_result = read_xml(StringIO(xml_dates), dtype="string", parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -126,7 +128,7 @@ def test_dtypes_all_str(parser): def test_dtypes_with_names(parser): df_result = read_xml( - xml_dates, + StringIO(xml_dates), names=["Col1", "Col2", "Col3", "Col4"], dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64[ns]"}, parser=parser, @@ -153,7 +155,7 @@ def test_dtypes_with_names(parser): def test_dtype_nullable_int(parser): - df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"sides": "Int64"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -174,7 +176,7 @@ def test_dtype_nullable_int(parser): def test_dtype_float(parser): - df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"degrees": "float"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -214,7 +216,7 @@ def test_both_dtype_converters(parser): with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): df_result = read_xml( - xml_types, + StringIO(xml_types), dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser, @@ -235,7 +237,9 @@ def test_both_dtype_converters(parser): def test_converters_str(parser): - df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + df_result = read_xml( + StringIO(xml_types), converters={"degrees": str}, parser=parser + ) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -258,7 +262,7 @@ def test_converters_str(parser): def test_converters_date(parser): convert_to_datetime = lambda x: to_datetime(x) df_result = read_xml( - xml_dates, converters={"date": convert_to_datetime}, parser=parser + StringIO(xml_dates), converters={"date": convert_to_datetime}, parser=parser ) df_iter = read_xml_iterparse( xml_dates, @@ -305,7 +309,7 @@ def test_callable_str_converters(xml_books, parser, iterparse): def test_parse_dates_column_name(parser): - df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -327,7 +331,7 @@ def test_parse_dates_column_name(parser): def test_parse_dates_column_index(parser): - df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=[3], parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -349,7 +353,7 @@ def test_parse_dates_column_index(parser): def test_parse_dates_true(parser): - df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=True, parser=parser) df_iter = read_xml_iterparse( xml_dates, @@ -401,7 +405,7 @@ def test_parse_dates_dictionary(parser): """ df_result = read_xml( - xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser + StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser ) df_iter = read_xml_iterparse( xml, @@ -459,7 +463,7 @@ def test_day_first_parse_dates(parser): with tm.assert_produces_warning( UserWarning, match="Parsing dates in %d/%m/%Y format" ): - df_result = read_xml(xml, parse_dates=["date"], parser=parser) + df_result = read_xml(StringIO(xml), parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( xml, parse_dates=["date"],