diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 18cafe18ae393..bfd5a7f4de1e7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -225,6 +225,7 @@ Other enhancements - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`) - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`) +- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`) - :meth:`Series.round` and :meth:`DataFrame.round` now work with nullable integer and floating dtypes (:issue:`38844`) - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) @@ -644,8 +645,10 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings. - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) + Build ===== diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index c890bfbfe3b7d..58d2569ad5e4c 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -157,7 +157,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = np.nan, - convert_na_value: bool = True, + coerce: str = "all", copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 341cfe4d6fac6..55d82641db479 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -98,6 +98,7 @@ from pandas._libs.missing cimport ( is_null_timedelta64, isnaobj, ) +from pandas._libs.missing import checknull from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -682,12 +683,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, - bint convert_na_value=True, + coerce="all", bint copy=True, bint skipna=True, ): """ - Returns a new numpy array with object dtype and only strings and na values. + Checks that all elements in numpy array are string or null + and returns a new numpy array with object dtype + and only strings and na values if so. Otherwise, raise a ValueError. Parameters ---------- @@ -695,8 +698,16 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array. + coerce : {{'all', 'null', 'non-null', None}}, default 'all' + Whether to coerce non-string elements to strings. + - 'all' will convert null values and non-null non-string values. + - 'strict-null' will only convert pd.NA, np.nan, or None to na_value + without converting other non-strings. + - 'null' will convert nulls to na_value w/out converting other non-strings. + - 'non-null' will only convert non-null non-string elements to string. + - None will not convert anything. + If coerce is not 'all', a ValueError will be raised for values + that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -710,6 +721,7 @@ cpdef ndarray[object] ensure_string_array( """ cdef: Py_ssize_t i = 0, n = len(arr) + set strict_na_values = {C_NA, np.nan, None} if hasattr(arr, "to_numpy"): arr = arr.to_numpy() @@ -721,16 +733,27 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if coerce == 'strict-null': + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. + check_null = strict_na_values.__contains__ + else: + check_null = checknull + for i in range(n): val = arr[i] if isinstance(val, str): continue - if not checknull(val): - result[i] = str(val) + if not check_null(val): + if coerce =="all" or coerce == "non-null": + result[i] = str(val) + else: + raise ValueError("Non-string element encountered in array.") else: - if convert_na_value: + if coerce=="all" or coerce == "null" or coerce == 'strict-null': val = na_value if skipna: result[i] = val @@ -1864,8 +1887,8 @@ cdef class StringValidator(Validator): return issubclass(self.dtype.type, np.str_) cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA + # Override to exclude float('Nan') and complex NaN + return value is None or value is C_NA or value is np.nan cpdef bint is_string_array(ndarray values, bint skipna=False): diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7d7074988e5f0..4c64a0e3d8479 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -92,6 +92,7 @@ from pandas._libs.khash cimport ( kh_strbox_t, khiter_t, ) +from pandas._libs.missing cimport C_NA from pandas.errors import ( EmptyDataError, @@ -109,6 +110,13 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) +from pandas.core.arrays.string_ import StringDtype + cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -311,7 +319,7 @@ cdef class TextReader: object handle object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - bint mangle_dupe_cols, allow_leading_cols + bint mangle_dupe_cols, allow_leading_cols, use_nullable_dtypes uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors @@ -369,6 +377,7 @@ cdef class TextReader: bint mangle_dupe_cols=True, float_precision=None, bint skip_blank_lines=True, + bint use_nullable_dtypes=False, encoding_errors=b"strict"): # set encoding for native Python and C library @@ -432,6 +441,7 @@ cdef class TextReader: # consistent with csv module semantics, cast all to float dtype_order = dtype_order[1:] self.dtype_cast_order = [np.dtype(x) for x in dtype_order] + self.use_nullable_dtypes = use_nullable_dtypes if comment is not None: if len(comment) > 1: @@ -1035,7 +1045,7 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 if try_upcast and not is_extension_array_dtype(col_dtype): - col_res = _maybe_upcast(col_res) + col_res = _maybe_upcast(col_res, self.use_nullable_dtypes) if col_res is None: raise ParserError(f'Unable to parse column {i}') @@ -1328,18 +1338,53 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) -def _maybe_upcast(arr): +def _maybe_upcast(arr, use_nullable_dtypes=False): """ + Tries to upcast null values or use nullable dtypes if set to True. + + + Parameters + ---------- + arr : ndarray + Array to upcast. + use_nullable_dtypes: bool, default False + Whether to use nullable datatypes instead of upcasting. + If true, then: + - int w/ NaN -> IntegerArray + - bool w/ NaN -> BooleanArray + - float w/NaN -> FloatingArray + - object(strings) w/NaN -> StringArray """ + na_value = na_values[arr.dtype] if issubclass(arr.dtype.type, np.integer): - na_value = na_values[arr.dtype] - arr = arr.astype(float) - np.putmask(arr, arr == na_value, np.nan) + mask = arr == na_value + if use_nullable_dtypes: + # only convert to integer array if not all NAN + if not mask.all(): + arr = IntegerArray(arr, mask) + else: + arr = arr.astype(float) + arr = FloatingArray(arr, mask) + else: + arr = arr.astype(float) + np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: - mask = arr.view(np.uint8) == na_values[np.uint8] - arr = arr.astype(object) - np.putmask(arr, mask, np.nan) + mask = arr.view(np.uint8) == na_value + if use_nullable_dtypes: + arr = BooleanArray(arr, mask) + else: + arr = arr.astype(object) + np.putmask(arr, mask, np.nan) + elif use_nullable_dtypes and arr.dtype == np.float64: + mask = np.isnan(arr) + arr = FloatingArray(arr, mask) + elif use_nullable_dtypes and arr.dtype == np.object_: + # Maybe convert StringArray & catch error for non-strings + try: + arr = StringDtype.construct_array_type()._from_sequence(arr) + except ValueError as e: + pass return arr diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ab1dadf4d2dfa..4d97035714ba3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -144,11 +144,18 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes(``None``, ``nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.3 + + StringArray now accepts nan-likes in the constructor in addition + to strings, whereas it only accepted strings and :attr:`pandas.NA` + before. + copy : bool, default False Whether to copy the array of data. @@ -208,24 +215,33 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype()) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + try: + lib.ensure_string_array( + self._ndarray, + na_value=StringDtype.na_value, + coerce="strict-null", + copy=False, + ) + except ValueError: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True + ): if dtype: assert dtype == "string" @@ -233,15 +249,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype + if coerce: + coerce = "non-null" + else: + coerce = None na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) result[na_values] = StringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value + if coerce: + coerce = "all" + else: + coerce = "strict-null" result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3cf471e381da9..f0af7a8a43594 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -237,7 +237,9 @@ def __init__(self, values): ) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True + ): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() @@ -247,11 +249,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + if coerce: + coerce = "non-null" + else: + coerce = None + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + if coerce: + coerce = "all" + else: + coerce = "strict-null" + result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4abb5d98202f6..16e603cb8ffa5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1105,7 +1105,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null") elif is_datetime64_dtype(arr): if dtype == np.int64: diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2a86ff13a2edc..46e0875ab61ec 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -41,6 +41,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -53,7 +54,17 @@ from pandas.core.dtypes.missing import isna from pandas.core import algorithms -from pandas.core.arrays import Categorical +from pandas.core.api import ( + NA, + array as pd_array, +) +from pandas.core.arrays import ( + BooleanArray, + Categorical, + FloatingArray, + IntegerArray, +) +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -108,6 +119,7 @@ "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, + "use_nullable_dtypes": False, "encoding_errors": "strict", "on_bad_lines": "error", } @@ -208,6 +220,7 @@ def __init__(self, kwds): self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False) self.handles: IOHandles | None = None # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) @@ -580,10 +593,7 @@ def _convert_to_ndarrays( ) # type specified in dtype param or cast_type is an EA - if cast_type and ( - not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type) - ): + if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): @@ -678,12 +688,12 @@ def _infer_types(self, values, na_values, try_num_bool=True): ---------- values : ndarray na_values : set - try_num_bool : bool, default try + try_num_bool : bool, default True try to cast values to numeric (first preference) or boolean Returns ------- - converted : ndarray + converted : ndarray or ExtensionArray na_count : int """ na_count = 0 @@ -694,14 +704,29 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = mask.sum() if na_count > 0: if is_integer_dtype(values): - values = values.astype(np.float64) + if self.use_nullable_dtypes: + values = pd_array(values, dtype="Int64") + values[mask] = NA # <- This is pd.NA + return values, na_count + else: + values = values.astype(np.float64) np.putmask(values, mask, np.nan) return values, na_count if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: - result, _ = lib.maybe_convert_numeric(values, na_values, False) + result, mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=self.use_nullable_dtypes, + ) + if mask is not None: + if is_integer_dtype(result): + result = IntegerArray(result, mask) + elif is_float_dtype(result): + result = FloatingArray(result, mask) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError # TypeError can be raised in floatify @@ -715,11 +740,26 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: - result, _ = libops.maybe_convert_bool( + result, mask = libops.maybe_convert_bool( np.asarray(values), true_values=self.true_values, false_values=self.false_values, + convert_to_masked_nullable=self.use_nullable_dtypes, ) + if mask is not None: + result = BooleanArray(result, mask) + # Maybe StringArray? Must have NA value to trigger + # Since it is called use_nullable_dtypes after all + # However, all NA -> Float64 not StringArray + if ( + result.dtype == np.object_ + and self.use_nullable_dtypes + and 0 < na_count < len(result) + ): + try: + result = StringDtype.construct_array_type()._from_sequence(result) + except ValueError: + pass return result, na_count diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a384846b7a063..875b15dad6d6d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -375,6 +375,13 @@ .. versionchanged:: 1.2 +use_nullable_dtypes : bool, default False + If True, use dtypes that use pd.NA as missing value indicator for + the resulting DataFrame. Currently supports reading data into the nullable boolean + and integer array types, but not string arrays. + + .. versionadded:: 1.3 + {storage_options} .. versionadded:: 1.2 @@ -567,6 +574,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + use_nullable_dtypes=False, storage_options: StorageOptions = None, ): # locals() should never be modified @@ -665,6 +673,7 @@ def read_table( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + use_nullable_dtypes=False, ): # locals() should never be modified kwds = locals().copy() @@ -884,7 +893,6 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c9533e239abe0..26770fcc1bf62 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -292,13 +292,15 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", pd.NaT], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) @pytest.mark.parametrize("copy", [True, False]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 3c541a309e42a..c6bdc7c62ce4c 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1455,11 +1455,18 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) - + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index fecba8bd81404..f63476b5a4dd6 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -10,9 +10,11 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas import ( + NA, DataFrame, Index, MultiIndex, + array as pd_array, ) import pandas._testing as tm @@ -146,20 +148,141 @@ def test_custom_na_values(all_parsers, na_values): tm.assert_frame_equal(result, expected) -def test_bool_na_values(all_parsers): - data = """A,B,C -True,False,True -NA,True,False -False,NA,True""" +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([True, NA, False], dtype="boolean"), + "B": pd_array([False, True, NA], dtype="boolean"), + "C": pd_array([np.nan, np.nan, np.nan], dtype="Float64"), + "D": np.array([True, False, True], dtype="bool"), + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": np.array([np.nan, np.nan, np.nan], dtype="float64"), + "D": np.array([True, False, True], dtype="bool"), + } + ), + ), + ], +) +def test_bool_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +True,False,NA,True +NA,True,NA,False +False,NA,NA,True""" parser = all_parsers - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - { - "A": np.array([True, np.nan, False], dtype=object), - "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True], - } - ) + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([1, NA, 2], dtype="Int64"), + "B": pd_array([3, 2, NA], dtype="Int64"), + "C": pd_array([NA, 1, 2], dtype="Int64"), + "D": np.array([1, 2, 3], dtype="int64"), + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([1.0, np.nan, 2.0], dtype="float64"), + "B": np.array([3.0, 2.0, np.nan], dtype="float64"), + "C": np.array([np.nan, 1.0, 2.0], dtype="float64"), + "D": np.array([1, 2, 3], dtype="int64"), + } + ), + ), + ], +) +def test_int_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +1,3,NA,1 +NA,2,1,2 +2,NA,2,3""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([1.0, NA, 2.0], dtype="Float64"), + "B": np.array([3.0, 2.0, 1.0], dtype="float64"), + "C": pd_array([NA, 1.0, 2.0], dtype="Float64"), + "D": pd_array([NA, NA, NA], dtype="Float64"), + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([1.0, np.nan, 2.0], dtype="float64"), + "B": np.array([3.0, 2.0, 1.0], dtype="float64"), + "C": np.array([np.nan, 1.0, 2.0], dtype="float64"), + "D": np.array([np.nan, np.nan, np.nan], dtype="float64"), + } + ), + ), + ], +) +def test_float_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +1.0,3,NA,NA +NA,2,1.0,NA +2,1.0,2.0,NA""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + print(result) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": np.array(["hi", "hello", "hey"], dtype=object), + "B": pd_array(["hi", NA, "hello"], dtype="string"), + "C": pd_array([NA, "hi", "hey"], dtype="string"), + "D": pd_array([np.nan, np.nan, np.nan], dtype="Float64"), + } + ), + ) + ], +) +def test_string_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +hi,hi,NA,NA +hello,NA,hi,NA +hey,hello,hey,NA""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) tm.assert_frame_equal(result, expected)