From d7760be711206098ce05d0d76ca7bd795dde60b6 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Fri, 11 Oct 2024 11:55:06 +0400 Subject: [PATCH 1/5] Add optional orjson support --- docs/guide/configuration.asciidoc | 18 ++++ elasticsearch_serverless/__init__.py | 7 ++ elasticsearch_serverless/serializer.py | 14 +++ noxfile.py | 4 +- pyproject.toml | 10 +-- .../test_serializer.py | 87 ++++++++++--------- 6 files changed, 89 insertions(+), 51 deletions(-) diff --git a/docs/guide/configuration.asciidoc b/docs/guide/configuration.asciidoc index 61bc44e..b5055bf 100644 --- a/docs/guide/configuration.asciidoc +++ b/docs/guide/configuration.asciidoc @@ -264,6 +264,24 @@ es = Elasticsearch( ) ------------------------------------ +If the `orjson` package is installed, you can use the faster ``OrjsonSerializer`` for the default mimetype (``application/json``): + +[source,python] +------------------------------------ +from elasticsearch import Elasticsearch, OrjsonSerializer + +es = Elasticsearch( + ..., + serializer=OrjsonSerializer() +) +------------------------------------ + +orjson is particularly fast when serializing vectors as it has native numpy support. This will be the default in a future release. Note that you can install orjson with the `orjson` extra: + +[source,sh] +-------------------------------------------- +$ python -m pip install elasticsearch[orjson] +-------------------------------------------- [discrete] [[nodes]] diff --git a/elasticsearch_serverless/__init__.py b/elasticsearch_serverless/__init__.py index 8b38b39..f4ac43c 100644 --- a/elasticsearch_serverless/__init__.py +++ b/elasticsearch_serverless/__init__.py @@ -63,6 +63,11 @@ ) from .serializer import JSONSerializer, JsonSerializer +try: + from .serializer import OrjsonSerializer +except ModuleNotFoundError: + OrjsonSerializer = None # type: ignore[assignment,misc] + # Only raise one warning per deprecation message so as not # to spam up the user if the same action is done multiple times. warnings.simplefilter("default", category=ElasticsearchWarning, append=True) @@ -86,6 +91,8 @@ "UnsupportedProductError", "ElasticsearchWarning", ] +if OrjsonSerializer is not None: + __all__.append("OrjsonSerializer") fixup_module_metadata(__name__, globals()) del fixup_module_metadata diff --git a/elasticsearch_serverless/serializer.py b/elasticsearch_serverless/serializer.py index 64592d2..727b3d5 100644 --- a/elasticsearch_serverless/serializer.py +++ b/elasticsearch_serverless/serializer.py @@ -41,6 +41,13 @@ "MapboxVectorTileSerializer", ] +try: + from elastic_transport import OrjsonSerializer as _OrjsonSerializer + + __all__.append("OrjsonSerializer") +except ModuleNotFoundError: + _OrjsonSerializer = None # type: ignore[assignment,misc] + class JsonSerializer(_JsonSerializer): mimetype: ClassVar[str] = "application/json" @@ -73,6 +80,13 @@ def default(self, data: Any) -> Any: raise TypeError(f"Unable to serialize {data!r} (type: {type(data)})") +if _OrjsonSerializer is not None: + + class OrjsonSerializer(JsonSerializer, _OrjsonSerializer): + def default(self, data: Any) -> Any: + return JsonSerializer.default(self, data) + + class NdjsonSerializer(JsonSerializer, _NdjsonSerializer): mimetype: ClassVar[str] = "application/x-ndjson" diff --git a/noxfile.py b/noxfile.py index 9360f74..e0a66f0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -49,7 +49,7 @@ def pytest_argv(): @nox.session(python=["3.9", "3.10", "3.11", "3.12"]) def test(session): - session.install(".[dev]", env=INSTALL_ENV) + session.install(".[async,requests,orjson]", env=INSTALL_ENV) session.run(*pytest_argv(), *(session.posargs)) @@ -86,7 +86,7 @@ def lint(session): session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES) # Workaround to make '-r' to still work despite uninstalling aiohttp below. - session.install(".[async,requests]", env=INSTALL_ENV) + session.install(".[async,requests,orjson]", env=INSTALL_ENV) # Run mypy on the package and then the type examples separately for # the two different mypy use-cases, ourselves and our users. diff --git a/pyproject.toml b/pyproject.toml index fcbbc29..ee1c98f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,12 +44,9 @@ dependencies = [ ] [project.optional-dependencies] -async = [ - "aiohttp>=3,<4", -] -requests = [ - "requests>=2.4.0, <3.0.0", -] +async = ["aiohttp>=3,<4"] +requests = ["requests>=2.4.0, <3.0.0" ] +orjson = ["orjson>=3"] dev = [ "requests>=2, <3", "aiohttp", @@ -66,6 +63,7 @@ dev = [ "twine", "build", "nox", + "orjson", "numpy", "pandas", "mapbox-vector-tile", diff --git a/test_elasticsearch_serverless/test_serializer.py b/test_elasticsearch_serverless/test_serializer.py index 4674e97..82f934e 100644 --- a/test_elasticsearch_serverless/test_serializer.py +++ b/test_elasticsearch_serverless/test_serializer.py @@ -16,7 +16,6 @@ # specific language governing permissions and limitations # under the License. -import sys import uuid from datetime import datetime from decimal import Decimal @@ -31,43 +30,44 @@ import re +from elasticsearch.serializer import JSONSerializer, OrjsonSerializer, TextSerializer + from elasticsearch_serverless import Elasticsearch from elasticsearch_serverless.exceptions import SerializationError -from elasticsearch_serverless.serializer import JSONSerializer, TextSerializer requires_numpy_and_pandas = pytest.mark.skipif( - np is None or pd is None, reason="Test requires numpy or pandas to be available" + np is None or pd is None, reason="Test requires numpy and pandas to be available" ) -def test_datetime_serialization(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( +@pytest.fixture(params=[JSONSerializer, OrjsonSerializer]) +def json_serializer(request: pytest.FixtureRequest): + yield request.param() + + +def test_datetime_serialization(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( {"d": datetime(2010, 10, 1, 2, 30)} ) -def test_decimal_serialization(): - requires_numpy_and_pandas() +def test_decimal_serialization(json_serializer): + assert b'{"d":3.8}' == json_serializer.dumps({"d": Decimal("3.8")}) - if sys.version_info[:2] == (2, 6): - pytest.skip("Float rounding is broken in 2.6.") - assert b'{"d":3.8}' == JSONSerializer().dumps({"d": Decimal("3.8")}) - -def test_uuid_serialization(): - assert b'{"d":"00000000-0000-0000-0000-000000000003"}' == JSONSerializer().dumps( +def test_uuid_serialization(json_serializer): + assert b'{"d":"00000000-0000-0000-0000-000000000003"}' == json_serializer.dumps( {"d": uuid.UUID("00000000-0000-0000-0000-000000000003")} ) @requires_numpy_and_pandas -def test_serializes_numpy_bool(): - assert b'{"d":true}' == JSONSerializer().dumps({"d": np.bool_(True)}) +def test_serializes_numpy_bool(json_serializer): + assert b'{"d":true}' == json_serializer.dumps({"d": np.bool_(True)}) @requires_numpy_and_pandas -def test_serializes_numpy_integers(): - ser = JSONSerializer() +def test_serializes_numpy_integers(json_serializer): for np_type in ( np.int_, np.int8, @@ -75,7 +75,7 @@ def test_serializes_numpy_integers(): np.int32, np.int64, ): - assert ser.dumps({"d": np_type(-1)}) == b'{"d":-1}' + assert json_serializer.dumps({"d": np_type(-1)}) == b'{"d":-1}' for np_type in ( np.uint8, @@ -83,81 +83,82 @@ def test_serializes_numpy_integers(): np.uint32, np.uint64, ): - assert ser.dumps({"d": np_type(1)}) == b'{"d":1}' + assert json_serializer.dumps({"d": np_type(1)}) == b'{"d":1}' @requires_numpy_and_pandas -def test_serializes_numpy_floats(): - ser = JSONSerializer() +def test_serializes_numpy_floats(json_serializer): for np_type in ( np.float32, np.float64, ): - assert re.search(rb'^{"d":1\.2[\d]*}$', ser.dumps({"d": np_type(1.2)})) + assert re.search( + rb'^{"d":1\.2[\d]*}$', json_serializer.dumps({"d": np_type(1.2)}) + ) @requires_numpy_and_pandas -def test_serializes_numpy_datetime(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( +def test_serializes_numpy_datetime(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( {"d": np.datetime64("2010-10-01T02:30:00")} ) @requires_numpy_and_pandas -def test_serializes_numpy_ndarray(): - assert b'{"d":[0,0,0,0,0]}' == JSONSerializer().dumps( +def test_serializes_numpy_ndarray(json_serializer): + assert b'{"d":[0,0,0,0,0]}' == json_serializer.dumps( {"d": np.zeros((5,), dtype=np.uint8)} ) # This isn't useful for Elasticsearch, just want to make sure it works. - assert b'{"d":[[0,0],[0,0]]}' == JSONSerializer().dumps( + assert b'{"d":[[0,0],[0,0]]}' == json_serializer.dumps( {"d": np.zeros((2, 2), dtype=np.uint8)} ) @requires_numpy_and_pandas def test_serializes_numpy_nan_to_nan(): - assert b'{"d":NaN}' == JSONSerializer().dumps({"d": np.nan}) + assert b'{"d":NaN}' == JSONSerializer().dumps({"d": float("NaN")}) + # NaN is invalid JSON, and orjson silently converts it to null + assert b'{"d":null}' == OrjsonSerializer().dumps({"d": float("NaN")}) @requires_numpy_and_pandas -def test_serializes_pandas_timestamp(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( - {"d": pd.Timestamp("2010-10-01T02:30:00")} - ) +def test_serializes_pandas_timestamp(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps() @requires_numpy_and_pandas -def test_serializes_pandas_series(): - assert b'{"d":["a","b","c","d"]}' == JSONSerializer().dumps( +def test_serializes_pandas_series(json_serializer): + assert b'{"d":["a","b","c","d"]}' == json_serializer.dumps( {"d": pd.Series(["a", "b", "c", "d"])} ) @requires_numpy_and_pandas @pytest.mark.skipif(not hasattr(pd, "NA"), reason="pandas.NA is required") -def test_serializes_pandas_na(): - assert b'{"d":null}' == JSONSerializer().dumps({"d": pd.NA}) +def test_serializes_pandas_na(json_serializer): + assert b'{"d":null}' == json_serializer.dumps({"d": pd.NA}) @requires_numpy_and_pandas @pytest.mark.skipif(not hasattr(pd, "NaT"), reason="pandas.NaT required") -def test_raises_serialization_error_pandas_nat(): +def test_raises_serialization_error_pandas_nat(json_serializer): with pytest.raises(SerializationError): - JSONSerializer().dumps({"d": pd.NaT}) + json_serializer.dumps({"d": pd.NaT}) @requires_numpy_and_pandas -def test_serializes_pandas_category(): +def test_serializes_pandas_category(json_serializer): cat = pd.Categorical(["a", "c", "b", "a"], categories=["a", "b", "c"]) - assert b'{"d":["a","c","b","a"]}' == JSONSerializer().dumps({"d": cat}) + assert b'{"d":["a","c","b","a"]}' == json_serializer.dumps({"d": cat}) cat = pd.Categorical([1, 2, 3], categories=[1, 2, 3]) - assert b'{"d":[1,2,3]}' == JSONSerializer().dumps({"d": cat}) + assert b'{"d":[1,2,3]}' == json_serializer.dumps({"d": cat}) -def test_json_raises_serialization_error_on_dump_error(): +def test_json_raises_serialization_error_on_dump_error(json_serializer): with pytest.raises(SerializationError): - JSONSerializer().dumps(object()) + json_serializer.dumps(object()) def test_raises_serialization_error_on_load_error(): From b88d093ad7f458edf5a981e1f91d2ed0f9709382 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Fri, 11 Oct 2024 12:01:37 +0400 Subject: [PATCH 2/5] Fix missing test dependency --- noxfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/noxfile.py b/noxfile.py index e0a66f0..01a1282 100644 --- a/noxfile.py +++ b/noxfile.py @@ -49,7 +49,7 @@ def pytest_argv(): @nox.session(python=["3.9", "3.10", "3.11", "3.12"]) def test(session): - session.install(".[async,requests,orjson]", env=INSTALL_ENV) + session.install(".[dev]", env=INSTALL_ENV) session.run(*pytest_argv(), *(session.posargs)) @@ -118,5 +118,5 @@ def lint(session): @nox.session() def docs(session): - session.install(".[docs]") + session.install(".[docs,orjson]") session.run("sphinx-build", "docs/sphinx/", "docs/sphinx/_build", "-b", "html") From 1a4273dbd2325d6c0567d34bd2b343d1b1fed534 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Fri, 11 Oct 2024 12:04:16 +0400 Subject: [PATCH 3/5] Fix import --- test_elasticsearch_serverless/test_serializer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test_elasticsearch_serverless/test_serializer.py b/test_elasticsearch_serverless/test_serializer.py index 82f934e..990ea3d 100644 --- a/test_elasticsearch_serverless/test_serializer.py +++ b/test_elasticsearch_serverless/test_serializer.py @@ -30,10 +30,13 @@ import re -from elasticsearch.serializer import JSONSerializer, OrjsonSerializer, TextSerializer - from elasticsearch_serverless import Elasticsearch from elasticsearch_serverless.exceptions import SerializationError +from elasticsearch_serverless.serializer import ( + JSONSerializer, + OrjsonSerializer, + TextSerializer, +) requires_numpy_and_pandas = pytest.mark.skipif( np is None or pd is None, reason="Test requires numpy and pandas to be available" From 1bb5a3691aa5b9735bf2ce66557f09bc4c088139 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Fri, 11 Oct 2024 12:38:12 +0400 Subject: [PATCH 4/5] Fix pandas timestamp test --- test_elasticsearch_serverless/test_serializer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_elasticsearch_serverless/test_serializer.py b/test_elasticsearch_serverless/test_serializer.py index 990ea3d..b9757a8 100644 --- a/test_elasticsearch_serverless/test_serializer.py +++ b/test_elasticsearch_serverless/test_serializer.py @@ -127,7 +127,9 @@ def test_serializes_numpy_nan_to_nan(): @requires_numpy_and_pandas def test_serializes_pandas_timestamp(json_serializer): - assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps() + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( + {"d": pd.Timestamp("2010-10-01T02:30:00")} + ) @requires_numpy_and_pandas From e4fbc567d90f0d99153c3a8231e55371ec5cbd9f Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Fri, 11 Oct 2024 12:51:19 +0400 Subject: [PATCH 5/5] Ensure optional dependencies are optional --- elasticsearch_serverless/__init__.py | 2 +- elasticsearch_serverless/serializer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/elasticsearch_serverless/__init__.py b/elasticsearch_serverless/__init__.py index f4ac43c..723b3a2 100644 --- a/elasticsearch_serverless/__init__.py +++ b/elasticsearch_serverless/__init__.py @@ -65,7 +65,7 @@ try: from .serializer import OrjsonSerializer -except ModuleNotFoundError: +except ImportError: OrjsonSerializer = None # type: ignore[assignment,misc] # Only raise one warning per deprecation message so as not diff --git a/elasticsearch_serverless/serializer.py b/elasticsearch_serverless/serializer.py index 727b3d5..37ad572 100644 --- a/elasticsearch_serverless/serializer.py +++ b/elasticsearch_serverless/serializer.py @@ -45,7 +45,7 @@ from elastic_transport import OrjsonSerializer as _OrjsonSerializer __all__.append("OrjsonSerializer") -except ModuleNotFoundError: +except ImportError: _OrjsonSerializer = None # type: ignore[assignment,misc]