diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt index 4b0a48c7..54df11ee 100644 --- a/tuple/CMakeLists.txt +++ b/tuple/CMakeLists.txt @@ -54,4 +54,6 @@ install(FILES include/array_tuple_intersection_impl.hpp include/array_tuple_a_not_b.hpp include/array_tuple_a_not_b_impl.hpp + include/array_of_strings_sketch.hpp + include/array_of_strings_sketch_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp new file mode 100644 index 00000000..296c0a87 --- /dev/null +++ b/tuple/include/array_of_strings_sketch.hpp @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_HPP_ + +#include +#include + +#include "array_tuple_sketch.hpp" +#include "xxhash64.h" + +namespace datasketches { + +using array_of_strings = array; + +// default update policy for an array of strings +class default_array_of_strings_update_policy { +public: + default_array_of_strings_update_policy() = default; + + array_of_strings create() const; + + void update(array_of_strings& array, const array_of_strings& input) const; + + void update(array_of_strings& array, const array_of_strings* input) const; +}; + +/** + * Serializer/deserializer for an array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * This serde does not perform UTF-8 validation. Callers must ensure strings + * are valid UTF-8 before serialization to guarantee interoperability with + * Java, Go, and Rust implementations. + */ +template> +struct default_array_of_strings_serde { + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); + + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; + size_t size_of_item(const array_of_strings& item) const; + +private: + summary_allocator summary_allocator_; + static void check_num_nodes(uint8_t num_nodes); + static uint32_t compute_total_bytes(const array_of_strings& item); +}; + +/** + * Hashes an array of strings using ArrayOfStrings-compatible hashing. + */ +uint64_t hash_array_of_strings_key(const array_of_strings& key); + +/** + * Extended class of compact_tuple_sketch for array of strings. + * + * Requirements: + * - Array size must be <= 127. + * + * UTF-8 compatibility: + * Serialized sketches are intended to be language and platform independent. + * Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings. + * This C++ implementation does not validate UTF-8; it is the caller's + * responsibility to ensure all strings are valid UTF-8 before calling update(). + * Non-UTF-8 strings may serialize successfully but will fail to deserialize + * in other language implementations. + */ +template> +class compact_array_of_strings_tuple_sketch: + public compact_tuple_sketch { +public: + using Base = compact_tuple_sketch; + using vector_bytes = typename Base::vector_bytes; + using Base::serialize; + + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (update or compact) + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ + template + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); + + /** + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + + /** + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + +private: + explicit compact_array_of_strings_tuple_sketch(Base&& base); +}; + +/** + * Convenience alias for update_tuple_sketch for array of strings + */ +template, + typename Policy = default_array_of_strings_update_policy> +using update_array_of_strings_tuple_sketch = update_tuple_sketch< + array_of_strings, + array_of_strings, + Policy, + Allocator +>; + +/** + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). + * @param sketch input sketch + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ +template, typename Policy = default_array_of_strings_update_policy> +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); + +} /* namespace datasketches */ + +#include "array_of_strings_sketch_impl.hpp" + +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp new file mode 100644 index 00000000..26751d66 --- /dev/null +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ + +#include + +#include "common_defs.hpp" + +namespace datasketches { + +inline array_of_strings default_array_of_strings_update_policy::create() const { + return array_of_strings(0, ""); +} + +inline void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings& input +) const { + const auto length = static_cast(input.size()); + array = array_of_strings(static_cast(length), ""); + for (size_t i = 0; i < length; ++i) array[i] = input[i]; +} + +inline void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings* input +) const { + if (input == nullptr) { + array = array_of_strings(0, ""); + return; + } + const auto length = static_cast(input->size()); + array = array_of_strings(static_cast(length), ""); + for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; +} + +inline uint64_t hash_array_of_strings_key(const array_of_strings& key) { + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; + XXHash64 hasher(STRING_ARR_HASH_SEED); + const auto size = static_cast(key.size()); + for (size_t i = 0; i < size; ++i) { + const auto& entry = key[i]; + hasher.add(entry.data(), entry.size()); + if (i + 1 < size) hasher.add(",", 1); + } + return hasher.hash(); +} + +template +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered +) { + return compact_array_of_strings_tuple_sketch(sketch, ordered); +} + +template +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + const Sketch& sketch, bool ordered +): Base(sketch, ordered) {} + +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + Base&& base +): Base(std::move(base)) {} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + auto base = Base::deserialize(is, seed, sd, allocator); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + auto base = Base::deserialize(bytes, size, seed, sd, allocator); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): + summary_allocator_(allocator) {} + +template +void default_array_of_strings_serde::serialize( + std::ostream& os, const array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } + } +} + +template +void default_array_of_strings_serde::deserialize( + std::istream& is, array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + read(is); // total_bytes + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + const uint8_t num_nodes = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + std::string value(length, '\0'); + if (length != 0) { + is.read(&value[0], length); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + } + array[j] = std::move(value); + } + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } +} + +template +size_t default_array_of_strings_serde::serialize( + void* ptr, size_t capacity, const array_of_strings* items, unsigned num +) const { + uint8_t* ptr8 = static_cast(ptr); + size_t bytes_written = 0; + + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + check_memory_size(bytes_written + total_bytes, capacity); + bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); + bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = static_cast(data[j].size()); + + bytes_written += copy_to_mem(length, ptr8 + bytes_written); + bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length); + } + } + return bytes_written; +} + +template +size_t default_array_of_strings_serde::deserialize( + const void* ptr, size_t capacity, array_of_strings* items, unsigned num +) const { + const uint8_t* ptr8 = static_cast(ptr); + size_t bytes_read = 0; + + for (unsigned i = 0; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, ""); + for (uint8_t j = 0; j < num_nodes; ++j) { + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + } + array[j] = std::move(value); + } + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } + return bytes_read; +} + +template +size_t default_array_of_strings_serde::size_of_item(const array_of_strings& item) const { + return compute_total_bytes(item); +} + +template +void default_array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { + if (num_nodes > 127) { + throw std::runtime_error("array_of_strings size exceeds 127"); + } +} + +template +uint32_t default_array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { + const auto count = item.size(); + check_num_nodes(static_cast(count)); + size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); + const std::string* data = item.data(); + for (uint32_t j = 0; j < count; ++j) { + total += data[j].size(); + } + return static_cast(total); +} + +} /* namespace datasketches */ + +#endif diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 547b240c..d331f8b1 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -22,6 +22,8 @@ #include #include +#include +#include #include "serde.hpp" #include "tuple_sketch.hpp" @@ -34,17 +36,18 @@ class array { public: using value_type = T; using allocator_type = Allocator; + using alloc_traits = std::allocator_traits; - explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()): + explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - std::fill(array_, array_ + size_, value); + init_values(value, std::is_trivially_copyable()); } array(const array& other): allocator_(other.allocator_), size_(other.size_), array_(allocator_.allocate(size_)) { - std::copy(other.array_, other.array_ + size_, array_); + copy_from(other, std::is_trivially_copyable()); } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -52,9 +55,13 @@ class array { array_(other.array_) { other.array_ = nullptr; + other.size_ = 0; } ~array() { - if (array_ != nullptr) allocator_.deallocate(array_, size_); + if (array_ != nullptr) { + destroy_values(std::is_trivially_destructible()); + allocator_.deallocate(array_, size_); + } } array& operator=(const array& other) { array copy(other); @@ -75,10 +82,34 @@ class array { T* data() { return array_; } const T* data() const { return array_; } bool operator==(const array& other) const { + if (size_ != other.size_) return false; for (uint8_t i = 0; i < size_; ++i) if (array_[i] != other.array_[i]) return false; return true; } private: + void init_values(const T& value, std::true_type) { + std::fill(array_, array_ + size_, value); + } + void init_values(const T& value, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } + } + void copy_from(const array& other, std::true_type) { + std::copy(other.array_, other.array_ + size_, array_); + } + void copy_from(const array& other, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } + } + void destroy_values(std::true_type) {} + void destroy_values(std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + } + Allocator allocator_; uint8_t size_; T* array_; diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..3d7ccca3 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -44,6 +44,7 @@ target_sources(tuple_test tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp array_of_doubles_sketch_test.cpp + array_of_strings_sketch_test.cpp engagement_test.cpp ) @@ -52,6 +53,7 @@ target_sources(tuple_test PRIVATE aod_sketch_deserialize_from_java_test.cpp tuple_sketch_deserialize_from_java_test.cpp + aos_sketch_deserialize_from_java_test.cpp ) endif() @@ -60,5 +62,6 @@ target_sources(tuple_test PRIVATE aod_sketch_serialize_for_java.cpp tuple_sketch_serialize_for_java.cpp + aos_sketch_serialize_for_java.cpp ) endif() diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp new file mode 100644 index 00000000..af37d6c2 --- /dev/null +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -0,0 +1,283 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + // assume the binary sketches for this test have been generated by datasketches-java code + // in the subdirectory called "java" in the root directory of this project + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + + static std::vector read_binary_file(const std::string& path) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + is.seekg(0, std::ios::end); + const auto size = static_cast(is.tellg()); + is.seekg(0, std::ios::beg); + std::vector bytes(size); + if (size != 0) { + is.read(reinterpret_cast(bytes.data()), size); + } + return bytes; + } + + TEST_CASE("aos sketch one value", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch three values", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + } + } + + TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + } + + TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch unicode strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_unicode_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } + + TEST_CASE("aos sketch empty strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_empty_strings_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } +} diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp new file mode 100644 index 00000000..c6eb0dfc --- /dev/null +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using aos_sketch = update_array_of_strings_tuple_sketch<>; +using array_of_strings = array; + +static array_of_strings make_array(std::initializer_list items) { + array_of_strings array(static_cast(items.size()), ""); + size_t i = 0; + for (const auto& item: items) { + array[static_cast(i)] = item; + ++i; + } + return array; +} + +TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { + auto sketch = aos_sketch::builder() + .set_lg_k(12) + .set_resize_factor(resize_factor::X8) + .set_p(0.01f) + .build(); + array_of_strings key(1, ""); + key[0] = "key1"; + array_of_strings value(1, ""); + value[0] = "value1"; + sketch.update(hash_array_of_strings_key(key), value); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})), + make_array({u8"밸류", u8"값"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})), + make_array({u8"📦", u8"🎁"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})), + make_array({u8"ценить1", u8"ценить2"}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"})); + sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""})); + sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""})); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +} /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp new file mode 100644 index 00000000..5507c071 --- /dev/null +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +TEST_CASE("aos update policy", "[tuple_sketch]") { + default_array_of_strings_update_policy policy; + + SECTION("create empty") { + auto values = policy.create(); + REQUIRE(values.size() == 0); + } + + SECTION("replace array") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "alpha"; + input[1] = "beta"; + policy.update(values, input); + REQUIRE(values.size() == 2); + REQUIRE(values[0] == "alpha"); + REQUIRE(values[1] == "beta"); + input[0] = "changed"; + REQUIRE(values[0] == "alpha"); + + array_of_strings input2(1, "", std::allocator()); + input2[0] = "gamma"; + policy.update(values, input2); + REQUIRE(values.size() == 1); + REQUIRE(values[0] == "gamma"); + } + + SECTION("nullptr clears") { + array_of_strings values(2, "", std::allocator()); + values[0] = "one"; + values[1] = "two"; + + policy.update(values, nullptr); + REQUIRE(values.size() == 0); + } + + SECTION("pointer input copies") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "first"; + input[1] = "second"; + policy.update(values, &input); + REQUIRE(values.size() == 2); + REQUIRE(values[1] == "second"); + input[1] = "changed"; + REQUIRE(values[1] == "second"); + } +} + +TEST_CASE("aos sketch update", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto* entry: entries) array[i++] = entry; + return array; + }; + + SECTION("same key replaces summary") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"first"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"second", "third"}) + ); + + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 2); + REQUIRE(it->second[0] == "second"); + REQUIRE(it->second[1] == "third"); + } + + SECTION("distinct keys retain multiple entries") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"a", "bc"})), + make_array({"one"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"ab", "c"})), + make_array({"two"}) + ); + + REQUIRE(sketch.get_num_retained() == 2); + + bool saw_one = false; + bool saw_two = false; + for (const auto& entry: sketch) { + REQUIRE(entry.second.size() == 1); + if (entry.second[0] == "one") saw_one = true; + if (entry.second[0] == "two") saw_two = true; + } + REQUIRE(saw_one); + REQUIRE(saw_two); + } + + SECTION("empty key") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"})); + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 1); + REQUIRE(it->second[0] == "value"); + } +} + +TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto& entry: entries) array[i++] = entry; + return array; + }; + + auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + typedef std::pair entry_type; + std::vector entries; + for (const auto& entry: sketch) entries.push_back(entry); + struct entry_less { + bool operator()(const entry_type& lhs, const entry_type& rhs) const { + return lhs.first < rhs.first; + } + }; + std::sort(entries.begin(), entries.end(), entry_less()); + return entries; + }; + + auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + compact_sketch.serialize(ss, default_array_of_strings_serde<>()); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize( + ss, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>()); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { + &deserialized_stream, + &deserialized_bytes + }; + for (int list_index = 0; list_index < 2; ++list_index) { + const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index]; + REQUIRE(compact_sketch.is_empty() == deserialized->is_empty()); + REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode()); + REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered()); + REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained()); + REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10)); + REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10)); + REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10)); + REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10)); + + auto original_entries = collect_entries(compact_sketch); + auto round_trip_entries = collect_entries(*deserialized); + REQUIRE(original_entries.size() == round_trip_entries.size()); + for (size_t i = 0; i < original_entries.size(); ++i) { + REQUIRE(original_entries[i].first == round_trip_entries[i].first); + REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size()); + for (size_t j = 0; j < original_entries[i].second.size(); ++j) { + REQUIRE(original_entries[i].second[static_cast(j)] == + round_trip_entries[i].second[static_cast(j)]); + } + } + } + }; + + auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = compact_array_of_strings_sketch(sketch, true); + auto unordered = compact_array_of_strings_sketch(sketch, false); + check_round_trip(ordered); + check_round_trip(unordered); + }; + + SECTION("empty sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + run_tests(sketch); + } + + SECTION("single entry sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"})); + run_tests(sketch); + } + + SECTION("multiple entries exact mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); + for (int i = 0; i < 50; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i), "extra"}) + ); + } + REQUIRE_FALSE(sketch.is_estimation_mode()); + run_tests(sketch); + } + + SECTION("multiple entries estimation mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + for (int i = 0; i < 10000; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i)}) + ); + } + REQUIRE(sketch.is_estimation_mode()); + run_tests(sketch); + } +} + +TEST_CASE("aos serde validation", "[tuple_sketch]") { + default_array_of_strings_serde<> serde; + + SECTION("too many nodes rejected") { + array_of_strings array(128, "", std::allocator()); + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("size exceeds 127") + ); + } +} + +} /* namespace datasketches */