diff --git a/.github/workflows/build_and_test_cmake.yaml b/.github/workflows/build_and_test_cmake.yaml index 8c16ee4..f7332b4 100644 --- a/.github/workflows/build_and_test_cmake.yaml +++ b/.github/workflows/build_and_test_cmake.yaml @@ -27,52 +27,30 @@ jobs: run: | git submodule update --init --recursive - uses: lukka/get-cmake@latest - - name: Setup vcpkg - uses: lukka/run-vcpkg@v11 - id: runvcpkg - with: - # This one is not needed, as it is the default value anyway. - vcpkgDirectory: '${{ github.workspace }}/tools/vcpkg' - vcpkgJsonGlob: '**/cmakepresets/vcpkg.json' - - - name: Prints output of run-vcpkg's action. - run: echo "root='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_ROOT_OUT }}', triplet='${{ steps.runvcpkg.outputs.RUNVCPKG_VCPKG_DEFAULT_TRIPLET_OUT }}' " - - name: Run CMake+vcpkg+Ninja - uses: lukka/run-cmake@v10 - id: runcmake - env: - CC: clang-18 - CXX: clang++-18 - CMAKE_C_COMPILER: clang-18 - CMAKE_CXX_COMPILER: clang++-18 - MKLROOT: ${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64 - with: - cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt' - configurePresetAdditionalArgs: "['-DOpenMP_CXX_FLAGS=-fopenmp=libiomp5', '-DOpenMP_CXX_LIB_NAMES=libiomp5', '-DOpenMP_libiomp5_LIBRARY=${{ github.workspace }}/builds/debug/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so']" - configurePreset: 'debug' - buildPreset: 'debug' - testPreset: 'debug' - testPresetCmdString: "['lintdb-tests', '--test-dir', 'builds/debug', '--output-on-failure']" - - uses: actions/setup-python@v4 with: python-version: 3.10.6 + - name: Install Conan + id: conan + uses: turtlebrowser/get-conan@main - name: Install Dependencies - env: - CC: clang-18 - CXX: clang++-18 - CMAKE_C_COMPILER: clang-18 - CMAKE_CXX_COMPILER: clang++-18 run: | - echo "CXX=${CXX}" - pip install pytest numpy - - sudo apt-get remove clang-14 clang-15 - sudo rm /usr/bin/clang++ - sudo ln -s /usr/bin/clang++-18 /usr/bin/clang++ + pip install pytest numpy conan + sudo apt install g++ gcc libopenblas libopenblas-openmp-dev liblapack-dev pip install . + + - name: Build + run: | + mkdir build && cd build \ + && conan profile new default --detect \ + && conan install .. --build=missing -s build_type=Release -s build_tests=True \ + && conan build .. - name: Run Tests + run: | + cd ./build/Release/tests \ + && ctest --output-on-failure lintdb-tests + - name: Run Python Tests run: | pytest lintdb/python/tests diff --git a/.gitmodules b/.gitmodules index a418ef8..9183f76 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "third_party/dkm"] path = third_party/dkm url = https://github.com/genbattle/dkm.git +[submodule "third_party/faiss"] + path = third_party/faiss + url = https://github.com/facebookresearch/faiss.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 345a0bb..ef54ed0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,10 @@ cmake_minimum_required(VERSION 3.25) include(CMakeFindDependencyMacro) - +include(GNUInstallDirs) +include(ExternalProject) # allow faiss to build on m1 mac even though it's listed as unsupported. -set(VCPKG_INSTALL_OPTIONS "--allow-unsupported") -# Setup vcpkg script with CMake (note: should be placed before project() call) -set(CMAKE_TOOLCHAIN_FILE - ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake - CACHE STRING "Vcpkg toolchain file") file(READ "version.txt" version) @@ -19,64 +15,155 @@ project( LANGUAGES CXX) set(LINTDB_VERSION ${version}) -include(GNUInstallDirs) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17 -fPIC -Wall -pthread") -set(CMAKE_CXX_STANDARD 17) +if (__x86_64__) + set(CMAKE_CXX_FLAGS "-msse4.2 ${CMAKE_CXX_FLAGS}") +endif() + +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") -set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -std=c++17 -fPIC -O3 -D_LIBCPP_DISABLE_AVAILABILITY" -) if(MSVC OR LINUX) - set(BLA_VENDOR "Intel10_64lp") +# set(BLA_VENDOR "Intel10_64lp") +# set(BLA_VENDOR_THREADING "intel") + set(BLA_VENDOR "OpenBLAS") else() set(BLA_VENDOR "OpenBLAS") endif() +set(CMAKE_CXX_STANDARD 17) # the below is caused by github actions failing to build flatbuffers. therefore, # we set this value so that we use a higher sdk version to build it. set(CMAKE_OSX_DEPLOYMENT_TARGET 10.13) -set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - -# https://conda-forge.org/docs/maintainer/knowledge_base/#newer-c-features-with-old-sdk -# if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_CXX_FLAGS -# "${CMAKE_CXX_FLAGS} ") endif() list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -if(SKBUILD) - message("Building with scikit-build") - cmake_path(GET CMAKE_CURRENT_BINARY_DIR PARENT_PATH BUILD_PARENT_DIR) - set(ENV{MKLROOT} - "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64") - set(OpenMP_libiomp5_LIBRARY - "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so" - ) - set(CMAKE_BUILD_TYPE Release) +# OpenMP is getting pulled in through a version of blas. This needs to get figured out. +find_package(OpenMP REQUIRED) + +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/libs/libfaiss.cmake) +include_directories(third_party/faiss) + +find_package(LAPACK REQUIRED) +find_package(BLAS REQUIRED) +find_package(RocksDB REQUIRED) +find_package(glog CONFIG REQUIRED) +find_package(Microsoft.GSL CONFIG REQUIRED) +find_package(jsoncpp CONFIG REQUIRED) +find_package(Bitsery CONFIG REQUIRED) +find_package(Arrow REQUIRED) +find_package(Drogon REQUIRED) + +if(OPENMP_FOUND) + message(STATUS "OpenMP flags: ${OpenMP_CXX_FLAGS} -- ${OpenMP_EXE_LINKER_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") endif() -set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}) +include_directories(third_party/tantivy) + +include_directories(lintdb) + +# Collect source files +file(GLOB LINT_DB_SRC + "lintdb/*.cpp" + "lintdb/datasources/*.cpp" + "lintdb/quantizers/*.cpp" + "lintdb/quantizers/impl/*.cpp" + "lintdb/invlists/*.cpp" + "lintdb/query/*.cpp" + "lintdb/query/physical/*.cpp" + "lintdb/schema/*.cpp" + "lintdb/scoring/*.cpp" +) -option(ENABLE_PYTHON "Build Python extension." ON) -option(ENABLE_BENCHMARKS "Build benchmarks." ON) -option(ENABLE_SERVER "Build the server." OFF) +# Collect header files +file(GLOB LINT_DB_HEADERS + "lintdb/*.h" + "lintdb/datasources/*.h" + "lintdb/quantizers/*.h" + "lintdb/quantizers/impl/*.h" + "lintdb/invlists/*.h" + "lintdb/query/*.h" + "lintdb/query/physical/*.h" + "lintdb/schema/*.h" + "lintdb/scoring/*.h" + "lintdb/utils/*.h" +) -add_subdirectory(lintdb) +add_library(lintdb ${LINT_DB_SRC}) + +target_compile_options( + lintdb + PRIVATE $<$: + -mfma + -mf16c + -mavx2 + -msse4.2 + -mpopcnt + -Wno-sign-compare + -Wno-unused-variable + -Wno-reorder + -Wno-unused-local-typedefs + -Wno-unused-function + -Wno-strict-aliasing>) + +target_link_libraries( + lintdb PUBLIC OpenMP::OpenMP_CXX ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} +) -if(ENABLE_PYTHON) +target_link_libraries(lintdb + PUBLIC + faiss + JsonCpp::JsonCpp + Bitsery::bitsery + glog::glog + Microsoft.GSL::GSL + RocksDB::rocksdb + arrow::arrow +) + +target_include_directories(lintdb + PUBLIC $) + + + +option(BUILD_PYTHON "Build Python extension." OFF) +option(BUILD_BENCHMARKS "Build benchmarks." ON) +option(BUILD_TESTS "Build tests." ON) +option(BUILD_SERVER "Build the server." OFF) + + +if(BUILD_PYTHON) + message(STATUS "building python extension") add_subdirectory(lintdb/python) endif() -IF(ENABLE_SERVER) +IF(BUILD_SERVER) add_subdirectory(lintdb/server) endif() include(CTest) -if(BUILD_TESTING) +if(BUILD_TESTS) + enable_testing() add_subdirectory(tests) -endif() + if(EXISTS "${PROJECT_SOURCE_DIR}/lintdb/query/test") + add_subdirectory(lintdb/query/test) + endif() +endif() -if(ENABLE_BENCHMARKS) +if(BUILD_BENCHMARKS) add_subdirectory(benchmarks) endif() + +install(TARGETS lintdb + DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} +) +foreach(header ${LINT_DB_HEADERS}) + get_filename_component(dir ${header} DIRECTORY) + install(FILES ${header} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lintdb/${dir}) +endforeach() \ No newline at end of file diff --git a/CMakePresets.json b/CMakePresets.json index 8e88f9c..094df67 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -23,8 +23,7 @@ "BUILD_TESTING": "ON", "CMAKE_INSTALL_LIBDIR": "lib", "ENABLE_PYTHON": "ON", - "CMAKE_VERBOSE_MAKEFILE":"ON", - "BLA_VENDOR":"Intel10_64lp" + "CMAKE_VERBOSE_MAKEFILE":"ON" } }, { @@ -42,8 +41,7 @@ "CMAKE_INSTALL_LIBDIR": "lib", "CMAKE_VERBOSE_MAKEFILE":"ON", "ENABLE_PYTHON": "OFF", - "ENABLE_BENCHMARKS": "ON", - "BLA_VENDOR":"Intel10_64lp" + "ENABLE_BENCHMARKS": "ON" } }, { @@ -61,8 +59,7 @@ "CMAKE_INSTALL_LIBDIR": "lib", "CMAKE_VERBOSE_MAKEFILE":"ON", "ENABLE_PYTHON": "ON", - "BUILD_TESTING": "OFF", - "BLA_VENDOR":"Intel10_64lp" + "BUILD_TESTING": "OFF" } }, { @@ -80,8 +77,7 @@ "CMAKE_INSTALL_LIBDIR": "lib", "ENABLE_PYTHON": "OFF", "BUILD_TESTING": "OFF", - "ENABLE_SERVER": "ON", - "BLA_VENDOR":"Intel10_64lp" + "ENABLE_SERVER": "ON" } }, { @@ -100,8 +96,7 @@ "ENABLE_BENCHMARKS": "ON", "ENABLE_PYTHON": "OFF", "BUILD_TESTING": "OFF", - "ENABLE_SERVER": "OFF", - "BLA_VENDOR":"Intel10_64lp" + "ENABLE_SERVER": "OFF" } } ], diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json new file mode 100644 index 0000000..71aeace --- /dev/null +++ b/CMakeUserPresets.json @@ -0,0 +1,9 @@ +{ + "version": 4, + "vendor": { + "conan": {} + }, + "include": [ + "build/Release/generators/CMakePresets.json" + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index 52baef6..be4ba18 100644 --- a/Makefile +++ b/Makefile @@ -1,60 +1,16 @@ ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) build-release: - MKLROOT=${ROOT_DIR}/builds/release/vcpkg_installed/x64-linux/lib/intel64 cmake \ - --preset release \ - -DCMAKE_CXX_COMPILER=clang++-18 \ - -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ - -DOpenMP_CXX_LIB_NAMES=libiomp5 \ - -DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/builds/release/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \ - . + # MKLROOT=${ROOT_DIR}/builds/release/vcpkg_installed/x64-linux/lib/intel64 cmake \ + conan install . --build=missing cmake --build --preset release -j12 build-debug: - MKLROOT=${ROOT_DIR}/builds/debug/vcpkg_installed/x64-linux/lib/intel64 cmake \ - --preset debug \ - -DCMAKE_CXX_COMPILER=clang++-18 \ - -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ - -DOpenMP_CXX_LIB_NAMES=libiomp5 \ - -DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/builds/debug/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \ - . + conan install . --build=missing -s build_type=Debug - cmake --build --preset debug -j12 - -build-python: - MKLROOT=${ROOT_DIR}/builds/python/vcpkg_installed/x64-linux/lib/intel64 cmake \ - --preset python \ - -DCMAKE_CXX_COMPILER=clang++-18 \ - -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ - -DOpenMP_CXX_LIB_NAMES=libiomp5 \ - -DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/builds/python/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \ - . - - cmake --build --preset python -j12 - cd builds/python/lintdb/python - -build-server: - MKLROOT=${ROOT_DIR}/builds/server/vcpkg_installed/x64-linux/lib/intel64 cmake \ - --preset server \ - -DCMAKE_CXX_COMPILER=clang++-18 \ - -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ - -DOpenMP_CXX_LIB_NAMES=libiomp5 \ - -DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/builds/server/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \ - . - - cmake --build --preset server -j12 - -build-benchmarks: - MKLROOT=${ROOT_DIR}/builds/benchmarks/vcpkg_installed/x64-linux/lib/intel64 cmake \ - --preset benchmarks \ - -DCMAKE_CXX_COMPILER=clang++-18 \ - -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \ - -DOpenMP_CXX_LIB_NAMES=libiomp5 \ - -DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/builds/benchmarks/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \ - . - - cmake --build --preset benchmarks -j12 --target bench_lintdb + cmake --preset conan-debug + cmake --build --preset conan-debug test: cd builds/debug && cmake -E env GLOG_v=5 GLOG_logtostderr=1 ctest --output-on-failure @@ -76,8 +32,8 @@ callgrind-py: dot -Tsvg out.dot -o graph.svg callgrind-cpp: build-benchmarks - GLOG_v=100 valgrind --tool=callgrind --instr-atstart=yes --dump-instr=yes --collect-jumps=yes ./builds/benchmarks/benchmarks/Release/bench_lintdb - python -m gprof2dot --format=callgrind --output=out.dot callgrind.out.* + valgrind --tool=callgrind --instr-atstart=yes --dump-instr=yes --collect-jumps=yes ./builds/benchmarks/benchmarks/Release/bench_lintdb + python -m gprof2dot -n0 -e0 --format=callgrind --output=out.dot --strip callgrind.out.* dot -Tsvg out.dot -o graph.svg callgrind-colbert: build-conda @@ -85,7 +41,7 @@ callgrind-colbert: build-conda run-perf: # make sure your system allows perf to run. ex: sudo sysctl -w kernel.perf_event_paranoid=1 - OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=12 OMP_NUM_THREADS=6 perf record -g -- python -X perf benchmarks/bench_lintdb.py single-index + OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=12 OMP_NUM_THREADS=6 perf record -g -- ./builds/benchmarks/benchmarks/Release/bench_lintdb perf script | ./debug/stackcollapse-perf.pl | ./debug/flamegraph.pl > perf.data.svg run-docs: diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 6e32469..78c7e2b 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,9 +1,11 @@ message(STATUS "Building lintdb benchmark") -add_executable(bench_lintdb EXCLUDE_FROM_ALL bench_lintdb.cpp) -target_link_libraries(bench_lintdb PRIVATE lintdb_lib) +add_executable(bench_lintdb bench_lintdb.cpp) + +target_link_libraries(bench_lintdb PRIVATE lintdb) find_package(benchmark CONFIG REQUIRED) target_link_libraries(bench_lintdb PRIVATE benchmark::benchmark benchmark::benchmark_main) +install(TARGETS bench_lintdb DESTINATION bin/benchmarks) \ No newline at end of file diff --git a/benchmarks/bench_lintdb.cpp b/benchmarks/bench_lintdb.cpp index fad46dc..8fc2a11 100644 --- a/benchmarks/bench_lintdb.cpp +++ b/benchmarks/bench_lintdb.cpp @@ -9,6 +9,7 @@ #include "lintdb/quantizers/Quantizer.h" #include "lintdb/query/Query.h" #include "lintdb/query/QueryNode.h" +#include lintdb::Document create_document(size_t num_tokens, size_t dim){ std::vector vector; @@ -85,7 +86,8 @@ static void BM_lintdb_add(benchmark::State& state) { } static void BM_lintdb_search(benchmark::State& state) { - lintdb::IndexIVF index = lintdb::IndexIVF("/home/matt/deployql/LintDB/benchmarks/lintdb-lifestyle-40k"); + omp_set_num_threads(6); + lintdb::IndexIVF index = lintdb::IndexIVF("/home/matt/deployql/LintDB/benchmarks/openblastest.db"); //lintdb-lifestyle-40k"); lintdb::FieldValue fv("colbert", std::vector(1280, 1), 10); std::unique_ptr root = std::make_unique(fv); @@ -93,10 +95,13 @@ static void BM_lintdb_search(benchmark::State& state) { lintdb::SearchOptions opts; opts.n_probe = 32; - opts.k_top_centroids = 2; + opts.k_top_centroids = 32; for(auto _ : state) { - index.search(0, query, 10, opts); + auto res = index.search(0, query, 10, opts); + if(res.size() == 0) { + std::cout << "No results" << std::endl; + } } } diff --git a/benchmarks/lotte/indexing_two.py b/benchmarks/lotte/indexing_two.py index f650be9..8bedd68 100644 --- a/benchmarks/lotte/indexing_two.py +++ b/benchmarks/lotte/indexing_two.py @@ -118,7 +118,7 @@ def run(index_path: str = "local_db.index", stop:int=40000, reuse_colbert_cluste 'indexing': [], 'per_doc': [], } - for b in tqdm(batch(list(zip(d.dids, d.collection)),n=1)): + for b in tqdm(batch(list(zip(d.dids, d.collection)),n=100)): ids = [i for i,_ in b] docs = [d for _, d in b] @@ -132,7 +132,7 @@ def run(index_path: str = "local_db.index", stop:int=40000, reuse_colbert_cluste for i, ee in zip(ids, e): start = time.perf_counter() - doc = Document(i, [TensorFieldValue("colbert", e)]) + doc = Document(i, [TensorFieldValue("colbert", ee)]) index.add(0, [doc]) end = time.perf_counter() latencies['per_doc'].append(end - start) @@ -148,7 +148,7 @@ def run(index_path: str = "local_db.index", stop:int=40000, reuse_colbert_cluste @app.command() -def eval(index_path = "local_db_2.index", dataset: str = 'lifestyle', split: str = 'dev', stop: int = 40000): +def eval(index_path = "local_db_2.index", dataset: str = 'lifestyle', failure_id=5, expected_doc=5462, split: str = 'dev', stop: int = 40000): checkpoint = "colbert-ir/colbertv2.0" experiment="" @@ -177,6 +177,7 @@ def eval(index_path = "local_db_2.index", dataset: str = 'lifestyle', split: str 100, { 'k_top_centroids': 32, + # 'expected_id': 5462, } ) diff --git a/benchmarks/lotte/main.py b/benchmarks/lotte/main.py index 9725483..aa5ca63 100644 --- a/benchmarks/lotte/main.py +++ b/benchmarks/lotte/main.py @@ -1,4 +1,3 @@ -import lintdb as ldb from datasets import load_dataset from collections import namedtuple from colbert import Indexer, Searcher diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake index 6c1345c..52b7ecc 100644 --- a/cmake/FindMKL.cmake +++ b/cmake/FindMKL.cmake @@ -73,7 +73,6 @@ include(FindPackageHandleStandardArgs) cmake_push_check_state() set(CMAKE_REQUIRED_QUIET ${BLAS_FIND_QUIETLY}) - set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) if(BLA_STATIC) if(WIN32) diff --git a/cmake/libs/libfaiss.cmake b/cmake/libs/libfaiss.cmake new file mode 100644 index 0000000..2903135 --- /dev/null +++ b/cmake/libs/libfaiss.cmake @@ -0,0 +1,55 @@ +#file(GLOB_RECURSE SOURCES "${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/**/*.cpp") + +file( + GLOB FAISS_SOURCES ${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/*.cpp + ${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/impl/*.cpp + ${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/invlists/*.cpp + ${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/utils/*.cpp + ${PROJECT_SOURCE_DIR}/third_party/faiss/faiss/utils/distances_fused/*.cpp +) + +file(GLOB AVX512_SOURCES third_party/faiss/faiss/*avx512*.cpp) +list(REMOVE_ITEM FAISS_SOURCES ${AVX512_SOURCES}) + + +# remove RHNSW as per milvus +file(GLOB RHNSW_SOURCES third_party/faiss/faiss/impl/RHNSW.cpp) +list(REMOVE_ITEM FAISS_SOURCES ${RHNSW_SOURCES}) + +find_package(BLAS REQUIRED) + + +#if(LINUX) + +# list(REMOVE_ITEM SOURCES ${AVX2_SOURCES}) + + file(GLOB NEON_SOURCES third_party/faiss/faiss/impl/*neon*.cpp) + list(REMOVE_ITEM FAISS_SOURCES ${NEON_SOURCES}) + + add_library(faiss STATIC ${FAISS_SOURCES}) + + target_compile_options( + faiss + PRIVATE $<$: + -mfma + -mf16c + -mavx2 + -msse4.2 + -mpopcnt + -Wno-sign-compare + -Wno-unused-variable + -Wno-reorder + -Wno-unused-local-typedefs + -Wno-unused-function + -Wno-strict-aliasing>) + +#find_package(OpenMP REQUIRED) + + +message(STATUS "faiss using BLAS: ${BLAS_LIBRARIES}") +target_link_libraries( + faiss PUBLIC OpenMP::OpenMP_CXX ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} +) + + target_compile_definitions(faiss PRIVATE FINTEGER=int) +#endif() \ No newline at end of file diff --git a/conanfile.py b/conanfile.py new file mode 100644 index 0000000..0e4b787 --- /dev/null +++ b/conanfile.py @@ -0,0 +1,151 @@ +from conan.tools.microsoft import is_msvc, msvc_runtime_flag +from conan.tools.build import check_min_cppstd +from conan.tools.scm import Version +from conan.tools import files +from conan import ConanFile +from conan.tools.cmake import CMake, CMakeDeps, CMakeToolchain, cmake_layout +from conan.tools.gnu import PkgConfigDeps +from conan.errors import ConanInvalidConfiguration +from conan import tools +import os + +required_conan_version = ">=1.55.0" + +class LintDBConan(ConanFile): + name = "LintDB" + description = "LintDB is a late interaction database for vector retrieval." + topics = ("vector", "ann", "search", "retrieval") + license = "Apache-2.0" + + settings = "os", "arch", "compiler", "build_type" + + options = { + "shared": [True, False], + "fPIC": [True, False], + "build_tests": [True, False], + "build_benchmarks": [True, False], + "build_python": [True, False], + "build_server": [True, False], + } + + default_options = { + "shared": False, + "fPIC": True, + "build_tests": True, + "build_benchmarks": True, + "build_python": True, + "build_server": False, + } + + exports_sources = { + "lintdb/*", + "third_party/*", + "tests/*", + "CMakelists.txt", + "*.cmake", + "conanfile.py", + } + + @property + def _minimum_cpp_standard(self): + return 17 + + @property + def _minimum_compilers_version(self): + return { + "gcc": "7", + "clang": "15", + "apple-clang": "10", + } + + def requirements(self): + self.requires("rocksdb/9.5.2") + self.requires("glog/0.4.0") + self.requires("jsoncpp/1.9.5") + self.requires("ms-gsl/4.0.0") + self.requires("arrow/19.0.1") + self.requires("bitsery/5.2.4") + self.requires("drogon/1.9.10", options={"with_boost": False}) + self.requires("taywee-args/6.4.6") + + if self.options.build_tests: + self.requires("gtest/1.15.0") + + if self.options.build_benchmarks: + self.requires("benchmark/1.9.0") + + def validate(self): + if self.settings.os == "Windows": + raise ConanInvalidConfiguration("Windows not supported") + + if self.settings.compiler.get_safe("cppstd"): + check_min_cppstd(self, self._minimum_cpp_standard) + min_version = self._minimum_compilers_version.get(str(self.settings.compiler)) + if not min_version: + self.output.warn( + "{} recipe lacks information about the {} compiler support.".format( + self.name, self.settings.compiler + ) + ) + else: + if Version(self.settings.compiler.version) < min_version: + raise ConanInvalidConfiguration( + "{} requires C++{} support. The current compiler {} {} does not support it.".format( + self.name, + self._minimum_cpp_standard, + self.settings.compiler, + self.settings.compiler.version, + ) + ) + + def configure(self): + self.options["rocksdb/*"].use_rtti = True + + def layout(self): + cmake_layout(self) + + def generate(self): + tc = CMakeToolchain(self) + tc.variables["CMAKE_POSITION_INDEPENDENT_CODE"] = self.options.get_safe( + "fPIC", True + ) + + cxx_std_flag = tools.build.cppstd_flag(self) + cxx_std_value = ( + cxx_std_flag.split("=")[1] + if cxx_std_flag + else "c++{}".format(self._minimum_cpp_standard) + ) + tc.variables["CXX_STD"] = cxx_std_value + + tc.variables["BUILD_TESTS"] = self.options.build_tests + tc.variables["BUILD_BENCHMARKS"] = self.options.build_benchmarks + tc.variables["BUILD_PYTHON"] = self.options.build_python + tc.variables["BUILD_SERVER"] = self.options.build_server + + tc.generate() + + deps = CMakeDeps(self) + deps.generate() + + pc = PkgConfigDeps(self) + pc.generate() + + def build(self): + # files.apply_conandata_patches(self) + cmake = CMake(self) + cmake.configure() + cmake.build() + + def package(self): + cmake = CMake(self) + cmake.install() + files.rmdir(self, os.path.join(self.package_folder, "lib", "cmake")) + files.rmdir(self, os.path.join(self.package_folder, "lib", "pkgconfig")) + + def package_info(self): + self.cpp_info.set_property("cmake_file_name", "lintdb") + self.cpp_info.set_property("cmake_target_name", "lintdb::LintDB") + self.cpp_info.set_property("pkg_config_name", "liblintdb") + + self.cpp_info.components["liblintdb"].libs = ["lintdb"] \ No newline at end of file diff --git a/conda/benchmark_env.yaml b/conda/benchmark_env.yaml index 063e83a..f03bdcf 100644 --- a/conda/benchmark_env.yaml +++ b/conda/benchmark_env.yaml @@ -9,7 +9,6 @@ dependencies: - faiss-cpu - pytorch::pytorch - conda-forge::numpy - - conda-forge::onnxruntime-cpp==1.17.3 - pip: - chardet - typer diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..4d36e5e --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,51 @@ +FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 as base + +ENV DEBIAN_FRONTEND=noninteractive +ENV CMAKE_VERSION="v3.28.5" +ENV CMAKE_TAR="cmake-3.28.5-linux-x86_64.tar.gz" +ENV CCACHE_VERSION="v4.9.1" +ENV CCACHE_DIR="ccache-4.9.1-linux-x86_64" +ENV CCACHE_TAR="ccache-4.9.1-linux-x86_64.tar.xz" +ENV BFLOAT16_WHL="bfloat16-1.4.0-cp311-cp311-linux_x86_64.whl" + +RUN apt update \ + && apt install -y ca-certificates apt-transport-https software-properties-common lsb-release \ + && apt install -y --no-install-recommends wget curl git make gfortran gcc g++ swig \ + && apt install -y gcc-12 g++-12 \ + && apt install -y python3.11 python3.11-dev python3.11-distutils \ + && apt install -y python3-setuptools \ + && cd /usr/bin \ + && unlink gcc && ln -s gcc-12 gcc \ + && unlink g++ && ln -s g++-12 g++ \ + && unlink python3 && ln -s python3.11 python3 \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python3 \ + && export PATH=$PATH:$HOME/.local/bin \ + && pip3 install wheel \ + && apt remove --purge -y \ + && rm -rf /var/lib/apt/lists/* + +# install cmake, ccache and bfloat16 +RUN cd /tmp \ + && wget https://github.com/Kitware/CMake/releases/download/${CMAKE_VERSION}/${CMAKE_TAR} \ + && tar --strip-components=1 -xz -C /usr/local -f ${CMAKE_TAR} \ + && rm -f ${CMAKE_TAR} \ + && wget https://github.com/ccache/ccache/releases/download/${CCACHE_VERSION}/${CCACHE_TAR} \ + && tar -xf ${CCACHE_TAR} \ + && cp ${CCACHE_DIR}/ccache /usr/local/bin \ + && rm -f ${CCACHE_TAR} \ + +# install knowhere dependancies +RUN apt update \ + && apt install -y libopenblas-openmp-dev libcurl4-openssl-dev libaio-dev libevent-dev lcov \ + && pip3 install conan==1.61.0 \ + && conan profile new default --detect \ + && conan profile update settings.compiler.libcxx=libstdc++11 default + + +FROM base as build + +WORKDIR /lintdb + +COPY docker/entrypoint.sh /usr/local/bin/entrypoint.sh + +ENTRYPOINT ["entrypoint.sh"] diff --git a/docker/Dockerfile.conda.build b/docker/Dockerfile.dev similarity index 73% rename from docker/Dockerfile.conda.build rename to docker/Dockerfile.dev index 55f1670..7dd591a 100644 --- a/docker/Dockerfile.conda.build +++ b/docker/Dockerfile.dev @@ -4,8 +4,10 @@ ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ - build-essential curl \ - git + build-essential curl pkg-config \ + zip unzip tar bzip2 \ + git ninja-build gfortran libpthread-stubs0-dev \ + cmake libopenblas-dev # Get Rust RUN curl https://sh.rustup.rs -sSf | bash -s -- -y @@ -16,6 +18,10 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 /bin/bash /tmp/miniconda.sh -b -p /opt/miniconda && \ rm /tmp/miniconda.sh +RUN wget https://apt.llvm.org/llvm.sh && \ + chmod +x llvm.sh && \ + sh ./llvm.sh all + # Add Miniconda to the path ENV PATH="/opt/miniconda/bin:$PATH" @@ -23,7 +29,7 @@ ENV PATH="/opt/miniconda/bin:$PATH" RUN conda update -n base -c defaults conda -y # Create and activate a new conda environment -RUN conda create -y -n build_env python=3.11 +RUN conda create -y -n build_env python=3.12 SHELL ["conda", "run", "-n", "build_env", "/bin/bash", "-c"] RUN echo "source activate build_env" > ~/.bashrc ENV PATH /opt/conda/envs/build_env/bin:$PATH @@ -33,4 +39,4 @@ RUN conda install -y conda-build RUN conda config --append channels conda-forge -CMD [ "conda", "build", "/lintdb/conda/lintdb" ] +# CMD [ "conda", "build", "/lintdb/conda/lintdb" ] diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..b954037 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,5 @@ +mkdir build +cd build + +conan install .. --build=missing -o with_ut=True -o build_tests=True -o build_benchmarks=True -o build_python=True -s compiler.libcxx=libstdc++11 -s build_type=Release +conan build .. \ No newline at end of file diff --git a/docs/development.md b/docs/development.md index f9098bf..4d44a5a 100644 --- a/docs/development.md +++ b/docs/development.md @@ -47,6 +47,16 @@ pip install nanobind python -m nanobind.stubgen -m lintdb.core -M py.typed -o core.pyi ``` +--- +# Conan + +LintDB is moving to Conan as a package manager. + +### Building +``` +conan install . --output-folder=build --build=missing +``` + --- # Makefile commands @@ -61,12 +71,6 @@ make build-debug # build a release target make build-release -# run tests -make tests - -# run benchmarks -make benchmarks - # profile LintDB (note some variables need to change in the Makefile) make callgrind ``` diff --git a/lintdb/CMakeLists.txt b/lintdb/CMakeLists.txt deleted file mode 100644 index 2cbcc5e..0000000 --- a/lintdb/CMakeLists.txt +++ /dev/null @@ -1,196 +0,0 @@ -set(LINT_DB_SRC - index.cpp - quantizers/Binarizer.cpp - quantizers/ProductEncoder.cpp - quantizers/io.cpp - util.cpp - invlists/RocksdbForwardIndex.cpp - invlists/EncodedDocument.cpp - invlists/IndexWriter.cpp - invlists/RocksdbInvertedList.cpp - invlists/ForwardIndexIterator.cpp - invlists/IndexWriter.cpp - invlists/InvertedIterator.cpp - quantizers/PQDistanceTables.cpp - quantizers/impl/kmeans.cpp - quantizers/CoarseQuantizer.cpp - query/DocIterator.cpp - query/Query.cpp - query/QueryNode.cpp - schema/DocEncoder.cpp - schema/DocProcessor.cpp - schema/Schema.cpp - schema/FieldMapper.cpp - query/QueryExecutor.cpp - scoring/Scorer.cpp - query/KnnNearestCentroids.cpp - quantizers/IdentityQuantizer.cpp - scoring/plaid.cpp - query/decode.cpp - scoring/ContextCollector.cpp - scoring/scoring_methods.h - scoring/scoring_methods.cpp -) - -set(LINT_DB_HEADERS - index.h - api.h - assert.h - exception.h - constants.h - cf.h - env.h - quantizers/Binarizer.h - quantizers/Quantizer.h - quantizers/ProductEncoder.h - quantizers/io.h - query/DocIterator.h - query/Query.h - query/QueryNode.h - SearchOptions.h - SearchResult.h - util.h - invlists/IndexWriter.h - invlists/InvertedList.h - invlists/Iterator.h - invlists/EncodedDocument.h - invlists/RocksdbForwardIndex.h - invlists/InvertedIterator.h - quantizers/PQDistanceTables.h - quantizers/impl/product_quantizer.h - quantizers/CoarseQuantizer.h - quantizers/impl/kmeans.h - quantizers/IdentityQuantizer.h - invlists/RocksdbInvertedList.h - invlists/ForwardIndexIterator.h - invlists/IndexWriter.h - invlists/PostingData.h - invlists/ContextIterator.h - schema/Schema.h - schema/DocEncoder.h - schema/DocProcessor.h - schema/Document.h - schema/DataTypes.h - schema/FieldMapper.h - schema/ProcessedData.h - query/QueryContext.h - query/QueryExecutor.h - scoring/ContextCollector.h - scoring/Scorer.h - scoring/plaid.h - query/KnnNearestCentroids.h - invlists/KeyBuilder.h - utils/endian.h - query/decode.h - utils/progress_bar.h - utils/half.h - version.h) - -set(LINT_DB_HEADERS - ${LINT_DB_HEADERS} - PARENT_SCOPE) - -add_library(lintdb_lib ${LINT_DB_SRC}) - -string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx) -if(${finteger_idx} EQUAL -1) - target_compile_definitions(lintdb_lib PRIVATE FINTEGER=int) -endif() - -if(MSVC OR LINUX) - set(OpenMP_CXX_FLAGS "-fopenmp=libiomp5") - set(OpenMP_CXX_LIB_NAMES "libiomp5") - # the below is necessary in github actions when calling pip install . - cmake_path(GET CMAKE_CURRENT_BINARY_DIR PARENT_PATH BUILD_PARENT_DIR) - set(OpenMP_libiomp5_LIBRARY "${BUILD_PARENT_DIR}/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so") - message(STATUS "OpenMP_libiomp5_LIBRARY: ${OpenMP_libiomp5_LIBRARY}") - if(EXISTS ${OpenMP_libiomp5_LIBRARY}) - message(STATUS "OpenMP_libiomp5_LIBRARY exists") - else() - message(STATUS "OpenMP_libiomp5_LIBRARY does not exist") - endif() -endif() - -find_package(OpenMP REQUIRED) -if(OpenMP_CXX_FOUND) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - target_link_libraries(lintdb_lib PUBLIC OpenMP::OpenMP_CXX) -endif() - -message(STATUS "using MKL root: ${MKLROOT}") -find_package(MKL) -if(MKL_FOUND) - message(STATUS "libraries: ${MKL_LIBRARIES}") - target_link_libraries(lintdb_lib PRIVATE ${MKL_LIBRARIES}) -else() - find_package(OpenBLAS CONFIG REQUIRED) - target_link_libraries(lintdb_lib PRIVATE OpenBLAS::OpenBLAS) - - -# find_package(LAPACK CONFIG REQUIRED) -# target_link_libraries(lintdb_lib PRIVATE LAPACK::LAPACK) -endif() - -# this is heuristically generated, and may not be correct -find_package(faiss CONFIG REQUIRED) -target_link_libraries(lintdb_lib PRIVATE faiss) - -# this is heuristically generated, and may not be correct -find_package(RocksDB CONFIG REQUIRED) -target_link_libraries(lintdb_lib PRIVATE RocksDB::rocksdb) - -find_package(flatbuffers CONFIG REQUIRED) -target_link_libraries(lintdb_lib PRIVATE flatbuffers::flatbuffers) - -find_package(glog CONFIG REQUIRED) -target_link_libraries(lintdb_lib PRIVATE glog::glog) - -# this is heuristically generated, and may not be correct -find_package(Microsoft.GSL CONFIG REQUIRED) -target_link_libraries(lintdb_lib PRIVATE Microsoft.GSL::GSL) - -find_package(jsoncpp CONFIG REQUIRED) -target_link_libraries(lintdb_lib PUBLIC JsonCpp::JsonCpp) - -find_package(Bitsery CONFIG REQUIRED) -target_link_libraries(lintdb_lib PUBLIC Bitsery::bitsery) - -if(MSVC) - target_compile_options(lintdb_lib PRIVATE /W4 /WX) -else() - target_compile_options(lintdb_lib PRIVATE -Wall -Wpedantic) -endif() - -target_include_directories(lintdb_lib - PUBLIC $) - -set_target_properties(lintdb_lib PROPERTIES POSITION_INDEPENDENT_CODE ON - WINDOWS_EXPORT_ALL_SYMBOLS ON) - -foreach(header ${LINT_DB_HEADERS}) - get_filename_component(dir ${header} DIRECTORY) - install(FILES ${header} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lintdb/${dir}) -endforeach() - -install( - TARGETS lintdb_lib - EXPORT lintdb-targets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - INCLUDES - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - -include(CMakePackageConfigHelpers) -write_basic_package_version_file( - "${PROJECT_BINARY_DIR}/cmake/lintdb-config-version.cmake" - VERSION ${CMAKE_PROJECT_VERSION} - COMPATIBILITY AnyNewerVersion) - -configure_file(${PROJECT_SOURCE_DIR}/cmake/lintdb-config.cmake.in - ${PROJECT_BINARY_DIR}/cmake/lintdb-config.cmake COPYONLY) -install(FILES ${PROJECT_BINARY_DIR}/cmake/lintdb-config.cmake - ${PROJECT_BINARY_DIR}/cmake/lintdb-config-version.cmake - DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/lintdb) - -install(EXPORT lintdb-targets DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/lintdb) diff --git a/lintdb/datasources/DataSource.h b/lintdb/datasources/DataSource.h new file mode 100644 index 0000000..dcc5413 --- /dev/null +++ b/lintdb/datasources/DataSource.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include +#include +#include "lintdb/schema/Schema.h" + +namespace lintdb { + +/** + * KeyValueIterator provides an interface for iterating over key-value pairs. + */ +class KeyValueIterator { +public: + virtual ~KeyValueIterator() = default; + + /** + * Returns true if the iterator is valid. + */ + virtual bool is_valid() const = 0; + + /** + * Advances the iterator to the next key-value pair. + */ + virtual void advance() = 0; + + /** + * Returns the current key. + */ + virtual std::string key() const = 0; + + /** + * Returns the current value as a string. + */ + virtual std::string value() const = 0; +}; + +/** + * DataSource provides a read-only interface for accessing data. + * It abstracts the underlying storage mechanism (e.g., RocksDB) and provides + * a key-value interface for data access. + */ +class DataSource { +public: + virtual ~DataSource() = default; + + /** + * Get the schema of the data source. + */ + virtual Schema schema() const = 0; + + /** + * Get an iterator over all key-value pairs. + */ + virtual std::unique_ptr scan(uint64_t tenant_id) = 0; + + /** + * Get an iterator over key-value pairs with a prefix. + */ + virtual std::unique_ptr scan_prefix(const std::string& prefix) = 0; + + /** + * Get a value by key. + */ + virtual std::string get(const std::string& key) = 0; + + /** + * Get an iterator over key-value pairs for a specific tenant and column. + */ + virtual std::unique_ptr get_iterator(uint64_t tenant_id, const std::string& column_name) { + // Default implementation uses scan and filters by prefix + std::string prefix = std::to_string(tenant_id) + ":" + column_name + ":"; + return scan_prefix(prefix); + } +}; +} \ No newline at end of file diff --git a/lintdb/datasources/RocksDBDataSource.cpp b/lintdb/datasources/RocksDBDataSource.cpp new file mode 100644 index 0000000..17f9787 --- /dev/null +++ b/lintdb/datasources/RocksDBDataSource.cpp @@ -0,0 +1,45 @@ +#include "lintdb/datasources/RocksDBDataSource.h" +#include +#include +#include +#include +#include + +namespace lintdb { + +RocksDBDataSource::RocksDBDataSource(const std::string& path) { + tenant_id_ = 0; + + rocksdb::Options options; + options.create_if_missing = false; // We're read-only + rocksdb::DB* db = nullptr; + rocksdb::Status status = rocksdb::DB::Open(options, path, &db); + if (!status.ok()) { + throw std::runtime_error("Failed to open RocksDB: " + status.ToString()); + } + db_.reset(db); +} + +RocksDBDataSource::~RocksDBDataSource() { + db_.reset(); +} + +std::unique_ptr RocksDBDataSource::scan_prefix(const std::string& prefix) { + rocksdb::ReadOptions options; + auto iter = db_->NewIterator(options); + iter->Seek(prefix); + return std::make_unique(iter); +} + +std::string RocksDBDataSource::get(const std::string& key) { + std::string value; + rocksdb::Status status = db_->Get(rocksdb::ReadOptions(), key, &value); + if (status.IsNotFound()) { + return ""; + } + if (!status.ok()) { + throw std::runtime_error("Failed to get value: " + status.ToString()); + } + return value; +} +} \ No newline at end of file diff --git a/lintdb/datasources/RocksDBDataSource.h b/lintdb/datasources/RocksDBDataSource.h new file mode 100644 index 0000000..089dcae --- /dev/null +++ b/lintdb/datasources/RocksDBDataSource.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include +#include "lintdb/datasources/DataSource.h" +#include "rocksdb/db.h" + +namespace lintdb { + +/** + * RocksDBKeyValueIterator implements the KeyValueIterator interface using RocksDB. + */ +class RocksDBKeyValueIterator : public KeyValueIterator { +public: + explicit RocksDBKeyValueIterator(rocksdb::Iterator* iter) : iter_(iter) {} + + bool is_valid() const override { + return iter_->Valid(); + } + + void advance() override { + iter_->Next(); + } + + std::string key() const override { + return iter_->key().ToString(); + } + + std::string value() const override { + return iter_->value().ToString(); + } + +private: + std::unique_ptr iter_; +}; + +/** + * RocksDBDataSource implements the DataSource interface using RocksDB. + */ +class RocksDBDataSource : public DataSource { +public: + /** + * Create a new RocksDB data source. + * @param path The path to the RocksDB database + * @param schema The schema of the data + */ + RocksDBDataSource(const std::string& path); + + ~RocksDBDataSource() override; + + Schema schema() const override { + return schema_; + } + + std::unique_ptr scan_prefix(const std::string& prefix) override; + std::string get(const std::string& key) override; + +private: + std::unique_ptr db_; + Schema schema_; + uint64_t tenant_id_; +}; +} \ No newline at end of file diff --git a/lintdb/datasources/serialization.cpp b/lintdb/datasources/serialization.cpp new file mode 100644 index 0000000..e6f3a72 --- /dev/null +++ b/lintdb/datasources/serialization.cpp @@ -0,0 +1,74 @@ +#include "serialization.h" +#include +#include +#include +#include "lintdb/schema/DataTypes.h" +#include "lintdb/schema/Schema.h" + +namespace lintdb { + +using Buffer = std::vector; +using OutputAdapter = bitsery::OutputBufferAdapter; +using InputAdapter = bitsery::InputBufferAdapter; + +std::string serialize_field_value(const Field& field, const SupportedTypes& value) { + Buffer buf; + auto written = bitsery::quickSerialization(OutputAdapter{buf}, value); + return std::string(buf.begin(), buf.begin() + written); +} + +SupportedTypes deserialize_field_value(const Field& field, const std::string& data) { + SupportedTypes res; + auto state = bitsery::quickDeserialization( + {data.begin(), data.size()}, res); + if (state.first != bitsery::ReaderError::NoError || !state.second) { + throw std::runtime_error("Failed to deserialize field value"); + } + return res; +} + +std::string serialize_document(const Schema& schema, const std::map& document) { + // Create a map of field_id to value for serialization + std::map field_values; + + for (const auto& [field_name, value] : document) { + try { + const Field& field = schema.get_field(field_name); + field_values[field.field_id] = value; + } catch (const std::runtime_error& e) { + LOG(WARNING) << "Field " << field_name << " not found in schema, skipping"; + } + } + + Buffer buf; + auto written = bitsery::quickSerialization(OutputAdapter{buf}, field_values); + return std::string(buf.begin(), buf.begin() + written); +} + +std::map deserialize_document( + const Schema& schema, + const std::string& data) { + std::map field_values; + auto state = bitsery::quickDeserialization( + {data.begin(), data.size()}, field_values); + + if (state.first != bitsery::ReaderError::NoError || !state.second) { + throw std::runtime_error("Failed to deserialize document"); + } + + // Convert field_id map back to field_name map + std::map result; + for (const auto& [field_id, value] : field_values) { + // Find the field name for this field_id + for (const auto& field : schema.fields) { + if (field.field_id == field_id) { + result[field.name] = value; + break; + } + } + } + + return result; +} + +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/datasources/serialization.h b/lintdb/datasources/serialization.h new file mode 100644 index 0000000..8715062 --- /dev/null +++ b/lintdb/datasources/serialization.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include "lintdb/schema/DataTypes.h" +#include "lintdb/schema/Schema.h" + +namespace lintdb { + +/** + * @brief Serialize a single field value + * @param field The field definition + * @param value The value to serialize + * @return Serialized string representation + */ +std::string serialize_field_value(const Field& field, const SupportedTypes& value); + +/** + * @brief Deserialize a single field value + * @param field The field definition + * @param data The serialized data + * @return The deserialized value + */ +SupportedTypes deserialize_field_value(const Field& field, const std::string& data); + +/** + * @brief Serialize an entire document according to a schema + * @param schema The schema defining the document structure + * @param document Map of field names to values + * @return Serialized string representation + */ +std::string serialize_document( + const Schema& schema, + const std::map& document); + +/** + * @brief Deserialize an entire document according to a schema + * @param schema The schema defining the document structure + * @param data The serialized data + * @return Map of field names to values + */ +std::map deserialize_document( + const Schema& schema, + const std::string& data); + +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/datasources/test/RocksDBDataSourceTest.cpp b/lintdb/datasources/test/RocksDBDataSourceTest.cpp new file mode 100644 index 0000000..afcb88d --- /dev/null +++ b/lintdb/datasources/test/RocksDBDataSourceTest.cpp @@ -0,0 +1,46 @@ +#include +#include +#include "lintdb/datasources/RocksDBDataSource.h" +#include "lintdb/query/Schema.h" + +using namespace lintdb; + +class RocksDBDataSourceTest : public ::testing::Test { +protected: + void SetUp() override { + // Create a temporary directory for the test database + db_path_ = std::filesystem::temp_directory_path() / "lintdb_test"; + std::filesystem::create_directories(db_path_); + + // Create a simple schema + Schema schema; + schema.add_field(Field(DataType::FLOAT, "vector")); + schema.add_field(Field(DataType::STRING, "text")); + + // Create the data source + data_source_ = std::make_unique(db_path_.string(), schema); + } + + void TearDown() override { + data_source_.reset(); + std::filesystem::remove_all(db_path_); + } + + std::filesystem::path db_path_; + std::unique_ptr data_source_; +}; + +TEST_F(RocksDBDataSourceTest, GetNonexistent) { + auto retrieved = data_source_->get("nonexistent_key"); + EXPECT_TRUE(retrieved.empty()); +} + +TEST_F(RocksDBDataSourceTest, ScanEmpty) { + auto iter = data_source_->scan(); + EXPECT_FALSE(iter->is_valid()); +} + +TEST_F(RocksDBDataSourceTest, ScanPrefixEmpty) { + auto iter = data_source_->scan_prefix("prefix"); + EXPECT_FALSE(iter->is_valid()); +} \ No newline at end of file diff --git a/lintdb/index.cpp b/lintdb/index.cpp index 1890058..a106b02 100644 --- a/lintdb/index.cpp +++ b/lintdb/index.cpp @@ -18,7 +18,7 @@ #include #include #include "lintdb/api.h" -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/cf.h" #include "lintdb/invlists/RocksdbForwardIndex.h" #include "lintdb/invlists/RocksdbInvertedList.h" diff --git a/lintdb/index.h b/lintdb/index.h index c2b9b07..1dcec2b 100644 --- a/lintdb/index.h +++ b/lintdb/index.h @@ -9,7 +9,7 @@ #include #include "lintdb/api.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/invlists/IndexWriter.h" #include "lintdb/invlists/InvertedList.h" #include "lintdb/quantizers/CoarseQuantizer.h" diff --git a/lintdb/invlists/ContextIerator.cpp b/lintdb/invlists/ContextIerator.cpp new file mode 100644 index 0000000..0723a0e --- /dev/null +++ b/lintdb/invlists/ContextIerator.cpp @@ -0,0 +1,69 @@ +#include "ContextIterator.h" +#include "KeyBuilder.h" +#include +#include + +namespace lintdb { + +ContextIterator::ContextIterator( + const std::shared_ptr db, + rocksdb::ColumnFamilyHandle* column_family, + const uint64_t tenant, + const uint8_t field) + : tenant(tenant), field(field), has_read_key(false) { + if (!column_family) { + throw std::runtime_error("Column family not found"); + } + cf = column_family->GetID(); + KeyBuilder kb; + prefix = kb.add(tenant).add(field).build(); + + auto options = rocksdb::ReadOptions(); + + this->it = std::unique_ptr( + db->NewIterator(options, column_family)); + it->Seek(this->prefix); +} + +bool ContextIterator::is_valid() { + if (!has_read_key) { + bool is_valid = it->Valid(); + if (!is_valid) { + return false; + } + + auto key = it->key(); + std::string key_str = key.ToString(); + if (key_str.compare(0, prefix.size(), prefix) != 0) { + return false; + } + this->current_key = ContextKey(key_str); + } + + has_read_key = true; + return true; +} + +void ContextIterator::advance(const idx_t doc_id) { + KeyBuilder kb; + + std::string expected_key = + kb.add(tenant).add(field).add(doc_id).build(); + it->Seek(rocksdb::Slice(expected_key)); + has_read_key = false; +} + +void ContextIterator::next() { + it->Next(); + has_read_key = false; +} + +ContextKey ContextIterator::get_key() const { + return current_key; +} + +std::string ContextIterator::get_value() const { + return it->value().ToString(); +} + +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/invlists/ContextIterator.h b/lintdb/invlists/ContextIterator.h index 1ecd832..b5a7f5c 100644 --- a/lintdb/invlists/ContextIterator.h +++ b/lintdb/invlists/ContextIterator.h @@ -2,88 +2,39 @@ #include #include +#include #include #include "lintdb/constants.h" #include "lintdb/invlists/Iterator.h" #include "lintdb/invlists/KeyBuilder.h" namespace lintdb { + class ContextIterator { public: ContextIterator( const std::shared_ptr db, rocksdb::ColumnFamilyHandle* column_family, const uint64_t tenant, - const uint8_t field) - : tenant(tenant), field(field) { - if (!column_family) { - throw std::runtime_error("Column family not found"); - } - cf = column_family->GetID(); - KeyBuilder kb; - prefix = kb.add(tenant).add(field).build(); - - prefix_slice = rocksdb::Slice(this->prefix); - auto options = rocksdb::ReadOptions(); - - this->it = std::unique_ptr( - db->NewIterator(options, column_family)); - it->Seek(this->prefix); - } - - bool is_valid() { - if (!has_read_key) { - bool is_valid = it->Valid(); - if (!is_valid) { - return false; - } - - auto key = it->key(); - std::string key_str = key.ToString(); - if (key_str.compare(0, prefix.size(), prefix) != 0) { - return false; - } - this->current_key = ContextKey(key_str); - } - - has_read_key = true; - return true; - } - - void advance(const idx_t doc_id) { - KeyBuilder kb; + const uint8_t field); - std::string expected_key = - kb.add(tenant).add(field).add(doc_id).build(); - it->Seek(rocksdb::Slice(expected_key)); - has_read_key = false; - } - - void next() { - it->Next(); - has_read_key = false; - } - - ContextKey get_key() const { - return current_key; - } - - std::string get_value() const { - return it->value().ToString(); - } + bool is_valid(); + void advance(const idx_t doc_id); + void next(); + ContextKey get_key() const; + std::string get_value() const; + private: std::unique_ptr it; - - protected: lintdb::column_index_t cf; - string prefix; - string end_key; + std::string prefix; + std::string end_key; rocksdb::Slice prefix_slice; ContextKey current_key; - bool has_read_key; const uint64_t tenant; const uint8_t field; }; + } // namespace lintdb diff --git a/lintdb/invlists/EncodedDocument.cpp b/lintdb/invlists/EncodedDocument.cpp index d94c409..77374e1 100644 --- a/lintdb/invlists/EncodedDocument.cpp +++ b/lintdb/invlists/EncodedDocument.cpp @@ -10,7 +10,7 @@ #include #include #include "lintdb/api.h" -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" namespace bitsery { template diff --git a/lintdb/invlists/ForwardIndexIterator.cpp b/lintdb/invlists/ForwardIndexIterator.cpp index 79c80a3..1947abc 100644 --- a/lintdb/invlists/ForwardIndexIterator.cpp +++ b/lintdb/invlists/ForwardIndexIterator.cpp @@ -1,5 +1,7 @@ #include "ForwardIndexIterator.h" #include +#include "rocksdb/db.h" +#include "rocksdb/slice.h" namespace lintdb { ForwardIndexIterator::ForwardIndexIterator( @@ -12,7 +14,6 @@ ForwardIndexIterator::ForwardIndexIterator( prefix = kb.add(tenant).build(); - prefix_slice = rocksdb::Slice(this->prefix); auto options = rocksdb::ReadOptions(); this->it = std::unique_ptr( diff --git a/lintdb/invlists/ForwardIndexIterator.h b/lintdb/invlists/ForwardIndexIterator.h index 9ebb386..e11baf8 100644 --- a/lintdb/invlists/ForwardIndexIterator.h +++ b/lintdb/invlists/ForwardIndexIterator.h @@ -34,7 +34,6 @@ struct ForwardIndexIterator { lintdb::column_index_t cf; string prefix; string end_key; - rocksdb::Slice prefix_slice; ForwardIndexKey current_key; const idx_t tenant; diff --git a/lintdb/invlists/IndexWriter.cpp b/lintdb/invlists/IndexWriter.cpp index 489d379..c6a15f1 100644 --- a/lintdb/invlists/IndexWriter.cpp +++ b/lintdb/invlists/IndexWriter.cpp @@ -2,8 +2,9 @@ #include #include #include +#include #include "lintdb/api.h" -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/constants.h" #include "lintdb/invlists/PostingData.h" diff --git a/lintdb/invlists/IndexWriter.h b/lintdb/invlists/IndexWriter.h index be4f4b9..f228e8b 100644 --- a/lintdb/invlists/IndexWriter.h +++ b/lintdb/invlists/IndexWriter.h @@ -1,11 +1,15 @@ #pragma once -#include -#include #include +#include #include "lintdb/invlists/PostingData.h" #include "lintdb/version.h" +namespace rocksdb { + class DB; + class ColumnFamilyHandle; +} + namespace lintdb { class IIndexWriter { diff --git a/lintdb/invlists/InvertedIterator.h b/lintdb/invlists/InvertedIterator.h index cd48079..02f8c28 100644 --- a/lintdb/invlists/InvertedIterator.h +++ b/lintdb/invlists/InvertedIterator.h @@ -8,7 +8,7 @@ #include #include #include "lintdb/constants.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/invlists/ContextIterator.h" #include "lintdb/invlists/InvertedList.h" #include "lintdb/invlists/Iterator.h" diff --git a/lintdb/invlists/RocksdbForwardIndex.cpp b/lintdb/invlists/RocksdbForwardIndex.cpp index 60ab556..7309a17 100644 --- a/lintdb/invlists/RocksdbForwardIndex.cpp +++ b/lintdb/invlists/RocksdbForwardIndex.cpp @@ -4,9 +4,9 @@ #include #include #include -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/constants.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/invlists/ForwardIndexIterator.h" #include "lintdb/invlists/KeyBuilder.h" #include "lintdb/schema/DocEncoder.h" diff --git a/lintdb/invlists/RocksdbInvertedList.cpp b/lintdb/invlists/RocksdbInvertedList.cpp index 2e3c1e1..d8e0cd9 100644 --- a/lintdb/invlists/RocksdbInvertedList.cpp +++ b/lintdb/invlists/RocksdbInvertedList.cpp @@ -4,9 +4,9 @@ #include #include #include "InvertedIterator.h" -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/constants.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/invlists/ContextIterator.h" #include "lintdb/invlists/RocksdbForwardIndex.h" #include "lintdb/schema/DocEncoder.h" diff --git a/lintdb/invlists/RocksdbInvertedList.h b/lintdb/invlists/RocksdbInvertedList.h index 4e599b6..d4fd347 100644 --- a/lintdb/invlists/RocksdbInvertedList.h +++ b/lintdb/invlists/RocksdbInvertedList.h @@ -9,7 +9,7 @@ #include #include #include "lintdb/constants.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/invlists/ContextIterator.h" #include "lintdb/invlists/InvertedList.h" #include "lintdb/invlists/Iterator.h" diff --git a/lintdb/python/CMakeLists.txt b/lintdb/python/CMakeLists.txt index 956e896..aa3a079 100644 --- a/lintdb/python/CMakeLists.txt +++ b/lintdb/python/CMakeLists.txt @@ -2,10 +2,6 @@ cmake_minimum_required(VERSION 3.21 FATAL_ERROR) -project(lintdb - LANGUAGES CXX -) - set(CMAKE_CXX_STANDARD 17) find_package(Python @@ -24,7 +20,7 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -add_subdirectory(${PROJECT_SOURCE_DIR}/../../third_party/nanobind nanobind EXCLUDE_FROM_ALL) +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/nanobind nanobind EXCLUDE_FROM_ALL) find_package(nanobind CONFIG REQUIRED) nanobind_add_module( @@ -34,8 +30,11 @@ nanobind_add_module( ) target_link_libraries(core PRIVATE - lintdb_lib + lintdb + faiss ) +target_include_directories(core PRIVATE nanobind) + install(TARGETS core LIBRARY DESTINATION lintdb) diff --git a/lintdb/quantizers/Binarizer.cpp b/lintdb/quantizers/Binarizer.cpp index 07c970f..39e9ea9 100644 --- a/lintdb/quantizers/Binarizer.cpp +++ b/lintdb/quantizers/Binarizer.cpp @@ -8,9 +8,13 @@ #include #include #include -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/util.h" +#if defined(__AVX2__) +#include +#endif + namespace lintdb { Binarizer::Binarizer(size_t nbits, size_t dim) : Quantizer(), nbits(nbits), dim(dim) { @@ -277,7 +281,6 @@ std::vector Binarizer::bucketize(const std::vector& residuals) { // residuals is a vector of size dim. std::vector binarized(residuals.size() * nbits); -#pragma omp parallel for for (size_t i = 0; i < residuals.size(); ++i) { uint8_t bucket = 0; bool bucket_found = false; @@ -345,6 +348,7 @@ std::vector Binarizer::create_decompression_lut() { } void Binarizer::sa_encode(size_t n, const float* x, residual_t* codes) { +#pragma omp parallel for for (size_t i = 0; i < n; ++i) { // TODO (mbarta): stop making this copy. std::vector residuals(x + i * dim, x + (i + 1) * dim); @@ -358,11 +362,58 @@ void Binarizer::sa_encode(size_t n, const float* x, residual_t* codes) { } } +#ifdef __AVX2__ +void Binarizer::sa_decode_1bit(size_t n, const residual_t* residuals, float* x) { + constexpr size_t npacked_vals_per_byte = 8; // For 1-bit quantization + const size_t packed_dim = (dim / npacked_vals_per_byte); + + // Precompute the two possible values for 1-bit quantization + const float low_value = bucket_weights[0]; + const float high_value = bucket_weights[1]; + + // Create vectors of low and high values + const __m256 low_vec = _mm256_set1_ps(low_value); + const __m256 high_vec = _mm256_set1_ps(high_value); + +#pragma omp parallel for + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < packed_dim; ++k) { + uint8_t packed = residuals[i * packed_dim + k]; + + // Process 8 bits at a time using AVX2 instructions + __m256i bits = _mm256_set1_epi32(packed); + __m256i mask = _mm256_set_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + __m256i unpacked = _mm256_cmpeq_epi32(_mm256_and_si256(bits, mask), mask); + + // Use the mask to select between low and high values + __m256 result = _mm256_blendv_ps(low_vec, high_vec, _mm256_castsi256_ps(unpacked)); + + // Store the result + _mm256_storeu_ps(&x[i * dim + k * npacked_vals_per_byte], result); + } + } +} + +#endif + void Binarizer::sa_decode(size_t n, const residual_t* residuals, float* x) { +// Use AVX2 instructions for 1-bit quantization +#ifdef __AVX2__ + if (nbits == 1) { + sa_decode_1bit(n, residuals, x); + return; + } +#endif + + sa_decode_generic(n, residuals, x); +} + +void Binarizer::sa_decode_generic(size_t n, const residual_t* residuals, float* x) { const size_t npacked_vals_per_byte = (8 / nbits); const size_t packed_dim = (dim / npacked_vals_per_byte); // for each token doc. +#pragma omp parallel for for (size_t i = 0; i < n; ++i) { // for each packed residual value for (int k = 0; k < packed_dim; ++k) { diff --git a/lintdb/quantizers/Binarizer.h b/lintdb/quantizers/Binarizer.h index 53e7328..f347025 100644 --- a/lintdb/quantizers/Binarizer.h +++ b/lintdb/quantizers/Binarizer.h @@ -42,6 +42,12 @@ struct Binarizer : public Quantizer { void sa_encode(size_t n, const float* x, residual_t* codes) override; void sa_decode(size_t n, const residual_t* codes, float* x) override; + + void sa_decode_generic (size_t n, const residual_t* codes, float* x); +#ifdef __AVX2__ + void sa_decode_1bit(size_t n, const residual_t* codes, float* x); +#endif + size_t code_size() override; size_t get_nbits() override { diff --git a/lintdb/quantizers/CoarseQuantizer.cpp b/lintdb/quantizers/CoarseQuantizer.cpp index cf0ae5b..33791b1 100644 --- a/lintdb/quantizers/CoarseQuantizer.cpp +++ b/lintdb/quantizers/CoarseQuantizer.cpp @@ -342,7 +342,7 @@ std::unique_ptr FaissCoarseQuantizer::deserialize( faiss::Index* index = faiss::read_index(filename.c_str()); auto faiss_quantizer = std::make_unique(index->d); - faiss_quantizer->index = *dynamic_cast(index); + faiss_quantizer->index = *static_cast(index); faiss_quantizer->k = faiss_quantizer->index.ntotal; faiss_quantizer->d = faiss_quantizer->index.d; diff --git a/lintdb/quantizers/InvertedListScanner.cpp b/lintdb/quantizers/InvertedListScanner.cpp deleted file mode 100644 index 680eeaf..0000000 --- a/lintdb/quantizers/InvertedListScanner.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include "InvertedListScanner.h" -#include -#include -#include -#include -#include - -namespace lintdb { -InvertedListScanner::InvertedListScanner( - std::shared_ptr& quantizer, - const float* query_data, - size_t num_tokens) - : quantizer(quantizer), code_size(quantizer->code_size()) { - distance_tables = quantizer->get_distance_tables(query_data, num_tokens); -} - -std::vector InvertedListScanner::scan( - const idx_t key, - const std::unique_ptr list_iterator, - const std::vector& query_tokens_to_score) { - std::vector query_token_ids; - query_token_ids.reserve(query_tokens_to_score.size()); - for (const auto& q : query_tokens_to_score) { - query_token_ids.push_back(q.query_token); - } - - std::vector precomputed_distances; - precomputed_distances.reserve(query_tokens_to_score.size()); - for (const auto& q : query_tokens_to_score) { - precomputed_distances.push_back(q.distance); - } - - std::vector results; - for (; list_iterator->has_next(); list_iterator->next()) { - auto partial_codes = list_iterator->get_value(); - size_t num_tokens = partial_codes.partial_residuals.size() / code_size; - if (num_tokens != 1) { - LOG(WARNING) - << "Codes found in inverted index are the wrong size. residual size: " - << partial_codes.partial_residuals.size() - << " code size: " << code_size; - } - - ScoredPartialDocumentCodes doc_results; - auto token_key = list_iterator->get_key(); - doc_results.doc_id = token_key.doc_id; - doc_results.doc_token_id = token_key.token_id; - - auto scores = distance_tables->calculate_query_distances( - query_token_ids, - precomputed_distances, - partial_codes.partial_residuals); - - for (idx_t i = 0; i < scores.size(); i++) { - const auto query_token_id = query_token_ids[i]; - doc_results.query_token_id = query_token_id; - doc_results.score = scores[i]; - } - - results.push_back(doc_results); - } - return results; -} - -} // namespace lintdb \ No newline at end of file diff --git a/lintdb/quantizers/InvertedListScanner.h b/lintdb/quantizers/InvertedListScanner.h deleted file mode 100644 index 5ff1acb..0000000 --- a/lintdb/quantizers/InvertedListScanner.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef LINTDB_INVERTEDLISTSCANNER_H -#define LINTDB_INVERTEDLISTSCANNER_H - -#include -#include -#include -#include "lintdb/api.h" -#include "lintdb/invlists/EncodedDocument.h" -#include "lintdb/invlists/Iterator.h" -#include "lintdb/quantizers/PQDistanceTables.h" -#include "lintdb/quantizers/Quantizer.h" -#include "ProductEncoder.h" - -namespace lintdb { - -/** - * ScoredPartialDocumentCodes holds per-token scores to help calculate - * sum-of-max scores. - * - * Each token in a document is scored across the query tokens, and we want to - * keep the max score per query token. - */ -struct ScoredPartialDocumentCodes { - idx_t doc_id; - idx_t doc_token_id; - idx_t query_token_id; - float score; - - ScoredPartialDocumentCodes() = default; - - ScoredPartialDocumentCodes( - idx_t doc_id, - idx_t doc_token_id, - idx_t query_token_id, - float score) - : doc_id(doc_id), - doc_token_id(doc_token_id), - query_token_id(query_token_id), - score(score) {} -}; - -/** - * QueryTokenCentroidScore holds the distance between a query token and a - centroid. - * - * This is passed to scan to help calculate the score of a token. - - */ -struct QueryTokenCentroidScore { - idx_t query_token; - idx_t centroid_id; - float distance; -}; - -/** - * InvertedListScanner helps us scan through an inverted list and score the - * results. - * - * The score is going to be a calculation between the stored codes, the - * centroid, and the query. - */ -class InvertedListScanner { - public: - InvertedListScanner( - std::shared_ptr& quantizer, - const float* query_data, - size_t num_tokens); - - std::vector scan( - idx_t key, - const std::unique_ptr list_iterator, - const std::vector& query_tokens_to_score); - - private: - std::unique_ptr distance_tables; - std::shared_ptr quantizer; - size_t code_size; -}; - -} // namespace lintdb - -#endif // LINTDB_INVERTEDLISTSCANNER_H diff --git a/lintdb/quantizers/PQDistanceTables.cpp b/lintdb/quantizers/PQDistanceTables.cpp index 5c35073..0b76d57 100644 --- a/lintdb/quantizers/PQDistanceTables.cpp +++ b/lintdb/quantizers/PQDistanceTables.cpp @@ -4,7 +4,7 @@ #include #include #include -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" namespace lintdb { PQDistanceTables::PQDistanceTables( @@ -36,7 +36,7 @@ std::vector PQDistanceTables::calculate_query_distances( auto query_token_id = query_tokens_to_score[j]; auto sim_table = distance_tables[query_token_id]; float score = faiss::distance_single_code( - ipq->pq, sim_table.data(), codes.data()); + ipq->pq.M, ipq->pq.nbits, sim_table.data(), codes.data()); results[j] += score; } return results; diff --git a/lintdb/quantizers/ProductEncoder.cpp b/lintdb/quantizers/ProductEncoder.cpp index 91c56d0..389116f 100644 --- a/lintdb/quantizers/ProductEncoder.cpp +++ b/lintdb/quantizers/ProductEncoder.cpp @@ -6,8 +6,8 @@ #include #include #include -#include "lintdb/assert.h" -#include "lintdb/exception.h" +#include "lintdb/utils/assert.h" +#include "lintdb/utils/exception.h" namespace lintdb { ProductEncoder::ProductEncoder( diff --git a/lintdb/quantizers/impl/kmeans.cpp b/lintdb/quantizers/impl/kmeans.cpp index d92038d..bae25c9 100644 --- a/lintdb/quantizers/impl/kmeans.cpp +++ b/lintdb/quantizers/impl/kmeans.cpp @@ -5,7 +5,7 @@ #include #include #include -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" namespace lintdb { std::vector kmeans( diff --git a/lintdb/quantizers/io.h b/lintdb/quantizers/io.h index 90e4fa9..82f6fc4 100644 --- a/lintdb/quantizers/io.h +++ b/lintdb/quantizers/io.h @@ -4,7 +4,7 @@ #include #include #include -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/quantizers/Binarizer.h" #include "lintdb/quantizers/IdentityQuantizer.h" #include "lintdb/quantizers/ProductEncoder.h" diff --git a/lintdb/query/DataFrame.h b/lintdb/query/DataFrame.h new file mode 100644 index 0000000..9b88b60 --- /dev/null +++ b/lintdb/query/DataFrame.h @@ -0,0 +1,143 @@ +#pragma once + +#include +#include +#include + +#include "lintdb/query/LogicalExpr.h" +#include "lintdb/query/LogicalPlan.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/datasources/DataSource.h" + +namespace lintdb::query { + +class DataFrame { +public: + virtual ~DataFrame() = default; + + /** + * Project specific columns from the DataFrame. + * @param exprs The expressions to project + * @return A new DataFrame with the projected columns + */ + virtual std::shared_ptr project(const std::vector>& exprs) = 0; + + /** + * Filter rows from the DataFrame. + * @param expr The filter expression + * @return A new DataFrame with filtered rows + */ + virtual std::shared_ptr filter(const std::shared_ptr& expr) = 0; + + /** + * Perform vector search on the DataFrame. + * @param query_vector The query vector + * @param k The number of results to return + * @param distance_metric The distance metric to use (default: L2) + * @return A new DataFrame with search results + */ + virtual std::shared_ptr vector_search( + const std::vector& query_vector, + size_t k, + const std::string& distance_metric = "L2") = 0; + + /** + * Get the schema of the DataFrame. + * @return The schema + */ + virtual std::shared_ptr schema() const = 0; + + /** + * Get the logical plan of the DataFrame. + * @return The logical plan + */ + virtual std::shared_ptr logical_plan() const = 0; + + /** + * Set the tenant ID for this DataFrame. + * @param tenant_id The tenant ID to set + * @return A new DataFrame with the updated tenant ID + */ + virtual std::shared_ptr tenant(uint64_t tenant_id) = 0; + + /** + * Create a new DataFrame with the given schema and data source. + * @param schema The schema + * @param data_source The data source + * @return A new DataFrame + */ + static std::shared_ptr create( + std::shared_ptr schema, + std::shared_ptr data_source); +}; + +class DataFrameImpl : public DataFrame { +public: + DataFrameImpl( + std::shared_ptr plan, + std::shared_ptr schema) + : plan_(plan), schema_(schema) {} + + std::shared_ptr project(const std::vector>& exprs) override { + auto new_plan = std::make_shared( + plan_->data_source(), + plan_->tenant_id(), + plan_, + exprs); + return std::make_shared(new_plan, schema_); + } + + std::shared_ptr filter(const std::shared_ptr& expr) override { + auto new_plan = std::make_shared( + plan_->data_source(), + plan_->tenant_id(), + plan_, + expr); + return std::make_shared(new_plan, schema_); + } + + std::shared_ptr vector_search( + const std::vector& query_vector, + size_t k, + const std::string& distance_metric = "L2") override { + auto new_plan = std::make_shared( + plan_->data_source(), + plan_->tenant_id(), + plan_, + query_vector, + k, + distance_metric); + return std::make_shared(new_plan, schema_); + } + + std::shared_ptr schema() const override { + return schema_; + } + + std::shared_ptr logical_plan() const override { + return plan_; + } + + std::shared_ptr tenant(uint64_t tenant_id) override { + // Create a new scan plan with the updated tenant ID + auto new_plan = std::make_shared( + plan_->data_source(), + tenant_id, + schema_); + return std::make_shared(new_plan, schema_); + } + +private: + std::shared_ptr plan_; + std::shared_ptr schema_; +}; + +inline std::shared_ptr DataFrame::create( + std::shared_ptr schema, + std::shared_ptr data_source) { + // Initialize with tenant_id 0, can be updated later with tenant() + auto plan = std::make_shared(data_source, 0, schema); + return std::make_shared(plan, schema); +} + +} // namespace lintdb::query \ No newline at end of file diff --git a/lintdb/query/DocIterator.cpp b/lintdb/query/DocIterator.cpp index 57b31fd..1500c9f 100644 --- a/lintdb/query/DocIterator.cpp +++ b/lintdb/query/DocIterator.cpp @@ -3,6 +3,7 @@ #include "DocValue.h" #include "lintdb/schema/DocEncoder.h" #include "lintdb/scoring/ScoredDocument.h" +#include "lintdb/scoring/scoring_methods.h" namespace lintdb { TermIterator::TermIterator( @@ -39,6 +40,7 @@ std::vector TermIterator::fields() const { } ScoredDocument TermIterator::score(std::vector fields) const { +// LOG(INFO) << "Term Scoring method"; score_t score = lintdb::score(this->scoring_method, fields); return ScoredDocument(score, doc_id(), fields); @@ -129,7 +131,6 @@ void ANNIterator::heapify(size_t idx) { ScoredDocument ANNIterator::score(std::vector fields) const { score_t score = lintdb::score_embeddings(this->scoring_method, fields, this->knn_); - return ScoredDocument(score, 0, fields); } diff --git a/lintdb/query/KnnNearestCentroids.h b/lintdb/query/KnnNearestCentroids.h index d901073..2a84efe 100644 --- a/lintdb/query/KnnNearestCentroids.h +++ b/lintdb/query/KnnNearestCentroids.h @@ -3,13 +3,13 @@ #include #include #include -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/quantizers/CoarseQuantizer.h" namespace lintdb { struct QueryTensor { - std::vector query; + const std::vector& query; size_t num_query_tokens; }; @@ -43,7 +43,7 @@ class KnnNearestCentroids { return coarse_idx[idx * total_centroids_to_calculate]; } - inline std::vector get_reordered_distances() const { + inline const std::vector& get_reordered_distances() const { return reordered_distances; } diff --git a/lintdb/query/LogicalExpr.h b/lintdb/query/LogicalExpr.h new file mode 100644 index 0000000..cf4e5b3 --- /dev/null +++ b/lintdb/query/LogicalExpr.h @@ -0,0 +1,182 @@ +#pragma once + +#include +#include +#include +#include "lintdb/schema/Schema.h" +#include "lintdb/schema/DataTypes.h" + +namespace lintdb { +namespace query { + +/** + * @brief LogicalExpr represents a logical expression that can be evaluated + * against data. + */ +class LogicalExpr { +public: + enum class Type { + COLUMN, + LITERAL, + BINARY, + VECTOR_SEARCH, + FILTER + }; + + virtual ~LogicalExpr() = default; + + /** + * @brief Get the type of this logical expression + * @return The type of the expression + */ + virtual Type type() const = 0; + + /** + * @brief Convert this expression to a field in the schema + * @param input_schema The input schema + * @return The field that represents this expression + */ + Field to_field(const lintdb::Schema& input_schema) const { + return input_schema.get_field(to_string()); + } + + /** + * @brief Get a string representation of this expression + * @return The string representation + */ + virtual std::string to_string() const = 0; +}; + +/** + * @brief Column represents a reference to a column in the data + */ +class Column : public LogicalExpr { +public: + explicit Column(const std::string& name) : name_(name) {} + + Type type() const override { return Type::COLUMN; } + + std::string to_string() const override { + return name_; + } + +private: + std::string name_; +}; + +/** + * @brief Literal represents a constant value + */ +class Literal : public LogicalExpr { +public: + explicit Literal(const std::string& value) : value_(value) {} + + Type type() const override { return Type::LITERAL; } + + std::string to_string() const override { + return value_; + } + +private: + std::string value_; +}; + +/** + * @brief BinaryExpr represents a binary operation between two expressions + */ +class BinaryExpr : public LogicalExpr { +public: + enum class Op { + EQ, + NEQ, + GT, + GTE, + LT, + LTE, + AND, + OR + }; + + BinaryExpr(std::shared_ptr left, + Op op, + std::shared_ptr right) + : left_(std::move(left)), op_(op), right_(std::move(right)) {} + + Type type() const override { return Type::BINARY; } + + std::string to_string() const override { + std::string op_str; + switch (op_) { + case Op::EQ: op_str = "="; break; + case Op::NEQ: op_str = "!="; break; + case Op::GT: op_str = ">"; break; + case Op::GTE: op_str = ">="; break; + case Op::LT: op_str = "<"; break; + case Op::LTE: op_str = "<="; break; + case Op::AND: op_str = "AND"; break; + case Op::OR: op_str = "OR"; break; + } + return left_->to_string() + " " + op_str + " " + right_->to_string(); + } + + BinaryExpr::Op op() const { return op_; } + + std::shared_ptr left() const { return left_; } + std::shared_ptr right() const { return right_; } + +private: + std::shared_ptr left_; + Op op_; + std::shared_ptr right_; +}; + +/** + * @brief Represents a vector search expression + */ +class VectorSearchExpr : public LogicalExpr { +public: + VectorSearchExpr(const std::string& column_name, const std::vector& query_vector, size_t k) + : column_name_(column_name), query_vector_(query_vector), k_(k) {} + + Type type() const override { return Type::VECTOR_SEARCH; } + + std::string to_string() const override { + return "VectorSearch(" + column_name_ + ", k=" + std::to_string(k_) + ")"; + } + + const std::string& column_name() const { return column_name_; } + const std::vector& query_vector() const { return query_vector_; } + size_t k() const { return k_; } + +private: + std::string column_name_; + std::vector query_vector_; + size_t k_; +}; + +/** + * @brief Represents a filter expression + */ +class FilterExpr : public LogicalExpr { +public: + FilterExpr(const std::string& column_name, const std::string& op, float value) + : column_name_(column_name), op_(op), value_(value) {} + + Type type() const override { return Type::FILTER; } + + std::string to_string() const override { + return "Filter(" + column_name_ + " " + op_ + " " + std::to_string(value_) + ")"; + } + + const std::string& column_name() const { return column_name_; } + const std::string& op() const { return op_; } + float value() const { return value_; } + +private: + std::string column_name_; + std::string op_; // ">", "<", ">=", "<=", "==" + float value_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/LogicalPlan.h b/lintdb/query/LogicalPlan.h new file mode 100644 index 0000000..bdc2b27 --- /dev/null +++ b/lintdb/query/LogicalPlan.h @@ -0,0 +1,199 @@ +#pragma once + +#include +#include +#include +#include "lintdb/query/LogicalExpr.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/datasources/DataSource.h" + +namespace lintdb { +namespace query { + +/** + * @brief Interface for all logical plans + */ +class LogicalPlan { +public: + virtual ~LogicalPlan() = default; + + /** + * @brief Get the schema of the plan + * @return The schema + */ + virtual std::shared_ptr schema() const = 0; + + /** + * @brief Get the child plans of this plan + * @return Vector of child plans + */ + virtual std::vector> children() const = 0; + + /** + * @brief Get a string representation of this plan + * @return The string representation + */ + virtual std::string to_string() const = 0; + + /** + * @brief Get the data source associated with this plan + * @return The data source + */ + virtual const std::shared_ptr& data_source() const = 0; + + /** + * @brief Get the tenant ID associated with this plan + * @return The tenant ID + */ + virtual uint64_t tenant_id() const = 0; +}; + +/** + * @brief Abstract base class implementing common functionality for logical plans + */ +class AbstractLogicalPlan : public LogicalPlan { +public: + AbstractLogicalPlan(std::shared_ptr data_source, uint64_t tenant_id) + : data_source_(std::move(data_source)), tenant_id_(tenant_id) {} + + const std::shared_ptr& data_source() const override { return data_source_; } + uint64_t tenant_id() const override { return tenant_id_; } + +protected: + std::shared_ptr data_source_; + uint64_t tenant_id_; +}; + +/** + * @brief Scan plan that reads data from a data source + */ +class ScanPlan : public AbstractLogicalPlan { +public: + ScanPlan(std::shared_ptr data_source, uint64_t tenant_id, std::shared_ptr schema) + : AbstractLogicalPlan(std::move(data_source), tenant_id), schema_(std::move(schema)) {} + + std::shared_ptr schema() const override { return schema_; } + std::vector> children() const override { return {}; } + std::string to_string() const override { return "Scan"; } + +private: + std::shared_ptr schema_; +}; + +/** + * @brief Projection plan that selects specific columns + */ +class ProjectionPlan : public AbstractLogicalPlan { +public: + ProjectionPlan( + std::shared_ptr data_source, + uint64_t tenant_id, + std::shared_ptr child, + const std::vector>& projections) + : AbstractLogicalPlan(std::move(data_source), tenant_id), + child_(std::move(child)), + projections_(projections) {} + + std::shared_ptr schema() const override { + // TODO: Implement schema derivation from projections + return child_->schema(); + } + + std::vector> children() const override { + return {child_}; + } + + std::string to_string() const override { + std::string result = "Project("; + for (size_t i = 0; i < projections_.size(); ++i) { + if (i > 0) result += ", "; + result += projections_[i]->to_string(); + } + result += ")"; + return result; + } + + const std::vector>& projections() const { + return projections_; + } + +private: + std::shared_ptr child_; + std::vector> projections_; +}; + +/** + * @brief Filter plan that applies a predicate + */ +class FilterPlan : public AbstractLogicalPlan { +public: + FilterPlan( + std::shared_ptr data_source, + uint64_t tenant_id, + std::shared_ptr child, + std::shared_ptr filter) + : AbstractLogicalPlan(std::move(data_source), tenant_id), + child_(std::move(child)), + filter_(std::move(filter)) {} + + std::shared_ptr schema() const override { return child_->schema(); } + std::vector> children() const override { + return {child_}; + } + + std::string to_string() const override { + return "Filter(" + filter_->to_string() + ")"; + } + + const std::shared_ptr& filter() const { return filter_; } + +private: + std::shared_ptr child_; + std::shared_ptr filter_; +}; + +/** + * @brief Vector search plan that performs nearest neighbor search + */ +class VectorSearchPlan : public AbstractLogicalPlan { +public: + VectorSearchPlan( + const std::string& column_name, + std::shared_ptr data_source, + uint64_t tenant_id, + std::shared_ptr child, + const std::vector& query_vector, + size_t k, + const std::string& distance_metric = "L2") + : AbstractLogicalPlan(std::move(data_source), tenant_id), + column_name_(column_name), + child_(std::move(child)), + query_vector_(query_vector), + k_(k), + distance_metric_(distance_metric) {} + + std::shared_ptr schema() const override { return child_->schema(); } + std::vector> children() const override { + return {child_}; + } + + std::string to_string() const override { + return "VectorSearch(k=" + std::to_string(k_) + + ", metric=" + distance_metric_ + ")"; + } + + const std::string& column_name() const { return column_name_; } + const std::vector& query_vector() const { return query_vector_; } + size_t k() const { return k_; } + const std::string& distance_metric() const { return distance_metric_; } + +private: + std::string column_name_; + std::shared_ptr child_; + std::vector query_vector_; + size_t k_; + std::string distance_metric_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/PhysicalExpr.h b/lintdb/query/PhysicalExpr.h new file mode 100644 index 0000000..c431824 --- /dev/null +++ b/lintdb/query/PhysicalExpr.h @@ -0,0 +1,399 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lintdb/query/LogicalExpr.h" + +namespace lintdb { +namespace query { + +/** + * @brief Base class for all physical expressions + */ +class PhysicalExpr { +public: + virtual ~PhysicalExpr() = default; + + /** + * @brief Evaluate the expression on a record batch + * @param batch The input record batch + * @return The resulting column vector + */ + virtual std::shared_ptr evaluate(const std::shared_ptr& batch) = 0; + + /** + * @brief Get a string representation of this expression + * @return The string representation + */ + virtual std::string to_string() const = 0; +}; + +/** + * @brief Physical expression for column references + */ +class ColumnPhysicalExpr : public PhysicalExpr { +public: + explicit ColumnPhysicalExpr(int index) : index_(index) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + if (index_ < 0 || index_ >= batch->num_columns()) { + throw std::runtime_error("Column index out of bounds: " + std::to_string(index_)); + } + return batch->column(index_); + } + + std::string to_string() const override { + return "#" + std::to_string(index_); + } + +private: + int index_; +}; + +/** + * @brief Physical expression for literal values + */ +class LiteralPhysicalExpr : public PhysicalExpr { +public: + explicit LiteralPhysicalExpr(const std::string& value) : value_(value) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + // Create a constant array with the literal value + arrow::StringBuilder builder; + auto status = builder.Append(value_); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + std::shared_ptr array; + status = builder.Finish(&array); + if (!status.ok()) { + throw std::runtime_error(status.ToString()); + } + return array; + } + + std::string to_string() const override { + return "'" + value_ + "'"; + } + +private: + std::string value_; +}; + +/** + * @brief Base class for binary physical expressions + */ +class BinaryPhysicalExpr : public PhysicalExpr { +public: + BinaryPhysicalExpr(std::shared_ptr left, + std::shared_ptr right, + BinaryExpr::Op op) + : left_(std::move(left)), right_(std::move(right)), op_(op) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // TODO: Implement binary operations based on op_ + // This will need to handle different types and operations + // For now, return the left result as a placeholder + return left_result; + } + + std::string to_string() const override { + std::string op_str; + switch (op_) { + case BinaryExpr::Op::EQ: op_str = "="; break; + case BinaryExpr::Op::GT: op_str = ">"; break; + case BinaryExpr::Op::GTE: op_str = ">="; break; + case BinaryExpr::Op::LT: op_str = "<"; break; + case BinaryExpr::Op::LTE: op_str = "<="; break; + case BinaryExpr::Op::AND: op_str = "AND"; break; + case BinaryExpr::Op::OR: op_str = "OR"; break; + default: op_str = "UNKNOWN"; break; + } + return "(" + left_->to_string() + " " + op_str + " " + right_->to_string() + ")"; + } + + BinaryExpr::Op op() const { return op_; } + +protected: + std::shared_ptr left_; + std::shared_ptr right_; + BinaryExpr::Op op_; +}; + +/** + * @brief Helper function to ensure arrays have the same type + */ +arrow::Result> ensure_same_type( + const std::shared_ptr& left, + const std::shared_ptr& right) { + if (left->type()->Equals(right->type())) { + return arrow::Result>(right); + } + // TODO: Implement type casting if needed + return arrow::Status::TypeError("Type mismatch in binary operation"); +} + +/** + * @brief Physical expression for equality comparison + */ +class EqPhysicalExpr : public BinaryPhysicalExpr { +public: + EqPhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::EQ) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure types match + auto right_cast = ensure_same_type(left_result, right_result); + if (!right_cast.ok()) { + throw std::runtime_error(right_cast.status().ToString()); + } + + // Use Arrow compute for efficient comparison + auto result = arrow::compute::CallFunction("equal", {left_result, right_cast.ValueOrDie()}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " = " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for greater than comparison + */ +class GtPhysicalExpr : public BinaryPhysicalExpr { +public: + GtPhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::GT) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure types match + auto right_cast = ensure_same_type(left_result, right_result); + if (!right_cast.ok()) { + throw std::runtime_error(right_cast.status().ToString()); + } + + // Use Arrow compute for efficient comparison + auto result = arrow::compute::CallFunction("greater", {left_result, right_cast.ValueOrDie()}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " > " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for less than comparison + */ +class LtPhysicalExpr : public BinaryPhysicalExpr { +public: + LtPhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::LT) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure types match + auto right_cast = ensure_same_type(left_result, right_result); + if (!right_cast.ok()) { + throw std::runtime_error(right_cast.status().ToString()); + } + + // Use Arrow compute for efficient comparison + auto result = arrow::compute::CallFunction("less", {left_result, right_cast.ValueOrDie()}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " < " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for greater than or equals comparison + */ +class GtePhysicalExpr : public BinaryPhysicalExpr { +public: + GtePhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::GTE) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure types match + auto right_cast = ensure_same_type(left_result, right_result); + if (!right_cast.ok()) { + throw std::runtime_error(right_cast.status().ToString()); + } + + // Use Arrow compute for efficient comparison + auto result = arrow::compute::CallFunction("greater_equal", {left_result, right_cast.ValueOrDie()}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " >= " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for less than or equals comparison + */ +class LtePhysicalExpr : public BinaryPhysicalExpr { +public: + LtePhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::LTE) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure types match + auto right_cast = ensure_same_type(left_result, right_result); + if (!right_cast.ok()) { + throw std::runtime_error(right_cast.status().ToString()); + } + + // Use Arrow compute for efficient comparison + auto result = arrow::compute::CallFunction("less_equal", {left_result, right_cast.ValueOrDie()}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " <= " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for logical AND + */ +class AndPhysicalExpr : public BinaryPhysicalExpr { +public: + AndPhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::AND) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure both arrays are boolean + if (left_result->type()->id() != arrow::Type::BOOL || + right_result->type()->id() != arrow::Type::BOOL) { + throw std::runtime_error("Logical AND requires boolean operands"); + } + + // Use Arrow compute for efficient logical AND + auto result = arrow::compute::CallFunction("and", {left_result, right_result}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " AND " + right_->to_string() + ")"; + } +}; + +/** + * @brief Physical expression for logical OR + */ +class OrPhysicalExpr : public BinaryPhysicalExpr { +public: + OrPhysicalExpr(std::shared_ptr left, + std::shared_ptr right) + : BinaryPhysicalExpr(std::move(left), std::move(right), BinaryExpr::Op::OR) {} + + std::shared_ptr evaluate(const std::shared_ptr& batch) override { + auto left_result = left_->evaluate(batch); + auto right_result = right_->evaluate(batch); + + // Ensure both arrays are boolean + if (left_result->type()->id() != arrow::Type::BOOL || + right_result->type()->id() != arrow::Type::BOOL) { + throw std::runtime_error("Logical OR requires boolean operands"); + } + + // Use Arrow compute for efficient logical OR + auto result = arrow::compute::CallFunction("or", {left_result, right_result}); + if (!result.ok()) { + throw std::runtime_error(result.status().ToString()); + } + return result.ValueOrDie().make_array(); + } + + std::string to_string() const override { + return "(" + left_->to_string() + " OR " + right_->to_string() + ")"; + } +}; + +/** + * @brief Factory function to create binary physical expressions + */ +std::shared_ptr create_binary_physical_expr( + std::shared_ptr left, + BinaryExpr::Op op, + std::shared_ptr right) { + switch (op) { + case BinaryExpr::Op::EQ: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::GT: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::GTE: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::LT: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::LTE: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::AND: + return std::make_shared(std::move(left), std::move(right)); + case BinaryExpr::Op::OR: + return std::make_shared(std::move(left), std::move(right)); + default: + throw std::runtime_error("Unsupported binary operation"); + } +} + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/QueryExecutor.cpp b/lintdb/query/QueryExecutor.cpp index b853df9..37c7e74 100644 --- a/lintdb/query/QueryExecutor.cpp +++ b/lintdb/query/QueryExecutor.cpp @@ -26,35 +26,41 @@ std::vector QueryExecutor::execute( } std::vector results(documents.size()); -#pragma omp parallel for if(documents.size() > 100) - for(int i = 0; i < documents.size(); i++) { - auto doc = documents[i]; -// for (auto& dv : doc.second) { -// // ColBERT is a special case where we don't have a value to decode. -// if (dv.unread_value) { -// continue; -// } -// dv = decode_vectors(context, dv); -// } - ScoredDocument scored = doc_it->score(doc.second); - scored.doc_id = doc.first; + std::vector top_results_ranked; + size_t num_to_rank = std::min(results.size(), opts.num_second_pass); - if (opts.expected_id != -1 && doc.first == opts.expected_id) { - LOG(INFO) << "\tscore: " << scored.score; - } + top_results_ranked.resize(num_to_rank); - results[i] = scored; - } // end for +#pragma omp parallel + { +#pragma omp for + for (int i = 0; i < documents.size(); i++) { + auto doc = documents[i]; + // for (auto& dv : doc.second) { + // // ColBERT is a special case where we don't have a value to decode. if (dv.unread_value) { + // continue; + // } + // dv = decode_vectors(context, dv); + // } + ScoredDocument scored = doc_it->score(doc.second); + scored.doc_id = doc.first; - std::sort(results.begin(), results.end(), std::greater<>()); + if (opts.expected_id != -1 && doc.first == opts.expected_id) { + LOG(INFO) << "\tscore: " << scored.score; + } - size_t num_to_rank = std::min(results.size(), opts.num_second_pass); + results[i] = scored; + } // end for - std::vector top_results_ranked(num_to_rank); - for (size_t i = 0; i < num_to_rank; i++) { - top_results_ranked[i] = ranker.score( - context, results[i].doc_id, results[i].values); - } +#pragma omp single + std::sort(results.begin(), results.end(), std::greater<>()); + +#pragma omp for + for (size_t i = 0; i < num_to_rank; i++) { + top_results_ranked[i] = + ranker.score(context, results[i].doc_id, results[i].values); + } + } // end parallel std::sort( top_results_ranked.begin(), diff --git a/lintdb/query/QueryPlanner.cpp b/lintdb/query/QueryPlanner.cpp new file mode 100644 index 0000000..cf0fe55 --- /dev/null +++ b/lintdb/query/QueryPlanner.cpp @@ -0,0 +1,101 @@ +#include "lintdb/query/QueryPlanner.h" +#include "lintdb/query/LogicalExpr.h" +#include "lintdb/query/PhysicalExpr.h" +#include "lintdb/query/physical/ScanPhysicalPlan.h" +#include "lintdb/query/physical/FilterPhysicalPlan.h" +#include "lintdb/query/physical/ProjectionPhysicalPlan.h" +#include "lintdb/query/physical/VectorSearchPhysicalPlan.h" + +namespace lintdb::query { + +std::unique_ptr QueryPlanner::create_physical_plan(const std::shared_ptr& logical) { + if (auto scan = dynamic_cast(logical.get())) { + return plan_scan(*scan); + } + else if (auto filt = dynamic_cast(logical.get())) { + return plan_filter(*filt); + } + else if (auto proj = dynamic_cast(logical.get())) { + return plan_project(*proj); + } + else if (auto vs = dynamic_cast(logical.get())) { + return plan_vector_search(*vs); + } + else { + throw std::runtime_error("Unsupported logical plan node"); + } +} + +std::unique_ptr QueryPlanner::plan_scan(const ScanPlan& scan) { + return std::make_unique(scan.data_source(), scan.tenant_id(), scan.schema()); +} + +std::unique_ptr QueryPlanner::plan_filter(const FilterPlan& filt) { + auto children = filt.children(); + if (children.empty()) { + throw std::runtime_error("Filter plan must have a child plan"); + } + auto child_phys = create_physical_plan(children[0]); + + return std::make_unique( + std::move(child_phys), + filt.filter() + ); +} + +std::unique_ptr QueryPlanner::plan_project(const ProjectionPlan& proj) { + auto children = proj.children(); + if (children.empty()) { + throw std::runtime_error("Projection plan must have a child plan"); + } + auto child_phys = create_physical_plan(children[0]); + + return std::make_unique( + std::move(child_phys), + proj.projections() + ); +} + +std::unique_ptr QueryPlanner::plan_vector_search(const VectorSearchPlan& vs) { + return std::make_unique( + vs.column_name(), + vs.data_source(), + vs.tenant_id(), + vs.query_vector(), + vs.k(), + vs.distance_metric() + ); +} + +std::unique_ptr QueryPlanner::create_physical_expr(const std::shared_ptr& expr, const LogicalPlan& plan) { + if (!expr) { + return nullptr; + } + + switch (expr->type()) { + case LogicalExpr::Type::LITERAL: { + auto lit = std::static_pointer_cast(expr); + return std::make_unique(lit->to_string()); + } + case LogicalExpr::Type::BINARY: { + auto bin = std::static_pointer_cast(expr); + auto left = create_physical_expr(bin->left(), plan); + auto right = create_physical_expr(bin->right(), plan); + return std::make_unique( + std::move(left), + std::move(right), + bin->op() + ); + } + case LogicalExpr::Type::COLUMN: { + auto col = std::static_pointer_cast(expr); + auto schema = plan.schema(); + auto field_idx = schema->get_field_index(col->to_string()); + return std::make_unique(field_idx); + } + default: + throw std::runtime_error("Unsupported logical expression type"); + } +} + +} // namespace lintdb::query \ No newline at end of file diff --git a/lintdb/query/QueryPlanner.h b/lintdb/query/QueryPlanner.h new file mode 100644 index 0000000..8b38b48 --- /dev/null +++ b/lintdb/query/QueryPlanner.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include "lintdb/query/LogicalPlan.h" +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/query/PhysicalExpr.h" + +namespace lintdb::query { + +/** + * @brief Query planner that converts logical plans to physical plans + */ +class QueryPlanner { +public: + QueryPlanner() = default; + ~QueryPlanner() = default; + + /** + * @brief Create a physical plan from a logical plan + * @param logical The logical plan to convert + * @return A unique pointer to the physical plan + */ + std::unique_ptr create_physical_plan(const std::shared_ptr& logical); + +private: + /** + * @brief Create a physical plan for a scan operation + * @param scan The logical scan plan + * @return A unique pointer to the physical scan plan + */ + std::unique_ptr plan_scan(const ScanPlan& scan); + + /** + * @brief Create a physical plan for a filter operation + * @param filt The logical filter plan + * @return A unique pointer to the physical filter plan + */ + std::unique_ptr plan_filter(const FilterPlan& filt); + + /** + * @brief Create a physical plan for a projection operation + * @param proj The logical projection plan + * @return A unique pointer to the physical projection plan + */ + std::unique_ptr plan_project(const ProjectionPlan& proj); + + /** + * @brief Create a physical plan for a vector search operation + * @param vs The logical vector search plan + * @return A unique pointer to the physical vector search plan + */ + std::unique_ptr plan_vector_search(const VectorSearchPlan& vs); + + std::unique_ptr create_physical_expr(const std::shared_ptr& expr, const LogicalPlan& plan); +}; + +} // namespace lintdb::query \ No newline at end of file diff --git a/lintdb/query/physical/FilterPhysicalPlan.h b/lintdb/query/physical/FilterPhysicalPlan.h new file mode 100644 index 0000000..60a0830 --- /dev/null +++ b/lintdb/query/physical/FilterPhysicalPlan.h @@ -0,0 +1,51 @@ +#pragma once + +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/query/LogicalExpr.h" + +namespace lintdb { +namespace query { + +/** + * @brief Physical filter plan that applies a predicate + */ +class FilterPhysicalPlan : public PhysicalPlan { +public: + FilterPhysicalPlan( + std::shared_ptr child, + std::shared_ptr filter) + : filter_(std::move(filter)) { + add_child(std::move(child)); + } + + std::shared_ptr schema() const override { return get_child(0)->schema(); } + + std::vector> execute() override { + std::vector> result; + auto input_batches = get_child(0)->execute(); + + for (const auto& batch : input_batches) { + // TODO: Implement filter logic + // 1. Evaluate the filter expression on the input batch + // 2. Create a new RecordBatch with only the rows that match + + result.push_back(batch); // Placeholder + } + + return result; + } + + std::vector> children() const override { + return children_; + } + + std::string to_string() const override { + return "PhysicalFilter(" + filter_->to_string() + ")"; + } + +private: + std::shared_ptr filter_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/physical/PhysicalPlan.h b/lintdb/query/physical/PhysicalPlan.h new file mode 100644 index 0000000..371eb06 --- /dev/null +++ b/lintdb/query/physical/PhysicalPlan.h @@ -0,0 +1,84 @@ +#pragma once + +#include +#include +#include +#include +#include "lintdb/schema/Schema.h" + +namespace lintdb { +namespace query { + +/** + * @brief Base interface for all physical plans + */ +class PhysicalPlan { +public: + virtual ~PhysicalPlan() = default; + + /** + * @brief Get the schema of the plan + * @return The schema + */ + virtual std::shared_ptr schema() const = 0; + + /** + * @brief Execute the plan and return a sequence of record batches + * @return Sequence of record batches + */ + virtual std::vector> execute() = 0; + + /** + * @brief Get the child plans of this plan + * @return Vector of child plans + */ + virtual std::vector> children() const = 0; + + /** + * @brief Get a string representation of this plan + * @return The string representation + */ + virtual std::string to_string() const = 0; + +protected: + /** + * @brief Add a child plan + * @param child The child plan to add + */ + void add_child(std::shared_ptr child) { + children_.push_back(std::move(child)); + } + + /** + * @brief Get the number of children + * @return Number of children + */ + size_t num_children() const { + return children_.size(); + } + + /** + * @brief Get a specific child by index + * @param index The index of the child to get + * @return The child plan at the specified index + * @throws std::out_of_range if index is invalid + */ + std::shared_ptr get_child(size_t index) const { + if (index >= children_.size()) { + throw std::out_of_range("Child index out of range"); + } + return children_[index]; + } + + /** + * @brief Clear all children + */ + void clear_children() { + children_.clear(); + } + + std::vector> children_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/physical/ProjectionPhysicalPlan.h b/lintdb/query/physical/ProjectionPhysicalPlan.h new file mode 100644 index 0000000..e4d77a5 --- /dev/null +++ b/lintdb/query/physical/ProjectionPhysicalPlan.h @@ -0,0 +1,61 @@ +#pragma once + +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/query/LogicalExpr.h" + +namespace lintdb { +namespace query { + +/** + * @brief Physical projection plan that selects specific columns + */ +class ProjectionPhysicalPlan : public PhysicalPlan { +public: + ProjectionPhysicalPlan( + std::shared_ptr child, + const std::vector>& projections) + : projections_(projections) { + add_child(std::move(child)); + } + + std::shared_ptr schema() const override { + // TODO: Implement schema derivation from projections + return get_child(0)->schema(); + } + + std::vector> execute() override { + std::vector> result; + auto input_batches = get_child(0)->execute(); + + for (const auto& batch : input_batches) { + // TODO: Implement projection logic + // For each projection expression: + // 1. Evaluate the expression on the input batch + // 2. Create a new RecordBatch with the projected columns + + result.push_back(batch); // Placeholder + } + + return result; + } + + std::vector> children() const override { + return children_; + } + + std::string to_string() const override { + std::string result = "PhysicalProject("; + for (size_t i = 0; i < projections_.size(); ++i) { + if (i > 0) result += ", "; + result += projections_[i]->to_string(); + } + result += ")"; + return result; + } + +private: + std::vector> projections_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/physical/ScanPhysicalPlan.h b/lintdb/query/physical/ScanPhysicalPlan.h new file mode 100644 index 0000000..9ba9632 --- /dev/null +++ b/lintdb/query/physical/ScanPhysicalPlan.h @@ -0,0 +1,52 @@ +#pragma once + +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/datasources/DataSource.h" + +namespace lintdb { +namespace query { + +/** + * @brief Physical scan plan that reads data from a data source + */ +class ScanPhysicalPlan : public PhysicalPlan { +public: + explicit ScanPhysicalPlan( + std::shared_ptr data_source, + uint64_t tenant_id, + std::shared_ptr schema) + : data_source_(std::move(data_source)), tenant_id_(tenant_id), schema_(std::move(schema)) {} + + std::shared_ptr schema() const override { return schema_; } + + std::vector> execute() override { + std::vector> batches; + auto iterator = data_source_->scan(tenant_id_); + + // Process data in batches + const size_t BATCH_SIZE = 1024; // Configurable batch size + std::vector> columns; + columns.resize(schema_->fields.size()); + + // TODO: Implement conversion from key-value pairs to Arrow arrays + // This will depend on your data format and serialization scheme + + return batches; + } + + std::vector> children() const override { + return children_; + } + + std::string to_string() const override { + return "PhysicalScan"; + } + +private: + std::shared_ptr data_source_; + uint64_t tenant_id_; + std::shared_ptr schema_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/query/physical/VectorSearchPhysicalPlan.h b/lintdb/query/physical/VectorSearchPhysicalPlan.h new file mode 100644 index 0000000..8369183 --- /dev/null +++ b/lintdb/query/physical/VectorSearchPhysicalPlan.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/datasources/DataSource.h" +#include "lintdb/datasources/serialization.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/invlists/KeyBuilder.h" + +namespace lintdb { +namespace query { + +/** + * @brief Physical vector search plan that performs nearest neighbor search + */ +class VectorSearchPhysicalPlan : public PhysicalPlan { +public: + VectorSearchPhysicalPlan( + const std::string& column_name, + std::shared_ptr data_source, + uint64_t tenant_id, + const std::vector& query_vector, + size_t k, + const std::string& distance_metric = "L2") + : data_source_(data_source), + tenant_id_(tenant_id), + column_name_(column_name), + query_vector_(query_vector), + k_(k), + distance_metric_(distance_metric) { + } + + std::shared_ptr schema() const override { return get_child(0)->schema(); } + + std::vector> execute() override { + std::vector> result; + + auto schema = this->schema(); + + // Get the field definition for the column we're searching + const Field& field = schema->get_field(column_name_); + + std::string prefix = create_index_prefix(tenant_id_, field.field_id, field.data_type, 0); + auto iter = data_source_->scan_prefix(prefix); + + // Priority queue to maintain top-k results + // For L2 distance, we want smaller distances (max heap) + // For inner product, we want larger values (min heap) + bool is_similarity = (distance_metric_ == "cosine" || distance_metric_ == "ip"); + using Result = std::pair; + auto compare = [is_similarity](const Result& a, const Result& b) { + return is_similarity ? a.first > b.first : a.first < b.first; + }; + std::priority_queue, decltype(compare)> top_k(compare); + + // Process each vector in the iterator + while (iter->is_valid()) { + auto key = iter->key(); + auto value = iter->value(); + + // Decode the value using our serialization system + auto decoded_value = deserialize_field_value(field, value); + + // Get the vector from the decoded value + if (const auto* tensor = std::get_if(&decoded_value)) { + // Compute distance based on metric + float distance; + if (distance_metric_ == "L2") { + distance = faiss::fvec_L2sqr(query_vector_.data(), tensor->data(), tensor->size()); + } else if (distance_metric_ == "cosine") { + // For cosine similarity, we compute inner product of normalized vectors + float ip = faiss::fvec_inner_product(query_vector_.data(), tensor->data(), tensor->size()); + float norm1 = faiss::fvec_norm_L2sqr(query_vector_.data(), query_vector_.size()); + float norm2 = faiss::fvec_norm_L2sqr(tensor->data(), tensor->size()); + distance = ip / (std::sqrt(norm1) * std::sqrt(norm2)); + } else if (distance_metric_ == "ip") { + distance = faiss::fvec_inner_product(query_vector_.data(), tensor->data(), tensor->size()); + } else { + throw std::runtime_error("Unsupported distance metric: " + distance_metric_); + } + + // Add to top-k results + if (top_k.size() < k_ || compare({distance, std::stoll(key)}, top_k.top())) { + top_k.push({distance, std::stoll(key)}); + if (top_k.size() > k_) { + top_k.pop(); + } + } + } + + iter->advance(); + } + + // Create Arrow arrays for results + arrow::Int64Builder id_builder; + arrow::FloatBuilder distance_builder; + + // Convert priority queue to sorted vector + std::vector sorted_results; + while (!top_k.empty()) { + sorted_results.push_back(top_k.top()); + top_k.pop(); + } + std::reverse(sorted_results.begin(), sorted_results.end()); + + // Build arrays + for (const auto& [distance, id] : sorted_results) { + auto status = id_builder.Append(id); + if (!status.ok()) { + throw std::runtime_error("Failed to append ID: " + status.ToString()); + } + status = distance_builder.Append(distance); + if (!status.ok()) { + throw std::runtime_error("Failed to append distance: " + status.ToString()); + } + } + + // Create arrays + std::shared_ptr id_array; + std::shared_ptr distance_array; + auto status = id_builder.Finish(&id_array); + if (!status.ok()) { + throw std::runtime_error("Failed to finish ID array: " + status.ToString()); + } + status = distance_builder.Finish(&distance_array); + if (!status.ok()) { + throw std::runtime_error("Failed to finish distance array: " + status.ToString()); + } + + // Create schema for results + auto result_schema = arrow::schema({ + arrow::field("id", arrow::int64()), + arrow::field("distance", arrow::float32()) + }); + + // Create record batch + auto batch = arrow::RecordBatch::Make( + result_schema, + sorted_results.size(), + {id_array, distance_array} + ); + + result.push_back(batch); + return result; + } + + std::vector> children() const override { + return children_; + } + + std::string to_string() const override { + return "PhysicalVectorSearch(k=" + std::to_string(k_) + + ", metric=" + distance_metric_ + ")"; + } + +private: + std::shared_ptr data_source_; + uint64_t tenant_id_; + std::string column_name_; + std::vector query_vector_; + size_t k_; + std::string distance_metric_; + std::vector> children_; +}; + +} // namespace query +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/schema/DataTypes.h b/lintdb/schema/DataTypes.h index 447af01..5ad8451 100644 --- a/lintdb/schema/DataTypes.h +++ b/lintdb/schema/DataTypes.h @@ -16,7 +16,7 @@ #include #include #include "lintdb/api.h" -#include "lintdb/assert.h" +#include "lintdb/utils/assert.h" #include "lintdb/api.h" #include #include diff --git a/lintdb/schema/Schema.h b/lintdb/schema/Schema.h index 08fbd98..6f60949 100644 --- a/lintdb/schema/Schema.h +++ b/lintdb/schema/Schema.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "lintdb/quantizers/Quantizer.h" #include "lintdb/schema/DataTypes.h" @@ -31,6 +33,7 @@ struct Field { std::vector field_types; /// the field types. e.g. indexed or /// stored in the database. FieldParameters parameters; /// parameters for the field. + int field_id = -1; /// unique identifier for the field within the schema Field() = default; Field(const std::string& name, @@ -40,7 +43,8 @@ struct Field { : name(name), data_type(data_type), field_types(field_types), - parameters(parameters) {} + parameters(parameters), + field_id(-1) {} Json::Value toJson() const; static Field fromJson(const Json::Value& json); @@ -88,16 +92,117 @@ struct ColbertField : public Field { */ struct Schema { std::vector fields; + int next_field_id = 0; /// counter for assigning field IDs Schema() = default; - explicit Schema(const std::vector& fields) : fields(fields) {} + explicit Schema(const std::vector& fields) : fields(fields) { + // Assign field IDs to all fields + for (auto& field : this->fields) { + field.field_id = next_field_id++; + } + } Json::Value toJson() const; static Schema fromJson(const Json::Value& json); + /** + * @brief Convert this schema to an Arrow schema + * @return std::shared_ptr The Arrow schema + */ + std::shared_ptr to_arrow() const { + std::vector> arrow_fields; + + for (const auto& field : fields) { + std::shared_ptr arrow_type; + + // Convert our DataType to Arrow DataType + switch (field.data_type) { + case DataType::INTEGER: + arrow_type = arrow::int64(); + break; + case DataType::FLOAT: + arrow_type = arrow::float64(); + break; + case DataType::TEXT: + arrow_type = arrow::utf8(); + break; + case DataType::TENSOR: + // For tensors, we create a fixed-size list of floats + arrow_type = arrow::fixed_size_list( + arrow::float32(), + field.parameters.dimensions + ); + break; + case DataType::QUANTIZED_TENSOR: + arrow_type = arrow::fixed_size_list( + arrow::int8(), + field.parameters.dimensions + ); + break; + // case DataType::BINARY: + // arrow_type = arrow::binary(); + // break; + default: + throw std::runtime_error("Unsupported data type for Arrow conversion: " + + std::to_string(static_cast(field.data_type))); + } + + arrow_fields.push_back(arrow::field(field.name, arrow_type)); + } + + return arrow::schema(arrow_fields); + } + + inline Field& get_field(const std::string& name) { + for (auto& field : fields) { + if (field.name == name) { + return field; + } + } + throw std::runtime_error("Field not found: " + name); + } + + inline const Field& get_field(const std::string& name) const { + for (const auto& field : fields) { + if (field.name == name) { + return field; + } + } + throw std::runtime_error("Field not found: " + name); + } + + inline int get_field_index(const std::string& name) const { + for (int i = 0; i < fields.size(); ++i) { + if (fields[i].name == name) { + return i; + } + } + throw std::runtime_error("Field not found: " + name); + } + + inline void add_field(Field& field) { + field.field_id = next_field_id++; fields.push_back(field); } + + inline Field& get_field_by_id(int field_id) { + for (auto& field : fields) { + if (field.field_id == field_id) { + return field; + } + } + throw std::runtime_error("Field not found with ID: " + std::to_string(field_id)); + } + + inline const Field& get_field_by_id(int field_id) const { + for (const auto& field : fields) { + if (field.field_id == field_id) { + return field; + } + } + throw std::runtime_error("Field not found with ID: " + std::to_string(field_id)); + } }; } // namespace lintdb diff --git a/lintdb/scoring/ContextCollector.cpp b/lintdb/scoring/ContextCollector.cpp index 402096d..4c1f25b 100644 --- a/lintdb/scoring/ContextCollector.cpp +++ b/lintdb/scoring/ContextCollector.cpp @@ -1,7 +1,63 @@ -// -// Copyright (c) 2024 ${ORGANIZATION_NAME}. All rights reserved. -// - #include "ContextCollector.h" +#include +#include +#include + +namespace lintdb { + +void ContextCollector::add_field(const QueryContext& context, const std::string& field) { + context_fields.push_back(field); + + uint8_t colbert_field_id = + context.getFieldMapper()->getFieldID(context.colbert_context); + context_field_ids.push_back(colbert_field_id); + + bool is_colbert = false; + auto field_types = context.getFieldMapper()->getFieldTypes(colbert_field_id); + /** + * This is a pretty big hack because we modify the ColBERT fields internally. A user passes in + * a tensor data type, and we process it distinctly for colbert and reset it to be datatype::colbert. + * + * A solution is to stop modifying datatypes internally, or we could expose ColBERT + * as a datatype. However, our colbert storage is meant to be internal. + */ + if (std::find(field_types.begin(), field_types.end(), FieldType::Colbert) != field_types.end()) { + is_colbert = true; + } + if (!is_colbert) { + context_data_types.push_back(context.getFieldMapper()->getDataType(colbert_field_id)); + } else { + context_data_types.push_back(DataType::COLBERT); + } + + std::unique_ptr it = context.getIndex()->get_context_iterator( + context.getTenant(), colbert_field_id); + + context_iterators.push_back(std::move(it)); +} + +std::vector ContextCollector::get_context_values(const idx_t doc_id) const { + std::vector results; + results.reserve(context_iterators.size()); + + for(int i=0; i < context_iterators.size(); i++) { + auto it = context_iterators[i].get(); + it->advance(doc_id); + + if(it->is_valid() && it->get_key().doc_id() == doc_id) { + std::string context_str = it->get_value(); + SupportedTypes colbert_context = + DocEncoder::decode_supported_types(context_str); + + // create DocValues for the context info. + uint8_t colbert_field_id = context_field_ids[i]; + results.emplace_back(colbert_context, colbert_field_id, context_data_types[i]); + } else { + LOG(WARNING) << "No context found for doc_id: " << doc_id << " field: " << context_fields[i]; + } + } + + return results; +} -namespace lintdb {} // namespace lintdb \ No newline at end of file +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/scoring/ContextCollector.h b/lintdb/scoring/ContextCollector.h index a8ae568..f6ea4ec 100644 --- a/lintdb/scoring/ContextCollector.h +++ b/lintdb/scoring/ContextCollector.h @@ -7,77 +7,21 @@ #include "lintdb/query/DocValue.h" #include "lintdb/invlists/ContextIterator.h" #include "lintdb/schema/DocEncoder.h" -#include -#include namespace lintdb { -class ContextCollector { - public: - ContextCollector() = default; + class ContextCollector { + public: + ContextCollector() = default; - void add_field(const QueryContext& context, const std::string& field) { - context_fields.push_back(field); + void add_field(const QueryContext& context, const std::string& field); + std::vector get_context_values(const idx_t doc_id) const; - uint8_t colbert_field_id = - context.getFieldMapper()->getFieldID(context.colbert_context); - context_field_ids.push_back(colbert_field_id); + private: + std::vector context_fields; + std::vector context_field_ids; + std::vector context_data_types; + std::vector> context_iterators; + }; - bool is_colbert = false; - auto field_types = context.getFieldMapper()->getFieldTypes(colbert_field_id); - /** - * This is a pretty big hack because we modify the ColBERT fields internally. A user passes in - * a tensor data type, and we process it distinctly for colbert and reset it to be datatype::colbert. - * - * A solution is to stop modifying datatypes internally, or we could expose ColBERT - * as a datatype. However, our colbert storage is meant to be internal. - */ - if (std::find(field_types.begin(), field_types.end(), FieldType::Colbert) != field_types.end()) { - is_colbert = true; - } - if (!is_colbert) { - context_data_types.push_back(context.getFieldMapper()->getDataType(colbert_field_id)); - } else { - context_data_types.push_back(DataType::COLBERT); - } - - auto it = context.getIndex()->get_context_iterator( - context.getTenant(), colbert_field_id); - - context_iterators.push_back(std::move(it)); - } - - std::vector get_context_values(const idx_t doc_id) const { - std::vector results; - results.reserve(context_iterators.size()); - - for(int i=0; i < context_iterators.size(); i++) { - auto it = context_iterators[i].get(); - it->advance(doc_id); - - if(it->is_valid() && it->get_key().doc_id() == doc_id) { - std::string context_str = it->get_value(); - SupportedTypes colbert_context = - DocEncoder::decode_supported_types(context_str); - - // create DocValues for the context info. - uint8_t colbert_field_id = context_field_ids[i]; - results.emplace_back(colbert_context, colbert_field_id, context_data_types[i]); - } else { - LOG(WARNING) << "No context found for doc_id: " << doc_id << " field: " << context_fields[i]; - } - } - - return results; - } - - - private: - std::vector context_fields; - std::vector context_field_ids; - std::vector context_data_types; - std::vector> context_iterators; - -}; - -} // namespace lintdb +} // namespace lintdb \ No newline at end of file diff --git a/lintdb/scoring/Scorer.cpp b/lintdb/scoring/Scorer.cpp index 27f272e..72739b7 100644 --- a/lintdb/scoring/Scorer.cpp +++ b/lintdb/scoring/Scorer.cpp @@ -30,8 +30,7 @@ ScoredDocument ColBERTScorer::score( context.getFieldMapper()->getFieldID(context.colbert_context); size_t dim = context.getFieldMapper()->getFieldDimensions(colbert_field_id); - SupportedTypes colbert_data = dvs[colbert_data_idx].value; - ColBERTContextData colbert = std::get(colbert_data); + ColBERTContextData colbert = std::get(dvs[colbert_data_idx].value); size_t num_tensors = colbert.doc_codes.size(); @@ -47,7 +46,7 @@ ScoredDocument ColBERTScorer::score( context.getOrCreateNearestCentroids(context.colbert_context) ->get_query_tensor(); - auto query_span = gsl::span(query.query); + auto query_span = gsl::span(query.query); DocumentScore score = score_document_by_residuals( query_span, diff --git a/lintdb/scoring/plaid.cpp b/lintdb/scoring/plaid.cpp index 8df63a0..0b460dd 100644 --- a/lintdb/scoring/plaid.cpp +++ b/lintdb/scoring/plaid.cpp @@ -163,6 +163,7 @@ DocumentScore score_document_by_residuals( DocumentScore doc; // find the max score for each doc_token. std::vector max_scores(n, 0); +#pragma omp parallel for for (size_t i = 0; i < m; i++) { // per num_doc_tokens float max_token_score = 0.0; for (size_t j = 0; j < n; j++) { // per num_query_tokens diff --git a/lintdb/scoring/scoring_methods.cpp b/lintdb/scoring/scoring_methods.cpp index ee7e85d..fe38292 100644 --- a/lintdb/scoring/scoring_methods.cpp +++ b/lintdb/scoring/scoring_methods.cpp @@ -21,7 +21,7 @@ score_t plaid_similarity(const std::vector& values, std::shared_ptrget_reordered_distances(); +// auto reordered_distances = knn->get_reordered_distances(); // gives us a potentially quantized vector SupportedTypes colbert_context = values[colbert_idx].value; @@ -31,11 +31,10 @@ score_t plaid_similarity(const std::vector& values, std::shared_ptrget_query_tensor(); float score = colbert_centroid_score( codes.doc_codes, - reordered_distances, + knn->get_reordered_distances(), query.num_query_tokens, knn->get_num_centroids(), -1); - return score; } diff --git a/lintdb/scoring/scoring_methods.h b/lintdb/scoring/scoring_methods.h index 4da20ab..0e30821 100644 --- a/lintdb/scoring/scoring_methods.h +++ b/lintdb/scoring/scoring_methods.h @@ -27,8 +27,8 @@ enum class UnaryScoringMethod { score_t score(const UnaryScoringMethod method, const std::vector& values); enum class EmbeddingScoringMethod { - PLAID = 1, - COLBERT = 2 + PLAID = 0, + COLBERT = 1 }; score_t score_embeddings(const EmbeddingScoringMethod method, const std::vector& values, std::shared_ptr knn); diff --git a/lintdb/util.cpp b/lintdb/util.cpp index 9d4fb99..b1df37e 100644 --- a/lintdb/util.cpp +++ b/lintdb/util.cpp @@ -3,7 +3,7 @@ #include #include #include "lintdb/api.h" -#include "lintdb/exception.h" +#include "lintdb/utils/exception.h" #include "lintdb/SearchOptions.h" namespace lintdb { diff --git a/lintdb/assert.h b/lintdb/utils/assert.h similarity index 98% rename from lintdb/assert.h rename to lintdb/utils/assert.h index 4f22589..221056f 100644 --- a/lintdb/assert.h +++ b/lintdb/utils/assert.h @@ -6,7 +6,7 @@ #include #include #include -#include "lintdb/exception.h" +#include "exception.h" // #define __PRETTY_FUNCTION__ __FUNCSIG__ diff --git a/lintdb/exception.h b/lintdb/utils/exception.h similarity index 100% rename from lintdb/exception.h rename to lintdb/utils/exception.h diff --git a/ports/bitsery/portfile.cmake b/ports/bitsery/portfile.cmake deleted file mode 100644 index 2fc9262..0000000 --- a/ports/bitsery/portfile.cmake +++ /dev/null @@ -1,20 +0,0 @@ -vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO fraillt/bitsery - REF "v${VERSION}" - SHA512 26e525d799d1777e182753c6c970765be8695a557e0fef35224ab8f4629a094c04fd8d7e456da369938d74acb0ca84084f394f212ae1343fa62a27256dba971f - HEAD_REF master -) - -vcpkg_cmake_configure( - SOURCE_PATH "${SOURCE_PATH}" -) - -vcpkg_cmake_install() - -vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/${PORT}) - -file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib") -file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug") - -file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) diff --git a/ports/bitsery/vcpkg.json b/ports/bitsery/vcpkg.json deleted file mode 100644 index 4908314..0000000 --- a/ports/bitsery/vcpkg.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "bitsery", - "version": "5.2.4", - "description": "Header only C++ binary serialization library", - "homepage": "https://github.com/fraillt/bitsery", - "dependencies": [ - { - "name": "vcpkg-cmake", - "host": true - }, - { - "name": "vcpkg-cmake-config", - "host": true - } - ] -} diff --git a/ports/faiss/faiss.patch b/ports/faiss/faiss.patch deleted file mode 100644 index 88edac4..0000000 --- a/ports/faiss/faiss.patch +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in -index 43ea9d4c..a7beff69 100644 ---- a/cmake/faiss-config.cmake.in -+++ b/cmake/faiss-config.cmake.in -@@ -4,4 +4,6 @@ - # This source code is licensed under the BSD-style license found in the - # LICENSE file in the root directory of this source tree. - -+find_dependency(MKL REQUIRED) -+ - include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake") -diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt -index 1fea676c..8723be27 100644 ---- a/faiss/CMakeLists.txt -+++ b/faiss/CMakeLists.txt -@@ -269,16 +269,16 @@ target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX) - - find_package(MKL) - if(MKL_FOUND) -- target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES}) -- target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES}) -+ target_link_libraries(faiss PRIVATE MKL::MKL) -+ target_link_libraries(faiss_avx2 PRIVATE MKL::MKL) - else() - find_package(BLAS REQUIRED) -- target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES}) -- target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES}) -+ target_link_libraries(faiss PRIVATE BLAS::BLAS) -+ target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS) - - find_package(LAPACK REQUIRED) -- target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES}) -- target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES}) -+ target_link_libraries(faiss PRIVATE lAPACK::LAPACK) -+ target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK) - endif() - - install(TARGETS faiss diff --git a/ports/faiss/fix-dependencies.patch b/ports/faiss/fix-dependencies.patch deleted file mode 100644 index 0894544..0000000 --- a/ports/faiss/fix-dependencies.patch +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cmake/faiss-config.cmake.in b/cmake/faiss-config.cmake.in -index 43ea9d4..437a7f8 100644 ---- a/cmake/faiss-config.cmake.in -+++ b/cmake/faiss-config.cmake.in -@@ -4,4 +4,7 @@ - # This source code is licensed under the BSD-style license found in the - # LICENSE file in the root directory of this source tree. - -+find_dependency(OpenMP REQUIRED) -+find_dependency(BLAS REQUIRED) -+find_dependency(LAPACK REQUIRED) - include("${CMAKE_CURRENT_LIST_DIR}/faiss-targets.cmake") -diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt -index 30d573f..9af8baf 100644 ---- a/faiss/CMakeLists.txt -+++ b/faiss/CMakeLists.txt -@@ -212,17 +212,17 @@ target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX) - target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX) - - find_package(MKL) --if(MKL_FOUND) -+if(MKL_FOUND) - target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES}) - target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES}) - else() - find_package(BLAS REQUIRED) -- target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES}) -- target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES}) -+ target_link_libraries(faiss PRIVATE BLAS::BLAS) -+ target_link_libraries(faiss_avx2 PRIVATE BLAS::BLAS) - - find_package(LAPACK REQUIRED) -- target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES}) -- target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES}) -+ target_link_libraries(faiss PRIVATE LAPACK::LAPACK) -+ target_link_libraries(faiss_avx2 PRIVATE LAPACK::LAPACK) - endif() - - install(TARGETS faiss diff --git a/ports/faiss/portfile.cmake b/ports/faiss/portfile.cmake deleted file mode 100644 index 5cd4fd8..0000000 --- a/ports/faiss/portfile.cmake +++ /dev/null @@ -1,46 +0,0 @@ -vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO facebookresearch/faiss -# REF v1.8.0 -# SHA512 38d4215e3e019915d8b367ff0e8d14901b1495f6f45b835e9248276567a422b0370baab6bd887045442dd1e268b7fe7c347107162e66bb3ec6b1a53be4b2e441 - REF v1.7.4 - SHA512 9622fb989cb2e1879450c2ad257cb55d0c0c639f54f0815e4781f4e4b2ae2f01779f5c8c0738ae9a29fde7e418587e6a92e91240d36c1ca051a6228bfb777638 - HEAD_REF master -) - -vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS - FEATURES - gpu FAISS_ENABLE_GPU -) - -if ("${FAISS_ENABLE_GPU}") - if (NOT VCPKG_CMAKE_SYSTEM_NAME AND NOT ENV{CUDACXX}) - set(ENV{CUDACXX} "$ENV{CUDA_PATH}/bin/nvcc.exe") - endif() -endif() - - -vcpkg_cmake_configure( - SOURCE_PATH "${SOURCE_PATH}" - OPTIONS - ${FEATURE_OPTIONS} - -DFAISS_ENABLE_PYTHON=OFF # Requires SWIG - -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release - # -DBLA_VENDOR=Intel10_64lp - # -DCMAKE_TOOLCHAIN_FILE="${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake" -) - -# # Setup vcpkg script with CMake (note: should be placed before project() call) -# set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_SOURCE_DIR}/tools/vcpkg/scripts/buildsystems/vcpkg.cmake CACHE STRING "Vcpkg toolchain file") - - -vcpkg_cmake_install() - -vcpkg_cmake_config_fixup() - -vcpkg_copy_pdbs() - -file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) - -file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") diff --git a/ports/faiss/vcpkg.json b/ports/faiss/vcpkg.json deleted file mode 100644 index 95345e8..0000000 --- a/ports/faiss/vcpkg.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "faiss", - "version": "1.7.4", - "description": "Faiss is a library for efficient similarity search and clustering of dense vectors.", - "homepage": "https://github.com/facebookresearch/faiss", - "license": "MIT", - "supports": "!uwp & !osx & !x86", - "dependencies": [ - "lapack", - "openblas", - { - "name": "vcpkg-cmake", - "host": true - }, - { - "name": "vcpkg-cmake-config", - "host": true - } - ], - "features": { - "gpu": { - "description": "Whether to enable GPU support", - "dependencies": [ - "cuda" - ] - } - } -} diff --git a/ports/intel-mkl/copy-from-dmg.cmake b/ports/intel-mkl/copy-from-dmg.cmake deleted file mode 100644 index a5aa67c..0000000 --- a/ports/intel-mkl/copy-from-dmg.cmake +++ /dev/null @@ -1,53 +0,0 @@ -find_program(HDIUTIL NAMES hdiutil REQUIRED) -set(dmg_path "NOTFOUND" CACHE FILEPATH "Where to find the DMG") -set(output_dir "output_dir" CACHE FILEPATH "Where to put the packages") - -if(NOT EXISTS "${dmg_path}") - message(FATAL_ERROR "'dmg_path' (${dmg_path}) does not exist.") -endif() -if(NOT IS_DIRECTORY "${output_dir}") - message(FATAL_ERROR "'output_dir' (${output_dir}) is not a directory.") -endif() - -execute_process( - COMMAND mktemp -d - RESULT_VARIABLE mktemp_result - OUTPUT_VARIABLE mount_point - OUTPUT_STRIP_TRAILING_WHITESPACE -) -if(NOT mktemp_result STREQUAL "0") - message(FATAL_ERROR "mktemp -d failed: ${mktemp_result}") -elseif(NOT IS_DIRECTORY "${mount_point}") - message(FATAL_ERROR "'mount_point' (${mount_point}) is not a directory.") -endif() - -execute_process( - COMMAND "${HDIUTIL}" attach "${dmg_path}" -mountpoint "${mount_point}" -readonly - RESULT_VARIABLE mount_result -) -if(mount_result STREQUAL "0") - set(dmg_packages_dir "${mount_point}/bootstrapper.app/Contents/Resources/packages") - file(GLOB packages - "${dmg_packages_dir}/intel.oneapi.mac.mkl.devel,*" - "${dmg_packages_dir}/intel.oneapi.mac.mkl.runtime,*" - "${dmg_packages_dir}/intel.oneapi.mac.mkl.product,*" - "${dmg_packages_dir}/intel.oneapi.mac.openmp,*" - ) - # Using execute_process to avoid direct errors - execute_process( - COMMAND cp -R ${packages} "${output_dir}/" - RESULT_VARIABLE copy_result - ) -endif() -execute_process( - COMMAND "${HDIUTIL}" detach "${mount_point}" - RESULT_VARIABLE unmount_result -) - -if(NOT mount_result STREQUAL "0") - message(FATAL_ERROR "Mounting ${dmg_path} failed: ${mount_result}") -elseif(NOT copy_result STREQUAL "0") - message(FATAL_ERROR "Coyping packages failed: ${copy_result}") -elseif(NOT unmount_result STREQUAL "0") - message(FATAL_ERROR "Unounting ${dmg_path} failed: ${unmount_result}") -endif() diff --git a/ports/intel-mkl/portfile.cmake b/ports/intel-mkl/portfile.cmake deleted file mode 100644 index 508e86a..0000000 --- a/ports/intel-mkl/portfile.cmake +++ /dev/null @@ -1,254 +0,0 @@ -# This package installs Intel MKL on Linux, macOS and Windows for x64. -# Configuration: -# - ilp64 -# - dynamic CRT: intel_thread, static CRT: sequential - -set(VCPKG_POLICY_EMPTY_PACKAGE enabled) - -# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19150/w_onemkl_p_2023.0.0.25930_offline.exe # windows -# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19116/m_onemkl_p_2023.0.0.25376_offline.dmg # macos -# https://registrationcenter-download.intel.com/akdlm/irc_nas/19138/l_onemkl_p_2023.0.0.25398_offline.sh # linux -set(sha "") -if(NOT VCPKG_TARGET_ARCHITECTURE STREQUAL "x64") - # nop -elseif(VCPKG_TARGET_IS_WINDOWS) - set(filename w_onemkl_p_2023.0.0.25930_offline.exe) - set(magic_number 19150) - set(sha a3eb6b75241a2eccb73ed73035ff111172c55d3fa51f545c7542277a155df84ff72fc826621711153e683f84058e64cb549c030968f9f964531db76ca8a3ed46) - set(package_infix "win") -elseif(VCPKG_TARGET_IS_OSX) - set(filename m_onemkl_p_2023.0.0.25376_offline.dmg) - set(magic_number 19116) - set(sha 7b9b8c004054603e6830fb9b9c049d5a4cfc0990c224cb182ac5262ab9f1863775a67491413040e3349c590e2cca58edcfc704db9f3b9f9faa8b5b09022cd2af) - set(package_infix "mac") - set(package_libdir "lib") - set(compiler_libdir "mac/compiler/lib") -elseif(VCPKG_TARGET_IS_LINUX) - set(filename l_onemkl_p_2023.0.0.25398_offline.sh) - set(magic_number 19138) - set(sha b5f2f464675f0fd969dde2faf2e622b834eb1cc406c4a867148116f6c24ba5c709d98b678840f4a89a1778e12cde0ff70ce2ef59faeef3d3f3aa1d0329c71af1) - set(package_infix "lin") - set(package_libdir "lib/intel64") - set(compiler_libdir "linux/compiler/lib/intel64_lin") -endif() - -if(NOT sha) - message(WARNING "${PORT} is empty for ${TARGET_TRIPLET}.") - return() -endif() - -vcpkg_download_distfile(installer_path - URLS "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/${magic_number}/${filename}" - FILENAME "${filename}" - SHA512 "${sha}" -) - -# Note: intel_thread and lp64 are the defaults. -set(interface "lp64") # or ilp64; ilp == 64 bit int api -#https://www.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/linking-your-application-with-onemkl/linking-in-detail/linking-with-interface-libraries/using-the-ilp64-interface-vs-lp64-interface.html -if(VCPKG_CRT_LINKAGE STREQUAL "dynamic") - set(threading "intel_thread") #sequential or intel_thread or tbb_thread or pgi_thread -else() - set(threading "sequential") -endif() -if(threading STREQUAL "intel_thread") - set(short_thread "iomp") -else() - string(SUBSTRING "${threading}" "0" "3" short_thread) -endif() -set(main_pc_file "mkl-${VCPKG_LIBRARY_LINKAGE}-${interface}-${short_thread}.pc") - -# First extraction level: packages (from offline installer) -set(extract_0_dir "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-extract") -file(REMOVE_RECURSE "${extract_0_dir}") -file(MAKE_DIRECTORY "${extract_0_dir}") - -# Second extraction level: actual files (from packages) -set(extract_1_dir "${CURRENT_PACKAGES_DIR}/intel-extract") -file(REMOVE_RECURSE "${extract_1_dir}") -file(MAKE_DIRECTORY "${extract_1_dir}") - -file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/lib/pkgconfig") - -if(VCPKG_TARGET_IS_WINDOWS) - vcpkg_find_acquire_program(7Z) - message(STATUS "Extracting offline installer") - vcpkg_execute_required_process( - COMMAND "${7Z}" x "${installer_path}" "-o${extract_0_dir}" "-y" "-bso0" "-bsp0" - WORKING_DIRECTORY "${extract_0_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-0" - ) - - set(packages - "intel.oneapi.win.mkl.devel,v=2023.0.0-25930/oneapi-mkl-devel-for-installer_p_2023.0.0.25930.msi" # has the required libs. - "intel.oneapi.win.mkl.runtime,v=2023.0.0-25930/oneapi-mkl-for-installer_p_2023.0.0.25930.msi" # has the required DLLs - #"intel.oneapi.win.compilers-common-runtime,v=2023.0.0-25922" # SVML - "intel.oneapi.win.openmp,v=2023.0.0-25922/oneapi-comp-openmp-for-installer_p_2023.0.0.25922.msi" # OpenMP - #"intel.oneapi.win.tbb.runtime,v=2021.8.0-25874" #TBB - ) - - foreach(pack IN LISTS packages) - set(package_path "${extract_0_dir}/packages/${pack}") - cmake_path(GET pack STEM LAST_ONLY packstem) - cmake_path(NATIVE_PATH package_path package_path_native) - vcpkg_execute_required_process( - COMMAND "${LESSMSI}" x "${package_path_native}" - WORKING_DIRECTORY "${extract_1_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" - ) - file(COPY "${extract_1_dir}/${packstem}/SourceDir/" DESTINATION "${extract_1_dir}") - file(REMOVE_RECURSE "${extract_1_dir}/${packstem}") - endforeach() - - set(mkl_dir "${extract_1_dir}/Intel/Compiler/12.0/mkl/2023.0.0") - file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include") - # see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html for linking - if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") - set(files "mkl_core_dll.lib" "mkl_${threading}_dll.lib" "mkl_intel_${interface}_dll.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib") # "mkl_rt.lib" single dynamic lib with dynamic dispatch - file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin") # Could probably be reduced instead of copying all - if(NOT VCPKG_BUILD_TYPE) - file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin") - endif() - else() - set(files "mkl_core.lib" "mkl_${threading}.lib" "mkl_intel_${interface}.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib") - endif() - foreach(file IN LISTS files) - file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") # instead of manual-link keep normal structure - if(NOT VCPKG_BUILD_TYPE) - file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64") - endif() - endforeach() - file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}") - - set(compiler_dir "${extract_1_dir}/Intel/Compiler/12.0/compiler/2023.0.0") - if(threading STREQUAL "intel_thread") - file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin") - file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") - file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/windows/compiler/lib/intel64_win/" "/lib/intel64/") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "-I \${includedir}" "-I\"\${includedir}\"") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5") - if(NOT VCPKG_BUILD_TYPE) - file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin") - file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64") - endif() - endif() -else() - message(STATUS "Warning: This port is still a work on progress. - E.g. it is not correctly filtering the libraries in accordance with - VCPKG_LIBRARY_LINKAGE. It is using the default threading (Intel OpenMP) - which is known to segfault when used together with GNU OpenMP. -") - - message(STATUS "Extracting offline installer") - if(VCPKG_TARGET_IS_LINUX) - vcpkg_execute_required_process( - COMMAND "bash" "--verbose" "--noprofile" "${installer_path}" "--extract-only" "--extract-folder" "${extract_0_dir}" - WORKING_DIRECTORY "${extract_0_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-0" - ) - file(RENAME "${extract_0_dir}/l_onemkl_p_2023.0.0.25398_offline/packages" "${extract_0_dir}/packages") - elseif(VCPKG_TARGET_IS_OSX) - find_program(HDIUTIL NAMES hdiutil REQUIRED) - file(MAKE_DIRECTORY "${extract_0_dir}/packages") - message(STATUS "... Don't interrupt.") - vcpkg_execute_required_process( - COMMAND "${CMAKE_COMMAND}" "-Ddmg_path=${installer_path}" - "-Doutput_dir=${extract_0_dir}/packages" - "-DHDIUTIL=${HDIUTIL}" - -P "${CMAKE_CURRENT_LIST_DIR}/copy-from-dmg.cmake" - WORKING_DIRECTORY "${extract_0_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-0" - ) - message(STATUS "... Done.") - endif() - - file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.runtime,v=2023.0.0-*") - cmake_path(GET package_path STEM LAST_ONLY packstem) - message(STATUS "Extracting ${packstem}") - vcpkg_execute_required_process( - COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" - "_installdir/mkl/2023.0.0/lib" - "_installdir/mkl/2023.0.0/licensing" - WORKING_DIRECTORY "${extract_1_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" - ) - file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.devel,v=2023.0.0-*") - cmake_path(GET package_path STEM LAST_ONLY packstem) - message(STATUS "Extracting ${packstem}") - vcpkg_execute_required_process( - COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" - "_installdir/mkl/2023.0.0/bin" - "_installdir/mkl/2023.0.0/include" - "_installdir/mkl/2023.0.0/lib" - WORKING_DIRECTORY "${extract_1_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" - ) - file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.openmp,v=2023.0.0-*") - cmake_path(GET package_path STEM LAST_ONLY packstem) - message(STATUS "Extracting ${packstem}") - vcpkg_execute_required_process( - COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup" - "_installdir/compiler/2023.0.0" - WORKING_DIRECTORY "${extract_1_dir}" - LOGNAME "extract-${TARGET_TRIPLET}-${packstem}" - ) - - set(mkl_dir "${extract_1_dir}/_installdir/mkl/2023.0.0") - file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include") - file(COPY "${mkl_dir}/${package_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") - if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") - set(to_remove_suffix .a) - elseif(VCPKG_TARGET_IS_OSX) - set(to_remove_suffix .dylib) - else() - set(to_remove_suffix .so) - endif() - file(GLOB_RECURSE files_to_remove - "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}" - "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}.?" - ) - file(REMOVE ${files_to_remove}) - file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "\${exec_prefix}/${package_libdir}" "\${exec_prefix}/lib/intel64") - - set(compiler_dir "${extract_1_dir}/_installdir/compiler/2023.0.0") - if(threading STREQUAL "intel_thread") - file(COPY "${compiler_dir}/${compiler_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") - file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/${compiler_libdir}/" "/lib/intel64/") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5") - endif() -endif() - -file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/mkl.pc") -if(NOT VCPKG_BUILD_TYPE) - file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig") - file(GLOB pc_files RELATIVE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/*.pc") - foreach(file IN LISTS pc_files) - file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${file}" "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}") - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/include" "/../include") - if(NOT VCPKG_TARGET_IS_WINDOWS) - vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/lib/intel64" "/../lib/intel64") - endif() - endforeach() -endif() - -file(COPY "${mkl_dir}/lib/cmake/" DESTINATION "${CURRENT_PACKAGES_DIR}/share/") -vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "MKL_CMAKE_PATH}/../../../" "MKL_CMAKE_PATH}/../../") -vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "redist/\${MKL_ARCH}" "bin") -#TODO: Hardcode settings from portfile in config.cmake -#TODO: Give lapack/blas information about the correct BLA_VENDOR depending on settings. - -file(INSTALL "${mkl_dir}/licensing" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") -file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.product,v=2023.0.0-*") -vcpkg_install_copyright(FILE_LIST "${package_path}/licenses/license.htm") - -file(REMOVE_RECURSE - "${extract_0_dir}" - "${extract_1_dir}" - "${CURRENT_PACKAGES_DIR}/lib/intel64/cmake" - "${CURRENT_PACKAGES_DIR}/lib/intel64/pkgconfig" -) - -file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}") diff --git a/ports/intel-mkl/usage b/ports/intel-mkl/usage deleted file mode 100644 index b8ee798..0000000 --- a/ports/intel-mkl/usage +++ /dev/null @@ -1,4 +0,0 @@ -intel-mkl provides CMake targets: - - find_package(MKL CONFIG REQUIRED) - target_link_libraries(main PRIVATE MKL::MKL) diff --git a/ports/intel-mkl/vcpkg.json b/ports/intel-mkl/vcpkg.json deleted file mode 100644 index e68c386..0000000 --- a/ports/intel-mkl/vcpkg.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "intel-mkl", - "version": "2023.0.0", - "port-version": 3, - "description": "Intel® Math Kernel Library (Intel® MKL) accelerates math processing routines, increases application performance, and reduces development time on Intel® processors.", - "homepage": "https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html", - "license": null, - "supports": "(windows | linux | osx) & x64", - "dependencies": [ - { - "name": "vcpkg-tool-lessmsi", - "host": true, - "platform": "windows" - } - ] -} diff --git a/ports/onnxruntime/portfile.cmake b/ports/onnxruntime/portfile.cmake deleted file mode 100644 index b031239..0000000 --- a/ports/onnxruntime/portfile.cmake +++ /dev/null @@ -1,166 +0,0 @@ -vcpkg_check_linkage(ONLY_DYNAMIC_LIBRARY) - -set(VERSION 1.17.3) -set(ONNX_FILENAME onnxruntime-win-x64-gpu-${VERSION}) -if (MSVC) -vcpkg_download_distfile(ARCHIVE - URLS "https://github.com/microsoft/onnxruntime/releases/download/v${VERSION}/onnxruntime-win-x64-${VERSION}.zip" - FILENAME "onnxruntime-win-x64-gpu-${VERSION}.zip" - SHA512 d9f7c21b0e4ee64e84923904e05d04686231ab9240724dad7e7efc05d890d73404d92e9d07f14d14507d897da846a754b474b7b036e8416a06daaf200e1ec488 -) -elseif(UNIX AND NOT APPLE) -vcpkg_download_distfile(ARCHIVE - URLS "https://github.com/microsoft/onnxruntime/releases/download/v${VERSION}/onnxruntime-linux-x64-${VERSION}.tgz" - FILENAME "onnxruntime-linux-x64-${VERSION}.tgz" - SHA512 c13273acb7730f0f5eed569cff479d34c9674f5f39d2a76a2c960835560e9706fd92e07071dd66fe242738c31f0df19d830b7e5083378c9e0657685727725ca0 -) -set(ONNX_FILENAME onnxruntime-linux-x64-${VERSION}) -elseif(APPLE AND VCPKG_TARGET_ARCHITECTURE MATCHES "x64") -vcpkg_download_distfile(ARCHIVE - URLS "https://github.com/microsoft/onnxruntime/releases/download/v${VERSION}/onnxruntime-osx-x86_64-${VERSION}.tgz" - FILENAME "onnxruntime-osx-x86_64-${VERSION}.tgz" - SHA512 175712dccb8d57cf4f0e7668f3e7ed42329ace19c54f3a5670e8cf13a335faf90889b6c248e855ab3d8ebb1254c6484fe91f7bb732f959816c02463c6b9a9626 -) -set(ONNX_FILENAME onnxruntime-osx-x86_64-${VERSION}) -elseif(APPLE AND VCPKG_TARGET_ARCHITECTURE MATCHES "arm64") -vcpkg_download_distfile(ARCHIVE - URLS "https://github.com/microsoft/onnxruntime/releases/download/v${VERSION}/onnxruntime-osx-arm64-${VERSION}.tgz" - FILENAME "onnxruntime-osx-arm64-${VERSION}.tgz" - SHA512 1e002f8d2d89cb99d2bd9c2c61ef7cfe4e72724f21a6a3d5df6524f92cc9dd5096754e871b2ee7e5588d5f09a320f5eb0f484a95ff70d4b05990dfa388c344bf -) -set(ONNX_FILENAME onnxruntime-osx-arm64-${VERSION}) -endif() - -vcpkg_extract_source_archive( - SOURCE_PATH - ARCHIVE "${ARCHIVE}" - NO_REMOVE_ONE_LEVEL -) - -file(MAKE_DIRECTORY - ${CURRENT_PACKAGES_DIR}/include - ${CURRENT_PACKAGES_DIR}/lib - ${CURRENT_PACKAGES_DIR}/bin - ${CURRENT_PACKAGES_DIR}/debug/lib - ${CURRENT_PACKAGES_DIR}/debug/bin - ) - -# copy the include dir to our package -file(COPY - ${SOURCE_PATH}/${ONNX_FILENAME}/include - DESTINATION ${CURRENT_PACKAGES_DIR} - ) - -# now copy the lib files depending on the platform -if (MSVC) - file(GLOB_RECURSE ONNX_LIBS ${SOURCE_PATH}/${ONNX_FILENAME}/lib/*.lib) - file(GLOB_RECURSE ONNX_PDB ${SOURCE_PATH}/${ONNX_FILENAME}/lib/*.pdb) - file(GLOB_RECURSE ONNX_DLLS ${SOURCE_PATH}/${ONNX_FILENAME}/lib/*.dll) - - file(COPY - ${SOURCE_PATH}/${ONNX_FILENAME}/include - DESTINATION ${CURRENT_PACKAGES_DIR} - ) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.pdb - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.lib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_shared.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_tensorrt.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/bin) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/onnxruntime_providers_cuda.dll - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/bin) -elseif(UNIX AND NOT APPLE) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.so - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.so - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.so.${VERSION} - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.so.${VERSION} - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) -elseif(APPLE AND VCPKG_TARGET_ARCHITECTURE MATCHES "x64") - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib.dSYM - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib.dSYM - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) -elseif(APPLE AND VCPKG_TARGET_ARCHITECTURE MATCHES "arm64") - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) - - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib.dSYM - DESTINATION ${CURRENT_PACKAGES_DIR}/lib) - file(COPY ${SOURCE_PATH}/${ONNX_FILENAME}/lib/libonnxruntime.${VERSION}.dylib.dSYM - DESTINATION ${CURRENT_PACKAGES_DIR}/debug/lib) -endif() -# # Handle copyright -vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/${ONNX_FILENAME}/LICENSE") diff --git a/ports/onnxruntime/vcpkg.json b/ports/onnxruntime/vcpkg.json deleted file mode 100644 index 11fcc99..0000000 --- a/ports/onnxruntime/vcpkg.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "onnxruntime", - "version": "1.17.3", - "description": "onnxruntime", - "homepage": "https://github.com/microsoft/onnxruntime", - "license": "MIT", - "supports": "(windows | linux | osx)" -} diff --git a/ports/rocksdb/0001-fix-dependencies.patch b/ports/rocksdb/0001-fix-dependencies.patch deleted file mode 100644 index 7c16654..0000000 --- a/ports/rocksdb/0001-fix-dependencies.patch +++ /dev/null @@ -1,122 +0,0 @@ - CMakeLists.txt | 33 +++++++++++++++------------------ - cmake/RocksDBConfig.cmake.in | 11 ++++++++--- - 2 files changed, 23 insertions(+), 21 deletions(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 23a4014bc..045f5a36d 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -87,7 +87,7 @@ endif() - - include(CMakeDependentOption) - --if(MSVC) -+if(0) - option(WITH_GFLAGS "build with GFlags" OFF) - option(WITH_XPRESS "build with windows built in compression" OFF) - option(ROCKSDB_SKIP_THIRDPARTY "skip thirdparty.inc" OFF) -@@ -136,10 +136,7 @@ else() - endif() - - if(WITH_SNAPPY) -- find_package(Snappy CONFIG) -- if(NOT Snappy_FOUND) -- find_package(Snappy REQUIRED) -- endif() -+ find_package(Snappy CONFIG REQUIRED) - add_definitions(-DSNAPPY) - list(APPEND THIRDPARTY_LIBS Snappy::snappy) - endif() -@@ -163,16 +160,19 @@ else() - endif() - - if(WITH_LZ4) -- find_package(lz4 REQUIRED) -+ find_package(lz4 CONFIG REQUIRED) - add_definitions(-DLZ4) - list(APPEND THIRDPARTY_LIBS lz4::lz4) - endif() - - if(WITH_ZSTD) -- find_package(zstd REQUIRED) -+ find_package(zstd CONFIG REQUIRED) - add_definitions(-DZSTD) -- include_directories(${ZSTD_INCLUDE_DIR}) -- list(APPEND THIRDPARTY_LIBS zstd::zstd) -+ if(TARGET zstd::libzstd_shared) -+ list(APPEND THIRDPARTY_LIBS zstd::libzstd_shared) -+ elseif(TARGET zstd::libzstd_static) -+ list(APPEND THIRDPARTY_LIBS zstd::libzstd_static) -+ endif() - endif() - endif() - -@@ -312,11 +312,10 @@ int main() { - endif() - - if (WITH_LIBURING) -- find_package(uring) -- if (uring_FOUND) -- add_definitions(-DROCKSDB_IOURING_PRESENT) -- list(APPEND THIRDPARTY_LIBS uring::uring) -- endif() -+ find_package(PkgConfig) -+ pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0) -+ add_definitions(-DROCKSDB_IOURING_PRESENT) -+ list(APPEND THIRDPARTY_LIBS PkgConfig::liburing) - endif() - - # Reset the required flags -@@ -382,9 +381,9 @@ endif() - - option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF) - if(WITH_TBB) -- find_package(TBB REQUIRED) -+ find_package(TBB CONFIG REQUIRED) - add_definitions(-DTBB) -- list(APPEND THIRDPARTY_LIBS TBB::TBB) -+ list(APPEND THIRDPARTY_LIBS TBB::tbb) - endif() - - # Stall notifications eat some performance from inserts -@@ -1202,8 +1201,6 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) - endforeach() - endforeach() - -- install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination}) -- - install( - TARGETS ${ROCKSDB_STATIC_LIB} - EXPORT RocksDBTargets -diff --git a/cmake/RocksDBConfig.cmake.in b/cmake/RocksDBConfig.cmake.in -index 0bd14be11..a420d8bfe 100644 ---- a/cmake/RocksDBConfig.cmake.in -+++ b/cmake/RocksDBConfig.cmake.in -@@ -33,11 +33,11 @@ if(@WITH_BZ2@) - endif() - - if(@WITH_LZ4@) -- find_dependency(lz4) -+ find_dependency(lz4 CONFIG) - endif() - - if(@WITH_ZSTD@) -- find_dependency(zstd) -+ find_dependency(zstd CONFIG) - endif() - - if(@WITH_NUMA@) -@@ -45,7 +45,12 @@ if(@WITH_NUMA@) - endif() - - if(@WITH_TBB@) -- find_dependency(TBB) -+ find_dependency(TBB CONFIG) -+endif() -+ -+if(@WITH_LIBURING@) -+ find_dependency(PkgConfig) -+ pkg_check_modules(liburing REQUIRED IMPORTED_TARGET GLOBAL liburing>=2.0) - endif() - - find_dependency(Threads) diff --git a/ports/rocksdb/portfile.cmake b/ports/rocksdb/portfile.cmake deleted file mode 100644 index deb1cf0..0000000 --- a/ports/rocksdb/portfile.cmake +++ /dev/null @@ -1,68 +0,0 @@ -vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO facebook/rocksdb - REF "v${VERSION}" - SHA512 524e3e70ed2b1d2e6c61a7b401946e50473cc95684ce4efc6250062f5bc945e443e96f7907fcc3ee1ab98c71179a8b56a654383cf2c0bbe1bb20907ab1ac7523 - HEAD_REF main - PATCHES - 0001-fix-dependencies.patch -) - -string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "dynamic" WITH_MD_LIBRARY) -string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" ROCKSDB_BUILD_SHARED) - -vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS - FEATURES - "liburing" WITH_LIBURING - "snappy" WITH_SNAPPY - "lz4" WITH_LZ4 - "zlib" WITH_ZLIB - "zstd" WITH_ZSTD - "bzip2" WITH_BZ2 - "numa" WITH_NUMA - "tbb" WITH_TBB -) - -vcpkg_cmake_configure( - SOURCE_PATH "${SOURCE_PATH}" - OPTIONS - -DWITH_GFLAGS=OFF - -DWITH_TESTS=OFF - -DWITH_BENCHMARK_TOOLS=OFF - -DWITH_TOOLS=OFF - -DUSE_RTTI=ON - -DROCKSDB_INSTALL_ON_WINDOWS=ON - -DFAIL_ON_WARNINGS=OFF - -DWITH_MD_LIBRARY=${WITH_MD_LIBRARY} - -DPORTABLE=1 # Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU - -DROCKSDB_BUILD_SHARED=${ROCKSDB_BUILD_SHARED} - -DCMAKE_DISABLE_FIND_PACKAGE_Git=TRUE - ${FEATURE_OPTIONS} - OPTIONS_DEBUG - -DCMAKE_DEBUG_POSTFIX=d - -DWITH_RUNTIME_DEBUG=ON - OPTIONS_RELEASE - -DWITH_RUNTIME_DEBUG=OFF -) - -vcpkg_cmake_install() - -vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/rocksdb) - -vcpkg_copy_pdbs() - -file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") -file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") - -vcpkg_fixup_pkgconfig() - -vcpkg_install_copyright(COMMENT [[ -RocksDB is dual-licensed under both the GPLv2 (found in COPYING) -and Apache 2.0 License (found in LICENSE.Apache). You may select, -at your option, one of the above-listed licenses. -]] - FILE_LIST - "${SOURCE_PATH}/LICENSE.leveldb" - "${SOURCE_PATH}/LICENSE.Apache" - "${SOURCE_PATH}/COPYING" -) diff --git a/ports/rocksdb/vcpkg.json b/ports/rocksdb/vcpkg.json deleted file mode 100644 index 5d1bf37..0000000 --- a/ports/rocksdb/vcpkg.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "name": "rocksdb", - "version": "8.10.0", - "description": "A library that provides an embeddable, persistent key-value store for fast storage", - "homepage": "https://github.com/facebook/rocksdb", - "license": "GPL-2.0-only OR Apache-2.0", - "supports": "!uwp & !(arm & !arm64 & android)", - "dependencies": [ - { - "name": "vcpkg-cmake", - "host": true - }, - { - "name": "vcpkg-cmake-config", - "host": true - } - ], - "default-features": [ - "zlib" - ], - "features": { - "bzip2": { - "description": "build with bzip2", - "dependencies": [ - "bzip2" - ] - }, - "liburing": { - "description": "build with liburing", - "supports": "linux", - "dependencies": [ - { - "name": "liburing", - "platform": "linux" - } - ] - }, - "lz4": { - "description": "build with lz4", - "dependencies": [ - "lz4" - ] - }, - "numa": { - "description": "build with NUMA policy support", - "supports": "linux" - }, - "snappy": { - "description": "build with SNAPPY", - "dependencies": [ - "snappy" - ] - }, - "tbb": { - "description": "build with Threading Building Blocks (TBB)", - "dependencies": [ - "tbb" - ] - }, - "zlib": { - "description": "build with zlib", - "dependencies": [ - "zlib" - ] - }, - "zstd": { - "description": "build with zstd", - "dependencies": [ - "zstd" - ] - } - } -} diff --git a/pyproject.toml b/pyproject.toml index 12e2a5c..9b249f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,9 @@ [build-system] -requires = ["scikit-build-core >=0.4.3", "nanobind >=1.3.2"] -build-backend = "scikit_build_core.build" +requires = [ + "scikit-build-core-conan >=0.4.0", + "nanobind >=1.3.2" +] +build-backend = "scikit_build_core_conan.build" [project] name = "lintdb" @@ -23,7 +26,6 @@ dependencies = [ Homepage = "https://github.com/deployQL/lintdb" [tool.pytest.ini_options] -#pythonpath = [ "./builds/python/lintdb/python/Release", "lintdb/python" ] testpaths = [ "lintdb/python/tests" ] @@ -33,24 +35,22 @@ testpaths = [ minimum-version = "0.4" # Setuptools-style build caching in a local directory -build-dir = "builds/{wheel_tag}" - -# Build stable ABI wheels for CPython 3.12+ -#wheel.py-api = "cp310" +build-dir = "build/{wheel_tag}" cmake.build-type = "Release" sdist.cmake = true wheel.packages = ["lintdb/python"] -cmake.targets = ['lintdb_lib', 'core'] - -[tool.scikit-build.cmake.define] -# to use a shared library, we need to jump through some hoops for python: https://github.com/scikit-build/scikit-build/issues/272 -# SO link: https://stackoverflow.com/questions/70044257/packaging-executable-shared-library-and-python-bindings-not-finding-library -BUILD_SHARED_LIBS = "OFF" -CMAKE_CXX_COMPILER = "clang++" -#CMAKE_INSTALL_LIBDIR = "lib" -CMAKE_VERBOSE_MAKEFILE = "OFF" -ENABLE_PYTHON = "ON" -BUILD_TESTING = "OFF" -#BLA_VENDOR = "Intel10_64lp" \ No newline at end of file +cmake.targets = ['lintdb', 'core'] + +[tool.scikit-build-core-conan] +path = "." +build = "missing" +options = ["build_python=True", "build_tests=False", "build_benchmarks=False"] +settings = [] +config = [] +output_folder = "build" + +#[[tool.scikit-build-core-conan.overrides]] +#if.platform-system = "linux" +#options= ["build_python=true"] diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 28679bf..2913165 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,33 +11,52 @@ set(LINT_DB_TESTS binarizer_test.cpp inverted_list_test.cpp doc_processor_test.cpp - product_quantizer_test.cpp) + product_quantizer_test.cpp + query/DataFrameTest.cpp + query/physical/PhysicalPlanTest.cpp + query/physical/VectorSearchPhysicalPlanTest.cpp + ) -add_executable(lintdb-tests ${LINT_DB_TESTS}) +add_executable(lintdb_tests ${LINT_DB_TESTS}) -target_link_libraries(lintdb-tests PRIVATE lintdb_lib) +# Add test data directory to include path +target_include_directories(lintdb_tests PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/..) find_package(Bitsery CONFIG REQUIRED) -target_link_libraries(lintdb-tests PRIVATE Bitsery::bitsery) - -enable_testing() - +find_package(RocksDB REQUIRED) find_package(GTest CONFIG REQUIRED) -include(FetchContent) -set(BUILD_GMOCK CACHE BOOL OFF) -set(INSTALL_GTEST CACHE BOOL OFF) -FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip -) - -# target_include_directories(lintdb-tests PUBLIC -# $) - -target_link_libraries(lintdb-tests PRIVATE GTest::gtest GTest::gtest_main - GTest::gmock GTest::gmock_main) - +target_link_libraries(lintdb_tests PRIVATE + Bitsery::bitsery + lintdb + RocksDB::rocksdb + GTest::gtest + GTest::gtest_main + GTest::gmock + GTest::gmock_main) + +target_compile_options( + lintdb_tests + PRIVATE $<$: + -mfma + -mf16c + -mavx2 + -msse4.2 + -mpopcnt + -Wno-sign-compare + -Wno-unused-variable + -Wno-reorder + -Wno-unused-local-typedefs + -Wno-unused-function + -Wno-strict-aliasing>) + +# Configure test discovery include(GoogleTest) -gtest_discover_tests(lintdb-tests - WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/tests") +gtest_discover_tests(lintdb_tests + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + PROPERTIES + ENVIRONMENT "GTEST_COLOR=1" + TIMEOUT 30 +) diff --git a/tests/binarizer_test.cpp b/tests/binarizer_test.cpp index 6b85888..ac868f1 100644 --- a/tests/binarizer_test.cpp +++ b/tests/binarizer_test.cpp @@ -3,6 +3,7 @@ #include #define private public #include +#include #include "lintdb/quantizers/Binarizer.h" #include "lintdb/utils/endian.h" @@ -141,4 +142,118 @@ TEST(BinarizerTests, EncodingTest) { binarizer.sa_decode(1, output.data(), decoded.data()); ASSERT_EQ(input, decoded); -} \ No newline at end of file +} + +// Helper function to generate random residuals +std::vector generateRandomResiduals(size_t n, size_t packed_dim) { + std::vector residuals(n * packed_dim); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, 255); + + for (size_t i = 0; i < residuals.size(); ++i) { + residuals[i] = dis(gen); + } + + return residuals; +} + +TEST(BinarizerTests, CompareOriginalAndOptimized) { + lintdb::Binarizer binarizer(1, 128); + binarizer.bucket_weights = {-1.0f, 1.0f}; + binarizer.reverse_bitmap = binarizer.create_reverse_bitmap(); + binarizer.decompression_lut = binarizer.create_decompression_lut(); + + const size_t n = 1000; // Number of vectors to decode + const size_t packed_dim = binarizer.dim / 8; // 1 bit per value, so 8 values per byte + + auto residuals = generateRandomResiduals(n, packed_dim); + + std::vector original_output(n * binarizer.dim); + std::vector optimized_output(n * binarizer.dim); + + // Call the original method + binarizer.sa_decode_generic(n, residuals.data(), original_output.data()); + + // Call the optimized method + binarizer.sa_decode_1bit(n, residuals.data(), optimized_output.data()); + + // Compare results + for (size_t i = 0; i < n * binarizer.dim; ++i) { + EXPECT_FLOAT_EQ(original_output[i], optimized_output[i]) + << "Mismatch at index " << i; + } +} + +// Test case: Check if all possible bit patterns are handled correctly +TEST(BinarizerTests, DISABLED_AllBitPatterns) { + lintdb::Binarizer binarizer(1, 128); + binarizer.bucket_weights = {-1.0f, 1.0f}; + binarizer.reverse_bitmap = binarizer.create_reverse_bitmap(); + binarizer.decompression_lut = binarizer.create_decompression_lut(); + + const size_t n = 256; // Test all possible byte values + const size_t packed_dim = 1; // One byte per vector + + std::vector residuals(n); + for (int i = 0; i < 256; ++i) { + residuals[i] = static_cast(i); + } + + std::vector original_output(n * 8); // 8 values per byte + std::vector optimized_output(n * 8); + + binarizer.sa_decode_generic(n, residuals.data(), original_output.data()); + binarizer.sa_decode_1bit(n, residuals.data(), optimized_output.data()); + + for (size_t i = 0; i < n * 8; ++i) { + EXPECT_FLOAT_EQ(original_output[i], optimized_output[i]) + << "Mismatch at index " << i; + } +} + +// Test case: Edge case with all zeros +TEST(BinarizerTests, AllZeros) { + lintdb::Binarizer binarizer(1, 128); + binarizer.bucket_weights = {-1.0f, 1.0f}; + binarizer.reverse_bitmap = binarizer.create_reverse_bitmap(); + binarizer.decompression_lut = binarizer.create_decompression_lut(); + + const size_t n = 100; + const size_t packed_dim = binarizer.dim / 8; + + std::vector residuals(n * packed_dim, 0); + std::vector original_output(n * binarizer.dim); + std::vector optimized_output(n * binarizer.dim); + + binarizer.sa_decode_generic(n, residuals.data(), original_output.data()); + binarizer.sa_decode_1bit(n, residuals.data(), optimized_output.data()); + + for (size_t i = 0; i < n * binarizer.dim; ++i) { + EXPECT_FLOAT_EQ(original_output[i], optimized_output[i]) + << "Mismatch at index " << i; + } +} + +// Test case: Edge case with all ones +TEST(BinarizerTests, AllOnes) { + lintdb::Binarizer binarizer(1, 128); + binarizer.bucket_weights = {-1.0f, 1.0f}; + binarizer.reverse_bitmap = binarizer.create_reverse_bitmap(); + binarizer.decompression_lut = binarizer.create_decompression_lut(); + + const size_t n = 100; + const size_t packed_dim = binarizer.dim / 8; + + std::vector residuals(n * packed_dim, 255); + std::vector original_output(n * binarizer.dim); + std::vector optimized_output(n * binarizer.dim); + + binarizer.sa_decode_generic(n, residuals.data(), original_output.data()); + binarizer.sa_decode_1bit(n, residuals.data(), optimized_output.data()); + + for (size_t i = 0; i < n * binarizer.dim; ++i) { + EXPECT_FLOAT_EQ(original_output[i], optimized_output[i]) + << "Mismatch at index " << i; + } +} diff --git a/tests/data/colbert_test.db/000176.log b/tests/data/colbert_test.db/000176.log deleted file mode 100644 index e69de29..0000000 diff --git a/tests/data/colbert_test.db/CURRENT b/tests/data/colbert_test.db/CURRENT index 0e27482..f10934b 100644 --- a/tests/data/colbert_test.db/CURRENT +++ b/tests/data/colbert_test.db/CURRENT @@ -1 +1 @@ -MANIFEST-000177 +MANIFEST-000265 diff --git a/tests/data/colbert_test.db/LOG b/tests/data/colbert_test.db/LOG index 0256855..b073690 100644 --- a/tests/data/colbert_test.db/LOG +++ b/tests/data/colbert_test.db/LOG @@ -1,120 +1,121 @@ -2024/08/14-23:25:07.318487 631181 RocksDB version: 8.10.0 -2024/08/14-23:25:07.318576 631181 Git sha 0 -2024/08/14-23:25:07.318584 631181 Compile date 2024-06-20 15:29:42 -2024/08/14-23:25:07.318588 631181 DB SUMMARY -2024/08/14-23:25:07.318593 631181 Host name (Env): devbox -2024/08/14-23:25:07.318597 631181 DB Session ID: LQL48X0GF8RA2NAO082H -2024/08/14-23:25:07.318708 631181 CURRENT file: CURRENT -2024/08/14-23:25:07.318715 631181 IDENTITY file: IDENTITY -2024/08/14-23:25:07.318731 631181 MANIFEST file: MANIFEST-000173 size: 1087 Bytes -2024/08/14-23:25:07.318737 631181 SST files in data/colbert_test.db dir, Total Num: 4, files: 000008.sst 000009.sst 000010.sst 000011.sst -2024/08/14-23:25:07.318742 631181 Write Ahead Log file in data/colbert_test.db: 000172.log size: 0 ; -2024/08/14-23:25:07.318748 631181 Options.error_if_exists: 0 -2024/08/14-23:25:07.318752 631181 Options.create_if_missing: 1 -2024/08/14-23:25:07.318756 631181 Options.paranoid_checks: 1 -2024/08/14-23:25:07.318759 631181 Options.flush_verify_memtable_count: 1 -2024/08/14-23:25:07.318762 631181 Options.compaction_verify_record_count: 1 -2024/08/14-23:25:07.318766 631181 Options.track_and_verify_wals_in_manifest: 0 -2024/08/14-23:25:07.318769 631181 Options.verify_sst_unique_id_in_manifest: 1 -2024/08/14-23:25:07.318772 631181 Options.env: 0x642fd90525b0 -2024/08/14-23:25:07.318776 631181 Options.fs: PosixFileSystem -2024/08/14-23:25:07.318779 631181 Options.info_log: 0x642fd90aa3e0 -2024/08/14-23:25:07.318783 631181 Options.max_file_opening_threads: 16 -2024/08/14-23:25:07.318786 631181 Options.statistics: (nil) -2024/08/14-23:25:07.318789 631181 Options.use_fsync: 0 -2024/08/14-23:25:07.318793 631181 Options.max_log_file_size: 0 -2024/08/14-23:25:07.318796 631181 Options.max_manifest_file_size: 1073741824 -2024/08/14-23:25:07.318799 631181 Options.log_file_time_to_roll: 0 -2024/08/14-23:25:07.318803 631181 Options.keep_log_file_num: 1000 -2024/08/14-23:25:07.318806 631181 Options.recycle_log_file_num: 0 -2024/08/14-23:25:07.318809 631181 Options.allow_fallocate: 1 -2024/08/14-23:25:07.318812 631181 Options.allow_mmap_reads: 0 -2024/08/14-23:25:07.318815 631181 Options.allow_mmap_writes: 0 -2024/08/14-23:25:07.318818 631181 Options.use_direct_reads: 0 -2024/08/14-23:25:07.318822 631181 Options.use_direct_io_for_flush_and_compaction: 0 -2024/08/14-23:25:07.318825 631181 Options.create_missing_column_families: 1 -2024/08/14-23:25:07.318828 631181 Options.db_log_dir: -2024/08/14-23:25:07.318831 631181 Options.wal_dir: -2024/08/14-23:25:07.318835 631181 Options.table_cache_numshardbits: 6 -2024/08/14-23:25:07.318838 631181 Options.WAL_ttl_seconds: 0 -2024/08/14-23:25:07.318841 631181 Options.WAL_size_limit_MB: 0 -2024/08/14-23:25:07.318844 631181 Options.max_write_batch_group_size_bytes: 1048576 -2024/08/14-23:25:07.318847 631181 Options.manifest_preallocation_size: 4194304 -2024/08/14-23:25:07.318851 631181 Options.is_fd_close_on_exec: 1 -2024/08/14-23:25:07.318854 631181 Options.advise_random_on_open: 1 -2024/08/14-23:25:07.318857 631181 Options.db_write_buffer_size: 0 -2024/08/14-23:25:07.318860 631181 Options.write_buffer_manager: 0x642fd90aac70 -2024/08/14-23:25:07.318864 631181 Options.access_hint_on_compaction_start: 1 -2024/08/14-23:25:07.318867 631181 Options.random_access_max_buffer_size: 1048576 -2024/08/14-23:25:07.318870 631181 Options.use_adaptive_mutex: 0 -2024/08/14-23:25:07.318873 631181 Options.rate_limiter: (nil) -2024/08/14-23:25:07.318877 631181 Options.sst_file_manager.rate_bytes_per_sec: 0 -2024/08/14-23:25:07.318880 631181 Options.wal_recovery_mode: 2 -2024/08/14-23:25:07.318883 631181 Options.enable_thread_tracking: 0 -2024/08/14-23:25:07.318887 631181 Options.enable_pipelined_write: 0 -2024/08/14-23:25:07.318890 631181 Options.unordered_write: 0 -2024/08/14-23:25:07.318893 631181 Options.allow_concurrent_memtable_write: 1 -2024/08/14-23:25:07.318896 631181 Options.enable_write_thread_adaptive_yield: 1 -2024/08/14-23:25:07.318900 631181 Options.write_thread_max_yield_usec: 100 -2024/08/14-23:25:07.318903 631181 Options.write_thread_slow_yield_usec: 3 -2024/08/14-23:25:07.318906 631181 Options.row_cache: None -2024/08/14-23:25:07.318910 631181 Options.wal_filter: None -2024/08/14-23:25:07.318913 631181 Options.avoid_flush_during_recovery: 0 -2024/08/14-23:25:07.318916 631181 Options.allow_ingest_behind: 0 -2024/08/14-23:25:07.318919 631181 Options.two_write_queues: 0 -2024/08/14-23:25:07.318922 631181 Options.manual_wal_flush: 0 -2024/08/14-23:25:07.318925 631181 Options.wal_compression: 0 -2024/08/14-23:25:07.318929 631181 Options.atomic_flush: 0 -2024/08/14-23:25:07.318932 631181 Options.avoid_unnecessary_blocking_io: 0 -2024/08/14-23:25:07.318935 631181 Options.persist_stats_to_disk: 0 -2024/08/14-23:25:07.318938 631181 Options.write_dbid_to_manifest: 0 -2024/08/14-23:25:07.318941 631181 Options.log_readahead_size: 0 -2024/08/14-23:25:07.318945 631181 Options.file_checksum_gen_factory: Unknown -2024/08/14-23:25:07.318948 631181 Options.best_efforts_recovery: 0 -2024/08/14-23:25:07.318951 631181 Options.max_bgerror_resume_count: 2147483647 -2024/08/14-23:25:07.318954 631181 Options.bgerror_resume_retry_interval: 1000000 -2024/08/14-23:25:07.318957 631181 Options.allow_data_in_errors: 0 -2024/08/14-23:25:07.318961 631181 Options.db_host_id: __hostname__ -2024/08/14-23:25:07.318964 631181 Options.enforce_single_del_contracts: true -2024/08/14-23:25:07.318967 631181 Options.max_background_jobs: 2 -2024/08/14-23:25:07.318970 631181 Options.max_background_compactions: -1 -2024/08/14-23:25:07.318973 631181 Options.max_subcompactions: 1 -2024/08/14-23:25:07.318977 631181 Options.avoid_flush_during_shutdown: 0 -2024/08/14-23:25:07.318985 631181 Options.writable_file_max_buffer_size: 1048576 -2024/08/14-23:25:07.318988 631181 Options.delayed_write_rate : 16777216 -2024/08/14-23:25:07.318991 631181 Options.max_total_wal_size: 0 -2024/08/14-23:25:07.318995 631181 Options.delete_obsolete_files_period_micros: 21600000000 -2024/08/14-23:25:07.318998 631181 Options.stats_dump_period_sec: 600 -2024/08/14-23:25:07.319001 631181 Options.stats_persist_period_sec: 600 -2024/08/14-23:25:07.319004 631181 Options.stats_history_buffer_size: 1048576 -2024/08/14-23:25:07.319007 631181 Options.max_open_files: -1 -2024/08/14-23:25:07.319011 631181 Options.bytes_per_sync: 0 -2024/08/14-23:25:07.319014 631181 Options.wal_bytes_per_sync: 0 -2024/08/14-23:25:07.319017 631181 Options.strict_bytes_per_sync: 0 -2024/08/14-23:25:07.319020 631181 Options.compaction_readahead_size: 2097152 -2024/08/14-23:25:07.319023 631181 Options.max_background_flushes: -1 -2024/08/14-23:25:07.319027 631181 Options.daily_offpeak_time_utc: -2024/08/14-23:25:07.319030 631181 Compression algorithms supported: -2024/08/14-23:25:07.319034 631181 kZSTD supported: 0 -2024/08/14-23:25:07.319038 631181 kXpressCompression supported: 0 -2024/08/14-23:25:07.319041 631181 kBZip2Compression supported: 0 -2024/08/14-23:25:07.319044 631181 kZSTDNotFinalCompression supported: 0 -2024/08/14-23:25:07.319048 631181 kLZ4Compression supported: 0 -2024/08/14-23:25:07.319051 631181 kZlibCompression supported: 1 -2024/08/14-23:25:07.319055 631181 kLZ4HCCompression supported: 0 -2024/08/14-23:25:07.319058 631181 kSnappyCompression supported: 0 -2024/08/14-23:25:07.319063 631181 Fast CRC32 supported: Not supported on x86 -2024/08/14-23:25:07.319066 631181 DMutex implementation: pthread_mutex_t -2024/08/14-23:25:07.319178 631181 [db/version_set.cc:5941] Recovering from manifest file: data/colbert_test.db/MANIFEST-000173 -2024/08/14-23:25:07.319472 631181 [db/column_family.cc:616] --------------- Options for column family [default]: -2024/08/14-23:25:07.319484 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.319488 631181 Options.merge_operator: None -2024/08/14-23:25:07.319491 631181 Options.compaction_filter: None -2024/08/14-23:25:07.319495 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.319498 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.319502 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.319505 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.319555 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd9072820) +2024/09/13-17:21:56.396991 1505759 RocksDB version: 8.10.0 +2024/09/13-17:21:56.397104 1505759 Git sha 0 +2024/09/13-17:21:56.397114 1505759 Compile date 2024-09-11 18:06:13 +2024/09/13-17:21:56.397120 1505759 DB SUMMARY +2024/09/13-17:21:56.397127 1505759 Host name (Env): devbox +2024/09/13-17:21:56.397132 1505759 DB Session ID: ZXZVJNQMK0L578WMPIVE +2024/09/13-17:21:56.397298 1505759 CURRENT file: CURRENT +2024/09/13-17:21:56.397310 1505759 IDENTITY file: IDENTITY +2024/09/13-17:21:56.397326 1505759 MANIFEST file: MANIFEST-000177 size: 1087 Bytes +2024/09/13-17:21:56.397336 1505759 MANIFEST file: MANIFEST-000261 size: 1087 Bytes +2024/09/13-17:21:56.397345 1505759 SST files in data/colbert_test.db dir, Total Num: 4, files: 000008.sst 000009.sst 000010.sst 000011.sst +2024/09/13-17:21:56.397351 1505759 Write Ahead Log file in data/colbert_test.db: 000176.log size: 0 ; 000260.log size: 0 ; +2024/09/13-17:21:56.397358 1505759 Options.error_if_exists: 0 +2024/09/13-17:21:56.397362 1505759 Options.create_if_missing: 1 +2024/09/13-17:21:56.397366 1505759 Options.paranoid_checks: 1 +2024/09/13-17:21:56.397370 1505759 Options.flush_verify_memtable_count: 1 +2024/09/13-17:21:56.397375 1505759 Options.compaction_verify_record_count: 1 +2024/09/13-17:21:56.397379 1505759 Options.track_and_verify_wals_in_manifest: 0 +2024/09/13-17:21:56.397383 1505759 Options.verify_sst_unique_id_in_manifest: 1 +2024/09/13-17:21:56.397387 1505759 Options.env: 0x59e4fb319d90 +2024/09/13-17:21:56.397391 1505759 Options.fs: PosixFileSystem +2024/09/13-17:21:56.397395 1505759 Options.info_log: 0x59e4fb379640 +2024/09/13-17:21:56.397399 1505759 Options.max_file_opening_threads: 16 +2024/09/13-17:21:56.397404 1505759 Options.statistics: (nil) +2024/09/13-17:21:56.397408 1505759 Options.use_fsync: 0 +2024/09/13-17:21:56.397412 1505759 Options.max_log_file_size: 0 +2024/09/13-17:21:56.397416 1505759 Options.max_manifest_file_size: 1073741824 +2024/09/13-17:21:56.397420 1505759 Options.log_file_time_to_roll: 0 +2024/09/13-17:21:56.397424 1505759 Options.keep_log_file_num: 1000 +2024/09/13-17:21:56.397428 1505759 Options.recycle_log_file_num: 0 +2024/09/13-17:21:56.397432 1505759 Options.allow_fallocate: 1 +2024/09/13-17:21:56.397436 1505759 Options.allow_mmap_reads: 0 +2024/09/13-17:21:56.397440 1505759 Options.allow_mmap_writes: 0 +2024/09/13-17:21:56.397445 1505759 Options.use_direct_reads: 0 +2024/09/13-17:21:56.397449 1505759 Options.use_direct_io_for_flush_and_compaction: 0 +2024/09/13-17:21:56.397453 1505759 Options.create_missing_column_families: 1 +2024/09/13-17:21:56.397457 1505759 Options.db_log_dir: +2024/09/13-17:21:56.397461 1505759 Options.wal_dir: +2024/09/13-17:21:56.397466 1505759 Options.table_cache_numshardbits: 6 +2024/09/13-17:21:56.397470 1505759 Options.WAL_ttl_seconds: 0 +2024/09/13-17:21:56.397474 1505759 Options.WAL_size_limit_MB: 0 +2024/09/13-17:21:56.397478 1505759 Options.max_write_batch_group_size_bytes: 1048576 +2024/09/13-17:21:56.397482 1505759 Options.manifest_preallocation_size: 4194304 +2024/09/13-17:21:56.397486 1505759 Options.is_fd_close_on_exec: 1 +2024/09/13-17:21:56.397489 1505759 Options.advise_random_on_open: 1 +2024/09/13-17:21:56.397493 1505759 Options.db_write_buffer_size: 0 +2024/09/13-17:21:56.397497 1505759 Options.write_buffer_manager: 0x59e4fb372550 +2024/09/13-17:21:56.397501 1505759 Options.access_hint_on_compaction_start: 1 +2024/09/13-17:21:56.397505 1505759 Options.random_access_max_buffer_size: 1048576 +2024/09/13-17:21:56.397509 1505759 Options.use_adaptive_mutex: 0 +2024/09/13-17:21:56.397513 1505759 Options.rate_limiter: (nil) +2024/09/13-17:21:56.397518 1505759 Options.sst_file_manager.rate_bytes_per_sec: 0 +2024/09/13-17:21:56.397522 1505759 Options.wal_recovery_mode: 2 +2024/09/13-17:21:56.397526 1505759 Options.enable_thread_tracking: 0 +2024/09/13-17:21:56.397529 1505759 Options.enable_pipelined_write: 0 +2024/09/13-17:21:56.397533 1505759 Options.unordered_write: 0 +2024/09/13-17:21:56.397537 1505759 Options.allow_concurrent_memtable_write: 1 +2024/09/13-17:21:56.397541 1505759 Options.enable_write_thread_adaptive_yield: 1 +2024/09/13-17:21:56.397545 1505759 Options.write_thread_max_yield_usec: 100 +2024/09/13-17:21:56.397549 1505759 Options.write_thread_slow_yield_usec: 3 +2024/09/13-17:21:56.397553 1505759 Options.row_cache: None +2024/09/13-17:21:56.397557 1505759 Options.wal_filter: None +2024/09/13-17:21:56.397561 1505759 Options.avoid_flush_during_recovery: 0 +2024/09/13-17:21:56.397565 1505759 Options.allow_ingest_behind: 0 +2024/09/13-17:21:56.397569 1505759 Options.two_write_queues: 0 +2024/09/13-17:21:56.397572 1505759 Options.manual_wal_flush: 0 +2024/09/13-17:21:56.397576 1505759 Options.wal_compression: 0 +2024/09/13-17:21:56.397580 1505759 Options.atomic_flush: 0 +2024/09/13-17:21:56.397584 1505759 Options.avoid_unnecessary_blocking_io: 0 +2024/09/13-17:21:56.397588 1505759 Options.persist_stats_to_disk: 0 +2024/09/13-17:21:56.397592 1505759 Options.write_dbid_to_manifest: 0 +2024/09/13-17:21:56.397596 1505759 Options.log_readahead_size: 0 +2024/09/13-17:21:56.397600 1505759 Options.file_checksum_gen_factory: Unknown +2024/09/13-17:21:56.397604 1505759 Options.best_efforts_recovery: 0 +2024/09/13-17:21:56.397608 1505759 Options.max_bgerror_resume_count: 2147483647 +2024/09/13-17:21:56.397612 1505759 Options.bgerror_resume_retry_interval: 1000000 +2024/09/13-17:21:56.397616 1505759 Options.allow_data_in_errors: 0 +2024/09/13-17:21:56.397620 1505759 Options.db_host_id: __hostname__ +2024/09/13-17:21:56.397624 1505759 Options.enforce_single_del_contracts: true +2024/09/13-17:21:56.397628 1505759 Options.max_background_jobs: 2 +2024/09/13-17:21:56.397632 1505759 Options.max_background_compactions: -1 +2024/09/13-17:21:56.397636 1505759 Options.max_subcompactions: 1 +2024/09/13-17:21:56.397640 1505759 Options.avoid_flush_during_shutdown: 0 +2024/09/13-17:21:56.397643 1505759 Options.writable_file_max_buffer_size: 1048576 +2024/09/13-17:21:56.397647 1505759 Options.delayed_write_rate : 16777216 +2024/09/13-17:21:56.397651 1505759 Options.max_total_wal_size: 0 +2024/09/13-17:21:56.397655 1505759 Options.delete_obsolete_files_period_micros: 21600000000 +2024/09/13-17:21:56.397659 1505759 Options.stats_dump_period_sec: 600 +2024/09/13-17:21:56.397663 1505759 Options.stats_persist_period_sec: 600 +2024/09/13-17:21:56.397667 1505759 Options.stats_history_buffer_size: 1048576 +2024/09/13-17:21:56.397671 1505759 Options.max_open_files: -1 +2024/09/13-17:21:56.397675 1505759 Options.bytes_per_sync: 0 +2024/09/13-17:21:56.397679 1505759 Options.wal_bytes_per_sync: 0 +2024/09/13-17:21:56.397683 1505759 Options.strict_bytes_per_sync: 0 +2024/09/13-17:21:56.397687 1505759 Options.compaction_readahead_size: 2097152 +2024/09/13-17:21:56.397691 1505759 Options.max_background_flushes: -1 +2024/09/13-17:21:56.397695 1505759 Options.daily_offpeak_time_utc: +2024/09/13-17:21:56.397699 1505759 Compression algorithms supported: +2024/09/13-17:21:56.397704 1505759 kZSTD supported: 0 +2024/09/13-17:21:56.397709 1505759 kXpressCompression supported: 0 +2024/09/13-17:21:56.397713 1505759 kBZip2Compression supported: 0 +2024/09/13-17:21:56.397717 1505759 kZSTDNotFinalCompression supported: 0 +2024/09/13-17:21:56.397721 1505759 kLZ4Compression supported: 0 +2024/09/13-17:21:56.397725 1505759 kZlibCompression supported: 1 +2024/09/13-17:21:56.397730 1505759 kLZ4HCCompression supported: 0 +2024/09/13-17:21:56.397734 1505759 kSnappyCompression supported: 0 +2024/09/13-17:21:56.397740 1505759 Fast CRC32 supported: Not supported on x86 +2024/09/13-17:21:56.397744 1505759 DMutex implementation: pthread_mutex_t +2024/09/13-17:21:56.397933 1505759 [db/version_set.cc:5941] Recovering from manifest file: data/colbert_test.db/MANIFEST-000177 +2024/09/13-17:21:56.398318 1505759 [db/column_family.cc:616] --------------- Options for column family [default]: +2024/09/13-17:21:56.398333 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.398338 1505759 Options.merge_operator: None +2024/09/13-17:21:56.398343 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.398347 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.398351 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.398356 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.398360 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.398422 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb339f70) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -125,7 +126,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd9072b10 + block_cache: 0x59e4fb33a260 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -153,103 +154,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.319571 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.319575 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.319580 631181 Options.compression: NoCompression -2024/08/14-23:25:07.319583 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.319587 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.319590 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.319593 631181 Options.num_levels: 7 -2024/08/14-23:25:07.319597 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.319600 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.319603 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.319607 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.319610 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.319613 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.319617 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.319620 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.319623 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.319627 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.319630 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.319633 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.319636 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.319640 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.319643 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.319646 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.319649 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.319652 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.319656 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.319659 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.319662 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.319665 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.319668 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.319671 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.319675 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.319678 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.319681 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.319684 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.319687 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.319692 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.319696 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.319699 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.319702 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.319706 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.319709 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.319712 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.319715 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.319718 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.319722 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.319725 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.319728 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.319731 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.319734 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.319739 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.319743 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.319746 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.319750 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.319753 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.319756 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.319759 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.319763 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.319766 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.319770 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.319775 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.319778 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.319781 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.319785 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.319788 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.319791 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.319795 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.319798 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.319801 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.319804 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.319807 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.319810 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.319814 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.319817 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.319821 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.319824 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.319827 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.319830 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.319833 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.319837 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.319840 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.319843 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.319847 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.319850 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.319854 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.319857 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.319860 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.319864 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.320208 631181 [db/column_family.cc:616] --------------- Options for column family [index]: -2024/08/14-23:25:07.320219 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.320222 631181 Options.merge_operator: None -2024/08/14-23:25:07.320225 631181 Options.compaction_filter: None -2024/08/14-23:25:07.320229 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.320232 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.320235 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.320239 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.320271 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd9081ef0) +2024/09/13-17:21:56.398447 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.398453 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.398458 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.398462 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.398467 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.398471 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.398475 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.398478 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.398482 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.398486 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.398491 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.398495 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.398499 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.398503 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.398507 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.398511 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.398514 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.398519 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.398523 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.398526 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.398530 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.398534 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.398538 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.398542 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.398546 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.398550 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.398554 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.398558 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.398562 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.398566 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.398570 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.398574 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.398577 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.398581 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.398586 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.398589 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.398596 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.398600 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.398604 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.398608 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.398612 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.398616 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.398620 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.398624 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.398628 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.398632 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.398636 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.398639 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.398643 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.398647 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.398653 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.398658 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.398662 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.398666 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.398670 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.398674 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.398678 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.398682 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.398686 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.398690 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.398698 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.398703 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.398706 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.398711 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.398715 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.398719 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.398724 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.398728 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.398732 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.398736 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.398740 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.398744 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.398748 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.398752 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.398757 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.398761 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.398765 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.398769 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.398772 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.398776 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.398780 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.398784 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.398788 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.398793 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.398797 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.398801 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.398805 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.398809 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.399291 1505759 [db/column_family.cc:616] --------------- Options for column family [index]: +2024/09/13-17:21:56.399304 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.399309 1505759 Options.merge_operator: None +2024/09/13-17:21:56.399313 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.399317 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.399321 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.399325 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.399329 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.399373 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb349640) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -260,7 +261,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd9082240 + block_cache: 0x59e4fb349990 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -288,103 +289,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.320280 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.320284 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.320287 631181 Options.compression: NoCompression -2024/08/14-23:25:07.320291 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.320295 631181 Options.prefix_extractor: rocksdb.CappedPrefix -2024/08/14-23:25:07.320298 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.320301 631181 Options.num_levels: 7 -2024/08/14-23:25:07.320304 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.320308 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.320311 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.320314 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.320317 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.320320 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.320323 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.320327 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.320330 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.320333 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.320336 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.320339 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.320343 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.320346 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.320349 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.320352 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.320355 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.320359 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.320362 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.320365 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.320368 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.320371 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.320374 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.320378 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.320381 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.320384 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.320387 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.320390 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.320393 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.320398 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.320401 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.320404 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.320408 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.320411 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.320414 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.320417 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.320420 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.320424 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.320427 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.320433 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.320436 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.320439 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.320443 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.320447 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.320451 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.320455 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.320458 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.320461 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.320464 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.320467 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.320471 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.320475 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.320478 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.320483 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.320486 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.320489 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.320493 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.320496 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.320499 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.320503 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.320506 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.320509 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.320512 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.320515 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.320518 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.320521 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.320525 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.320528 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.320532 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.320535 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.320538 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.320541 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.320544 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.320548 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.320551 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.320554 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.320558 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.320561 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.320564 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.320568 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.320571 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.320874 631181 [db/column_family.cc:616] --------------- Options for column family [forward]: -2024/08/14-23:25:07.320884 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.320888 631181 Options.merge_operator: None -2024/08/14-23:25:07.320901 631181 Options.compaction_filter: None -2024/08/14-23:25:07.320905 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.320908 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.320912 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.320915 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.320951 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd9081e40) +2024/09/13-17:21:56.399399 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.399405 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.399409 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.399413 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.399418 1505759 Options.prefix_extractor: rocksdb.CappedPrefix +2024/09/13-17:21:56.399422 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.399426 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.399430 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.399434 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.399438 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.399442 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.399446 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.399450 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.399454 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.399458 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.399462 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.399467 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.399470 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.399474 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.399478 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.399482 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.399486 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.399490 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.399494 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.399498 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.399502 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.399506 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.399510 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.399514 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.399518 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.399521 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.399525 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.399529 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.399533 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.399537 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.399541 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.399547 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.399551 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.399555 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.399559 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.399563 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.399572 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.399576 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.399580 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.399584 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.399588 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.399592 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.399596 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.399600 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.399604 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.399609 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.399614 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.399618 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.399622 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.399626 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.399630 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.399634 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.399639 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.399643 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.399647 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.399654 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.399658 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.399662 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.399666 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.399670 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.399674 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.399678 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.399682 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.399686 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.399690 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.399694 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.399698 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.399702 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.399706 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.399710 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.399714 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.399718 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.399722 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.399726 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.399730 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.399734 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.399738 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.399742 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.399747 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.399751 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.399755 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.399759 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.399767 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.400183 1505759 [db/column_family.cc:616] --------------- Options for column family [forward]: +2024/09/13-17:21:56.400197 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.400201 1505759 Options.merge_operator: None +2024/09/13-17:21:56.400205 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.400209 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.400213 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.400218 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.400222 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.400267 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb3495b0) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -395,7 +396,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd907a710 + block_cache: 0x59e4fb341e80 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -423,103 +424,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.320969 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.320973 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.320976 631181 Options.compression: NoCompression -2024/08/14-23:25:07.320980 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.320983 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.320986 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.320990 631181 Options.num_levels: 7 -2024/08/14-23:25:07.320993 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.320996 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.320999 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.321002 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.321006 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.321009 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.321012 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.321015 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.321019 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.321022 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.321025 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.321028 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.321031 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.321035 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.321038 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.321041 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.321044 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.321047 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.321051 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.321054 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.321057 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.321063 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.321067 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.321070 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.321073 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.321076 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.321079 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.321083 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.321086 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.321090 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.321094 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.321097 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.321100 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.321103 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.321107 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.321110 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.321113 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.321116 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.321119 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.321123 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.321126 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.321129 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.321132 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.321137 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.321141 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.321144 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.321147 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.321150 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.321154 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.321157 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.321161 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.321164 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.321167 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.321172 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.321175 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.321179 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.321182 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.321185 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.321189 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.321192 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.321195 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.321198 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.321201 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.321204 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.321208 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.321211 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.321217 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.321221 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.321224 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.321227 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.321230 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.321234 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.321237 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.321240 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.321243 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.321247 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.321250 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.321254 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.321257 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.321260 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.321264 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.321529 631181 [db/column_family.cc:616] --------------- Options for column family [codes]: -2024/08/14-23:25:07.321539 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.321542 631181 Options.merge_operator: None -2024/08/14-23:25:07.321545 631181 Options.compaction_filter: None -2024/08/14-23:25:07.321549 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.321552 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.321555 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.321559 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.321597 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd907a9b0) +2024/09/13-17:21:56.400287 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.400292 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.400297 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.400301 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.400305 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.400309 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.400313 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.400317 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.400321 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.400325 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.400329 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.400333 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.400336 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.400340 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.400344 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.400348 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.400352 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.400356 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.400360 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.400364 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.400368 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.400372 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.400376 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.400384 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.400388 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.400392 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.400396 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.400400 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.400404 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.400408 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.400412 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.400416 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.400420 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.400424 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.400428 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.400432 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.400437 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.400441 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.400445 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.400449 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.400453 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.400457 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.400461 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.400465 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.400469 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.400473 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.400477 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.400481 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.400485 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.400489 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.400494 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.400499 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.400503 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.400507 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.400511 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.400515 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.400519 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.400524 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.400528 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.400532 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.400539 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.400543 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.400547 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.400551 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.400555 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.400559 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.400563 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.400567 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.400575 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.400579 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.400583 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.400587 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.400591 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.400595 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.400599 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.400604 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.400608 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.400612 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.400616 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.400620 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.400624 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.400628 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.400632 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.400636 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.400641 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.400644 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.400648 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.400653 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.401006 1505759 [db/column_family.cc:616] --------------- Options for column family [codes]: +2024/09/13-17:21:56.401020 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.401024 1505759 Options.merge_operator: None +2024/09/13-17:21:56.401028 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.401032 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.401036 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.401041 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.401045 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.401096 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb342130) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -530,7 +531,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd907ace0 + block_cache: 0x59e4fb342460 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -558,103 +559,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.321613 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.321617 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.321621 631181 Options.compression: NoCompression -2024/08/14-23:25:07.321624 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.321627 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.321630 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.321634 631181 Options.num_levels: 7 -2024/08/14-23:25:07.321637 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.321640 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.321643 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.321647 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.321653 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.321656 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.321659 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.321663 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.321666 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.321669 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.321672 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.321675 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.321679 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.321682 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.321685 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.321688 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.321691 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.321694 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.321698 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.321701 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.321704 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.321707 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.321710 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.321713 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.321717 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.321720 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.321723 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.321726 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.321729 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.321734 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.321737 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.321740 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.321744 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.321747 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.321750 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.321753 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.321756 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.321760 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.321763 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.321766 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.321769 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.321772 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.321776 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.321780 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.321784 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.321787 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.321790 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.321793 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.321796 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.321802 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.321806 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.321810 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.321813 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.321818 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.321821 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.321824 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.321828 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.321831 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.321834 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.321838 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.321841 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.321844 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.321847 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.321850 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.321853 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.321856 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.321859 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.321863 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.321866 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.321870 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.321873 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.321876 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.321879 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.321882 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.321886 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.321889 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.321892 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.321896 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.321899 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.321902 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.321906 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.322183 631181 [db/column_family.cc:616] --------------- Options for column family [residuals]: -2024/08/14-23:25:07.322194 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.322197 631181 Options.merge_operator: None -2024/08/14-23:25:07.322200 631181 Options.compaction_filter: None -2024/08/14-23:25:07.322204 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.322207 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.322210 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.322213 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.322252 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd907d4e0) +2024/09/13-17:21:56.401118 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.401124 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.401128 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.401132 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.401137 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.401146 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.401150 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.401154 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.401158 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.401162 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.401166 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.401170 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.401174 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.401178 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.401182 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.401186 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.401190 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.401194 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.401198 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.401202 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.401206 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.401209 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.401213 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.401217 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.401221 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.401225 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.401229 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.401233 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.401237 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.401241 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.401245 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.401248 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.401252 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.401256 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.401260 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.401264 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.401270 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.401274 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.401278 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.401282 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.401286 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.401290 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.401294 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.401298 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.401302 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.401306 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.401309 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.401313 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.401317 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.401321 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.401330 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.401335 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.401339 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.401343 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.401347 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.401351 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.401355 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.401360 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.401364 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.401368 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.401375 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.401379 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.401383 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.401387 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.401391 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.401395 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.401399 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.401403 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.401407 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.401411 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.401415 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.401419 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.401423 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.401427 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.401431 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.401435 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.401439 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.401443 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.401447 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.401451 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.401455 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.401459 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.401463 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.401467 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.401472 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.401475 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.401479 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.401484 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.401834 1505759 [db/column_family.cc:616] --------------- Options for column family [residuals]: +2024/09/13-17:21:56.401848 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.401852 1505759 Options.merge_operator: None +2024/09/13-17:21:56.401866 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.401871 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.401875 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.401879 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.401889 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.401935 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb344cd0) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -665,7 +666,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd907d810 + block_cache: 0x59e4fb345000 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -693,103 +694,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.322270 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.322274 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.322278 631181 Options.compression: NoCompression -2024/08/14-23:25:07.322281 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.322285 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.322288 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.322291 631181 Options.num_levels: 7 -2024/08/14-23:25:07.322294 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.322298 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.322301 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.322304 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.322307 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.322310 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.322313 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.322317 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.322320 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.322323 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.322326 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.322329 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.322333 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.322336 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.322339 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.322342 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.322345 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.322348 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.322352 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.322355 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.322358 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.322361 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.322364 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.322367 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.322371 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.322374 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.322377 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.322380 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.322383 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.322388 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.322391 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.322394 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.322401 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.322404 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.322408 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.322411 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.322414 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.322417 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.322420 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.322424 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.322427 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.322430 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.322433 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.322437 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.322441 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.322445 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.322448 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.322451 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.322454 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.322457 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.322461 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.322464 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.322467 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.322472 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.322476 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.322479 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.322482 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.322486 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.322489 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.322492 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.322495 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.322498 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.322502 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.322505 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.322508 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.322511 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.322514 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.322518 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.322521 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.322524 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.322527 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.322531 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.322534 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.322537 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.322540 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.322543 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.322547 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.322553 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.322557 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.322560 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.322563 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.322824 631181 [db/column_family.cc:616] --------------- Options for column family [mapping]: -2024/08/14-23:25:07.322834 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.322837 631181 Options.merge_operator: None -2024/08/14-23:25:07.322841 631181 Options.compaction_filter: None -2024/08/14-23:25:07.322844 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.322847 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.322850 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.322854 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.322895 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd90993c0) +2024/09/13-17:21:56.401955 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.401960 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.401965 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.401969 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.401973 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.401977 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.401981 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.401985 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.401989 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.401993 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.401997 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.402001 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.402005 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.402009 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.402013 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.402016 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.402020 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.402024 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.402028 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.402032 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.402036 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.402040 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.402044 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.402048 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.402052 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.402056 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.402060 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.402063 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.402067 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.402071 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.402075 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.402083 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.402087 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.402091 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.402095 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.402099 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.402105 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.402109 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.402113 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.402117 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.402121 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.402125 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.402129 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.402132 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.402136 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.402140 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.402144 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.402148 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.402152 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.402156 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.402161 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.402166 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.402170 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.402174 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.402178 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.402182 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.402186 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.402191 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.402195 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.402199 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.402205 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.402210 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.402214 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.402218 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.402222 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.402226 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.402230 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.402234 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.402238 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.402241 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.402245 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.402249 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.402253 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.402257 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.402262 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.402265 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.402269 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.402277 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.402281 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.402285 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.402290 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.402293 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.402297 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.402302 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.402306 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.402310 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.402314 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.402318 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.402655 1505759 [db/column_family.cc:616] --------------- Options for column family [mapping]: +2024/09/13-17:21:56.402668 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.402672 1505759 Options.merge_operator: None +2024/09/13-17:21:56.402676 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.402680 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.402684 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.402689 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.402693 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.402743 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb360ca0) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -800,7 +801,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd90996f0 + block_cache: 0x59e4fb360fd0 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -828,103 +829,103 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.322914 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.322918 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.322922 631181 Options.compression: NoCompression -2024/08/14-23:25:07.322925 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.322928 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.322932 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.322935 631181 Options.num_levels: 7 -2024/08/14-23:25:07.322938 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.322941 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.322945 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.322948 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.322951 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.322954 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.322957 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.322961 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.322964 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.322967 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.322970 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.322973 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.322977 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.322980 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.322986 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.322989 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.322993 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.322996 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.322999 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.323002 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.323005 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.323008 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.323012 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.323015 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.323018 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.323021 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.323024 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.323027 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.323031 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.323035 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.323038 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.323042 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.323045 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.323048 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.323051 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.323054 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.323058 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.323061 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.323064 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.323067 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.323070 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.323074 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.323077 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.323081 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.323085 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.323088 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.323091 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.323095 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.323098 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.323101 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.323105 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.323108 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.323111 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.323116 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.323119 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.323123 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.323126 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.323129 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.323133 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.323204 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.323208 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.323211 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.323214 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.323218 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.323221 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.323224 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.323227 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.323231 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.323234 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.323237 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.323240 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.323244 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.323247 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.323250 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.323254 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.323257 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.323261 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.323264 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.323268 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.323271 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.323274 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.323596 631181 [db/column_family.cc:616] --------------- Options for column family [doc]: -2024/08/14-23:25:07.323606 631181 Options.comparator: leveldb.BytewiseComparator -2024/08/14-23:25:07.323610 631181 Options.merge_operator: None -2024/08/14-23:25:07.323613 631181 Options.compaction_filter: None -2024/08/14-23:25:07.323616 631181 Options.compaction_filter_factory: None -2024/08/14-23:25:07.323619 631181 Options.sst_partitioner_factory: None -2024/08/14-23:25:07.323622 631181 Options.memtable_factory: SkipListFactory -2024/08/14-23:25:07.323625 631181 Options.table_factory: BlockBasedTable -2024/08/14-23:25:07.323647 631181 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x642fd90a0f70) +2024/09/13-17:21:56.402763 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.402769 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.402773 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.402777 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.402782 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.402786 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.402790 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.402794 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.402797 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.402801 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.402805 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.402809 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.402813 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.402817 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.402826 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.402830 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.402834 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.402838 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.402842 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.402846 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.402849 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.402853 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.402874 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.402878 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.402882 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.402886 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.402890 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.402894 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.402898 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.402902 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.402906 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.402910 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.402914 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.402918 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.402922 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.402925 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.402931 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.402935 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.402939 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.402943 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.402947 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.402951 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.402955 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.402959 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.402963 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.402967 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.402970 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.402974 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.402978 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.402982 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.402988 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.402992 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.402997 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.403001 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.403004 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.403008 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.403012 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.403017 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.403139 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.403145 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.403152 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.403157 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.403161 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.403165 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.403170 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.403174 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.403178 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.403182 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.403186 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.403190 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.403193 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.403197 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.403201 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.403205 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.403210 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.403214 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.403218 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.403222 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.403226 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.403230 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.403234 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.403238 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.403242 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.403246 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.403251 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.403255 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.403258 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.403263 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.403572 1505759 [db/column_family.cc:616] --------------- Options for column family [doc]: +2024/09/13-17:21:56.403586 1505759 Options.comparator: leveldb.BytewiseComparator +2024/09/13-17:21:56.403590 1505759 Options.merge_operator: None +2024/09/13-17:21:56.403594 1505759 Options.compaction_filter: None +2024/09/13-17:21:56.403598 1505759 Options.compaction_filter_factory: None +2024/09/13-17:21:56.403602 1505759 Options.sst_partitioner_factory: None +2024/09/13-17:21:56.403606 1505759 Options.memtable_factory: SkipListFactory +2024/09/13-17:21:56.403611 1505759 Options.table_factory: BlockBasedTable +2024/09/13-17:21:56.403637 1505759 table_factory options: flush_block_policy_factory: FlushBlockBySizePolicyFactory (0x59e4fb3688c0) cache_index_and_filter_blocks: 0 cache_index_and_filter_blocks_with_high_priority: 1 pin_l0_filter_and_index_blocks_in_cache: 0 @@ -935,7 +936,7 @@ data_block_hash_table_util_ratio: 0.750000 checksum: 4 no_block_cache: 0 - block_cache: 0x642fd90a12a0 + block_cache: 0x59e4fb368bf0 block_cache_name: LRUCache block_cache_options: capacity : 33554432 @@ -963,114 +964,122 @@ prepopulate_block_cache: 0 initial_auto_readahead_size: 8192 num_file_reads_for_auto_readahead: 2 -2024/08/14-23:25:07.323651 631181 Options.write_buffer_size: 67108864 -2024/08/14-23:25:07.323654 631181 Options.max_write_buffer_number: 2 -2024/08/14-23:25:07.323657 631181 Options.compression: NoCompression -2024/08/14-23:25:07.323660 631181 Options.bottommost_compression: Disabled -2024/08/14-23:25:07.323670 631181 Options.prefix_extractor: nullptr -2024/08/14-23:25:07.323673 631181 Options.memtable_insert_with_hint_prefix_extractor: nullptr -2024/08/14-23:25:07.323676 631181 Options.num_levels: 7 -2024/08/14-23:25:07.323679 631181 Options.min_write_buffer_number_to_merge: 1 -2024/08/14-23:25:07.323682 631181 Options.max_write_buffer_number_to_maintain: 0 -2024/08/14-23:25:07.323685 631181 Options.max_write_buffer_size_to_maintain: 0 -2024/08/14-23:25:07.323688 631181 Options.bottommost_compression_opts.window_bits: -14 -2024/08/14-23:25:07.323691 631181 Options.bottommost_compression_opts.level: 32767 -2024/08/14-23:25:07.323694 631181 Options.bottommost_compression_opts.strategy: 0 -2024/08/14-23:25:07.323697 631181 Options.bottommost_compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.323700 631181 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.323703 631181 Options.bottommost_compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.323707 631181 Options.bottommost_compression_opts.enabled: false -2024/08/14-23:25:07.323710 631181 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.323713 631181 Options.bottommost_compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.323716 631181 Options.compression_opts.window_bits: -14 -2024/08/14-23:25:07.323719 631181 Options.compression_opts.level: 32767 -2024/08/14-23:25:07.323722 631181 Options.compression_opts.strategy: 0 -2024/08/14-23:25:07.323725 631181 Options.compression_opts.max_dict_bytes: 0 -2024/08/14-23:25:07.323728 631181 Options.compression_opts.zstd_max_train_bytes: 0 -2024/08/14-23:25:07.323731 631181 Options.compression_opts.use_zstd_dict_trainer: true -2024/08/14-23:25:07.323734 631181 Options.compression_opts.parallel_threads: 1 -2024/08/14-23:25:07.323737 631181 Options.compression_opts.enabled: false -2024/08/14-23:25:07.323740 631181 Options.compression_opts.max_dict_buffer_bytes: 0 -2024/08/14-23:25:07.323743 631181 Options.level0_file_num_compaction_trigger: 4 -2024/08/14-23:25:07.323746 631181 Options.level0_slowdown_writes_trigger: 20 -2024/08/14-23:25:07.323749 631181 Options.level0_stop_writes_trigger: 36 -2024/08/14-23:25:07.323752 631181 Options.target_file_size_base: 67108864 -2024/08/14-23:25:07.323755 631181 Options.target_file_size_multiplier: 1 -2024/08/14-23:25:07.323758 631181 Options.max_bytes_for_level_base: 268435456 -2024/08/14-23:25:07.323761 631181 Options.level_compaction_dynamic_level_bytes: 1 -2024/08/14-23:25:07.323764 631181 Options.max_bytes_for_level_multiplier: 10.000000 -2024/08/14-23:25:07.323768 631181 Options.max_bytes_for_level_multiplier_addtl[0]: 1 -2024/08/14-23:25:07.323772 631181 Options.max_bytes_for_level_multiplier_addtl[1]: 1 -2024/08/14-23:25:07.323775 631181 Options.max_bytes_for_level_multiplier_addtl[2]: 1 -2024/08/14-23:25:07.323778 631181 Options.max_bytes_for_level_multiplier_addtl[3]: 1 -2024/08/14-23:25:07.323781 631181 Options.max_bytes_for_level_multiplier_addtl[4]: 1 -2024/08/14-23:25:07.323784 631181 Options.max_bytes_for_level_multiplier_addtl[5]: 1 -2024/08/14-23:25:07.323787 631181 Options.max_bytes_for_level_multiplier_addtl[6]: 1 -2024/08/14-23:25:07.323790 631181 Options.max_sequential_skip_in_iterations: 8 -2024/08/14-23:25:07.323793 631181 Options.max_compaction_bytes: 1677721600 -2024/08/14-23:25:07.323796 631181 Options.ignore_max_compaction_bytes_for_input: true -2024/08/14-23:25:07.323799 631181 Options.arena_block_size: 1048576 -2024/08/14-23:25:07.323802 631181 Options.soft_pending_compaction_bytes_limit: 68719476736 -2024/08/14-23:25:07.323805 631181 Options.hard_pending_compaction_bytes_limit: 274877906944 -2024/08/14-23:25:07.323809 631181 Options.disable_auto_compactions: 0 -2024/08/14-23:25:07.323815 631181 Options.compaction_style: kCompactionStyleLevel -2024/08/14-23:25:07.323819 631181 Options.compaction_pri: kMinOverlappingRatio -2024/08/14-23:25:07.323823 631181 Options.compaction_options_universal.size_ratio: 1 -2024/08/14-23:25:07.323826 631181 Options.compaction_options_universal.min_merge_width: 2 -2024/08/14-23:25:07.323829 631181 Options.compaction_options_universal.max_merge_width: 4294967295 -2024/08/14-23:25:07.323832 631181 Options.compaction_options_universal.max_size_amplification_percent: 200 -2024/08/14-23:25:07.323835 631181 Options.compaction_options_universal.compression_size_percent: -1 -2024/08/14-23:25:07.323838 631181 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize -2024/08/14-23:25:07.323842 631181 Options.compaction_options_fifo.max_table_files_size: 1073741824 -2024/08/14-23:25:07.323845 631181 Options.compaction_options_fifo.allow_compaction: 0 -2024/08/14-23:25:07.323849 631181 Options.table_properties_collectors: -2024/08/14-23:25:07.323853 631181 Options.inplace_update_support: 0 -2024/08/14-23:25:07.323856 631181 Options.inplace_update_num_locks: 10000 -2024/08/14-23:25:07.323859 631181 Options.memtable_prefix_bloom_size_ratio: 0.000000 -2024/08/14-23:25:07.323862 631181 Options.memtable_whole_key_filtering: 0 -2024/08/14-23:25:07.323865 631181 Options.memtable_huge_page_size: 0 -2024/08/14-23:25:07.323868 631181 Options.bloom_locality: 0 -2024/08/14-23:25:07.323871 631181 Options.max_successive_merges: 0 -2024/08/14-23:25:07.323875 631181 Options.optimize_filters_for_hits: 0 -2024/08/14-23:25:07.323878 631181 Options.paranoid_file_checks: 0 -2024/08/14-23:25:07.323881 631181 Options.force_consistency_checks: 1 -2024/08/14-23:25:07.323884 631181 Options.report_bg_io_stats: 0 -2024/08/14-23:25:07.323887 631181 Options.ttl: 2592000 -2024/08/14-23:25:07.323890 631181 Options.periodic_compaction_seconds: 0 -2024/08/14-23:25:07.323893 631181 Options.default_temperature: kUnknown -2024/08/14-23:25:07.323896 631181 Options.preclude_last_level_data_seconds: 0 -2024/08/14-23:25:07.323899 631181 Options.preserve_internal_time_seconds: 0 -2024/08/14-23:25:07.323902 631181 Options.enable_blob_files: false -2024/08/14-23:25:07.323905 631181 Options.min_blob_size: 0 -2024/08/14-23:25:07.323908 631181 Options.blob_file_size: 268435456 -2024/08/14-23:25:07.323912 631181 Options.blob_compression_type: NoCompression -2024/08/14-23:25:07.323915 631181 Options.enable_blob_garbage_collection: false -2024/08/14-23:25:07.323918 631181 Options.blob_garbage_collection_age_cutoff: 0.250000 -2024/08/14-23:25:07.323921 631181 Options.blob_garbage_collection_force_threshold: 1.000000 -2024/08/14-23:25:07.323925 631181 Options.blob_compaction_readahead_size: 0 -2024/08/14-23:25:07.323928 631181 Options.blob_file_starting_level: 0 -2024/08/14-23:25:07.323931 631181 Options.experimental_mempurge_threshold: 0.000000 -2024/08/14-23:25:07.323934 631181 Options.memtable_max_range_deletions: 0 -2024/08/14-23:25:07.331370 631181 [db/version_set.cc:5984] Recovered from manifest file:data/colbert_test.db/MANIFEST-000173 succeeded,manifest_file_number is 173, next_file_number is 175, last_sequence is 82824, log_number is 169,prev_log_number is 0,max_column_family is 6,min_log_number_to_keep is 169 -2024/08/14-23:25:07.331386 631181 [db/version_set.cc:5999] Column family [default] (ID 0), log number is 169 -2024/08/14-23:25:07.331390 631181 [db/version_set.cc:5999] Column family [index] (ID 1), log number is 169 -2024/08/14-23:25:07.331394 631181 [db/version_set.cc:5999] Column family [forward] (ID 2), log number is 169 -2024/08/14-23:25:07.331403 631181 [db/version_set.cc:5999] Column family [codes] (ID 3), log number is 169 -2024/08/14-23:25:07.331406 631181 [db/version_set.cc:5999] Column family [residuals] (ID 4), log number is 169 -2024/08/14-23:25:07.331410 631181 [db/version_set.cc:5999] Column family [mapping] (ID 5), log number is 169 -2024/08/14-23:25:07.331413 631181 [db/version_set.cc:5999] Column family [doc] (ID 6), log number is 169 -2024/08/14-23:25:07.331468 631181 [db/db_impl/db_impl_open.cc:646] DB ID: ca27e180-c0ac-40a6-8ba1-abc5931d9ca6 -2024/08/14-23:25:07.331806 631181 EVENT_LOG_v1 {"time_micros": 1723677907331796, "job": 1, "event": "recovery_started", "wal_files": [172]} -2024/08/14-23:25:07.331821 631181 [db/db_impl/db_impl_open.cc:1143] Recovering log #172 mode 2 -2024/08/14-23:25:07.331960 631181 EVENT_LOG_v1 {"time_micros": 1723677907331956, "job": 1, "event": "recovery_finished"} -2024/08/14-23:25:07.332305 631181 [db/version_set.cc:5438] Creating manifest 177 -2024/08/14-23:25:07.525733 631181 [file/delete_scheduler.cc:73] Deleted file data/colbert_test.db/000172.log immediately, rate_bytes_per_sec 0, total_trash_size 0 max_trash_db_ratio 0.250000 -2024/08/14-23:25:07.525768 631181 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/000172.log type=0 #172 -- OK -2024/08/14-23:25:07.525935 631181 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/MANIFEST-000173 type=3 #173 -- OK -2024/08/14-23:25:07.526037 631181 [db/db_impl/db_impl_open.cc:2156] SstFileManager instance 0x642fd90b08e0 -2024/08/14-23:25:07.526472 631181 DB pointer 0x642fd90aaf00 -2024/08/14-23:25:07.530703 631289 [db/db_impl/db_impl.cc:1139] ------- DUMPING STATS ------- -2024/08/14-23:25:07.530818 631289 [db/db_impl/db_impl.cc:1141] +2024/09/13-17:21:56.403651 1505759 Options.write_buffer_size: 67108864 +2024/09/13-17:21:56.403655 1505759 Options.max_write_buffer_number: 2 +2024/09/13-17:21:56.403660 1505759 Options.compression: NoCompression +2024/09/13-17:21:56.403664 1505759 Options.bottommost_compression: Disabled +2024/09/13-17:21:56.403668 1505759 Options.prefix_extractor: nullptr +2024/09/13-17:21:56.403672 1505759 Options.memtable_insert_with_hint_prefix_extractor: nullptr +2024/09/13-17:21:56.403676 1505759 Options.num_levels: 7 +2024/09/13-17:21:56.403680 1505759 Options.min_write_buffer_number_to_merge: 1 +2024/09/13-17:21:56.403684 1505759 Options.max_write_buffer_number_to_maintain: 0 +2024/09/13-17:21:56.403688 1505759 Options.max_write_buffer_size_to_maintain: 0 +2024/09/13-17:21:56.403692 1505759 Options.bottommost_compression_opts.window_bits: -14 +2024/09/13-17:21:56.403696 1505759 Options.bottommost_compression_opts.level: 32767 +2024/09/13-17:21:56.403700 1505759 Options.bottommost_compression_opts.strategy: 0 +2024/09/13-17:21:56.403704 1505759 Options.bottommost_compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.403708 1505759 Options.bottommost_compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.403712 1505759 Options.bottommost_compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.403716 1505759 Options.bottommost_compression_opts.enabled: false +2024/09/13-17:21:56.403720 1505759 Options.bottommost_compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.403724 1505759 Options.bottommost_compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.403728 1505759 Options.compression_opts.window_bits: -14 +2024/09/13-17:21:56.403731 1505759 Options.compression_opts.level: 32767 +2024/09/13-17:21:56.403735 1505759 Options.compression_opts.strategy: 0 +2024/09/13-17:21:56.403739 1505759 Options.compression_opts.max_dict_bytes: 0 +2024/09/13-17:21:56.403743 1505759 Options.compression_opts.zstd_max_train_bytes: 0 +2024/09/13-17:21:56.403747 1505759 Options.compression_opts.use_zstd_dict_trainer: true +2024/09/13-17:21:56.403751 1505759 Options.compression_opts.parallel_threads: 1 +2024/09/13-17:21:56.403755 1505759 Options.compression_opts.enabled: false +2024/09/13-17:21:56.403759 1505759 Options.compression_opts.max_dict_buffer_bytes: 0 +2024/09/13-17:21:56.403763 1505759 Options.level0_file_num_compaction_trigger: 4 +2024/09/13-17:21:56.403767 1505759 Options.level0_slowdown_writes_trigger: 20 +2024/09/13-17:21:56.403771 1505759 Options.level0_stop_writes_trigger: 36 +2024/09/13-17:21:56.403775 1505759 Options.target_file_size_base: 67108864 +2024/09/13-17:21:56.403779 1505759 Options.target_file_size_multiplier: 1 +2024/09/13-17:21:56.403783 1505759 Options.max_bytes_for_level_base: 268435456 +2024/09/13-17:21:56.403786 1505759 Options.level_compaction_dynamic_level_bytes: 1 +2024/09/13-17:21:56.403790 1505759 Options.max_bytes_for_level_multiplier: 10.000000 +2024/09/13-17:21:56.403796 1505759 Options.max_bytes_for_level_multiplier_addtl[0]: 1 +2024/09/13-17:21:56.403800 1505759 Options.max_bytes_for_level_multiplier_addtl[1]: 1 +2024/09/13-17:21:56.403804 1505759 Options.max_bytes_for_level_multiplier_addtl[2]: 1 +2024/09/13-17:21:56.403808 1505759 Options.max_bytes_for_level_multiplier_addtl[3]: 1 +2024/09/13-17:21:56.403815 1505759 Options.max_bytes_for_level_multiplier_addtl[4]: 1 +2024/09/13-17:21:56.403820 1505759 Options.max_bytes_for_level_multiplier_addtl[5]: 1 +2024/09/13-17:21:56.403824 1505759 Options.max_bytes_for_level_multiplier_addtl[6]: 1 +2024/09/13-17:21:56.403828 1505759 Options.max_sequential_skip_in_iterations: 8 +2024/09/13-17:21:56.403832 1505759 Options.max_compaction_bytes: 1677721600 +2024/09/13-17:21:56.403836 1505759 Options.ignore_max_compaction_bytes_for_input: true +2024/09/13-17:21:56.403840 1505759 Options.arena_block_size: 1048576 +2024/09/13-17:21:56.403843 1505759 Options.soft_pending_compaction_bytes_limit: 68719476736 +2024/09/13-17:21:56.403847 1505759 Options.hard_pending_compaction_bytes_limit: 274877906944 +2024/09/13-17:21:56.403851 1505759 Options.disable_auto_compactions: 0 +2024/09/13-17:21:56.403867 1505759 Options.compaction_style: kCompactionStyleLevel +2024/09/13-17:21:56.403873 1505759 Options.compaction_pri: kMinOverlappingRatio +2024/09/13-17:21:56.403877 1505759 Options.compaction_options_universal.size_ratio: 1 +2024/09/13-17:21:56.403882 1505759 Options.compaction_options_universal.min_merge_width: 2 +2024/09/13-17:21:56.403886 1505759 Options.compaction_options_universal.max_merge_width: 4294967295 +2024/09/13-17:21:56.403890 1505759 Options.compaction_options_universal.max_size_amplification_percent: 200 +2024/09/13-17:21:56.403894 1505759 Options.compaction_options_universal.compression_size_percent: -1 +2024/09/13-17:21:56.403898 1505759 Options.compaction_options_universal.stop_style: kCompactionStopStyleTotalSize +2024/09/13-17:21:56.403903 1505759 Options.compaction_options_fifo.max_table_files_size: 1073741824 +2024/09/13-17:21:56.403907 1505759 Options.compaction_options_fifo.allow_compaction: 0 +2024/09/13-17:21:56.403913 1505759 Options.table_properties_collectors: +2024/09/13-17:21:56.403918 1505759 Options.inplace_update_support: 0 +2024/09/13-17:21:56.403922 1505759 Options.inplace_update_num_locks: 10000 +2024/09/13-17:21:56.403926 1505759 Options.memtable_prefix_bloom_size_ratio: 0.000000 +2024/09/13-17:21:56.403930 1505759 Options.memtable_whole_key_filtering: 0 +2024/09/13-17:21:56.403934 1505759 Options.memtable_huge_page_size: 0 +2024/09/13-17:21:56.403938 1505759 Options.bloom_locality: 0 +2024/09/13-17:21:56.403942 1505759 Options.max_successive_merges: 0 +2024/09/13-17:21:56.403946 1505759 Options.optimize_filters_for_hits: 0 +2024/09/13-17:21:56.403950 1505759 Options.paranoid_file_checks: 0 +2024/09/13-17:21:56.403954 1505759 Options.force_consistency_checks: 1 +2024/09/13-17:21:56.403958 1505759 Options.report_bg_io_stats: 0 +2024/09/13-17:21:56.403962 1505759 Options.ttl: 2592000 +2024/09/13-17:21:56.403966 1505759 Options.periodic_compaction_seconds: 0 +2024/09/13-17:21:56.403970 1505759 Options.default_temperature: kUnknown +2024/09/13-17:21:56.403974 1505759 Options.preclude_last_level_data_seconds: 0 +2024/09/13-17:21:56.403978 1505759 Options.preserve_internal_time_seconds: 0 +2024/09/13-17:21:56.403982 1505759 Options.enable_blob_files: false +2024/09/13-17:21:56.403986 1505759 Options.min_blob_size: 0 +2024/09/13-17:21:56.403990 1505759 Options.blob_file_size: 268435456 +2024/09/13-17:21:56.403994 1505759 Options.blob_compression_type: NoCompression +2024/09/13-17:21:56.403998 1505759 Options.enable_blob_garbage_collection: false +2024/09/13-17:21:56.404002 1505759 Options.blob_garbage_collection_age_cutoff: 0.250000 +2024/09/13-17:21:56.404007 1505759 Options.blob_garbage_collection_force_threshold: 1.000000 +2024/09/13-17:21:56.404011 1505759 Options.blob_compaction_readahead_size: 0 +2024/09/13-17:21:56.404020 1505759 Options.blob_file_starting_level: 0 +2024/09/13-17:21:56.404024 1505759 Options.experimental_mempurge_threshold: 0.000000 +2024/09/13-17:21:56.404029 1505759 Options.memtable_max_range_deletions: 0 +2024/09/13-17:21:56.427267 1505759 [db/version_set.cc:5984] Recovered from manifest file:data/colbert_test.db/MANIFEST-000177 succeeded,manifest_file_number is 177, next_file_number is 179, last_sequence is 82824, log_number is 173,prev_log_number is 0,max_column_family is 6,min_log_number_to_keep is 173 +2024/09/13-17:21:56.427287 1505759 [db/version_set.cc:5999] Column family [default] (ID 0), log number is 173 +2024/09/13-17:21:56.427293 1505759 [db/version_set.cc:5999] Column family [index] (ID 1), log number is 173 +2024/09/13-17:21:56.427298 1505759 [db/version_set.cc:5999] Column family [forward] (ID 2), log number is 173 +2024/09/13-17:21:56.427302 1505759 [db/version_set.cc:5999] Column family [codes] (ID 3), log number is 173 +2024/09/13-17:21:56.427306 1505759 [db/version_set.cc:5999] Column family [residuals] (ID 4), log number is 173 +2024/09/13-17:21:56.427310 1505759 [db/version_set.cc:5999] Column family [mapping] (ID 5), log number is 173 +2024/09/13-17:21:56.427315 1505759 [db/version_set.cc:5999] Column family [doc] (ID 6), log number is 173 +2024/09/13-17:21:56.427530 1505759 [db/db_impl/db_impl_open.cc:646] DB ID: ca27e180-c0ac-40a6-8ba1-abc5931d9ca6 +2024/09/13-17:21:56.427945 1505759 EVENT_LOG_v1 {"time_micros": 1726248116427934, "job": 1, "event": "recovery_started", "wal_files": [176, 260]} +2024/09/13-17:21:56.427965 1505759 [db/db_impl/db_impl_open.cc:1143] Recovering log #176 mode 2 +2024/09/13-17:21:56.428025 1505759 [db/db_impl/db_impl_open.cc:1143] Recovering log #260 mode 2 +2024/09/13-17:21:56.428210 1505759 EVENT_LOG_v1 {"time_micros": 1726248116428205, "job": 1, "event": "recovery_finished"} +2024/09/13-17:21:56.428673 1505759 [db/version_set.cc:5438] Creating manifest 265 +2024/09/13-17:21:56.580426 1505759 [file/delete_scheduler.cc:73] Deleted file data/colbert_test.db/000176.log immediately, rate_bytes_per_sec 0, total_trash_size 0 max_trash_db_ratio 0.250000 +2024/09/13-17:21:56.580461 1505759 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/000176.log type=0 #176 -- OK +2024/09/13-17:21:56.580517 1505759 [file/delete_scheduler.cc:73] Deleted file data/colbert_test.db/000260.log immediately, rate_bytes_per_sec 0, total_trash_size 0 max_trash_db_ratio 0.250000 +2024/09/13-17:21:56.580535 1505759 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/000260.log type=0 #260 -- OK +2024/09/13-17:21:56.580665 1505759 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/MANIFEST-000177 type=3 #177 -- OK +2024/09/13-17:21:56.580746 1505759 [DEBUG] [db/db_impl/db_impl_files.cc:375] [JOB 2] Delete data/colbert_test.db/MANIFEST-000261 type=3 #261 -- OK +2024/09/13-17:21:56.580853 1505759 [db/db_impl/db_impl_open.cc:2156] SstFileManager instance 0x59e4fb372ae0 +2024/09/13-17:21:56.581213 1505760 (Original Log Time 2024/09/13-17:21:56.581158) [db/db_impl/db_impl_compaction_flush.cc:3675] Compaction nothing to do +2024/09/13-17:21:56.581280 1505759 DB pointer 0x59e4fb377ac0 +2024/09/13-17:21:56.581329 1505760 (Original Log Time 2024/09/13-17:21:56.581311) [db/db_impl/db_impl_compaction_flush.cc:3675] Compaction nothing to do +2024/09/13-17:21:56.581380 1505760 (Original Log Time 2024/09/13-17:21:56.581365) [db/db_impl/db_impl_compaction_flush.cc:3675] Compaction nothing to do +2024/09/13-17:21:56.581429 1505760 (Original Log Time 2024/09/13-17:21:56.581414) [db/db_impl/db_impl_compaction_flush.cc:3675] Compaction nothing to do +2024/09/13-17:21:56.585345 1505881 [db/db_impl/db_impl.cc:1139] ------- DUMPING STATS ------- +2024/09/13-17:21:56.585453 1505881 [db/db_impl/db_impl.cc:1141] ** DB Stats ** Uptime(secs): 0.2 total, 0.2 interval Cumulative writes: 0 writes, 0 keys, 0 commit groups, 0.0 writes per commit group, ingest: 0.00 GB, 0.00 MB/s @@ -1102,7 +1111,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd9072b10#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000633 secs_since: 0 +Block cache LRUCache@0x59e4fb33a260#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000395 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [default] ** @@ -1129,7 +1138,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd9082240#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000515 secs_since: 0 +Block cache LRUCache@0x59e4fb349990#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000571 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [index] ** @@ -1155,7 +1164,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd907a710#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000336 secs_since: 0 +Block cache LRUCache@0x59e4fb341e80#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000339 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [forward] ** @@ -1182,7 +1191,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd907ace0#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000333 secs_since: 0 +Block cache LRUCache@0x59e4fb342460#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000343 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [codes] ** @@ -1208,7 +1217,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd907d810#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000336 secs_since: 0 +Block cache LRUCache@0x59e4fb345000#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.00034 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [residuals] ** @@ -1235,7 +1244,7 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd90996f0#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000332 secs_since: 0 +Block cache LRUCache@0x59e4fb360fd0#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000338 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [mapping] ** @@ -1262,9 +1271,9 @@ AddFile(Keys): cumulative 0, interval 0 Cumulative compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Interval compaction: 0.00 GB write, 0.00 MB/s write, 0.00 GB read, 0.00 MB/s read, 0.0 seconds Write Stall (count): cf-l0-file-count-limit-delays-with-ongoing-compaction: 0, cf-l0-file-count-limit-stops-with-ongoing-compaction: 0, l0-file-count-limit-delays: 0, l0-file-count-limit-stops: 0, memtable-limit-delays: 0, memtable-limit-stops: 0, pending-compaction-bytes-delays: 0, pending-compaction-bytes-stops: 0, total-delays: 0, total-stops: 0 -Block cache LRUCache@0x642fd90a12a0#631181 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000333 secs_since: 0 +Block cache LRUCache@0x59e4fb368bf0#1505759 capacity: 32.00 MB seed: 1411059435 usage: 0.09 KB table_size: 1024 occupancy: 1 collections: 1 last_copies: 0 last_secs: 0.000348 secs_since: 0 Block cache entry stats(count,size,portion): Misc(1,0.00 KB,0%) ** File Read Latency Histogram By Level [doc] ** -2024/08/14-23:25:07.905610 631181 [db/db_impl/db_impl.cc:485] Shutdown: canceling all background work -2024/08/14-23:25:07.907280 631181 [db/db_impl/db_impl.cc:667] Shutdown complete +2024/09/13-17:21:57.111197 1505759 [db/db_impl/db_impl.cc:485] Shutdown: canceling all background work +2024/09/13-17:21:57.112740 1505759 [db/db_impl/db_impl.cc:667] Shutdown complete diff --git a/tests/data/colbert_test.db/MANIFEST-000177 b/tests/data/colbert_test.db/MANIFEST-000177 deleted file mode 100644 index 4f26f27..0000000 Binary files a/tests/data/colbert_test.db/MANIFEST-000177 and /dev/null differ diff --git a/tests/data/colbert_test.db/OPTIONS-000175 b/tests/data/colbert_test.db/OPTIONS-000175 deleted file mode 100644 index 78c9aec..0000000 --- a/tests/data/colbert_test.db/OPTIONS-000175 +++ /dev/null @@ -1,870 +0,0 @@ -# This is a RocksDB option file. -# -# For detailed file format spec, please refer to the example file -# in examples/rocksdb_option_file_example.ini -# - -[Version] - rocksdb_version=8.10.0 - options_file_version=1.1 - -[DBOptions] - compaction_readahead_size=2097152 - strict_bytes_per_sync=false - bytes_per_sync=0 - max_background_jobs=2 - avoid_flush_during_shutdown=false - max_background_flushes=-1 - delayed_write_rate=16777216 - max_open_files=-1 - max_subcompactions=1 - writable_file_max_buffer_size=1048576 - wal_bytes_per_sync=0 - max_background_compactions=-1 - max_total_wal_size=0 - delete_obsolete_files_period_micros=21600000000 - stats_dump_period_sec=600 - stats_history_buffer_size=1048576 - stats_persist_period_sec=600 - enforce_single_del_contracts=true - lowest_used_cache_tier=kNonVolatileBlockTier - bgerror_resume_retry_interval=1000000 - best_efforts_recovery=false - log_readahead_size=0 - write_dbid_to_manifest=false - wal_compression=kNoCompression - manual_wal_flush=false - db_host_id=__hostname__ - two_write_queues=false - random_access_max_buffer_size=1048576 - avoid_unnecessary_blocking_io=false - skip_checking_sst_file_sizes_on_db_open=false - flush_verify_memtable_count=true - fail_if_options_file_error=true - atomic_flush=false - verify_sst_unique_id_in_manifest=true - skip_stats_update_on_db_open=false - track_and_verify_wals_in_manifest=false - compaction_verify_record_count=true - paranoid_checks=true - create_if_missing=true - max_write_batch_group_size_bytes=1048576 - avoid_flush_during_recovery=false - file_checksum_gen_factory=nullptr - enable_thread_tracking=false - allow_fallocate=true - allow_data_in_errors=false - error_if_exists=false - use_direct_io_for_flush_and_compaction=false - create_missing_column_families=true - WAL_size_limit_MB=0 - use_direct_reads=false - persist_stats_to_disk=false - allow_2pc=false - is_fd_close_on_exec=true - max_log_file_size=0 - access_hint_on_compaction_start=NORMAL - max_file_opening_threads=16 - wal_filter=nullptr - allow_mmap_reads=false - allow_mmap_writes=false - use_adaptive_mutex=false - use_fsync=false - table_cache_numshardbits=6 - dump_malloc_stats=false - db_write_buffer_size=0 - allow_ingest_behind=false - keep_log_file_num=1000 - max_bgerror_resume_count=2147483647 - allow_concurrent_memtable_write=true - recycle_log_file_num=0 - log_file_time_to_roll=0 - manifest_preallocation_size=4194304 - enable_write_thread_adaptive_yield=true - WAL_ttl_seconds=0 - max_manifest_file_size=1073741824 - wal_recovery_mode=kPointInTimeRecovery - enable_pipelined_write=false - write_thread_slow_yield_usec=3 - unordered_write=false - write_thread_max_yield_usec=100 - advise_random_on_open=true - info_log_level=DEBUG_LEVEL - - -[CFOptions "default"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "default"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "index"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=rocksdb.CappedPrefix.12 - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "index"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=bloomfilter:10:false - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "forward"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "forward"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "codes"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "codes"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "residuals"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "residuals"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "mapping"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "mapping"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "doc"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "doc"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - diff --git a/tests/data/colbert_test.db/OPTIONS-000179 b/tests/data/colbert_test.db/OPTIONS-000179 deleted file mode 100644 index 78c9aec..0000000 --- a/tests/data/colbert_test.db/OPTIONS-000179 +++ /dev/null @@ -1,870 +0,0 @@ -# This is a RocksDB option file. -# -# For detailed file format spec, please refer to the example file -# in examples/rocksdb_option_file_example.ini -# - -[Version] - rocksdb_version=8.10.0 - options_file_version=1.1 - -[DBOptions] - compaction_readahead_size=2097152 - strict_bytes_per_sync=false - bytes_per_sync=0 - max_background_jobs=2 - avoid_flush_during_shutdown=false - max_background_flushes=-1 - delayed_write_rate=16777216 - max_open_files=-1 - max_subcompactions=1 - writable_file_max_buffer_size=1048576 - wal_bytes_per_sync=0 - max_background_compactions=-1 - max_total_wal_size=0 - delete_obsolete_files_period_micros=21600000000 - stats_dump_period_sec=600 - stats_history_buffer_size=1048576 - stats_persist_period_sec=600 - enforce_single_del_contracts=true - lowest_used_cache_tier=kNonVolatileBlockTier - bgerror_resume_retry_interval=1000000 - best_efforts_recovery=false - log_readahead_size=0 - write_dbid_to_manifest=false - wal_compression=kNoCompression - manual_wal_flush=false - db_host_id=__hostname__ - two_write_queues=false - random_access_max_buffer_size=1048576 - avoid_unnecessary_blocking_io=false - skip_checking_sst_file_sizes_on_db_open=false - flush_verify_memtable_count=true - fail_if_options_file_error=true - atomic_flush=false - verify_sst_unique_id_in_manifest=true - skip_stats_update_on_db_open=false - track_and_verify_wals_in_manifest=false - compaction_verify_record_count=true - paranoid_checks=true - create_if_missing=true - max_write_batch_group_size_bytes=1048576 - avoid_flush_during_recovery=false - file_checksum_gen_factory=nullptr - enable_thread_tracking=false - allow_fallocate=true - allow_data_in_errors=false - error_if_exists=false - use_direct_io_for_flush_and_compaction=false - create_missing_column_families=true - WAL_size_limit_MB=0 - use_direct_reads=false - persist_stats_to_disk=false - allow_2pc=false - is_fd_close_on_exec=true - max_log_file_size=0 - access_hint_on_compaction_start=NORMAL - max_file_opening_threads=16 - wal_filter=nullptr - allow_mmap_reads=false - allow_mmap_writes=false - use_adaptive_mutex=false - use_fsync=false - table_cache_numshardbits=6 - dump_malloc_stats=false - db_write_buffer_size=0 - allow_ingest_behind=false - keep_log_file_num=1000 - max_bgerror_resume_count=2147483647 - allow_concurrent_memtable_write=true - recycle_log_file_num=0 - log_file_time_to_roll=0 - manifest_preallocation_size=4194304 - enable_write_thread_adaptive_yield=true - WAL_ttl_seconds=0 - max_manifest_file_size=1073741824 - wal_recovery_mode=kPointInTimeRecovery - enable_pipelined_write=false - write_thread_slow_yield_usec=3 - unordered_write=false - write_thread_max_yield_usec=100 - advise_random_on_open=true - info_log_level=DEBUG_LEVEL - - -[CFOptions "default"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "default"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "index"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=rocksdb.CappedPrefix.12 - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "index"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=bloomfilter:10:false - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "forward"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "forward"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "codes"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "codes"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "residuals"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "residuals"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "mapping"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "mapping"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - - -[CFOptions "doc"] - memtable_max_range_deletions=0 - compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - block_protection_bytes_per_key=0 - bottommost_file_compaction_delay=0 - memtable_protection_bytes_per_key=0 - target_file_size_multiplier=1 - report_bg_io_stats=false - write_buffer_size=67108864 - memtable_huge_page_size=0 - max_successive_merges=0 - max_write_buffer_number=2 - prefix_extractor=nullptr - bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;} - paranoid_file_checks=false - blob_garbage_collection_force_threshold=1.000000 - enable_blob_files=false - blob_file_starting_level=0 - memtable_prefix_bloom_size_ratio=0.000000 - inplace_update_num_locks=10000 - blob_compaction_readahead_size=0 - ignore_max_compaction_bytes_for_input=true - arena_block_size=1048576 - level0_stop_writes_trigger=36 - blob_compression_type=kNoCompression - level0_slowdown_writes_trigger=20 - hard_pending_compaction_bytes_limit=274877906944 - soft_pending_compaction_bytes_limit=68719476736 - target_file_size_base=67108864 - level0_file_num_compaction_trigger=4 - max_compaction_bytes=1677721600 - disable_auto_compactions=false - check_flush_compaction_key_order=true - min_blob_size=0 - memtable_whole_key_filtering=false - max_bytes_for_level_base=268435456 - last_level_temperature=kUnknown - compaction_options_fifo={file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;} - max_bytes_for_level_multiplier=10.000000 - max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1 - max_sequential_skip_in_iterations=8 - prepopulate_blob_cache=kDisable - compression=kNoCompression - compaction_options_universal={incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;size_ratio=1;} - blob_garbage_collection_age_cutoff=0.250000 - ttl=2592000 - periodic_compaction_seconds=0 - sample_for_compression=0 - blob_file_size=268435456 - enable_blob_garbage_collection=false - experimental_mempurge_threshold=0.000000 - bottommost_compression=kDisableCompressionOption - persist_user_defined_timestamps=true - preserve_internal_time_seconds=0 - preclude_last_level_data_seconds=0 - sst_partitioner_factory=nullptr - num_levels=7 - force_consistency_checks=true - memtable_insert_with_hint_prefix_extractor=nullptr - memtable_factory=SkipListFactory - compaction_pri=kMinOverlappingRatio - max_write_buffer_size_to_maintain=0 - level_compaction_dynamic_file_size=true - max_write_buffer_number_to_maintain=0 - optimize_filters_for_hits=false - level_compaction_dynamic_level_bytes=true - default_temperature=kUnknown - inplace_update_support=false - merge_operator=nullptr - table_factory=BlockBasedTable - min_write_buffer_number_to_merge=1 - compaction_filter=nullptr - compaction_style=kCompactionStyleLevel - bloom_locality=0 - comparator=leveldb.BytewiseComparator - compaction_filter_factory=nullptr - -[TableOptions/BlockBasedTable "doc"] - initial_auto_readahead_size=8192 - pin_top_level_index_and_filter=true - block_align=false - block_size_deviation=10 - checksum=kXXH3 - index_shortening=kShortenSeparators - num_file_reads_for_auto_readahead=2 - whole_key_filtering=true - data_block_index_type=kDataBlockBinarySearch - index_type=kBinarySearch - no_block_cache=false - index_block_restart_interval=1 - data_block_hash_table_util_ratio=0.750000 - prepopulate_block_cache=kDisable - pin_l0_filter_and_index_blocks_in_cache=false - filter_policy=nullptr - cache_index_and_filter_blocks_with_high_priority=true - verify_compression=false - block_restart_interval=16 - max_auto_readahead_size=262144 - flush_block_policy_factory=FlushBlockBySizePolicyFactory - partition_filters=false - cache_index_and_filter_blocks=false - block_size=4096 - metadata_block_size=4096 - optimize_filters_for_memory=false - detect_filter_construct_corruption=false - format_version=5 - metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;} - read_amp_bytes_per_bit=0 - enable_index_compression=true - diff --git a/tests/query/DataFrameTest.cpp b/tests/query/DataFrameTest.cpp new file mode 100644 index 0000000..2e9082f --- /dev/null +++ b/tests/query/DataFrameTest.cpp @@ -0,0 +1,129 @@ +#include +#include "lintdb/query/DataFrame.h" +#include "lintdb/query/LogicalExpr.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/schema/DataTypes.h" + +using namespace lintdb; +using namespace lintdb::query; + +class DataFrameTest : public ::testing::Test { +protected: + void SetUp() override { + // Create a simple schema for testing + schema_ = std::make_shared(); + + // Create field parameters + FieldParameters params; + + // Add fields with proper construction + Field id_field("id", DataType::INTEGER, {FieldType::Stored}, params); + Field name_field("name", DataType::TEXT, {FieldType::Stored}, params); + Field age_field("age", DataType::INTEGER, {FieldType::Stored}, params); + Field vector_field("vector", DataType::FLOAT, {FieldType::Stored}, params); + + schema_->add_field(id_field); + schema_->add_field(name_field); + schema_->add_field(age_field); + schema_->add_field(vector_field); + } + + std::shared_ptr schema_; +}; + +TEST_F(DataFrameTest, TestProjection) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Create projection expressions + auto id_col = std::make_shared("id"); + auto name_col = std::make_shared("name"); + + // Apply projection + auto projected_df = df->project({id_col, name_col}); + auto result_plan = projected_df->logical_plan(); + + // Verify projection was added to plan + EXPECT_EQ(result_plan->projections().size(), 2); + EXPECT_EQ(result_plan->projections()[0]->to_string(), "id"); + EXPECT_EQ(result_plan->projections()[1]->to_string(), "name"); +} + +TEST_F(DataFrameTest, TestFilter) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Create filter expression: age > 30 + auto age_col = std::make_shared("age"); + auto literal = std::make_shared("30"); + auto filter_expr = std::make_shared(age_col, BinaryExpr::Op::GT, literal); + + // Apply filter + auto filtered_df = df->filter(filter_expr); + auto result_plan = filtered_df->logical_plan(); + + // Verify filter was added to plan + EXPECT_NE(result_plan->filter(), nullptr); + EXPECT_EQ(result_plan->filter()->to_string(), "age > 30"); +} + +TEST_F(DataFrameTest, TestVectorSearch) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Create query vector + std::vector query_vector = {0.1f, 0.2f, 0.3f}; + size_t k = 5; + + // Apply vector search + auto search_df = df->vector_search(query_vector, k, "L2"); + auto result_plan = search_df->logical_plan(); + + // Verify vector search was added to plan + EXPECT_EQ(result_plan->query_vector(), query_vector); + EXPECT_EQ(result_plan->k(), k); + EXPECT_EQ(result_plan->distance_metric(), "L2"); +} + +TEST_F(DataFrameTest, TestComplexQuery) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Create a complex query: project id and name, filter age > 30, then do vector search + auto id_col = std::make_shared("id"); + auto name_col = std::make_shared("name"); + auto age_col = std::make_shared("age"); + auto literal = std::make_shared("30"); + auto filter_expr = std::make_shared(age_col, BinaryExpr::Op::GT, literal); + + // Apply operations in sequence + auto projected_df = df->project({id_col, name_col}); + auto filtered_df = projected_df->filter(filter_expr); + + std::vector query_vector = {0.1f, 0.2f, 0.3f}; + auto search_df = filtered_df->vector_search(query_vector, 10, "L2"); + auto result_plan = search_df->logical_plan(); + + // Verify all operations were added to plan + EXPECT_EQ(result_plan->projections().size(), 2); + EXPECT_NE(result_plan->filter(), nullptr); + EXPECT_EQ(result_plan->query_vector(), query_vector); + EXPECT_EQ(result_plan->k(), 10); + EXPECT_EQ(result_plan->distance_metric(), "L2"); +} + +TEST_F(DataFrameTest, TestLogicalPlan) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Test that logical plan is accessible and matches the input plan + EXPECT_EQ(df->logical_plan(), plan); +} + +TEST_F(DataFrameTest, TestSchema) { + auto plan = std::make_shared(); + auto df = std::make_shared(plan, schema_); + + // Test that schema is accessible and matches the input schema + EXPECT_EQ(df->schema(), schema_); +} \ No newline at end of file diff --git a/tests/query/physical/PhysicalPlanTest.cpp b/tests/query/physical/PhysicalPlanTest.cpp new file mode 100644 index 0000000..7d520dc --- /dev/null +++ b/tests/query/physical/PhysicalPlanTest.cpp @@ -0,0 +1,269 @@ +#include +#include +#include +#include +#include +#include +#include +#include "lintdb/query/physical/PhysicalPlan.h" +#include "lintdb/query/physical/ScanPhysicalPlan.h" +#include "lintdb/query/physical/FilterPhysicalPlan.h" +#include "lintdb/query/physical/ProjectionPhysicalPlan.h" +#include "lintdb/query/physical/VectorSearchPhysicalPlan.h" +#include "lintdb/query/LogicalExpr.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/datasources/DataSource.h" +#include "lintdb/datasources/DataSourceIterator.h" + +using namespace lintdb; +using namespace lintdb::query; + +class MockDataSource : public DataSource { +public: + std::vector> batches; + + Schema schema() const override { + return Schema({}); + } + + std::unique_ptr scan(uint64_t tenant_id) override { + return std::make_unique(batches); + } + + std::unique_ptr scan_prefix(const std::string& prefix) override { + return nullptr; + } + + std::string get(const std::string& key) override { + return ""; + } +}; + +class MockDataSourceIterator : public DataSourceIterator { +public: + explicit MockDataSourceIterator(const std::vector>& batches) + : batches_(batches), current_(0) {} + + bool has_next() override { + return current_ < batches_.size(); + } + + std::shared_ptr next() override { + if (!has_next()) { + return nullptr; + } + return batches_[current_++]; + } + +private: + std::vector> batches_; + size_t current_; +}; + +class PhysicalPlanTest : public ::testing::Test { +protected: + void SetUp() override { + // Create a simple schema with two columns: id (int64) and value (string) + std::vector fields = { + Field("id", DataType::INT64, {}, {}), + Field("value", DataType::STRING, {}, {}) + }; + schema_ = std::make_shared(fields); + + // Create a mock data source with some test data + data_source_ = std::make_shared(); + + // Create a record batch with test data + arrow::Int64Builder id_builder; + arrow::StringBuilder value_builder; + + ASSERT_TRUE(id_builder.AppendValues({1, 2, 3, 4, 5}).ok()); + ASSERT_TRUE(value_builder.AppendValues({"a", "b", "c", "d", "e"}).ok()); + + std::shared_ptr id_array; + std::shared_ptr value_array; + + ASSERT_TRUE(id_builder.Finish(&id_array).ok()); + ASSERT_TRUE(value_builder.Finish(&value_array).ok()); + + auto batch = arrow::RecordBatch::Make( + schema_->to_arrow_schema(), + 5, + {id_array, value_array} + ); + + data_source_->batches.push_back(batch); + } + + std::shared_ptr schema_; + std::shared_ptr data_source_; +}; + +TEST_F(PhysicalPlanTest, TestScanPhysicalPlan) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Test schema + EXPECT_EQ(scan_plan->schema(), schema_); + + // Test execution + auto batches = scan_plan->execute(); + ASSERT_EQ(batches.size(), 1); + + auto batch = batches[0]; + EXPECT_EQ(batch->num_rows(), 5); + EXPECT_EQ(batch->num_columns(), 2); + + // Verify data + auto id_array = std::static_pointer_cast(batch->column(0)); + auto value_array = std::static_pointer_cast(batch->column(1)); + + EXPECT_EQ(id_array->Value(0), 1); + EXPECT_EQ(value_array->GetString(0), "a"); + EXPECT_EQ(id_array->Value(4), 5); + EXPECT_EQ(value_array->GetString(4), "e"); +} + +TEST_F(PhysicalPlanTest, TestFilterPhysicalPlan) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Create a filter expression: id > 3 + auto id_col = std::make_shared(0, "id"); + auto lit = std::make_shared(3); + auto filter_expr = std::make_shared(id_col, lit, BinaryExpr::Op::GT); + + // Create a filter plan + auto filter_plan = std::make_shared(scan_plan, filter_expr); + + // Test execution + auto batches = filter_plan->execute(); + ASSERT_EQ(batches.size(), 1); + + auto batch = batches[0]; + EXPECT_EQ(batch->num_rows(), 2); // Only rows with id > 3 + + // Verify filtered data + auto id_array = std::static_pointer_cast(batch->column(0)); + auto value_array = std::static_pointer_cast(batch->column(1)); + + EXPECT_EQ(id_array->Value(0), 4); + EXPECT_EQ(value_array->GetString(0), "d"); + EXPECT_EQ(id_array->Value(1), 5); + EXPECT_EQ(value_array->GetString(1), "e"); +} + +TEST_F(PhysicalPlanTest, TestProjectionPhysicalPlan) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Create projection expressions: [id, value] + std::vector> projections = { + std::make_shared(0, "id"), + std::make_shared(1, "value") + }; + + // Create a projection plan + auto projection_plan = std::make_shared(scan_plan, projections); + + // Test execution + auto batches = projection_plan->execute(); + ASSERT_EQ(batches.size(), 1); + + auto batch = batches[0]; + EXPECT_EQ(batch->num_rows(), 5); + EXPECT_EQ(batch->num_columns(), 2); + + // Verify projected data + auto id_array = std::static_pointer_cast(batch->column(0)); + auto value_array = std::static_pointer_cast(batch->column(1)); + + EXPECT_EQ(id_array->Value(0), 1); + EXPECT_EQ(value_array->GetString(0), "a"); + EXPECT_EQ(id_array->Value(4), 5); + EXPECT_EQ(value_array->GetString(4), "e"); +} + +TEST_F(PhysicalPlanTest, TestVectorSearchPhysicalPlan) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Create a query vector + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + // Create a vector search plan + auto vector_search_plan = std::make_shared( + data_source_, + 1, + query_vector, + 2, // k=2 + "L2" // L2 distance + ); + + // Add the scan plan as a child + vector_search_plan->add_child(scan_plan); + + // Test execution + auto batches = vector_search_plan->execute(); + // Note: The actual results will depend on the vector search implementation + // This test just verifies that the plan executes without errors + EXPECT_NO_THROW(vector_search_plan->execute()); +} + +TEST_F(PhysicalPlanTest, TestPlanToString) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + EXPECT_EQ(scan_plan->to_string(), "PhysicalScan"); + + // Create a filter plan + auto filter_expr = std::make_shared( + std::make_shared(0, "id"), + std::make_shared(3), + BinaryExpr::Op::GT + ); + auto filter_plan = std::make_shared(scan_plan, filter_expr); + EXPECT_EQ(filter_plan->to_string(), "PhysicalFilter(id > 3)"); + + // Create a projection plan + std::vector> projections = { + std::make_shared(0, "id"), + std::make_shared(1, "value") + }; + auto projection_plan = std::make_shared(scan_plan, projections); + EXPECT_EQ(projection_plan->to_string(), "PhysicalProject(id, value)"); + + // Create a vector search plan + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + auto vector_search_plan = std::make_shared( + data_source_, + 1, + query_vector, + 2, + "L2" + ); + EXPECT_EQ(vector_search_plan->to_string(), "PhysicalVectorSearch(k=2, metric=L2)"); +} + +TEST_F(PhysicalPlanTest, TestPlanChildren) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + EXPECT_EQ(scan_plan->children().size(), 0); + + // Create a filter plan with scan as child + auto filter_expr = std::make_shared( + std::make_shared(0, "id"), + std::make_shared(3), + BinaryExpr::Op::GT + ); + auto filter_plan = std::make_shared(scan_plan, filter_expr); + EXPECT_EQ(filter_plan->children().size(), 1); + EXPECT_EQ(filter_plan->children()[0], scan_plan); + + // Create a projection plan with filter as child + std::vector> projections = { + std::make_shared(0, "id"), + std::make_shared(1, "value") + }; + auto projection_plan = std::make_shared(filter_plan, projections); + EXPECT_EQ(projection_plan->children().size(), 1); + EXPECT_EQ(projection_plan->children()[0], filter_plan); +} \ No newline at end of file diff --git a/tests/query/physical/VectorSearchPhysicalPlanTest.cpp b/tests/query/physical/VectorSearchPhysicalPlanTest.cpp new file mode 100644 index 0000000..77a14eb --- /dev/null +++ b/tests/query/physical/VectorSearchPhysicalPlanTest.cpp @@ -0,0 +1,319 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lintdb/query/physical/VectorSearchPhysicalPlan.h" +#include "lintdb/query/physical/ScanPhysicalPlan.h" +#include "lintdb/schema/Schema.h" +#include "lintdb/datasources/DataSource.h" +#include "lintdb/datasources/DataSourceIterator.h" + +using namespace lintdb; +using namespace lintdb::query; + +class MockDataSource : public DataSource { +public: + std::vector> batches; + + Schema schema() const override { + return Schema({}); + } + + std::unique_ptr scan(uint64_t tenant_id) override { + return std::make_unique(batches); + } + + std::unique_ptr scan_prefix(const std::string& prefix) override { + return nullptr; + } + + std::string get(const std::string& key) override { + return ""; + } +}; + +class MockDataSourceIterator : public DataSourceIterator { +public: + explicit MockDataSourceIterator(const std::vector>& batches) + : batches_(batches), current_(0) {} + + bool has_next() override { + return current_ < batches_.size(); + } + + std::shared_ptr next() override { + if (!has_next()) { + return nullptr; + } + return batches_[current_++]; + } + +private: + std::vector> batches_; + size_t current_; +}; + +class VectorSearchPhysicalPlanTest : public ::testing::Test { +protected: + void SetUp() override { + // Create a schema with id, vector, and metadata columns + std::vector fields = { + Field("id", DataType::INT64, {}, {}), + Field("vector", DataType::FLOAT32, {FieldType::FIXED_SIZE_LIST}, {{"size", "3"}}), + Field("metadata", DataType::STRING, {}, {}) + }; + schema_ = std::make_shared(fields); + + // Create a mock data source with test data + data_source_ = std::make_shared(); + + // Create test data + arrow::Int64Builder id_builder; + arrow::StringBuilder metadata_builder; + + // Create vector data + std::vector vectors = { + 1.0f, 2.0f, 3.0f, // Vector 1 + 2.0f, 3.0f, 4.0f, // Vector 2 + 3.0f, 4.0f, 5.0f, // Vector 3 + 4.0f, 5.0f, 6.0f, // Vector 4 + 5.0f, 6.0f, 7.0f // Vector 5 + }; + + // Create fixed-size list array for vectors + auto vector_type = arrow::fixed_size_list(arrow::float32(), 3); + arrow::FixedSizeListBuilder vector_builder( + arrow::default_memory_pool(), + std::make_shared(), + vector_type + ); + + // Append vectors + for (size_t i = 0; i < 5; ++i) { + ASSERT_TRUE(vector_builder.Append().ok()); + auto value_builder = std::static_pointer_cast(vector_builder.value_builder()); + ASSERT_TRUE(value_builder->AppendValues(vectors.data() + i * 3, 3).ok()); + } + + // Append IDs and metadata + ASSERT_TRUE(id_builder.AppendValues({1, 2, 3, 4, 5}).ok()); + ASSERT_TRUE(metadata_builder.AppendValues({"a", "b", "c", "d", "e"}).ok()); + + // Finish building arrays + std::shared_ptr id_array; + std::shared_ptr vector_array; + std::shared_ptr metadata_array; + + ASSERT_TRUE(id_builder.Finish(&id_array).ok()); + ASSERT_TRUE(vector_builder.Finish(&vector_array).ok()); + ASSERT_TRUE(metadata_builder.Finish(&metadata_array).ok()); + + // Create record batch + auto batch = arrow::RecordBatch::Make( + schema_->to_arrow_schema(), + 5, + {id_array, vector_array, metadata_array} + ); + + data_source_->batches.push_back(batch); + } + + std::shared_ptr schema_; + std::shared_ptr data_source_; +}; + +TEST_F(VectorSearchPhysicalPlanTest, TestBasicVectorSearch) { + // Create a scan plan + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Create a query vector + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + // Create a vector search plan + auto vector_search_plan = std::make_shared( + data_source_, + 1, + query_vector, + 2, // k=2 + "L2" // L2 distance + ); + + // Add the scan plan as a child + vector_search_plan->add_child(scan_plan); + + // Test execution + auto batches = vector_search_plan->execute(); + ASSERT_EQ(batches.size(), 1); + + auto batch = batches[0]; + EXPECT_EQ(batch->num_rows(), 2); // Should return top 2 results + EXPECT_EQ(batch->num_columns(), 3); // id, vector, metadata + + // Verify the results are ordered by distance + auto id_array = std::static_pointer_cast(batch->column(0)); + auto vector_array = std::static_pointer_cast(batch->column(1)); + auto metadata_array = std::static_pointer_cast(batch->column(2)); + + // First result should be closest to query vector + EXPECT_EQ(id_array->Value(0), 1); // First vector is closest + EXPECT_EQ(metadata_array->GetString(0), "a"); + + // Second result should be second closest + EXPECT_EQ(id_array->Value(1), 2); // Second vector is second closest + EXPECT_EQ(metadata_array->GetString(1), "b"); +} + +TEST_F(VectorSearchPhysicalPlanTest, TestDifferentKValues) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + // Test with k=1 + auto plan_k1 = std::make_shared( + data_source_, + 1, + query_vector, + 1, + "L2" + ); + plan_k1->add_child(scan_plan); + auto batches_k1 = plan_k1->execute(); + ASSERT_EQ(batches_k1.size(), 1); + EXPECT_EQ(batches_k1[0]->num_rows(), 1); + + // Test with k=3 + auto plan_k3 = std::make_shared( + data_source_, + 1, + query_vector, + 3, + "L2" + ); + plan_k3->add_child(scan_plan); + auto batches_k3 = plan_k3->execute(); + ASSERT_EQ(batches_k3.size(), 1); + EXPECT_EQ(batches_k3[0]->num_rows(), 3); + + // Test with k=5 (all vectors) + auto plan_k5 = std::make_shared( + data_source_, + 1, + query_vector, + 5, + "L2" + ); + plan_k5->add_child(scan_plan); + auto batches_k5 = plan_k5->execute(); + ASSERT_EQ(batches_k5.size(), 1); + EXPECT_EQ(batches_k5[0]->num_rows(), 5); +} + +TEST_F(VectorSearchPhysicalPlanTest, TestDifferentDistanceMetrics) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + // Test L2 distance + auto plan_l2 = std::make_shared( + data_source_, + 1, + query_vector, + 2, + "L2" + ); + plan_l2->add_child(scan_plan); + auto batches_l2 = plan_l2->execute(); + ASSERT_EQ(batches_l2.size(), 1); + + // Test cosine distance + auto plan_cosine = std::make_shared( + data_source_, + 1, + query_vector, + 2, + "cosine" + ); + plan_cosine->add_child(scan_plan); + auto batches_cosine = plan_cosine->execute(); + ASSERT_EQ(batches_cosine.size(), 1); + + // Results might be different for different metrics + auto id_array_l2 = std::static_pointer_cast(batches_l2[0]->column(0)); + auto id_array_cosine = std::static_pointer_cast(batches_cosine[0]->column(0)); + + // Note: We don't assert specific values here since the ordering might be different + // for different distance metrics + EXPECT_EQ(id_array_l2->length(), 2); + EXPECT_EQ(id_array_cosine->length(), 2); +} + +TEST_F(VectorSearchPhysicalPlanTest, TestEmptyResults) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + // Test with k=0 + auto plan_k0 = std::make_shared( + data_source_, + 1, + query_vector, + 0, + "L2" + ); + plan_k0->add_child(scan_plan); + auto batches_k0 = plan_k0->execute(); + ASSERT_EQ(batches_k0.size(), 1); + EXPECT_EQ(batches_k0[0]->num_rows(), 0); +} + +TEST_F(VectorSearchPhysicalPlanTest, TestInvalidInputs) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + + // Test with empty query vector + std::vector empty_vector; + auto plan_empty = std::make_shared( + data_source_, + 1, + empty_vector, + 2, + "L2" + ); + plan_empty->add_child(scan_plan); + EXPECT_THROW(plan_empty->execute(), std::runtime_error); + + // Test with invalid distance metric + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + auto plan_invalid_metric = std::make_shared( + data_source_, + 1, + query_vector, + 2, + "invalid_metric" + ); + plan_invalid_metric->add_child(scan_plan); + EXPECT_THROW(plan_invalid_metric->execute(), std::runtime_error); +} + +TEST_F(VectorSearchPhysicalPlanTest, TestSchemaConsistency) { + auto scan_plan = std::make_shared(data_source_, 1, schema_); + std::vector query_vector = {1.0f, 2.0f, 3.0f}; + + auto vector_search_plan = std::make_shared( + data_source_, + 1, + query_vector, + 2, + "L2" + ); + vector_search_plan->add_child(scan_plan); + + // Verify that the schema is preserved + EXPECT_EQ(vector_search_plan->schema(), schema_); + + // Verify that the output batch has the same schema + auto batches = vector_search_plan->execute(); + ASSERT_EQ(batches.size(), 1); + EXPECT_TRUE(batches[0]->schema()->Equals(schema_->to_arrow_schema())); +} \ No newline at end of file diff --git a/third_party/faiss b/third_party/faiss new file mode 160000 index 0000000..dc55e11 --- /dev/null +++ b/third_party/faiss @@ -0,0 +1 @@ +Subproject commit dc55e11874e4e32d0f04a9d62156ebe2ca25192d diff --git a/third_party/tantivy/CMakeLists.txt b/third_party/tantivy/CMakeLists.txt new file mode 100644 index 0000000..f007a2a --- /dev/null +++ b/third_party/tantivy/CMakeLists.txt @@ -0,0 +1,110 @@ +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CARGO_CMD cargo build) + set(TARGET_DIR "debug") +else () + set(CARGO_CMD cargo build --release) + set(TARGET_DIR "release") +endif () + +set(TANTIVY_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}") +set(TANTIVY_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}") +set(TANTIVY_NAME "libtantivy_binding${CMAKE_STATIC_LIBRARY_SUFFIX}") + +set(LIB_FILE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_DIR}/${TANTIVY_NAME}") +set(LIB_HEADER_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding/include") + +message(STATUS "TANTIVY_LIB_DIR: ${TANTIVY_LIB_DIR}") + +# In fact, cargo was already installed on our builder environment. +# Below settings are used to suit for first local development. +set(HOME_VAR $ENV{HOME}) +set(PATH_VAR $ENV{PATH}) +set(ENV{PATH} ${HOME_VAR}/.cargo/bin:${PATH_VAR}) +message($ENV{PATH}) + +add_custom_command(OUTPUT ls_cargo + COMMENT "ls cargo" + COMMAND ls ${HOME_VAR}/.cargo/bin/ + ) +add_custom_target(ls_cargo_target DEPENDS ls_cargo) + +add_custom_command(OUTPUT compile_tantivy + COMMENT "Compiling tantivy binding" + COMMAND CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR} ${CARGO_CMD} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding) +add_custom_target(tantivy_binding_target DEPENDS compile_tantivy ls_cargo_target) + +set(INSTALL_COMMAND + cp ${LIB_HEADER_FOLDER}/tantivy-binding.h ${TANTIVY_INCLUDE_DIR}/ && + cp ${CMAKE_CURRENT_SOURCE_DIR}/*.h ${TANTIVY_INCLUDE_DIR}/ && + cp ${LIB_FILE} ${TANTIVY_LIB_DIR}/) +add_custom_command(OUTPUT install_tantivy + COMMENT "Install tantivy target ${LIB_FILE} to ${TANTIVY_LIB_DIR}" + COMMAND ${INSTALL_COMMAND} + ) +add_custom_target(install_tantivy_target DEPENDS install_tantivy tantivy_binding_target) + +add_library(tantivy_binding STATIC IMPORTED) +add_dependencies(tantivy_binding + install_tantivy_target + ) + +set_target_properties(tantivy_binding + PROPERTIES + IMPORTED_GLOBAL TRUE + IMPORTED_LOCATION "${TANTIVY_LIB_DIR}/${TANTIVY_NAME}" + INTERFACE_INCLUDE_DIRECTORIES ${TANTIVY_INCLUDE_DIR}) # CMAKE_CURRENT_BINARY_DIR + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + add_compile_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) + add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) +endif() + +# TODO: move these below tests to ut. + +option(BUILD_TANTIVY_WITH_UT "compile tantivy with ut" OFF) + +if (BUILD_TANTIVY_WITH_UT) + message(STATUS "compile tantivy with ut") + + add_executable(test_tantivy test.cpp) + target_link_libraries(test_tantivy + tantivy_binding + boost_filesystem + dl + ) + + add_executable(bench_tantivy bench.cpp) + target_link_libraries(bench_tantivy + tantivy_binding + boost_filesystem + dl + ) + + add_executable(ffi_demo ffi_demo.cpp) + target_link_libraries(ffi_demo + tantivy_binding + dl + ) + + add_executable(tokenizer_demo tokenizer_demo.cpp) + target_link_libraries(tokenizer_demo + tantivy_binding + dl + ) + + add_executable(text_demo text_demo.cpp) + target_link_libraries(text_demo + tantivy_binding + dl + ) + + add_executable(jieba_demo jieba_demo.cpp) + target_link_libraries(jieba_demo + tantivy_binding + dl + ) +else () +endif () + +set( TANTIVY_INCLUDE_DIR ${LIB_HEADER_FOLDER};${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "Path to tantivy include directory" ) diff --git a/third_party/tantivy/bench.cpp b/third_party/tantivy/bench.cpp new file mode 100644 index 0000000..c8136f7 --- /dev/null +++ b/third_party/tantivy/bench.cpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include + +#include "tantivy-binding.h" +#include "tantivy-wrapper.h" +#include "time_recorder.h" + +using namespace milvus::tantivy; + +void +build_index(size_t n = 1000000) { + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + auto w = + TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path); + + std::vector arr; + arr.reserve(n); + + std::default_random_engine er(42); + int64_t sample = 10000; + for (size_t i = 0; i < n; i++) { + auto x = er() % sample; + arr.push_back(std::to_string(x)); + } + + w.add_data(arr.data(), arr.size(), 0); + + w.finish(); + assert(w.count() == n); +} + +void +search(size_t repeat = 10) { + TimeRecorder tr("bench-tantivy-search"); + + auto path = "/tmp/inverted-index/test-binding/"; + assert(tantivy_index_exist(path)); + tr.RecordSection("check if index exist"); + + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + tr.RecordSection("count num_entities"); + std::cout << "index already exist, open it, count: " << cnt << std::endl; + + for (size_t i = 0; i < repeat; i++) { + w.lower_bound_range_query(std::to_string(45), false); + tr.RecordSection("query"); + } + + tr.ElapseFromBegin("done"); +} + +int +main(int argc, char* argv[]) { + build_index(1000000); + search(10); + + return 0; +} diff --git a/third_party/tantivy/ffi_demo.cpp b/third_party/tantivy/ffi_demo.cpp new file mode 100644 index 0000000..1626d65 --- /dev/null +++ b/third_party/tantivy/ffi_demo.cpp @@ -0,0 +1,17 @@ +#include +#include + +#include "tantivy-binding.h" + +int +main(int argc, char* argv[]) { + std::vector data{"data1", "data2", "data3"}; + std::vector datas{}; + for (auto& s : data) { + datas.push_back(s.c_str()); + } + + print_vector_of_strings(datas.data(), datas.size()); + + return 0; +} diff --git a/third_party/tantivy/jieba_demo.cpp b/third_party/tantivy/jieba_demo.cpp new file mode 100644 index 0000000..0e8106b --- /dev/null +++ b/third_party/tantivy/jieba_demo.cpp @@ -0,0 +1,63 @@ +#include +#include +#include + +#include "tantivy-binding.h" +#include "tantivy-wrapper.h" + +using namespace milvus::tantivy; + +std::set +to_set(const RustArrayWrapper& w) { + std::set s(w.array_.array, w.array_.array + w.array_.len); + return s; +} + +int +main(int argc, char* argv[]) { + std::string tokenizer_name = "jieba"; + std::map tokenizer_params; + tokenizer_params["tokenizer"] = tokenizer_name; + + auto text_index = TantivyIndexWrapper( + "text_demo", true, "", tokenizer_name.c_str(), tokenizer_params); + auto write_single_text = [&text_index](const std::string& s, + int64_t offset) { + text_index.add_data(&s, 1, offset); + }; + + { + write_single_text( + "张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我" + "们都有光明的前途", + 0); + write_single_text("测试中文分词器的效果", 1); + write_single_text("黄金时代", 2); + write_single_text("青铜时代", 3); + text_index.commit(); + } + + text_index.create_reader(); + text_index.register_tokenizer(tokenizer_name.c_str(), tokenizer_params); + + { + auto result = to_set(text_index.match_query("北京")); + assert(result.size() == 1); + assert(result.find(0) != result.end()); + } + + { + auto result = to_set(text_index.match_query("效果")); + assert(result.size() == 1); + assert(result.find(1) != result.end()); + } + + { + auto result = to_set(text_index.match_query("时代")); + assert(result.size() == 2); + assert(result.find(2) != result.end()); + assert(result.find(3) != result.end()); + } + + return 0; +} diff --git a/third_party/tantivy/rust-array.h b/third_party/tantivy/rust-array.h new file mode 100644 index 0000000..ba9baec --- /dev/null +++ b/third_party/tantivy/rust-array.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" + +namespace milvus::tantivy { + +struct RustArrayWrapper { + NO_COPY_OR_ASSIGN(RustArrayWrapper); + + explicit RustArrayWrapper(RustArray array) : array_(array) { + } + + RustArrayWrapper(RustArrayWrapper&& other) noexcept { + array_.array = other.array_.array; + array_.len = other.array_.len; + array_.cap = other.array_.cap; + other.array_.array = nullptr; + other.array_.len = 0; + other.array_.cap = 0; + } + + RustArrayWrapper& + operator=(RustArrayWrapper&& other) noexcept { + if (this != &other) { + free(); + array_.array = other.array_.array; + array_.len = other.array_.len; + array_.cap = other.array_.cap; + other.array_.array = nullptr; + other.array_.len = 0; + other.array_.cap = 0; + } + return *this; + } + + ~RustArrayWrapper() { + free(); + } + + void + debug() { + std::stringstream ss; + ss << "[ "; + for (int i = 0; i < array_.len; i++) { + ss << array_.array[i] << " "; + } + ss << "]"; + std::cout << ss.str() << std::endl; + } + + RustArray array_; + + private: + void + free() { + if (array_.array != nullptr) { + free_rust_array(array_); + } + } +}; +} // namespace milvus::tantivy diff --git a/third_party/tantivy/rust-binding.h b/third_party/tantivy/rust-binding.h new file mode 100644 index 0000000..b5001c7 --- /dev/null +++ b/third_party/tantivy/rust-binding.h @@ -0,0 +1,7 @@ +#pragma once + +namespace milvus::tantivy { +#define NO_COPY_OR_ASSIGN(ClassName) \ + ClassName(const ClassName&) = delete; \ + ClassName& operator=(const ClassName&) = delete; +} // namespace milvus::tantivy diff --git a/third_party/tantivy/rust-hashmap.h b/third_party/tantivy/rust-hashmap.h new file mode 100644 index 0000000..0376de9 --- /dev/null +++ b/third_party/tantivy/rust-hashmap.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" + +namespace milvus::tantivy { + +struct RustHashMap { + public: + NO_COPY_OR_ASSIGN(RustHashMap); + + RustHashMap() { + ptr_ = create_hashmap(); + } + + ~RustHashMap() { + if (ptr_ != nullptr) { + free_hashmap(ptr_); + } + } + + void + from(const std::map& m) { + for (const auto& [k, v] : m) { + set(k, v); + } + } + + void* + get_pointer() { + return ptr_; + } + + void + set(const std::string& k, const std::string& v) { + hashmap_set_value(ptr_, k.c_str(), v.c_str()); + } + + private: + void* ptr_ = nullptr; +}; +} // namespace milvus::tantivy diff --git a/third_party/tantivy/tantivy-binding/.gitignore b/third_party/tantivy/tantivy-binding/.gitignore new file mode 100644 index 0000000..39a92a7 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/.gitignore @@ -0,0 +1,16 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +# Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +.vscode/ diff --git a/third_party/tantivy/tantivy-binding/Cargo.toml b/third_party/tantivy/tantivy-binding/Cargo.toml new file mode 100644 index 0000000..3bf9759 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "tantivy-binding" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tantivy = "=0.21.1" +futures = "0.3.21" +libc = "0.2" +scopeguard = "1.2" +zstd-sys = "=2.0.9" +env_logger = "0.11.3" +log = "0.4.21" +tantivy-jieba = "0.10.0" +lazy_static = "1.4.0" + +[build-dependencies] +cbindgen = "0.26.0" + +[lib] +crate-type = ["staticlib"] diff --git a/third_party/tantivy/tantivy-binding/build.rs b/third_party/tantivy/tantivy-binding/build.rs new file mode 100644 index 0000000..9d583e0 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/build.rs @@ -0,0 +1,12 @@ +use std::{env, path::PathBuf}; + +fn main() { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + let package_name = env::var("CARGO_PKG_NAME").unwrap(); + let output_file = PathBuf::from(&crate_dir) + .join("include") + .join(format!("{}.h", package_name)); + cbindgen::generate(&crate_dir) + .unwrap() + .write_to_file(output_file); +} diff --git a/third_party/tantivy/tantivy-binding/cbindgen.toml b/third_party/tantivy/tantivy-binding/cbindgen.toml new file mode 100644 index 0000000..318f9b0 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/cbindgen.toml @@ -0,0 +1,2 @@ +language = "C++" +pragma_once = true diff --git a/third_party/tantivy/tantivy-binding/include/tantivy-binding.h b/third_party/tantivy/tantivy-binding/include/tantivy-binding.h new file mode 100644 index 0000000..c443ec7 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/include/tantivy-binding.h @@ -0,0 +1,166 @@ +#pragma once + +#include +#include +#include +#include +#include + +enum class TantivyDataType : uint8_t { + Text, + Keyword, + I64, + F64, + Bool, +}; + +struct RustArray { + uint32_t *array; + size_t len; + size_t cap; +}; + +extern "C" { + +void free_rust_array(RustArray array); + +void print_vector_of_strings(const char *const *ptr, uintptr_t len); + +void *create_hashmap(); + +void hashmap_set_value(void *map, const char *key, const char *value); + +void free_hashmap(void *map); + +void *tantivy_load_index(const char *path); + +void tantivy_free_index_reader(void *ptr); + +void tantivy_reload_index(void *ptr); + +uint32_t tantivy_index_count(void *ptr); + +RustArray tantivy_term_query_i64(void *ptr, int64_t term); + +RustArray tantivy_lower_bound_range_query_i64(void *ptr, int64_t lower_bound, bool inclusive); + +RustArray tantivy_upper_bound_range_query_i64(void *ptr, int64_t upper_bound, bool inclusive); + +RustArray tantivy_range_query_i64(void *ptr, + int64_t lower_bound, + int64_t upper_bound, + bool lb_inclusive, + bool ub_inclusive); + +RustArray tantivy_term_query_f64(void *ptr, double term); + +RustArray tantivy_lower_bound_range_query_f64(void *ptr, double lower_bound, bool inclusive); + +RustArray tantivy_upper_bound_range_query_f64(void *ptr, double upper_bound, bool inclusive); + +RustArray tantivy_range_query_f64(void *ptr, + double lower_bound, + double upper_bound, + bool lb_inclusive, + bool ub_inclusive); + +RustArray tantivy_term_query_bool(void *ptr, bool term); + +RustArray tantivy_term_query_keyword(void *ptr, const char *term); + +RustArray tantivy_lower_bound_range_query_keyword(void *ptr, + const char *lower_bound, + bool inclusive); + +RustArray tantivy_upper_bound_range_query_keyword(void *ptr, + const char *upper_bound, + bool inclusive); + +RustArray tantivy_range_query_keyword(void *ptr, + const char *lower_bound, + const char *upper_bound, + bool lb_inclusive, + bool ub_inclusive); + +RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix); + +RustArray tantivy_regex_query(void *ptr, const char *pattern); + +RustArray tantivy_match_query(void *ptr, const char *query); + +void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params); + +void *tantivy_create_index(const char *field_name, + TantivyDataType data_type, + const char *path, + uintptr_t num_threads, + uintptr_t overall_memory_budget_in_bytes); + +void tantivy_free_index_writer(void *ptr); + +void tantivy_finish_index(void *ptr); + +void tantivy_commit_index(void *ptr); + +void *tantivy_create_reader_from_writer(void *ptr); + +void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len, int64_t offset_begin); + +void tantivy_index_add_string(void *ptr, const char *s, int64_t offset); + +void tantivy_index_add_multi_int8s(void *ptr, const int8_t *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_int16s(void *ptr, const int16_t *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_int32s(void *ptr, const int32_t *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_int64s(void *ptr, const int64_t *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_f32s(void *ptr, const float *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_f64s(void *ptr, const double *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_bools(void *ptr, const bool *array, uintptr_t len, int64_t offset); + +void tantivy_index_add_multi_keywords(void *ptr, + const char *const *array, + uintptr_t len, + int64_t offset); + +void *tantivy_create_text_writer(const char *field_name, + const char *path, + const char *tokenizer_name, + void *tokenizer_params, + uintptr_t num_threads, + uintptr_t overall_memory_budget_in_bytes, + bool in_ram); + +void free_rust_string(const char *ptr); + +void *tantivy_create_token_stream(void *tokenizer, const char *text); + +void tantivy_free_token_stream(void *token_stream); + +bool tantivy_token_stream_advance(void *token_stream); + +const char *tantivy_token_stream_get_token(void *token_stream); + +void *tantivy_create_tokenizer(void *tokenizer_params); + +void tantivy_free_tokenizer(void *tokenizer); + +bool tantivy_index_exist(const char *path); + +} // extern "C" diff --git a/third_party/tantivy/tantivy-binding/src/array.rs b/third_party/tantivy/tantivy-binding/src/array.rs new file mode 100644 index 0000000..9d71ffa --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/array.rs @@ -0,0 +1,29 @@ +use libc::size_t; + +#[repr(C)] +pub struct RustArray { + array: *mut u32, + len: size_t, + cap: size_t, +} + +impl RustArray { + pub fn from_vec(vec: Vec) -> RustArray { + let len = vec.len(); + let cap = vec.capacity(); + let v = vec.leak(); + RustArray { + array: v.as_mut_ptr(), + len, + cap, + } + } +} + +#[no_mangle] +pub extern "C" fn free_rust_array(array: RustArray) { + let RustArray { array, len, cap } = array; + unsafe { + Vec::from_raw_parts(array, len, cap); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/data_type.rs b/third_party/tantivy/tantivy-binding/src/data_type.rs new file mode 100644 index 0000000..fc4672e --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/data_type.rs @@ -0,0 +1,9 @@ +#[repr(u8)] +pub enum TantivyDataType { + Text, + Keyword, + // U64, + I64, + F64, + Bool, +} diff --git a/third_party/tantivy/tantivy-binding/src/demo_c.rs b/third_party/tantivy/tantivy-binding/src/demo_c.rs new file mode 100644 index 0000000..af44121 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/demo_c.rs @@ -0,0 +1,13 @@ +use std::{ + ffi::{c_char, CStr}, + slice, +}; + +#[no_mangle] +pub extern "C" fn print_vector_of_strings(ptr: *const *const c_char, len: usize) { + let arr: &[*const c_char] = unsafe { slice::from_raw_parts(ptr, len) }; + for element in arr { + let c_str = unsafe { CStr::from_ptr(*element) }; + println!("{}", c_str.to_str().unwrap()); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/docid_collector.rs b/third_party/tantivy/tantivy-binding/src/docid_collector.rs new file mode 100644 index 0000000..95d585b --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/docid_collector.rs @@ -0,0 +1,60 @@ +use tantivy::{ + collector::{Collector, SegmentCollector}, + fastfield::Column, + DocId, Score, SegmentOrdinal, SegmentReader, +}; + +pub(crate) struct DocIdCollector; + +impl Collector for DocIdCollector { + type Fruit = Vec; + type Child = DocIdChildCollector; + + fn for_segment( + &self, + _segment_local_id: SegmentOrdinal, + segment: &SegmentReader, + ) -> tantivy::Result { + Ok(DocIdChildCollector { + docs: Vec::new(), + column: segment.fast_fields().i64("doc_id").unwrap(), + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits( + &self, + segment_fruits: Vec<::Fruit>, + ) -> tantivy::Result { + let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); + let mut result = Vec::with_capacity(len); + for docs in segment_fruits { + for doc in docs { + result.push(doc); + } + } + Ok(result) + } +} + +pub(crate) struct DocIdChildCollector { + docs: Vec, + column: Column, +} + +impl SegmentCollector for DocIdChildCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: Score) { + self.column.values_for_doc(doc).for_each(|doc_id| { + self.docs.push(doc_id as u32); + }) + } + + fn harvest(self) -> Self::Fruit { + self.docs + } +} diff --git a/third_party/tantivy/tantivy-binding/src/hashmap_c.rs b/third_party/tantivy/tantivy-binding/src/hashmap_c.rs new file mode 100644 index 0000000..8185a27 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/hashmap_c.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; +use std::ffi::CStr; +use std::os::raw::c_char; + +use libc::c_void; + +use crate::util::{create_binding, free_binding}; + +#[no_mangle] +pub extern "C" fn create_hashmap() -> *mut c_void { + let map: HashMap = HashMap::new(); + create_binding(map) +} + +#[no_mangle] +pub extern "C" fn hashmap_set_value(map: *mut c_void, key: *const c_char, value: *const c_char) { + let m = map as *mut HashMap; + let k = unsafe { CStr::from_ptr(key).to_str().unwrap() }; + let v = unsafe { CStr::from_ptr(value).to_str().unwrap() }; + unsafe { + (*m).insert(String::from(k), String::from(v)); + } +} + +#[no_mangle] +pub extern "C" fn free_hashmap(map: *mut c_void) { + free_binding::>(map); +} diff --git a/third_party/tantivy/tantivy-binding/src/index_reader.rs b/third_party/tantivy/tantivy-binding/src/index_reader.rs new file mode 100644 index 0000000..3ac5147 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_reader.rs @@ -0,0 +1,216 @@ +use std::ops::Bound; +use std::sync::Arc; + +use tantivy::query::{Query, RangeQuery, RegexQuery, TermQuery}; +use tantivy::schema::{Field, IndexRecordOption}; +use tantivy::{Index, IndexReader, ReloadPolicy, Term}; + +use crate::docid_collector::DocIdCollector; +use crate::log::init_log; +use crate::util::make_bounds; +use crate::vec_collector::VecCollector; + +pub(crate) struct IndexReaderWrapper { + pub(crate) field_name: String, + pub(crate) field: Field, + pub(crate) reader: IndexReader, + pub(crate) index: Arc, + pub(crate) id_field: Option, +} + +impl IndexReaderWrapper { + pub fn load(path: &str) -> IndexReaderWrapper { + init_log(); + + let index = Index::open_in_dir(path).unwrap(); + + IndexReaderWrapper::from_index(Arc::new(index)) + } + + pub fn from_index(index: Arc) -> IndexReaderWrapper { + let field = index.schema().fields().next().unwrap().0; + let schema = index.schema(); + let field_name = String::from(schema.get_field_name(field)); + let id_field: Option = match schema.get_field("doc_id") { + Ok(field) => Some(field), + Err(_) => None, + }; + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) // OnCommit serve for growing segment. + .try_into() + .unwrap(); + reader.reload().unwrap(); + + IndexReaderWrapper { + field_name, + field, + reader, + index, + id_field, + } + } + + pub fn reload(&self) { + self.reader.reload().unwrap(); + } + + pub fn count(&self) -> u32 { + let metas = self.index.searchable_segment_metas().unwrap(); + let mut sum: u32 = 0; + for meta in metas { + sum += meta.max_doc(); + } + sum + } + + pub(crate) fn search(&self, q: &dyn Query) -> Vec { + let searcher = self.reader.searcher(); + match self.id_field { + Some(_) => { + // newer version with doc_id. + searcher.search(q, &DocIdCollector {}).unwrap() + } + None => { + // older version without doc_id, only one segment. + searcher.search(q, &VecCollector {}).unwrap() + } + } + } + + pub fn term_query_i64(&self, term: i64) -> Vec { + let q = TermQuery::new( + Term::from_field_i64(self.field, term), + IndexRecordOption::Basic, + ); + self.search(&q) + } + + pub fn lower_bound_range_query_i64(&self, lower_bound: i64, inclusive: bool) -> Vec { + let q = RangeQuery::new_i64_bounds( + self.field_name.to_string(), + make_bounds(lower_bound, inclusive), + Bound::Unbounded, + ); + self.search(&q) + } + + pub fn upper_bound_range_query_i64(&self, upper_bound: i64, inclusive: bool) -> Vec { + let q = RangeQuery::new_i64_bounds( + self.field_name.to_string(), + Bound::Unbounded, + make_bounds(upper_bound, inclusive), + ); + self.search(&q) + } + + pub fn range_query_i64( + &self, + lower_bound: i64, + upper_bound: i64, + lb_inclusive: bool, + ub_inclusive: bool, + ) -> Vec { + let lb = make_bounds(lower_bound, lb_inclusive); + let ub = make_bounds(upper_bound, ub_inclusive); + let q = RangeQuery::new_i64_bounds(self.field_name.to_string(), lb, ub); + self.search(&q) + } + + pub fn term_query_f64(&self, term: f64) -> Vec { + let q = TermQuery::new( + Term::from_field_f64(self.field, term), + IndexRecordOption::Basic, + ); + self.search(&q) + } + + pub fn lower_bound_range_query_f64(&self, lower_bound: f64, inclusive: bool) -> Vec { + let q = RangeQuery::new_f64_bounds( + self.field_name.to_string(), + make_bounds(lower_bound, inclusive), + Bound::Unbounded, + ); + self.search(&q) + } + + pub fn upper_bound_range_query_f64(&self, upper_bound: f64, inclusive: bool) -> Vec { + let q = RangeQuery::new_f64_bounds( + self.field_name.to_string(), + Bound::Unbounded, + make_bounds(upper_bound, inclusive), + ); + self.search(&q) + } + + pub fn range_query_f64( + &self, + lower_bound: f64, + upper_bound: f64, + lb_inclusive: bool, + ub_inclusive: bool, + ) -> Vec { + let lb = make_bounds(lower_bound, lb_inclusive); + let ub = make_bounds(upper_bound, ub_inclusive); + let q = RangeQuery::new_f64_bounds(self.field_name.to_string(), lb, ub); + self.search(&q) + } + + pub fn term_query_bool(&self, term: bool) -> Vec { + let q = TermQuery::new( + Term::from_field_bool(self.field, term), + IndexRecordOption::Basic, + ); + self.search(&q) + } + + pub fn term_query_keyword(&self, term: &str) -> Vec { + let q = TermQuery::new( + Term::from_field_text(self.field, term), + IndexRecordOption::Basic, + ); + self.search(&q) + } + + pub fn lower_bound_range_query_keyword(&self, lower_bound: &str, inclusive: bool) -> Vec { + let q = RangeQuery::new_str_bounds( + self.field_name.to_string(), + make_bounds(lower_bound, inclusive), + Bound::Unbounded, + ); + self.search(&q) + } + + pub fn upper_bound_range_query_keyword(&self, upper_bound: &str, inclusive: bool) -> Vec { + let q = RangeQuery::new_str_bounds( + self.field_name.to_string(), + Bound::Unbounded, + make_bounds(upper_bound, inclusive), + ); + self.search(&q) + } + + pub fn range_query_keyword( + &self, + lower_bound: &str, + upper_bound: &str, + lb_inclusive: bool, + ub_inclusive: bool, + ) -> Vec { + let lb = make_bounds(lower_bound, lb_inclusive); + let ub = make_bounds(upper_bound, ub_inclusive); + let q = RangeQuery::new_str_bounds(self.field_name.to_string(), lb, ub); + self.search(&q) + } + + pub fn prefix_query_keyword(&self, prefix: &str) -> Vec { + let pattern = format!("{}(.|\n)*", prefix); + self.regex_query(&pattern) + } + + pub fn regex_query(&self, pattern: &str) -> Vec { + let q = RegexQuery::from_pattern(&pattern, self.field).unwrap(); + self.search(&q) + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_reader_c.rs b/third_party/tantivy/tantivy-binding/src/index_reader_c.rs new file mode 100644 index 0000000..60e6136 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_reader_c.rs @@ -0,0 +1,230 @@ +use std::ffi::{c_char, c_void, CStr}; + +use crate::{ + array::RustArray, + index_reader::IndexReaderWrapper, + util::{create_binding, free_binding}, + util_c::tantivy_index_exist, +}; + +#[no_mangle] +pub extern "C" fn tantivy_load_index(path: *const c_char) -> *mut c_void { + assert!(tantivy_index_exist(path)); + let path_str = unsafe { CStr::from_ptr(path) }; + let wrapper = IndexReaderWrapper::load(path_str.to_str().unwrap()); + create_binding(wrapper) +} + +#[no_mangle] +pub extern "C" fn tantivy_free_index_reader(ptr: *mut c_void) { + free_binding::(ptr); +} + +// -------------------------query-------------------- +#[no_mangle] +pub extern "C" fn tantivy_reload_index(ptr: *mut c_void) { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + (*real).reload(); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_count(ptr: *mut c_void) -> u32 { + let real = ptr as *mut IndexReaderWrapper; + unsafe { (*real).count() } +} + +#[no_mangle] +pub extern "C" fn tantivy_term_query_i64(ptr: *mut c_void, term: i64) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).term_query_i64(term); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_lower_bound_range_query_i64( + ptr: *mut c_void, + lower_bound: i64, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).lower_bound_range_query_i64(lower_bound, inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_upper_bound_range_query_i64( + ptr: *mut c_void, + upper_bound: i64, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).upper_bound_range_query_i64(upper_bound, inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_range_query_i64( + ptr: *mut c_void, + lower_bound: i64, + upper_bound: i64, + lb_inclusive: bool, + ub_inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).range_query_i64(lower_bound, upper_bound, lb_inclusive, ub_inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_term_query_f64(ptr: *mut c_void, term: f64) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).term_query_f64(term); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_lower_bound_range_query_f64( + ptr: *mut c_void, + lower_bound: f64, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).lower_bound_range_query_f64(lower_bound, inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_upper_bound_range_query_f64( + ptr: *mut c_void, + upper_bound: f64, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).upper_bound_range_query_f64(upper_bound, inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_range_query_f64( + ptr: *mut c_void, + lower_bound: f64, + upper_bound: f64, + lb_inclusive: bool, + ub_inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).range_query_f64(lower_bound, upper_bound, lb_inclusive, ub_inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_term_query_bool(ptr: *mut c_void, term: bool) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let hits = (*real).term_query_bool(term); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_term_query_keyword(ptr: *mut c_void, term: *const c_char) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_str = CStr::from_ptr(term); + let hits = (*real).term_query_keyword(c_str.to_str().unwrap()); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_lower_bound_range_query_keyword( + ptr: *mut c_void, + lower_bound: *const c_char, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_lower_bound = CStr::from_ptr(lower_bound); + let hits = + (*real).lower_bound_range_query_keyword(c_lower_bound.to_str().unwrap(), inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_upper_bound_range_query_keyword( + ptr: *mut c_void, + upper_bound: *const c_char, + inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_upper_bound = CStr::from_ptr(upper_bound); + let hits = + (*real).upper_bound_range_query_keyword(c_upper_bound.to_str().unwrap(), inclusive); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_range_query_keyword( + ptr: *mut c_void, + lower_bound: *const c_char, + upper_bound: *const c_char, + lb_inclusive: bool, + ub_inclusive: bool, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_lower_bound = CStr::from_ptr(lower_bound); + let c_upper_bound = CStr::from_ptr(upper_bound); + let hits = (*real).range_query_keyword( + c_lower_bound.to_str().unwrap(), + c_upper_bound.to_str().unwrap(), + lb_inclusive, + ub_inclusive, + ); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_prefix_query_keyword( + ptr: *mut c_void, + prefix: *const c_char, +) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_str = CStr::from_ptr(prefix); + let hits = (*real).prefix_query_keyword(c_str.to_str().unwrap()); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_regex_query(ptr: *mut c_void, pattern: *const c_char) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_str = CStr::from_ptr(pattern); + let hits = (*real).regex_query(c_str.to_str().unwrap()); + RustArray::from_vec(hits) + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_reader_text.rs b/third_party/tantivy/tantivy-binding/src/index_reader_text.rs new file mode 100644 index 0000000..654346f --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_reader_text.rs @@ -0,0 +1,32 @@ +use tantivy::{ + query::BooleanQuery, + tokenizer::{TextAnalyzer, TokenStream}, + Term, +}; + +use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer}; + +impl IndexReaderWrapper { + // split the query string into multiple tokens using index's default tokenizer, + // and then execute the disconjunction of term query. + pub(crate) fn match_query(&self, q: &str) -> Vec { + // clone the tokenizer to make `match_query` thread-safe. + let mut tokenizer = self + .index + .tokenizer_for_field(self.field) + .unwrap_or(default_tokenizer()) + .clone(); + let mut token_stream = tokenizer.token_stream(q); + let mut terms: Vec = Vec::new(); + while token_stream.advance() { + let token = token_stream.token(); + terms.push(Term::from_field_text(self.field, &token.text)); + } + let query = BooleanQuery::new_multiterms_query(terms); + self.search(&query) + } + + pub(crate) fn register_tokenizer(&self, tokenizer_name: String, tokenizer: TextAnalyzer) { + self.index.tokenizers().register(&tokenizer_name, tokenizer) + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_reader_text_c.rs b/third_party/tantivy/tantivy-binding/src/index_reader_text_c.rs new file mode 100644 index 0000000..eb0653c --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -0,0 +1,40 @@ +use std::{collections::HashMap, ffi::CStr}; + +use libc::{c_char, c_void}; + +use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer}; + +#[no_mangle] +pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray { + let real = ptr as *mut IndexReaderWrapper; + unsafe { + let c_str = CStr::from_ptr(query); + let hits = (*real).match_query(c_str.to_str().unwrap()); + RustArray::from_vec(hits) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_register_tokenizer( + ptr: *mut c_void, + tokenizer_name: *const c_char, + tokenizer_params: *mut c_void, +) { + let real = ptr as *mut IndexReaderWrapper; + let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) }; + let analyzer = unsafe { + let m = tokenizer_params as *const HashMap; + create_tokenizer(&(*m)) + }; + match analyzer { + Some(text_analyzer) => unsafe { + (*real).register_tokenizer( + String::from(tokenizer_name_str.to_str().unwrap()), + text_analyzer, + ); + }, + None => { + panic!("unsupported tokenizer"); + } + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_writer.rs b/third_party/tantivy/tantivy-binding/src/index_writer.rs new file mode 100644 index 0000000..c466d1e --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_writer.rs @@ -0,0 +1,232 @@ +use std::ffi::CStr; +use std::sync::Arc; + +use futures::executor::block_on; +use libc::c_char; +use tantivy::schema::{ + Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED, +}; +use tantivy::{doc, tokenizer, Document, Index, IndexWriter}; + +use crate::data_type::TantivyDataType; + +use crate::index_reader::IndexReaderWrapper; +use crate::log::init_log; + +pub(crate) struct IndexWriterWrapper { + pub(crate) field: Field, + pub(crate) index_writer: IndexWriter, + pub(crate) id_field: Field, + pub(crate) index: Arc, +} + +impl IndexWriterWrapper { + pub fn new( + field_name: String, + data_type: TantivyDataType, + path: String, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + ) -> IndexWriterWrapper { + init_log(); + + let field: Field; + let mut schema_builder = Schema::builder(); + let mut use_raw_tokenizer = false; + match data_type { + TantivyDataType::I64 => { + field = schema_builder.add_i64_field(&field_name, INDEXED); + } + TantivyDataType::F64 => { + field = schema_builder.add_f64_field(&field_name, INDEXED); + } + TantivyDataType::Bool => { + field = schema_builder.add_bool_field(&field_name, INDEXED); + } + TantivyDataType::Keyword => { + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("raw_tokenizer") + .set_index_option(IndexRecordOption::Basic); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + field = schema_builder.add_text_field(&field_name, text_options); + use_raw_tokenizer = true; + } + TantivyDataType::Text => { + panic!("text should be indexed with analyzer"); + } + } + let id_field = schema_builder.add_i64_field("doc_id", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema).unwrap(); + if use_raw_tokenizer { + index + .tokenizers() + .register("raw_tokenizer", tokenizer::RawTokenizer::default()); + } + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + IndexWriterWrapper { + field, + index_writer, + id_field, + index: Arc::new(index), + } + } + + pub fn create_reader(&self) -> IndexReaderWrapper { + IndexReaderWrapper::from_index(self.index.clone()) + } + + pub fn add_i8(&mut self, data: i8, offset: i64) { + self.add_i64(data.into(), offset) + } + + pub fn add_i16(&mut self, data: i16, offset: i64) { + self.add_i64(data.into(), offset) + } + + pub fn add_i32(&mut self, data: i32, offset: i64) { + self.add_i64(data.into(), offset) + } + + pub fn add_i64(&mut self, data: i64, offset: i64) { + self.index_writer + .add_document(doc!( + self.field => data, + self.id_field => offset, + )) + .unwrap(); + } + + pub fn add_f32(&mut self, data: f32, offset: i64) { + self.add_f64(data.into(), offset) + } + + pub fn add_f64(&mut self, data: f64, offset: i64) { + self.index_writer + .add_document(doc!( + self.field => data, + self.id_field => offset, + )) + .unwrap(); + } + + pub fn add_bool(&mut self, data: bool, offset: i64) { + self.index_writer + .add_document(doc!( + self.field => data, + self.id_field => offset, + )) + .unwrap(); + } + + pub fn add_string(&mut self, data: &str, offset: i64) { + self.index_writer + .add_document(doc!( + self.field => data, + self.id_field => offset, + )) + .unwrap(); + } + + pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data as i64); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data as i64); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data as i64); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data as f64); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) { + let mut document = Document::default(); + for data in datas { + document.add_field_value(self.field, *data); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) { + let mut document = Document::default(); + for element in datas { + let data = unsafe { CStr::from_ptr(*element) }; + document.add_field_value(self.field, data.to_str().unwrap()); + } + document.add_i64(self.id_field, offset); + self.index_writer.add_document(document).unwrap(); + } + + fn manual_merge(&mut self) { + let metas = self + .index_writer + .index() + .searchable_segment_metas() + .unwrap(); + let policy = self.index_writer.get_merge_policy(); + let candidates = policy.compute_merge_candidates(metas.as_slice()); + for candidate in candidates { + self.index_writer + .merge(candidate.0.as_slice()) + .wait() + .unwrap(); + } + } + + pub fn finish(mut self) { + self.index_writer.commit().unwrap(); + // self.manual_merge(); + block_on(self.index_writer.garbage_collect_files()).unwrap(); + self.index_writer.wait_merging_threads().unwrap(); + } + + pub(crate) fn commit(&mut self) { + self.index_writer.commit().unwrap(); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_writer_c.rs b/third_party/tantivy/tantivy-binding/src/index_writer_c.rs new file mode 100644 index 0000000..9cb81d7 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_writer_c.rs @@ -0,0 +1,292 @@ +use core::slice; +use std::ffi::{c_char, c_void, CStr}; + +use crate::{ + data_type::TantivyDataType, + index_writer::IndexWriterWrapper, + util::{create_binding, free_binding}, +}; + +#[no_mangle] +pub extern "C" fn tantivy_create_index( + field_name: *const c_char, + data_type: TantivyDataType, + path: *const c_char, + num_threads: usize, + overall_memory_budget_in_bytes: usize, +) -> *mut c_void { + let field_name_str = unsafe { CStr::from_ptr(field_name) }; + let path_str = unsafe { CStr::from_ptr(path) }; + let wrapper = IndexWriterWrapper::new( + String::from(field_name_str.to_str().unwrap()), + data_type, + String::from(path_str.to_str().unwrap()), + num_threads, + overall_memory_budget_in_bytes, + ); + create_binding(wrapper) +} + +#[no_mangle] +pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) { + free_binding::(ptr); +} + +// tantivy_finish_index will finish the index writer, and the index writer can't be used any more. +// After this was called, you should reset the pointer to null. +#[no_mangle] +pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { Box::from_raw(real).finish() } +} + +#[no_mangle] +pub extern "C" fn tantivy_commit_index(ptr: *mut c_void) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + (*real).commit(); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> *mut c_void { + let writer = ptr as *mut IndexWriterWrapper; + let reader = unsafe { (*writer).create_reader() }; + create_binding(reader) +} + +// -------------------------build-------------------- +#[no_mangle] +pub extern "C" fn tantivy_index_add_int8s( + ptr: *mut c_void, + array: *const i8, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_i8(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_int16s( + ptr: *mut c_void, + array: *const i16, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_i16(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_int32s( + ptr: *mut c_void, + array: *const i32, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_i32(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_int64s( + ptr: *mut c_void, + array: *const i64, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_i64(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_f32s( + ptr: *mut c_void, + array: *const f32, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_f32(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_f64s( + ptr: *mut c_void, + array: *const f64, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_f64(*data, offset_begin + (index as i64)); + } + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_bools( + ptr: *mut c_void, + array: *const bool, + len: usize, + offset_begin: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + let arr = unsafe { slice::from_raw_parts(array, len) }; + unsafe { + for (index, data) in arr.iter().enumerate() { + (*real).add_bool(*data, offset_begin + (index as i64)); + } + } +} + +// TODO: this is not a very efficient way, since we must call this function many times, which +// will bring a lot of overhead caused by the rust binding. +#[no_mangle] +pub extern "C" fn tantivy_index_add_string(ptr: *mut c_void, s: *const c_char, offset: i64) { + let real = ptr as *mut IndexWriterWrapper; + let c_str = unsafe { CStr::from_ptr(s) }; + unsafe { (*real).add_string(c_str.to_str().unwrap(), offset) } +} + +// --------------------------------------------- array ------------------------------------------ + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_int8s( + ptr: *mut c_void, + array: *const i8, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_i8s(arr, offset) + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_int16s( + ptr: *mut c_void, + array: *const i16, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_i16s(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_int32s( + ptr: *mut c_void, + array: *const i32, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_i32s(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_int64s( + ptr: *mut c_void, + array: *const i64, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_i64s(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_f32s( + ptr: *mut c_void, + array: *const f32, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_f32s(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_f64s( + ptr: *mut c_void, + array: *const f64, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_f64s(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_bools( + ptr: *mut c_void, + array: *const bool, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_bools(arr, offset); + } +} + +#[no_mangle] +pub extern "C" fn tantivy_index_add_multi_keywords( + ptr: *mut c_void, + array: *const *const c_char, + len: usize, + offset: i64, +) { + let real = ptr as *mut IndexWriterWrapper; + unsafe { + let arr = slice::from_raw_parts(array, len); + (*real).add_multi_keywords(arr, offset) + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_writer_text.rs b/third_party/tantivy/tantivy-binding/src/index_writer_text.rs new file mode 100644 index 0000000..923fb6e --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_writer_text.rs @@ -0,0 +1,52 @@ +use std::sync::Arc; + +use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; +use tantivy::tokenizer::TextAnalyzer; +use tantivy::Index; + +use crate::{index_writer::IndexWriterWrapper, log::init_log}; + +fn build_text_schema(field_name: &String, tokenizer_name: &String) -> (Schema, Field, Field) { + let mut schema_builder = Schema::builder(); + // positions is required for matching phase. + let indexing = TextFieldIndexing::default() + .set_tokenizer(&tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let option = TextOptions::default().set_indexing_options(indexing); + let field = schema_builder.add_text_field(&field_name, option); + let id_field = schema_builder.add_i64_field("doc_id", FAST); + (schema_builder.build(), field, id_field) +} + +impl IndexWriterWrapper { + pub(crate) fn create_text_writer( + field_name: String, + path: String, + tokenizer_name: String, + tokenizer: TextAnalyzer, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, + ) -> IndexWriterWrapper { + init_log(); + + let (schema, field, id_field) = build_text_schema(&field_name, &tokenizer_name); + let index: Index; + if in_ram { + index = Index::create_in_ram(schema); + } else { + index = Index::create_in_dir(path.clone(), schema).unwrap(); + } + index.tokenizers().register(&tokenizer_name, tokenizer); + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + + IndexWriterWrapper { + field, + index_writer, + id_field, + index: Arc::new(index), + } + } +} diff --git a/third_party/tantivy/tantivy-binding/src/index_writer_text_c.rs b/third_party/tantivy/tantivy-binding/src/index_writer_text_c.rs new file mode 100644 index 0000000..1ca70ac --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -0,0 +1,44 @@ +use std::collections::HashMap; +use std::ffi::c_char; +use std::ffi::c_void; +use std::ffi::CStr; + +use crate::index_writer::IndexWriterWrapper; +use crate::tokenizer::create_tokenizer; +use crate::util::create_binding; + +#[no_mangle] +pub extern "C" fn tantivy_create_text_writer( + field_name: *const c_char, + path: *const c_char, + tokenizer_name: *const c_char, + tokenizer_params: *mut c_void, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, +) -> *mut c_void { + let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() }; + let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() }; + let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() }; + let analyzer = unsafe { + let m = tokenizer_params as *const HashMap; + create_tokenizer(&(*m)) + }; + match analyzer { + Some(text_analyzer) => { + let wrapper = IndexWriterWrapper::create_text_writer( + String::from(field_name_str), + String::from(path_str), + String::from(tokenizer_name_str), + text_analyzer, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + ); + create_binding(wrapper) + } + None => { + std::ptr::null_mut() + } + } +} diff --git a/third_party/tantivy/tantivy-binding/src/lib.rs b/third_party/tantivy/tantivy-binding/src/lib.rs new file mode 100644 index 0000000..fd73108 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/lib.rs @@ -0,0 +1,36 @@ +mod array; +mod data_type; +mod demo_c; +mod docid_collector; +mod hashmap_c; +mod index_reader; +mod index_reader_c; +mod index_reader_text; +mod index_reader_text_c; +mod index_writer; +mod index_writer_c; +mod index_writer_text; +mod index_writer_text_c; +mod log; +mod string_c; +mod token_stream_c; +mod tokenizer; +mod tokenizer_c; +mod util; +mod util_c; +mod vec_collector; + +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/log.rs b/third_party/tantivy/tantivy-binding/src/log.rs new file mode 100644 index 0000000..112fa86 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/log.rs @@ -0,0 +1,10 @@ +use env_logger::Env; +use std::sync::Once; + +pub(crate) fn init_log() { + static _INITIALIZED: Once = Once::new(); + _INITIALIZED.call_once(|| { + let _env = Env::default().filter_or("MY_LOG_LEVEL", "info"); + env_logger::init_from_env(_env); + }); +} diff --git a/third_party/tantivy/tantivy-binding/src/string_c.rs b/third_party/tantivy/tantivy-binding/src/string_c.rs new file mode 100644 index 0000000..fc1c1ea --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/string_c.rs @@ -0,0 +1,22 @@ +use std::ffi::{CStr, CString}; + +use libc::c_char; + +use std::str; + +// Be careful to use this function, since the returned str depends on the input to be not freed. +pub(crate) unsafe fn c_str_to_str<'a>(s: *const c_char) -> &'a str { + let rs = CStr::from_ptr(s); + str::from_utf8_unchecked(rs.to_bytes()) +} + +pub(crate) fn create_string(s: &str) -> *const c_char { + CString::new(s).unwrap().into_raw() +} + +#[no_mangle] +pub extern "C" fn free_rust_string(ptr: *const c_char) { + unsafe { + let _ = CString::from_raw(ptr as *mut c_char); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/token_stream_c.rs b/third_party/tantivy/tantivy-binding/src/token_stream_c.rs new file mode 100644 index 0000000..810b1ae --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/token_stream_c.rs @@ -0,0 +1,40 @@ +use std::ffi::c_char; + +use libc::c_void; +use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer}; + +use crate::string_c::c_str_to_str; +use crate::{ + string_c::create_string, + util::{create_binding, free_binding}, +}; + +// Note: the tokenizer and text must be released after the token_stream. +#[no_mangle] +pub extern "C" fn tantivy_create_token_stream( + tokenizer: *mut c_void, + text: *const c_char, +) -> *mut c_void { + let analyzer = tokenizer as *mut TextAnalyzer; + let token_stream = unsafe { (*analyzer).token_stream(c_str_to_str(text)) }; + create_binding(token_stream) +} + +#[no_mangle] +pub extern "C" fn tantivy_free_token_stream(token_stream: *mut c_void) { + free_binding::>(token_stream); +} + +#[no_mangle] +pub extern "C" fn tantivy_token_stream_advance(token_stream: *mut c_void) -> bool { + let real = token_stream as *mut BoxTokenStream<'_>; + unsafe { (*real).advance() } +} + +// Note: the returned token should be released by calling `free_string` after use. +#[no_mangle] +pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> *const c_char { + let real = token_stream as *mut BoxTokenStream<'_>; + let token = unsafe { (*real).token().text.as_str() }; + create_string(token) +} diff --git a/third_party/tantivy/tantivy-binding/src/tokenizer.rs b/third_party/tantivy/tantivy-binding/src/tokenizer.rs new file mode 100644 index 0000000..9a1d34b --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/tokenizer.rs @@ -0,0 +1,55 @@ +use lazy_static::lazy_static; +use log::{info, warn}; +use std::collections::HashMap; +use tantivy::tokenizer::{TextAnalyzer, TokenizerManager}; +use crate::log::init_log; + +lazy_static! { + static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default(); +} + +pub(crate) fn default_tokenizer() -> TextAnalyzer { + DEFAULT_TOKENIZER_MANAGER.get("default").unwrap() +} + +fn jieba_tokenizer() -> TextAnalyzer { + tantivy_jieba::JiebaTokenizer {}.into() +} + +pub(crate) fn create_tokenizer(params: &HashMap) -> Option { + init_log(); + + match params.get("tokenizer") { + Some(tokenizer_name) => match tokenizer_name.as_str() { + "default" => { + Some(default_tokenizer()) + } + "jieba" => { + Some(jieba_tokenizer()) + } + s => { + warn!("unsupported tokenizer: {}", s); + None + } + }, + None => { + info!("no tokenizer is specific, use default tokenizer"); + Some(default_tokenizer()) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use crate::tokenizer::create_tokenizer; + + #[test] + fn test_create_tokenizer() { + let mut params : HashMap = HashMap::new(); + params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap()); + + let tokenizer = create_tokenizer(¶ms); + assert!(tokenizer.is_some()); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/tokenizer_c.rs b/third_party/tantivy/tantivy-binding/src/tokenizer_c.rs new file mode 100644 index 0000000..c2caf09 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -0,0 +1,26 @@ +use std::collections::HashMap; + +use libc::c_void; +use tantivy::tokenizer::TextAnalyzer; + +use crate::{ + tokenizer::create_tokenizer, + util::{create_binding, free_binding}, +}; + +#[no_mangle] +pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void { + let analyzer = unsafe { + let m = tokenizer_params as *const HashMap; + create_tokenizer(&(*m)) + }; + match analyzer { + Some(text_analyzer) => create_binding(text_analyzer), + None => std::ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) { + free_binding::(tokenizer); +} diff --git a/third_party/tantivy/tantivy-binding/src/util.rs b/third_party/tantivy/tantivy-binding/src/util.rs new file mode 100644 index 0000000..1f1c165 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/util.rs @@ -0,0 +1,30 @@ +use std::ffi::c_void; +use std::ops::Bound; + +use tantivy::{directory::MmapDirectory, Index}; + +pub fn index_exist(path: &str) -> bool { + let dir = MmapDirectory::open(path).unwrap(); + Index::exists(&dir).unwrap() +} + +pub fn make_bounds(bound: T, inclusive: bool) -> Bound { + if inclusive { + Bound::Included(bound) + } else { + Bound::Excluded(bound) + } +} + +pub fn create_binding(wrapper: T) -> *mut c_void { + let bp = Box::new(wrapper); + let p_heap: *mut T = Box::into_raw(bp); + p_heap as *mut c_void +} + +pub fn free_binding(ptr: *mut c_void) { + let real = ptr as *mut T; + unsafe { + drop(Box::from_raw(real)); + } +} diff --git a/third_party/tantivy/tantivy-binding/src/util_c.rs b/third_party/tantivy/tantivy-binding/src/util_c.rs new file mode 100644 index 0000000..cc35e0c --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/util_c.rs @@ -0,0 +1,9 @@ +use std::ffi::{c_char, CStr}; + +use crate::util::index_exist; + +#[no_mangle] +pub extern "C" fn tantivy_index_exist(path: *const c_char) -> bool { + let path_str = unsafe { CStr::from_ptr(path) }; + index_exist(path_str.to_str().unwrap()) +} diff --git a/third_party/tantivy/tantivy-binding/src/vec_collector.rs b/third_party/tantivy/tantivy-binding/src/vec_collector.rs new file mode 100644 index 0000000..73299f2 --- /dev/null +++ b/third_party/tantivy/tantivy-binding/src/vec_collector.rs @@ -0,0 +1,60 @@ +use log::warn; +use tantivy::{ + collector::{Collector, SegmentCollector}, + DocId, +}; + +pub struct VecCollector; + +impl Collector for VecCollector { + type Fruit = Vec; + + type Child = VecChildCollector; + + fn for_segment( + &self, + _segment_local_id: tantivy::SegmentOrdinal, + _segment: &tantivy::SegmentReader, + ) -> tantivy::Result { + Ok(VecChildCollector { docs: Vec::new() }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segment_fruits: Vec>) -> tantivy::Result> { + if segment_fruits.len() == 1 { + Ok(segment_fruits.into_iter().next().unwrap()) + } else { + warn!( + "inverted index should have only one segment, but got {} segments", + segment_fruits.len() + ); + let len: usize = segment_fruits.iter().map(|docset| docset.len()).sum(); + let mut result = Vec::with_capacity(len); + for docs in segment_fruits { + for doc in docs { + result.push(doc); + } + } + Ok(result) + } + } +} + +pub struct VecChildCollector { + docs: Vec, +} + +impl SegmentCollector for VecChildCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: tantivy::Score) { + self.docs.push(doc); + } + + fn harvest(self) -> Self::Fruit { + self.docs + } +} diff --git a/third_party/tantivy/tantivy-wrapper.h b/third_party/tantivy/tantivy-wrapper.h new file mode 100644 index 0000000..17822d1 --- /dev/null +++ b/third_party/tantivy/tantivy-wrapper.h @@ -0,0 +1,469 @@ +#include +#include +#include +#include +#include +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" +#include "rust-array.h" +#include "rust-hashmap.h" + +namespace milvus::tantivy { +using Map = std::map; + +static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer"; +static Map DEFAULT_TOKENIZER_PARAMS = {}; +static constexpr uintptr_t DEFAULT_NUM_THREADS = 4; +static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES = + DEFAULT_NUM_THREADS * 15 * 1024 * 1024; + +template +inline TantivyDataType +guess_data_type() { + if constexpr (std::is_same_v) { + return TantivyDataType::Bool; + } + + if constexpr (std::is_integral_v) { + return TantivyDataType::I64; + } + + if constexpr (std::is_floating_point_v) { + return TantivyDataType::F64; + } + + throw fmt::format("guess_data_type: unsupported data type: {}", + typeid(T).name()); +} + +// TODO: should split this into IndexWriter & IndexReader. +struct TantivyIndexWrapper { + using IndexWriter = void*; + using IndexReader = void*; + + NO_COPY_OR_ASSIGN(TantivyIndexWrapper); + + TantivyIndexWrapper() = default; + + TantivyIndexWrapper(TantivyIndexWrapper&& other) noexcept { + writer_ = other.writer_; + reader_ = other.reader_; + finished_ = other.finished_; + path_ = other.path_; + other.writer_ = nullptr; + other.reader_ = nullptr; + other.finished_ = false; + other.path_ = ""; + } + + TantivyIndexWrapper& + operator=(TantivyIndexWrapper&& other) noexcept { + if (this != &other) { + free(); + writer_ = other.writer_; + reader_ = other.reader_; + path_ = other.path_; + finished_ = other.finished_; + other.writer_ = nullptr; + other.reader_ = nullptr; + other.finished_ = false; + other.path_ = ""; + } + return *this; + } + + // create index writer for non-text type. + TantivyIndexWrapper(const char* field_name, + TantivyDataType data_type, + const char* path, + uintptr_t num_threads = DEFAULT_NUM_THREADS, + uintptr_t overall_memory_budget_in_bytes = + DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { + writer_ = tantivy_create_index(field_name, + data_type, + path, + num_threads, + overall_memory_budget_in_bytes); + path_ = std::string(path); + } + + // load index. create index reader. + explicit TantivyIndexWrapper(const char* path) { + assert(tantivy_index_exist(path)); + reader_ = tantivy_load_index(path); + path_ = std::string(path); + } + + // create index writer for text type with tokenizer. + TantivyIndexWrapper(const char* field_name, + bool in_ram, + const char* path, + const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, + const std::map& + tokenizer_params = DEFAULT_TOKENIZER_PARAMS, + uintptr_t num_threads = DEFAULT_NUM_THREADS, + uintptr_t overall_memory_budget_in_bytes = + DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { + RustHashMap m; + m.from(tokenizer_params); + writer_ = tantivy_create_text_writer(field_name, + path, + tokenizer_name, + m.get_pointer(), + num_threads, + overall_memory_budget_in_bytes, + in_ram); + path_ = std::string(path); + } + + // create reader. + void + create_reader() { + if (writer_ != nullptr) { + reader_ = tantivy_create_reader_from_writer(writer_); + } else if (!path_.empty()) { + assert(tantivy_index_exist(path_.c_str())); + reader_ = tantivy_load_index(path_.c_str()); + } + } + + ~TantivyIndexWrapper() { + free(); + } + + void + register_tokenizer( + const char* tokenizer_name, + const std::map& tokenizer_params) { + RustHashMap m; + m.from(tokenizer_params); + if (reader_ != nullptr) { + tantivy_register_tokenizer( + reader_, tokenizer_name, m.get_pointer()); + } + } + + template + void + add_data(const T* array, uintptr_t len, int64_t offset_begin) { + assert(!finished_); + + if constexpr (std::is_same_v) { + tantivy_index_add_bools(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_int8s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_int16s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_int32s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_int64s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_f32s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_f64s(writer_, array, len, offset_begin); + return; + } + + if constexpr (std::is_same_v) { + // TODO: not very efficient, a lot of overhead due to rust-ffi call. + for (uintptr_t i = 0; i < len; i++) { + tantivy_index_add_string( + writer_, + static_cast(array)[i].c_str(), + offset_begin + i); + } + return; + } + + throw fmt::format("InvertedIndex.add_data: unsupported data type: {}", + typeid(T).name()); + } + + template + void + add_multi_data(const T* array, uintptr_t len, int64_t offset) { + assert(!finished_); + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_bools(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_int8s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_int16s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_int32s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_int64s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_f32s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + tantivy_index_add_multi_f64s(writer_, array, len, offset); + return; + } + + if constexpr (std::is_same_v) { + std::vector views; + for (uintptr_t i = 0; i < len; i++) { + views.push_back(array[i].c_str()); + } + tantivy_index_add_multi_keywords( + writer_, views.data(), len, offset); + return; + } + + throw fmt::format( + "InvertedIndex.add_multi_data: unsupported data type: {}", + typeid(T).name()); + } + + inline void + finish() { + if (finished_) { + return; + } + + tantivy_finish_index(writer_); + writer_ = nullptr; + finished_ = true; + } + + inline void + commit() { + if (writer_ != nullptr) { + tantivy_commit_index(writer_); + } + } + + inline void + reload() { + if (reader_ != nullptr) { + tantivy_reload_index(reader_); + } + } + + inline uint32_t + count() { + return tantivy_index_count(reader_); + } + + public: + template + RustArrayWrapper + term_query(T term) { + auto array = [&]() { + if constexpr (std::is_same_v) { + return tantivy_term_query_bool(reader_, term); + } + + if constexpr (std::is_integral_v) { + return tantivy_term_query_i64(reader_, + static_cast(term)); + } + + if constexpr (std::is_floating_point_v) { + return tantivy_term_query_f64(reader_, + static_cast(term)); + } + + if constexpr (std::is_same_v) { + return tantivy_term_query_keyword( + reader_, static_cast(term).c_str()); + } + + throw fmt::format( + "InvertedIndex.term_query: unsupported data type: {}", + typeid(T).name()); + }(); + return RustArrayWrapper(array); + } + + template + RustArrayWrapper + lower_bound_range_query(T lower_bound, bool inclusive) { + auto array = [&]() { + if constexpr (std::is_integral_v) { + return tantivy_lower_bound_range_query_i64( + reader_, static_cast(lower_bound), inclusive); + } + + if constexpr (std::is_floating_point_v) { + return tantivy_lower_bound_range_query_f64( + reader_, static_cast(lower_bound), inclusive); + } + + if constexpr (std::is_same_v) { + return tantivy_lower_bound_range_query_keyword( + reader_, + static_cast(lower_bound).c_str(), + inclusive); + } + + throw fmt::format( + "InvertedIndex.lower_bound_range_query: unsupported data type: " + "{}", + typeid(T).name()); + }(); + return RustArrayWrapper(array); + } + + template + RustArrayWrapper + upper_bound_range_query(T upper_bound, bool inclusive) { + auto array = [&]() { + if constexpr (std::is_integral_v) { + return tantivy_upper_bound_range_query_i64( + reader_, static_cast(upper_bound), inclusive); + } + + if constexpr (std::is_floating_point_v) { + return tantivy_upper_bound_range_query_f64( + reader_, static_cast(upper_bound), inclusive); + } + + if constexpr (std::is_same_v) { + return tantivy_upper_bound_range_query_keyword( + reader_, + static_cast(upper_bound).c_str(), + inclusive); + } + + throw fmt::format( + "InvertedIndex.upper_bound_range_query: unsupported data type: " + "{}", + typeid(T).name()); + }(); + return RustArrayWrapper(array); + } + + template + RustArrayWrapper + range_query(T lower_bound, + T upper_bound, + bool lb_inclusive, + bool ub_inclusive) { + auto array = [&]() { + if constexpr (std::is_integral_v) { + return tantivy_range_query_i64( + reader_, + static_cast(lower_bound), + static_cast(upper_bound), + lb_inclusive, + ub_inclusive); + } + + if constexpr (std::is_floating_point_v) { + return tantivy_range_query_f64(reader_, + static_cast(lower_bound), + static_cast(upper_bound), + lb_inclusive, + ub_inclusive); + } + + if constexpr (std::is_same_v) { + return tantivy_range_query_keyword( + reader_, + static_cast(lower_bound).c_str(), + static_cast(upper_bound).c_str(), + lb_inclusive, + ub_inclusive); + } + + throw fmt::format( + "InvertedIndex.range_query: unsupported data type: {}", + typeid(T).name()); + }(); + return RustArrayWrapper(array); + } + + RustArrayWrapper + prefix_query(const std::string& prefix) { + auto array = tantivy_prefix_query_keyword(reader_, prefix.c_str()); + return RustArrayWrapper(array); + } + + RustArrayWrapper + regex_query(const std::string& pattern) { + auto array = tantivy_regex_query(reader_, pattern.c_str()); + return RustArrayWrapper(array); + } + + RustArrayWrapper + match_query(const std::string& query) { + auto array = tantivy_match_query(reader_, query.c_str()); + return RustArrayWrapper(array); + } + + public: + inline IndexWriter + get_writer() { + return writer_; + } + + inline IndexReader + get_reader() { + return reader_; + } + + private: + void + check_search() { + // TODO + } + + void + free() { + if (writer_ != nullptr) { + tantivy_free_index_writer(writer_); + } + + if (reader_ != nullptr) { + tantivy_free_index_reader(reader_); + } + } + + private: + bool finished_ = false; + IndexWriter writer_ = nullptr; + IndexReader reader_ = nullptr; + std::string path_; +}; +} // namespace milvus::tantivy diff --git a/third_party/tantivy/test.cpp b/third_party/tantivy/test.cpp new file mode 100644 index 0000000..4ba283d --- /dev/null +++ b/third_party/tantivy/test.cpp @@ -0,0 +1,302 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "tantivy-binding.h" +#include "tantivy-wrapper.h" + +using namespace milvus::tantivy; + +template +void +run() { + std::cout << "run " << typeid(T).name() << std::endl; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + if (tantivy_index_exist(path)) { + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + std::cout << "index already exist, open it, count: " << cnt + << std::endl; + return; + } + + auto w = TantivyIndexWrapper("test_field_name", guess_data_type(), path); + + T arr[] = {1, 2, 3, 4, 5, 6}; + auto l = sizeof(arr) / sizeof(T); + + w.add_data(arr, l, 0); + + w.finish(); + + assert(w.count() == l); + + { + auto hits = w.term_query(2); + hits.debug(); + } + + { + auto hits = w.lower_bound_range_query(1, false); + hits.debug(); + } + + { + auto hits = w.upper_bound_range_query(4, false); + hits.debug(); + } + + { + auto hits = w.range_query(2, 4, false, false); + hits.debug(); + } +} + +template <> +void +run() { + std::cout << "run bool" << std::endl; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + if (tantivy_index_exist(path)) { + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + std::cout << "index already exist, open it, count: " << cnt + << std::endl; + return; + } + + auto w = + TantivyIndexWrapper("test_field_name", TantivyDataType::Bool, path); + + bool arr[] = {true, false, false, true, false, true}; + auto l = sizeof(arr) / sizeof(bool); + + w.add_data(arr, l, 0); + + w.finish(); + + assert(w.count() == l); + + { + auto hits = w.term_query(true); + hits.debug(); + } +} + +template <> +void +run() { + std::cout << "run string" << std::endl; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + if (tantivy_index_exist(path)) { + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + std::cout << "index already exist, open it, count: " << cnt + << std::endl; + return; + } + + auto w = + TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path); + + std::vector arr = {"a", "b", "aaa", "abbb"}; + auto l = arr.size(); + + w.add_data(arr.data(), l, 0); + + w.finish(); + + assert(w.count() == l); + + { + auto hits = w.term_query("a"); + hits.debug(); + } + + { + auto hits = w.lower_bound_range_query("aa", true); + hits.debug(); + } + + { + auto hits = w.upper_bound_range_query("ab", true); + hits.debug(); + } + + { + auto hits = w.range_query("aa", "ab", true, true); + hits.debug(); + } + + { + auto hits = w.prefix_query("a"); + hits.debug(); + } + + { + auto hits = w.regex_query("a(.|\n)*"); + hits.debug(); + } +} + +void +test_32717() { + using T = int16_t; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + + if (tantivy_index_exist(path)) { + auto w = TantivyIndexWrapper(path); + auto cnt = w.count(); + std::cout << "index already exist, open it, count: " << cnt + << std::endl; + return; + } + + auto w = TantivyIndexWrapper("test_field_name", guess_data_type(), path); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(1, 1000); + std::vector arr; + std::map> inverted; + size_t l = 1000000; + for (size_t i = 0; i < l; i++) { + auto n = static_cast(dis(gen)); + arr.push_back(n); + if (inverted.find(n) == inverted.end()) { + inverted[n] = std::set(); + } + inverted[n].insert(i); + } + + w.add_data(arr.data(), l, 0); + w.finish(); + assert(w.count() == l); + + for (int16_t term = 1; term < 1000; term += 10) { + auto hits = w.term_query(term); + for (size_t i = 0; i < hits.array_.len; i++) { + assert(arr[hits.array_.array[i]] == term); + } + } +} + +std::set +to_set(const RustArrayWrapper& w) { + std::set s(w.array_.array, w.array_.array + w.array_.len); + return s; +} + +template +std::map> +build_inverted_index(const std::vector>& vec_of_array) { + std::map> inverted_index; + for (uint32_t i = 0; i < vec_of_array.size(); i++) { + for (const auto& term : vec_of_array[i]) { + inverted_index[term].insert(i); + } + } + return inverted_index; +} + +void +test_array_int() { + using T = int64_t; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + auto w = TantivyIndexWrapper("test_field_name", guess_data_type(), path); + + std::vector> vec_of_array{ + {10, 40, 50}, + {20, 50}, + {10, 50, 60}, + }; + + int64_t offset = 0; + for (const auto& arr : vec_of_array) { + w.add_multi_data(arr.data(), arr.size(), offset++); + } + w.finish(); + + assert(w.count() == vec_of_array.size()); + + auto inverted_index = build_inverted_index(vec_of_array); + for (const auto& [term, posting_list] : inverted_index) { + auto hits = to_set(w.term_query(term)); + assert(posting_list == hits); + } +} + +void +test_array_string() { + using T = std::string; + + auto path = "/tmp/inverted-index/test-binding/"; + boost::filesystem::remove_all(path); + boost::filesystem::create_directories(path); + auto w = + TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path); + + std::vector> vec_of_array{ + {"10", "40", "50"}, + {"20", "50"}, + {"10", "50", "60"}, + }; + + int64_t offset = 0; + for (const auto& arr : vec_of_array) { + w.add_multi_data(arr.data(), arr.size(), offset++); + } + w.finish(); + + assert(w.count() == vec_of_array.size()); + + auto inverted_index = build_inverted_index(vec_of_array); + for (const auto& [term, posting_list] : inverted_index) { + auto hits = to_set(w.term_query(term)); + assert(posting_list == hits); + } +} + +int +main(int argc, char* argv[]) { + test_32717(); + + run(); + run(); + run(); + run(); + + run(); + run(); + + run(); + + run(); + + test_array_int(); + test_array_string(); + + return 0; +} diff --git a/third_party/tantivy/text_demo.cpp b/third_party/tantivy/text_demo.cpp new file mode 100644 index 0000000..c084319 --- /dev/null +++ b/third_party/tantivy/text_demo.cpp @@ -0,0 +1,68 @@ +#include +#include +#include + +#include "tantivy-binding.h" +#include "tantivy-wrapper.h" + +using namespace milvus::tantivy; + +std::set +to_set(const RustArrayWrapper& w) { + std::set s(w.array_.array, w.array_.array + w.array_.len); + return s; +} + +int +main(int argc, char* argv[]) { + auto text_index = TantivyIndexWrapper("text_demo", true, ""); + auto write_single_text = [&text_index](const std::string& s, + int64_t offset) { + text_index.add_data(&s, 1, offset); + }; + + { + write_single_text("football, basketball, pingpang", 0); + write_single_text("swimming, football", 1); + write_single_text("Avatar", 2); + write_single_text("Action, Adventure, Fantasy, Science Fiction", 3); + write_single_text("Ingenious Film Partners, Twentiesth Century Fox", 4); + write_single_text("Sam Worthington as Jack Sully", 5); + text_index.commit(); + } + + text_index.create_reader(); + { + auto result = to_set(text_index.match_query("football")); + assert(result.size() == 2); + assert(result.find(0) != result.end()); + assert(result.find(1) != result.end()); + } + + { + auto result = to_set(text_index.match_query("basketball")); + assert(result.size() == 1); + assert(result.find(0) != result.end()); + } + + { + auto result = to_set(text_index.match_query("swimming")); + assert(result.size() == 1); + assert(result.find(1) != result.end()); + } + + { + auto result = to_set(text_index.match_query("basketball, swimming")); + assert(result.size() == 2); + assert(result.find(0) != result.end()); + assert(result.find(1) != result.end()); + } + + { + auto result = to_set(text_index.match_query("avatar")); + assert(result.size() == 1); + assert(result.find(2) != result.end()); + } + + return 0; +} diff --git a/third_party/tantivy/time_recorder.h b/third_party/tantivy/time_recorder.h new file mode 100644 index 0000000..c2a8d7b --- /dev/null +++ b/third_party/tantivy/time_recorder.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include +#include + +class TimeRecorder { + using stdclock = std::chrono::high_resolution_clock; + + public: + // trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5 + explicit TimeRecorder(std::string hdr, int64_t log_level = 0) + : header_(std::move(hdr)), log_level_(log_level) { + start_ = last_ = stdclock::now(); + } + virtual ~TimeRecorder() = default; + + double + RecordSection(const std::string& msg) { + stdclock::time_point curr = stdclock::now(); + double span = + (std::chrono::duration(curr - last_)).count(); + last_ = curr; + + PrintTimeRecord(msg, span); + return span; + } + + double + ElapseFromBegin(const std::string& msg) { + stdclock::time_point curr = stdclock::now(); + double span = + (std::chrono::duration(curr - start_)).count(); + + PrintTimeRecord(msg, span); + return span; + } + + static std::string + GetTimeSpanStr(double span) { + std::string str_ms = std::to_string(span * 0.001) + " ms"; + return str_ms; + } + + private: + void + PrintTimeRecord(const std::string& msg, double span) { + std::string str_log; + if (!header_.empty()) { + str_log += header_ + ": "; + } + str_log += msg; + str_log += " ("; + str_log += TimeRecorder::GetTimeSpanStr(span); + str_log += ")"; + + std::cout << str_log << std::endl; + } + + private: + std::string header_; + stdclock::time_point start_; + stdclock::time_point last_; + int64_t log_level_; +}; diff --git a/third_party/tantivy/token-stream.h b/third_party/tantivy/token-stream.h new file mode 100644 index 0000000..03718be --- /dev/null +++ b/third_party/tantivy/token-stream.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" + +namespace milvus::tantivy { +struct TokenStream { + public: + NO_COPY_OR_ASSIGN(TokenStream); + + TokenStream(void* ptr, std::shared_ptr text) + : ptr_(ptr), text_(text) { + assert(ptr != nullptr); + } + + ~TokenStream() { + if (ptr_ != nullptr) { + tantivy_free_token_stream(ptr_); + } + } + + public: + bool + advance() { + return tantivy_token_stream_advance(ptr_); + } + + std::string + get_token() { + auto token = tantivy_token_stream_get_token(ptr_); + std::string s(token); + free_rust_string(token); + return s; + } + + // Note: the returned token must be freed by calling `free_rust_string`. + const char* + get_token_no_copy() { + return tantivy_token_stream_get_token(ptr_); + } + + public: + void* ptr_; + std::shared_ptr text_; +}; +} // namespace milvus::tantivy diff --git a/third_party/tantivy/tokenizer.h b/third_party/tantivy/tokenizer.h new file mode 100644 index 0000000..dd75320 --- /dev/null +++ b/third_party/tantivy/tokenizer.h @@ -0,0 +1,50 @@ +#pragma once + +#include "tantivy-binding.h" +#include "rust-binding.h" +#include "rust-hashmap.h" +#include "token-stream.h" + +namespace milvus::tantivy { + +struct Tokenizer { + public: + NO_COPY_OR_ASSIGN(Tokenizer); + + explicit Tokenizer(const std::map& params) { + RustHashMap m; + m.from(params); + ptr_ = tantivy_create_tokenizer(m.get_pointer()); + if (ptr_ == nullptr) { + throw std::invalid_argument("invalid tokenizer parameters"); + } + } + + ~Tokenizer() { + if (ptr_ != nullptr) { + tantivy_free_tokenizer(ptr_); + } + } + + std::unique_ptr + CreateTokenStream(std::string&& text) { + auto shared_text = std::make_shared(std::move(text)); + auto token_stream = + tantivy_create_token_stream(ptr_, shared_text->c_str()); + return std::make_unique(token_stream, shared_text); + } + + // CreateTokenStreamCopyText will copy the text and then create token stream based on the text. + std::unique_ptr + CreateTokenStreamCopyText(const std::string& text) { + auto shared_text = std::make_shared(text); + auto token_stream = + tantivy_create_token_stream(ptr_, shared_text->c_str()); + return std::make_unique(token_stream, shared_text); + } + + private: + void* ptr_; +}; + +} // namespace milvus::tantivy diff --git a/third_party/tantivy/tokenizer_demo.cpp b/third_party/tantivy/tokenizer_demo.cpp new file mode 100644 index 0000000..7538751 --- /dev/null +++ b/third_party/tantivy/tokenizer_demo.cpp @@ -0,0 +1,47 @@ +#include +#include "token-stream.h" +#include "tokenizer.h" + +using Map = std::map; + +using namespace milvus::tantivy; + +void +test_tokenizer(const Map& m, std::string&& text) { + Tokenizer tokenizer(m); + + auto token_stream = tokenizer.CreateTokenStream(std::move(text)); + while (token_stream->advance()) { + auto token = token_stream->get_token(); + std::cout << token << std::endl; + } +} + +int +main(int argc, char* argv[]) { + // default tokenizer + { + Map m; + test_tokenizer(m, "football, basketball, pingpang"); + test_tokenizer(m, "Avatar"); + test_tokenizer(m, "Action, Adventure, Fantasy, Science Fiction"); + test_tokenizer(m, "Ingenious Film Partners, Twentiesth Century Fox"); + test_tokenizer(m, "Sam Worthington as Jack Sully"); + } + + // jieba tokenizer + { + Map m; + std::string tokenizer_name = "jieba"; + m["tokenizer"] = tokenizer_name; + test_tokenizer(m, + "张华考上了北京大学;李萍进了中等技术学校;我在百货公司" + "当售货员:我们都有光明的前途"); + test_tokenizer(m, "青铜时代"); + test_tokenizer(m, "黄金时代"); + test_tokenizer(m, "时代"); + test_tokenizer(m, "测试中文分词器的效果"); + } + + return 0; +} diff --git a/tools/vcpkg b/tools/vcpkg deleted file mode 160000 index 7ba0ba7..0000000 --- a/tools/vcpkg +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7ba0ba7334c3346e7eee1e049ba85da193a8d821 diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json deleted file mode 100644 index 7699f3f..0000000 --- a/vcpkg-configuration.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "overlay-ports": [ - "./ports" - ] -} diff --git a/vcpkg.json b/vcpkg.json deleted file mode 100644 index 1b7f4e9..0000000 --- a/vcpkg.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "name": "lintdb", - "version-string": "0.5.1", - "license": "MIT", - "dependencies": [ - "faiss", - "rocksdb", - "flatbuffers", - "gtest", - "glog", - "jsoncpp", - "ms-gsl", - "benchmark", - "intel-mkl", - "openblas", - "bitsery", - "drogon", - "args" - ] -}